def annotate(sent): global client if client is None: client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(',')) words = [] for sent in client.annotate(sent).sentences: for tok in sent: words.append(tok.word) return words
class NLPParser(object): """ NLP parse, including Part-Of-Speech tagging and dependency parse. Attributes ========== parser: StanfordCoreNLP the Staford Core NLP parser """ def __init__(self): self.parser = CoreNLPClient( default_annotators=['ssplit', 'tokenize', 'pos', 'depparse']) def parse(self, sent): """ Part-Of-Speech tagging and dependency parse. :param sent: string :return: a list of tuple (word, pos, dependency) """ result = self.parser.annotate(sent) tuples = [] for s in result.sentences: word, pos, dep = [], [], [] for token in s: word += [token.word] pos += [token.pos] edges = s.depparse(mode='enhanced').to_json() for e in edges: dep.append({ 'type': e['dep'], 'dep': e['dependent'] - 1, 'gov': e['governer'] - 1 }) tuples.append((word, pos, dep)) return tuples
class NLPParser(object): """ NLP parse, including Part-Of-Speech tagging. Attributes ========== parser: StanfordCoreNLP the Staford Core NLP parser """ def __init__(self): self.parser = CoreNLPClient( default_annotators=['ssplit', 'tokenize', 'pos', 'ner'], server="http://localhost:9000") #self.parser = POSTagger(corenlp_dir+'/models/english-bidirectional-distsim.tagger', corenlp_dir+'/stanford-postagger.jar') def parse(self, sent): result = self.parser.annotate(sent) tuples = [] for sent in result.sentences: tokens, pos, ner = [], [], [] for token in sent: tokens += [token.word] pos += [token.pos] ner += [token.ner] tuples.append((tokens, pos, ner)) return tuples
def annotate(sentence, lower=True): global client if client is None: client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(',')) words, gloss, after = [], [], [] for s in client.annotate(sentence): for t in s: words.append(t.word) gloss.append(t.originalText) after.append(t.after) if lower: words = [w.lower() for w in words] return { 'gloss': gloss, 'words': words, 'after': after, }
def tokenize(df): from stanza.nlp.corenlp import CoreNLPClient parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize'], server='http://localhost:9000') parsed = [] for item in df['posts']: temp = [] for sentence in item.strip().split('|||'): try: result = parser.annotate(sentence) tokens = [] for i in range(len(result.sentences)): tokens += result.sentences[i].tokens temp.append(' '.join([token.word for token in tokens])) except: print('error', sentence) parsed.append(' <RETURN> '.join(temp)) df['posts'] = parsed
def do_command(args): reader = csv.reader(args.input, delimiter="\t") header = next(reader) assert all(field in header for field in ("id", "text")) Tweet = namedtuple("Tweet", header) client = CoreNLPClient() annotators = "tokenize ssplit lemma pos".split() writer = csv.writer(args.output, delimiter="\t") writer.writerow(["id", "tokens", "lemmas", "pos_tags"]) for tweet in tqdm(Tweet(*row) for row in reader): doc = client.annotate(tweet.text, annotators) tokens, lemmas, pos_tags = [], [], [] for sentence in doc: tokens += sentence.words lemmas += sentence.lemmas pos_tags += sentence.pos_tags writer.writerow([tweet.id, to_psql_array(tokens), to_psql_array(lemmas), to_psql_array(pos_tags)])
def get_annotate(sentence, lower=True): # notice return 4 infos # todo: handle [Salmonella spp.] -> ['salmonella', 'spp.', '.'] global client if client is None: client = CoreNLPClient( server='http://localhost:9000', default_annotators=['ssplit', 'tokenize', 'pos']) tokenize, origin, pos_tag, after = [], [], [], [] for s in client.annotate(sentence): for t in s: if lower: tokenize.append(t.word.lower()), origin.append( t.originalText.lower()), pos_tag.append( t.pos), after.append(t.after) else: tokenize.append(t.word), origin.append( t.originalText), pos_tag.append(t.pos), after.append( t.after) return tokenize, origin, pos_tag, after
class NLPParser(object): """ NLP parse, including Part-Of-Speech tagging. Attributes ========== parser: StanfordCoreNLP the Staford Core NLP parser """ def __init__(self): self.parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize', 'ner']) def get_ner(self, tokens): sent = ' '.join(tokens) result = self.parser.annotate(sent) ner = [] for token in result.sentences[0]: ner.append(token.ner) return ner
class NLPParser(object): """ NLP parse, including Part-Of-Speech tagging. Attributes ========== parser: StanfordCoreNLP the Staford Core NLP parser """ def __init__(self): self.parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize', 'ner']) def parse(self, sent): result = self.parser.annotate(sent) tokens_list, ner_list = [], [] for sent in result.sentences: tokens, ner = [], [] currNERType = 'O' currNER = '' for token in sent: token_ner = token.ner if token_ner not in INTERESTED_STANFORD_EM_TYPES: token_ner = 'O' tokens += [token.word] if token_ner == 'O': if currNER != '': ner.append(currNER.strip()) currNER = '' elif token_ner == currNERType: currNER += token.word + ' ' else: if currNER != '': ner.append(currNER.strip()) currNERType = token_ner currNER = token.word + ' ' if currNER != '': ner.append(currNER.strip()) if len(tokens) == 0 or len(ner) == 0: continue tokens_list.append(tokens) ner_list.append(ner) return tokens_list, ner_list
type=int, default=50, help="Maximum sequence length") parser.add_argument('-shuffle', type=int, default=1, help="Shuffle data") parser.add_argument('-seed', type=int, default=3435, help="Random seed") parser.add_argument('-lower', action='store_true', help='lowercase data') parser.add_argument('-report_every', type=int, default=100000, help="Report status every this many sentences") opt = parser.parse_args() corenlp = CoreNLPClient(default_annotators=['tokenize', 'ssplit']) def annotate_sentence(corenlp, gloss): try: parse = corenlp.annotate(gloss) except: time.sleep(10) parse = corenlp.annotate(gloss) token_str = ' '.join([ token['word'] for sentence in parse.json['sentence'] for token in sentence['token'] ]) #return parse.json['sentence'][0]['token'] return token_str
'text': para_string, 'fbEMs': entities_fb, 'dbEMs': entities_db, 'sentences': sentences } paragraph_count += 1 output_file.write(json.dumps(paragraph) + "\n") wiki_file.close() output_file.close() ''' Takes path to directory with folders and files extracted using WikiExtractor and path to output directory as inputs. Writes the sentences with required extracted information as Json files in output directory. The Wikipedia dumps are extracted using the following command (preserve links)- python WikiExtractor.py -l enwiki-20160920-pages-articles-multistream.xml ''' if __name__ == "__main__": input_file_path = sys.argv[1] output_file_path = sys.argv[2] corenlp_client = CoreNLPClient( server='http://localhost:9000', default_annotators=['ssplit', 'tokenize', 'lemma', 'pos', 'ner']) sparql = SPARQLWrapper.SPARQLWrapper("http://localhost:8890/sparql/") process_file(input_file_path, sparql, corenlp_client, output_file_path)
import unicodedata #consider using notebooks from stanza.nlp.corenlp import CoreNLPClient client = CoreNLPClient(server='http://localhost:9000', default_annotators=['ssplit', 'tokenize']) marquez = 'marquez/marquez' # encoded in latin-1 ulysses = 'ulysses/ulysses' # encoded in utf-8 emma = 'emma/emma' # encoded in utf-8 with open(marquez + "token", 'w') as writeFile: for i in range(1, 21): filename = marquez + "%02d" % i with open(filename, 'r') as myfile: chapter = unicode(myfile.read(), encoding='latin-1') chapter = unicodedata.normalize('NFD', chapter) chapter = chapter.encode('ascii', 'ignore') annotated = client.annotate(chapter) for sentence in annotated: for token in sentence: writeFile.write(token.word.encode('ascii', 'ignore')) writeFile.write("\n")
def __init__(self): self.parser = CoreNLPClient( default_annotators=['ssplit', 'tokenize', 'ner'], server='http://localhost:9001')
state = {'model': model.state_dict()} torch.save(state, os.path.join('.', 'model_best.pt')) state = {'model_bert': model_bert.state_dict()} torch.save(state, os.path.join('.', 'model_bert_best.pt')) print(f" Best Dev lx acc: {acc_lx_t_best} at epoch: {epoch_best}") if args.do_infer: # To use recent corenlp: https://github.com/stanfordnlp/python-stanford-corenlp # 1. pip install stanford-corenlp # 2. download java crsion # 3. export CORENLP_HOME=/Users/wonseok/utils/stanford-corenlp-full-2018-10-05 from stanza.nlp.corenlp import CoreNLPClient client = CoreNLPClient(server='http://localhost:9000', default_annotators='ssplit,tokenize'.split(',')) # import corenlp # # client = corenlp.CoreNLPClient(annotators='ssplit,tokenize'.split(',')) nlu1 = "Which company have more than 100 employees?" path_db = './data_and_model' db_name = 'dev' data_table = load_jsonl('./data_and_model/dev.tables.jsonl') table_name = 'table_10015132_11' # change table here depending on the questions. n_Q = 100000 if args.infer_loop else 1 for i in range(n_Q): if n_Q > 1: nlu1 = input('Type question: ') pr_sql_i, pr_ans = infer(nlu1,
def __init__(self): self.parser = CoreNLPClient( default_annotators=['ssplit', 'tokenize', 'pos', 'depparse'])
def _client(cls): if cls.__client is None: cls.__client = CoreNLPClient(server=config.CORENLP_SERVER, default_annotators=config.CORENLP_ANNOTATORS) return cls.__client
from stanza.nlp.corenlp import CoreNLPClient client = CoreNLPClient(server='http://localhost:9000', default_annotators=['ssplit', 'tokenize', 'lemma', 'pos', 'ner']) annotated = client.annotate('This is an example document. Here is a second sentence') for sentence in annotated.sentences: print('sentence', sentence) for token in sentence: print(token.word, token.lemma, token.pos, token.ner)
def corenlp_tokenize(s: str) -> Sequence[str]: global corenlp_client if "corenlp_client" not in globals(): corenlp_client = CoreNLPClient( default_annotators=["ssplit", "tokenize"]) return [t.word for s in corenlp_client.annotate(s).sentences for t in s]