class NLPParser(object): """ NLP parse, including Part-Of-Speech tagging and dependency parse. Attributes ========== parser: StanfordCoreNLP the Staford Core NLP parser """ def __init__(self): self.parser = CoreNLPClient( default_annotators=['ssplit', 'tokenize', 'pos', 'depparse']) def parse(self, sent): """ Part-Of-Speech tagging and dependency parse. :param sent: string :return: a list of tuple (word, pos, dependency) """ result = self.parser.annotate(sent) tuples = [] for s in result.sentences: word, pos, dep = [], [], [] for token in s: word += [token.word] pos += [token.pos] edges = s.depparse(mode='enhanced').to_json() for e in edges: dep.append({ 'type': e['dep'], 'dep': e['dependent'] - 1, 'gov': e['governer'] - 1 }) tuples.append((word, pos, dep)) return tuples
class NLPParser(object): """ NLP parse, including Part-Of-Speech tagging. Attributes ========== parser: StanfordCoreNLP the Staford Core NLP parser """ def __init__(self): self.parser = CoreNLPClient( default_annotators=['ssplit', 'tokenize', 'pos', 'ner'], server="http://localhost:9000") #self.parser = POSTagger(corenlp_dir+'/models/english-bidirectional-distsim.tagger', corenlp_dir+'/stanford-postagger.jar') def parse(self, sent): result = self.parser.annotate(sent) tuples = [] for sent in result.sentences: tokens, pos, ner = [], [], [] for token in sent: tokens += [token.word] pos += [token.pos] ner += [token.ner] tuples.append((tokens, pos, ner)) return tuples
def annotate(sent): global client if client is None: client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(',')) words = [] for sent in client.annotate(sent).sentences: for tok in sent: words.append(tok.word) return words
def annotate(sentence, lower=True): global client if client is None: client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(',')) words, gloss, after = [], [], [] for s in client.annotate(sentence): for t in s: words.append(t.word) gloss.append(t.originalText) after.append(t.after) if lower: words = [w.lower() for w in words] return { 'gloss': gloss, 'words': words, 'after': after, }
def tokenize(df): from stanza.nlp.corenlp import CoreNLPClient parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize'], server='http://localhost:9000') parsed = [] for item in df['posts']: temp = [] for sentence in item.strip().split('|||'): try: result = parser.annotate(sentence) tokens = [] for i in range(len(result.sentences)): tokens += result.sentences[i].tokens temp.append(' '.join([token.word for token in tokens])) except: print('error', sentence) parsed.append(' <RETURN> '.join(temp)) df['posts'] = parsed
class NLPParser(object): """ NLP parse, including Part-Of-Speech tagging. Attributes ========== parser: StanfordCoreNLP the Staford Core NLP parser """ def __init__(self): self.parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize', 'ner']) def get_ner(self, tokens): sent = ' '.join(tokens) result = self.parser.annotate(sent) ner = [] for token in result.sentences[0]: ner.append(token.ner) return ner
def get_annotate(sentence, lower=True): # notice return 4 infos # todo: handle [Salmonella spp.] -> ['salmonella', 'spp.', '.'] global client if client is None: client = CoreNLPClient( server='http://localhost:9000', default_annotators=['ssplit', 'tokenize', 'pos']) tokenize, origin, pos_tag, after = [], [], [], [] for s in client.annotate(sentence): for t in s: if lower: tokenize.append(t.word.lower()), origin.append( t.originalText.lower()), pos_tag.append( t.pos), after.append(t.after) else: tokenize.append(t.word), origin.append( t.originalText), pos_tag.append(t.pos), after.append( t.after) return tokenize, origin, pos_tag, after
def do_command(args): reader = csv.reader(args.input, delimiter="\t") header = next(reader) assert all(field in header for field in ("id", "text")) Tweet = namedtuple("Tweet", header) client = CoreNLPClient() annotators = "tokenize ssplit lemma pos".split() writer = csv.writer(args.output, delimiter="\t") writer.writerow(["id", "tokens", "lemmas", "pos_tags"]) for tweet in tqdm(Tweet(*row) for row in reader): doc = client.annotate(tweet.text, annotators) tokens, lemmas, pos_tags = [], [], [] for sentence in doc: tokens += sentence.words lemmas += sentence.lemmas pos_tags += sentence.pos_tags writer.writerow([tweet.id, to_psql_array(tokens), to_psql_array(lemmas), to_psql_array(pos_tags)])
class NLPParser(object): """ NLP parse, including Part-Of-Speech tagging. Attributes ========== parser: StanfordCoreNLP the Staford Core NLP parser """ def __init__(self): self.parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize', 'ner']) def parse(self, sent): result = self.parser.annotate(sent) tokens_list, ner_list = [], [] for sent in result.sentences: tokens, ner = [], [] currNERType = 'O' currNER = '' for token in sent: token_ner = token.ner if token_ner not in INTERESTED_STANFORD_EM_TYPES: token_ner = 'O' tokens += [token.word] if token_ner == 'O': if currNER != '': ner.append(currNER.strip()) currNER = '' elif token_ner == currNERType: currNER += token.word + ' ' else: if currNER != '': ner.append(currNER.strip()) currNERType = token_ner currNER = token.word + ' ' if currNER != '': ner.append(currNER.strip()) if len(tokens) == 0 or len(ner) == 0: continue tokens_list.append(tokens) ner_list.append(ner) return tokens_list, ner_list
import unicodedata #consider using notebooks from stanza.nlp.corenlp import CoreNLPClient client = CoreNLPClient(server='http://localhost:9000', default_annotators=['ssplit', 'tokenize']) marquez = 'marquez/marquez' # encoded in latin-1 ulysses = 'ulysses/ulysses' # encoded in utf-8 emma = 'emma/emma' # encoded in utf-8 with open(marquez + "token", 'w') as writeFile: for i in range(1, 21): filename = marquez + "%02d" % i with open(filename, 'r') as myfile: chapter = unicode(myfile.read(), encoding='latin-1') chapter = unicodedata.normalize('NFD', chapter) chapter = chapter.encode('ascii', 'ignore') annotated = client.annotate(chapter) for sentence in annotated: for token in sentence: writeFile.write(token.word.encode('ascii', 'ignore')) writeFile.write("\n")
def corenlp_tokenize(s: str) -> Sequence[str]: global corenlp_client if "corenlp_client" not in globals(): corenlp_client = CoreNLPClient( default_annotators=["ssplit", "tokenize"]) return [t.word for s in corenlp_client.annotate(s).sentences for t in s]
from stanza.nlp.corenlp import CoreNLPClient client = CoreNLPClient(server='http://localhost:9000', default_annotators=['ssplit', 'tokenize', 'lemma', 'pos', 'ner']) annotated = client.annotate('This is an example document. Here is a second sentence') for sentence in annotated.sentences: print('sentence', sentence) for token in sentence: print(token.word, token.lemma, token.pos, token.ner)