Beispiel #1
0
class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging and dependency parse.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self):
        self.parser = CoreNLPClient(
            default_annotators=['ssplit', 'tokenize', 'pos', 'depparse'])

    def parse(self, sent):
        """
        Part-Of-Speech tagging and dependency parse.
        :param sent: string
        :return: a list of tuple (word, pos, dependency)
        """
        result = self.parser.annotate(sent)
        tuples = []
        for s in result.sentences:
            word, pos, dep = [], [], []
            for token in s:
                word += [token.word]
                pos += [token.pos]
            edges = s.depparse(mode='enhanced').to_json()
            for e in edges:
                dep.append({
                    'type': e['dep'],
                    'dep': e['dependent'] - 1,
                    'gov': e['governer'] - 1
                })
            tuples.append((word, pos, dep))
        return tuples
Beispiel #2
0
class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self):
        self.parser = CoreNLPClient(
            default_annotators=['ssplit', 'tokenize', 'pos', 'ner'],
            server="http://localhost:9000")

        #self.parser = POSTagger(corenlp_dir+'/models/english-bidirectional-distsim.tagger', corenlp_dir+'/stanford-postagger.jar')
    def parse(self, sent):
        result = self.parser.annotate(sent)
        tuples = []
        for sent in result.sentences:
            tokens, pos, ner = [], [], []
            for token in sent:
                tokens += [token.word]
                pos += [token.pos]
                ner += [token.ner]
            tuples.append((tokens, pos, ner))
        return tuples
Beispiel #3
0
def annotate(sent):
    global client
    if client is None:
        client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(','))
    words = []
    for sent in client.annotate(sent).sentences:
        for tok in sent:
            words.append(tok.word)
    return words
Beispiel #4
0
def annotate(sentence, lower=True):
    global client
    if client is None:
        client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(','))
    words, gloss, after = [], [], []
    for s in client.annotate(sentence):
        for t in s:
            words.append(t.word)
            gloss.append(t.originalText)
            after.append(t.after)
    if lower:
        words = [w.lower() for w in words]
    return {
        'gloss': gloss,
        'words': words,
        'after': after,
    }
Beispiel #5
0
def tokenize(df):
    from stanza.nlp.corenlp import CoreNLPClient
    parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize'],
                           server='http://localhost:9000')
    parsed = []
    for item in df['posts']:
        temp = []
        for sentence in item.strip().split('|||'):
            try:
                result = parser.annotate(sentence)
                tokens = []
                for i in range(len(result.sentences)):
                    tokens += result.sentences[i].tokens
                temp.append(' '.join([token.word for token in tokens]))
            except:
                print('error', sentence)
        parsed.append(' <RETURN> '.join(temp))
    df['posts'] = parsed
class NLPParser(object):
	"""
	NLP parse, including Part-Of-Speech tagging.
	Attributes
	==========
	parser: StanfordCoreNLP
		the Staford Core NLP parser
	"""
	def __init__(self):
		self.parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize', 'ner'])

	def get_ner(self, tokens):
		sent = ' '.join(tokens)
		result = self.parser.annotate(sent)
		ner = []
		for token in result.sentences[0]:
			ner.append(token.ner)
		return ner
def get_annotate(sentence, lower=True):
    # notice return 4 infos
    # todo: handle [Salmonella spp.] -> ['salmonella', 'spp.', '.']
    global client
    if client is None:
        client = CoreNLPClient(
            server='http://localhost:9000',
            default_annotators=['ssplit', 'tokenize', 'pos'])
    tokenize, origin, pos_tag, after = [], [], [], []
    for s in client.annotate(sentence):
        for t in s:
            if lower:
                tokenize.append(t.word.lower()), origin.append(
                    t.originalText.lower()), pos_tag.append(
                        t.pos), after.append(t.after)
            else:
                tokenize.append(t.word), origin.append(
                    t.originalText), pos_tag.append(t.pos), after.append(
                        t.after)
    return tokenize, origin, pos_tag, after
Beispiel #8
0
def do_command(args):
    reader = csv.reader(args.input, delimiter="\t")
    header = next(reader)
    assert all(field in header for field in ("id", "text"))

    Tweet = namedtuple("Tweet", header)
    client = CoreNLPClient()
    annotators = "tokenize ssplit lemma pos".split()

    writer = csv.writer(args.output, delimiter="\t")
    writer.writerow(["id", "tokens", "lemmas", "pos_tags"])

    for tweet in tqdm(Tweet(*row) for row in reader):
        doc = client.annotate(tweet.text, annotators)
        tokens, lemmas, pos_tags = [], [], []
        for sentence in doc:
            tokens += sentence.words
            lemmas += sentence.lemmas
            pos_tags += sentence.pos_tags
        writer.writerow([tweet.id, to_psql_array(tokens), to_psql_array(lemmas), to_psql_array(pos_tags)])
Beispiel #9
0
class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self):
        self.parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize', 'ner'])

    def parse(self, sent):
        result = self.parser.annotate(sent)
        tokens_list, ner_list = [], []
        for sent in result.sentences:
            tokens, ner = [], []
            currNERType = 'O'
            currNER = ''
            for token in sent:
                token_ner = token.ner
                if token_ner not in INTERESTED_STANFORD_EM_TYPES:
                  token_ner = 'O'
                tokens += [token.word]
                if token_ner == 'O':
                  if currNER != '':
                    ner.append(currNER.strip())
                  currNER = ''
                elif token_ner == currNERType:
                  currNER += token.word + ' '
                else:
                  if currNER != '':
                    ner.append(currNER.strip())
                  currNERType = token_ner
                  currNER = token.word + ' '
            if currNER != '':
              ner.append(currNER.strip())
            if len(tokens) == 0 or len(ner) == 0:
              continue
            tokens_list.append(tokens)
            ner_list.append(ner)
        return tokens_list, ner_list
Beispiel #10
0
import unicodedata

#consider using notebooks

from stanza.nlp.corenlp import CoreNLPClient
client = CoreNLPClient(server='http://localhost:9000',
                       default_annotators=['ssplit', 'tokenize'])

marquez = 'marquez/marquez'  # encoded in latin-1
ulysses = 'ulysses/ulysses'  # encoded in utf-8
emma = 'emma/emma'  # encoded in utf-8

with open(marquez + "token", 'w') as writeFile:
    for i in range(1, 21):
        filename = marquez + "%02d" % i
        with open(filename, 'r') as myfile:
            chapter = unicode(myfile.read(), encoding='latin-1')
            chapter = unicodedata.normalize('NFD', chapter)
            chapter = chapter.encode('ascii', 'ignore')

        annotated = client.annotate(chapter)
        for sentence in annotated:
            for token in sentence:
                writeFile.write(token.word.encode('ascii', 'ignore'))
                writeFile.write("\n")
Beispiel #11
0
def corenlp_tokenize(s: str) -> Sequence[str]:
    global corenlp_client
    if "corenlp_client" not in globals():
        corenlp_client = CoreNLPClient(
            default_annotators=["ssplit", "tokenize"])
    return [t.word for s in corenlp_client.annotate(s).sentences for t in s]
Beispiel #12
0
from stanza.nlp.corenlp import CoreNLPClient
client = CoreNLPClient(server='http://localhost:9000', default_annotators=['ssplit', 'tokenize', 'lemma', 'pos', 'ner'])
annotated = client.annotate('This is an example document. Here is a second sentence')
for sentence in annotated.sentences:
    print('sentence', sentence)
    for token in sentence:
        print(token.word, token.lemma, token.pos, token.ner)