Python CoreNLPClientの例、stanza.nlp.corenlp.CoreNLPClient Pythonの例

コード例 #1

0

ファイルを表示

def annotate(sent):
    global client
    if client is None:
        client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(','))
    words = []
    for sent in client.annotate(sent).sentences:
        for tok in sent:
            words.append(tok.word)
    return words

コード例 #2

0

ファイルを表示

class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging and dependency parse.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self):
        self.parser = CoreNLPClient(
            default_annotators=['ssplit', 'tokenize', 'pos', 'depparse'])

    def parse(self, sent):
        """
        Part-Of-Speech tagging and dependency parse.
        :param sent: string
        :return: a list of tuple (word, pos, dependency)
        """
        result = self.parser.annotate(sent)
        tuples = []
        for s in result.sentences:
            word, pos, dep = [], [], []
            for token in s:
                word += [token.word]
                pos += [token.pos]
            edges = s.depparse(mode='enhanced').to_json()
            for e in edges:
                dep.append({
                    'type': e['dep'],
                    'dep': e['dependent'] - 1,
                    'gov': e['governer'] - 1
                })
            tuples.append((word, pos, dep))
        return tuples

コード例 #3

0

ファイルを表示

class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self):
        self.parser = CoreNLPClient(
            default_annotators=['ssplit', 'tokenize', 'pos', 'ner'],
            server="http://localhost:9000")

        #self.parser = POSTagger(corenlp_dir+'/models/english-bidirectional-distsim.tagger', corenlp_dir+'/stanford-postagger.jar')
    def parse(self, sent):
        result = self.parser.annotate(sent)
        tuples = []
        for sent in result.sentences:
            tokens, pos, ner = [], [], []
            for token in sent:
                tokens += [token.word]
                pos += [token.pos]
                ner += [token.ner]
            tuples.append((tokens, pos, ner))
        return tuples

コード例 #4

0

ファイルを表示

def annotate(sentence, lower=True):
    global client
    if client is None:
        client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(','))
    words, gloss, after = [], [], []
    for s in client.annotate(sentence):
        for t in s:
            words.append(t.word)
            gloss.append(t.originalText)
            after.append(t.after)
    if lower:
        words = [w.lower() for w in words]
    return {
        'gloss': gloss,
        'words': words,
        'after': after,
    }

コード例 #5

0

ファイルを表示

ファイル: cleandata.py プロジェクト: JacobLau0513/676-MBTI

def tokenize(df):
    from stanza.nlp.corenlp import CoreNLPClient
    parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize'],
                           server='http://localhost:9000')
    parsed = []
    for item in df['posts']:
        temp = []
        for sentence in item.strip().split('|||'):
            try:
                result = parser.annotate(sentence)
                tokens = []
                for i in range(len(result.sentences)):
                    tokens += result.sentences[i].tokens
                temp.append(' '.join([token.word for token in tokens]))
            except:
                print('error', sentence)
        parsed.append(' <RETURN> '.join(temp))
    df['posts'] = parsed

コード例 #6

0

ファイルを表示

ファイル: annotate.py プロジェクト: arunchaganty/aeschines

def do_command(args):
    reader = csv.reader(args.input, delimiter="\t")
    header = next(reader)
    assert all(field in header for field in ("id", "text"))

    Tweet = namedtuple("Tweet", header)
    client = CoreNLPClient()
    annotators = "tokenize ssplit lemma pos".split()

    writer = csv.writer(args.output, delimiter="\t")
    writer.writerow(["id", "tokens", "lemmas", "pos_tags"])

    for tweet in tqdm(Tweet(*row) for row in reader):
        doc = client.annotate(tweet.text, annotators)
        tokens, lemmas, pos_tags = [], [], []
        for sentence in doc:
            tokens += sentence.words
            lemmas += sentence.lemmas
            pos_tags += sentence.pos_tags
        writer.writerow([tweet.id, to_psql_array(tokens), to_psql_array(lemmas), to_psql_array(pos_tags)])

コード例 #7

0

ファイルを表示

ファイル: utils.py プロジェクト: Dspirit0908/Attn_Binding_System

def get_annotate(sentence, lower=True):
    # notice return 4 infos
    # todo: handle [Salmonella spp.] -> ['salmonella', 'spp.', '.']
    global client
    if client is None:
        client = CoreNLPClient(
            server='http://localhost:9000',
            default_annotators=['ssplit', 'tokenize', 'pos'])
    tokenize, origin, pos_tag, after = [], [], [], []
    for s in client.annotate(sentence):
        for t in s:
            if lower:
                tokenize.append(t.word.lower()), origin.append(
                    t.originalText.lower()), pos_tag.append(
                        t.pos), after.append(t.after)
            else:
                tokenize.append(t.word), origin.append(
                    t.originalText), pos_tag.append(t.pos), after.append(
                        t.after)
    return tokenize, origin, pos_tag, after

コード例 #8

0

ファイルを表示

ファイル: cotype2json.py プロジェクト: xnz535264581/DS-RelationExtraction

class NLPParser(object):
	"""
	NLP parse, including Part-Of-Speech tagging.
	Attributes
	==========
	parser: StanfordCoreNLP
		the Staford Core NLP parser
	"""
	def __init__(self):
		self.parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize', 'ner'])

	def get_ner(self, tokens):
		sent = ' '.join(tokens)
		result = self.parser.annotate(sent)
		ner = []
		for token in result.sentences[0]:
			ner.append(token.ner)
		return ner

コード例 #9

0

ファイルを表示

class NLPParser(object):
    """
    NLP parse, including Part-Of-Speech tagging.
    Attributes
    ==========
    parser: StanfordCoreNLP
        the Staford Core NLP parser
    """
    def __init__(self):
        self.parser = CoreNLPClient(default_annotators=['ssplit', 'tokenize', 'ner'])

    def parse(self, sent):
        result = self.parser.annotate(sent)
        tokens_list, ner_list = [], []
        for sent in result.sentences:
            tokens, ner = [], []
            currNERType = 'O'
            currNER = ''
            for token in sent:
                token_ner = token.ner
                if token_ner not in INTERESTED_STANFORD_EM_TYPES:
                  token_ner = 'O'
                tokens += [token.word]
                if token_ner == 'O':
                  if currNER != '':
                    ner.append(currNER.strip())
                  currNER = ''
                elif token_ner == currNERType:
                  currNER += token.word + ' '
                else:
                  if currNER != '':
                    ner.append(currNER.strip())
                  currNERType = token_ner
                  currNER = token.word + ' '
            if currNER != '':
              ner.append(currNER.strip())
            if len(tokens) == 0 or len(ner) == 0:
              continue
            tokens_list.append(tokens)
            ner_list.append(ner)
        return tokens_list, ner_list

コード例 #10

0

ファイルを表示

                    type=int,
                    default=50,
                    help="Maximum sequence length")
parser.add_argument('-shuffle', type=int, default=1, help="Shuffle data")
parser.add_argument('-seed', type=int, default=3435, help="Random seed")

parser.add_argument('-lower', action='store_true', help='lowercase data')

parser.add_argument('-report_every',
                    type=int,
                    default=100000,
                    help="Report status every this many sentences")

opt = parser.parse_args()

corenlp = CoreNLPClient(default_annotators=['tokenize', 'ssplit'])


def annotate_sentence(corenlp, gloss):
    try:
        parse = corenlp.annotate(gloss)
    except:
        time.sleep(10)
        parse = corenlp.annotate(gloss)
    token_str = ' '.join([
        token['word'] for sentence in parse.json['sentence']
        for token in sentence['token']
    ])
    #return parse.json['sentence'][0]['token']
    return token_str

コード例 #11

0

ファイルを表示

ファイル: create_final.py プロジェクト: abarthakur/create_distantly_supervised_dataset

            'text': para_string,
            'fbEMs': entities_fb,
            'dbEMs': entities_db,
            'sentences': sentences
        }
        paragraph_count += 1
        output_file.write(json.dumps(paragraph) + "\n")
    wiki_file.close()
    output_file.close()


''' Takes path to directory with folders and files extracted using WikiExtractor
    and path to output directory as inputs. Writes the sentences with required extracted information
    as Json files in output directory. 
    The Wikipedia dumps are extracted using the following command (preserve links)-

    python WikiExtractor.py -l enwiki-20160920-pages-articles-multistream.xml

'''

if __name__ == "__main__":
    input_file_path = sys.argv[1]
    output_file_path = sys.argv[2]

    corenlp_client = CoreNLPClient(
        server='http://localhost:9000',
        default_annotators=['ssplit', 'tokenize', 'lemma', 'pos', 'ner'])

    sparql = SPARQLWrapper.SPARQLWrapper("http://localhost:8890/sparql/")
    process_file(input_file_path, sparql, corenlp_client, output_file_path)

コード例 #12

0

ファイルを表示

ファイル: tkn.py プロジェクト: harshita-gupta/paper5

import unicodedata

#consider using notebooks

from stanza.nlp.corenlp import CoreNLPClient
client = CoreNLPClient(server='http://localhost:9000',
                       default_annotators=['ssplit', 'tokenize'])

marquez = 'marquez/marquez'  # encoded in latin-1
ulysses = 'ulysses/ulysses'  # encoded in utf-8
emma = 'emma/emma'  # encoded in utf-8

with open(marquez + "token", 'w') as writeFile:
    for i in range(1, 21):
        filename = marquez + "%02d" % i
        with open(filename, 'r') as myfile:
            chapter = unicode(myfile.read(), encoding='latin-1')
            chapter = unicodedata.normalize('NFD', chapter)
            chapter = chapter.encode('ascii', 'ignore')

        annotated = client.annotate(chapter)
        for sentence in annotated:
            for token in sentence:
                writeFile.write(token.word.encode('ascii', 'ignore'))
                writeFile.write("\n")

コード例 #13

0

ファイルを表示

ファイル: add_ner.py プロジェクト: cherry979988/feedforward-RE

 def __init__(self):
     self.parser = CoreNLPClient(
         default_annotators=['ssplit', 'tokenize', 'ner'],
         server='http://localhost:9001')

コード例 #14

0

ファイルを表示

                state = {'model': model.state_dict()}
                torch.save(state, os.path.join('.', 'model_best.pt'))

                state = {'model_bert': model_bert.state_dict()}
                torch.save(state, os.path.join('.', 'model_bert_best.pt'))

            print(f" Best Dev lx acc: {acc_lx_t_best} at epoch: {epoch_best}")

    if args.do_infer:
        # To use recent corenlp: https://github.com/stanfordnlp/python-stanford-corenlp
        # 1. pip install stanford-corenlp
        # 2. download java crsion
        # 3. export CORENLP_HOME=/Users/wonseok/utils/stanford-corenlp-full-2018-10-05

        from stanza.nlp.corenlp import CoreNLPClient
        client = CoreNLPClient(server='http://localhost:9000',
                               default_annotators='ssplit,tokenize'.split(','))

        # import corenlp
        #
        # client = corenlp.CoreNLPClient(annotators='ssplit,tokenize'.split(','))

        nlu1 = "Which company have more than 100 employees?"
        path_db = './data_and_model'
        db_name = 'dev'
        data_table = load_jsonl('./data_and_model/dev.tables.jsonl')
        table_name = 'table_10015132_11'  # change table here depending on the questions.
        n_Q = 100000 if args.infer_loop else 1
        for i in range(n_Q):
            if n_Q > 1:
                nlu1 = input('Type question: ')
            pr_sql_i, pr_ans = infer(nlu1,

コード例 #15

0

ファイルを表示

 def __init__(self):
     self.parser = CoreNLPClient(
         default_annotators=['ssplit', 'tokenize', 'pos', 'depparse'])

コード例 #16

0

ファイルを表示

 def _client(cls):
     if cls.__client is None:
         cls.__client = CoreNLPClient(server=config.CORENLP_SERVER, default_annotators=config.CORENLP_ANNOTATORS)
     return cls.__client

コード例 #17

0

ファイルを表示

from stanza.nlp.corenlp import CoreNLPClient
client = CoreNLPClient(server='http://localhost:9000', default_annotators=['ssplit', 'tokenize', 'lemma', 'pos', 'ner'])
annotated = client.annotate('This is an example document. Here is a second sentence')
for sentence in annotated.sentences:
    print('sentence', sentence)
    for token in sentence:
        print(token.word, token.lemma, token.pos, token.ner)

コード例 #18

0

ファイルを表示

def corenlp_tokenize(s: str) -> Sequence[str]:
    global corenlp_client
    if "corenlp_client" not in globals():
        corenlp_client = CoreNLPClient(
            default_annotators=["ssplit", "tokenize"])
    return [t.word for s in corenlp_client.annotate(s).sentences for t in s]