Python CoreNLPClientの例、corenlp.CoreNLPClient Pythonの例

コード例 #1

0

ファイルを表示

    def __init__(self, src, src_folder="../data/", corenlp_path="../stanford-corenlp/",\
                 regexp=None, verbose=False):

        # initialization
        self.src = src
        self.filename = re.search("([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).json", self.src).group(1)
        self.src_candidates = os.path.join(src_folder, "lexicon/candidates.json")
        self.dst_starred = os.path.join(src_folder, "starred_reviews/")
        self.corenlp_path = os.path.normpath(corenlp_path)
        self.regexp = regexp
        self.verbose = verbose

        self.entity = {}
        self.entity_name = ""
        self.entity_regexp = []
        self.ratings = []
        self.sentiment_words = []
        self.adv_adj_combinations = {}
        self.clean_reviews = []
        self.starred_reviews = []

        self.stopwords = set(stopwords.words("english"))
        self.stopwords.remove("not")
        self.stemmer = SnowballStemmer("english")

        # need set the CORENLP_HOME path
        os.environ["CORENLP_HOME"] = self.corenlp_path
        self.corenlp = corenlp.CoreNLPClient(annotators="tokenize ssplit dcoref".split(), timeout=50000)

コード例 #2

0

ファイルを表示

ファイル: getTriples.py プロジェクト: rahular/coref-rl

def get_openIE_triples(document):
    # CoreNLP OpenIE
    """
    Input: document: str
    Output: tripleSet: set(tripple-tuple, confidence)
    """
    tripleSet = set()

    try:
        with corenlp.CoreNLPClient(
                annotators="tokenize,ssplit,pos,lemma,depparse,natlog,openie".
                split()) as client:
            ann = client.annotate(document)  # 'doc.CoreNLP_pb2.Document'>

        for sent in ann.sentence:
            if len(sent.openieTriple) > 0:  # if there are any triples...
                # <class 'google.protobuf.pyext._message.RepeatedCompositeContainer'>
                triples = sent.openieTriple
                for t in triples:
                    triple_tuple = (t.subject, t.relation, t.object)
                    c = t.confidence
                    out = (triple_tuple, c)
                    tripleSet.add(out)
    except:
        # corenlp server probably timed out because the document was too big
        # TODO: grab a smaller section and annotate that.
        pass

    return tripleSet

コード例 #3

0

ファイルを表示

ファイル: test_client.py プロジェクト: zhang1546/python-stanford-corenlp

def test_tokensregex():
    with corenlp.CoreNLPClient(
            annotators='tokenize ssplit ner depparse'.split(),
            timeout=60000) as client:
        # Example pattern from: https://nlp.stanford.edu/software/tokensregex.shtml
        pattern = '([ner: PERSON]+) /wrote/ /an?/ []{0,3} /sentence|article/'
        matches = client.tokensregex(TEXT, pattern)
        assert len(matches["sentences"]) == 1
        assert matches["sentences"][0]["length"] == 1
        assert matches == {
            "sentences": [
                {
                    "0": {
                        "text": "Chris wrote a simple sentence",
                        "begin": 0,
                        "end": 5,
                        "1": {
                            "text": "Chris",
                            "begin": 0,
                            "end": 1
                        }
                    },
                    "length": 1
                },
            ]
        }

コード例 #4

0

ファイルを表示

ファイル: wordnet.py プロジェクト: hellorusk-sandbox/TSMH

def find_all_pos_words(pos):
	results = []
	verbs = []
	with corenlp.CoreNLPClient(annotators="tokenize ssplit pos".split()) as client:
		for idx in tqdm(range(tokenizer.vocab_size)):
			word = tokenizer._convert_id_to_token(idx)
			if check_word_pos(word, pos):
				ann = client.annotate(word)
				pos_all = [[token.pos for token in sent.token] for sent in ann.sentence]
				pos_list = []
				for pos_sent in pos_all:
					pos_list.extend(pos_sent)
				if len(pos_list) == 1 and pos_list[0] not in \
						['VVD', 'VVG', 'VVN', 'VVP', 'VVZ',
						 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ',
						 'VHD', 'VHG', 'VHN', 'VHP', 'VHZ',
						 'MD']:
					results.append(idx)
					verbs.append(word)
	# for idx in tqdm(range(tokenizer.vocab_size)):
	# 	word = tokenizer._convert_id_to_token(idx)
	# 	if check_word_pos(word, pos):
	# 		results.append(idx)
	# 		verbs.append(word)
	# print(results)
	print(len(results))
	return results, verbs

コード例 #5

0

ファイルを表示

ファイル: word2vec.py プロジェクト: noodlez04/wikisearch

 def tokenize_text(text):
     """
     Tokenizes the title's text by the embedding class
     :param title: The title to tokenize its text
     """
     # Filters out external links
     text = ' '.join([
         word for word in text.split()
         if "https://" not in word and "http://" not in word
     ])
     # Start the coreNLPServer separately
     with corenlp.CoreNLPClient(
             start_server=False,
             timeout=10000,
             annotators="tokenize ssplit lemma pos".split()) as client:
         ann = client.annotate(text)
     # Filters out stop words
     stop_words = set(stopwords.words('english')) | {
         word.capitalize()
         for word in stopwords.words('english')
     }
     # Removes punctuation from each word
     punctuation = set(string.punctuation) | {"\"\""} | {'\'\''} | {'``'}
     punctuation_and_stop_words = stop_words | punctuation
     # Couple each word with its pos tag if the word isn't a stop word and isn't a punctuation
     text = [
         f"{token.lemma}_{TREEBANK_TO_UNIVERSAL[token.pos]}"
         for sentence in ann.sentence for token in sentence.token
         if token.lemma not in punctuation_and_stop_words
     ]
     return text

コード例 #6

0

ファイルを表示

ファイル: annotate_question.py プロジェクト: epaes90/nl2sql-1

def annotate_question(question, table_id, dir_in, dir_out, split):
    if not os.path.isdir(dir_out):
        os.makedirs(dir_out)

    ftable = os.path.join(dir_in, split) + '.tables.jsonl'
    fout = os.path.join(
        dir_out,
        split) + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + '.jsonl'
    fbase = os.path.join(dir_in, split) + '_base.jsonl'
    copyfile(fbase, fout)

    with open(ftable) as ft, open(fout, 'a') as fo, corenlp.CoreNLPClient(
            annotators="tokenize ssplit".split()) as client:
        tables = {}
        for line in ft:
            d = json.loads(line)
            tables[d['id']] = d  # to get table headers
        raw_data = {
            "phase": 1,
            "table_id": table_id,
            "question": question,
            "sql": {
                "sel": 0,
                "conds": [[0, 0, 0]],
                "agg": 0
            }
        }
        a = annotate_example(client, raw_data, tables[table_id])
        # if not is_valid_example(a):
        #    raise Exception(str(a))
        nlp = spacy.load('en_core_web_sm')
        doc = nlp(question)
        a["question"]["ent"] = [token.tag_ for token in doc]
        fo.write('\n' + json.dumps(a) + '\n')
    return fout, tables[table_id]['header']

コード例 #7

0

ファイルを表示

ファイル: udp_add_ner.py プロジェクト: AlongWY/AMR

def main(args):
    num_sentences = 0
    with open(args.input, encoding='utf-8') as f, open(args.output, mode='w', encoding='utf-8') as out:
        with corenlp.CoreNLPClient(annotators="tokenize ner".split(), endpoint="http://localhost:5000") as client:
            for line in f.readlines():
                mrp_json = json.loads(line)

                tok = []
                ner = []
                ann = client.annotate(mrp_json['input'], output_format='json')
                for sentence in ann['sentences']:
                    for tokens in sentence['tokens']:
                        tok.append(tokens['word'])
                        ner.append(tokens['ner'])
                if len(ner) != len(mrp_json['nodes']):
                    print(mrp_json['id'], " error!")

                mrp_json['tok'] = tok
                mrp_json['ner'] = ner
                # for ner, nodes in zip(ner, mrp_json['nodes']):
                #     nodes['properties'].append('ner')
                #     nodes['values'].append(ner)

                out.write(json.dumps(mrp_json) + '\n')
                num_sentences += 1
                if num_sentences % 1000 == 0:
                    print(f"Processed {num_sentences} sentences!")

コード例 #8

0

ファイルを表示

ファイル: test_annotator.py プロジェクト: zhang14/python-stanford-corenlp

def test_tokenizer():
    cases = [
        (u"RT @ #happyfuncoding: this is a typical Twitter tweet :-)",
         u"rt @ #happyfuncoding : this is a typical twitter tweet :-)".split()
         ),
        (u"HTML entities &amp; other Web oddities can be an &aacute;cute <em class='grumpy'>pain</em> >:(",
         u"html entities and other web oddities can be an ácute".split() +
         [u"<em class='grumpy'>", u"pain", u"</em>", u">:("]),
        (u"It's perhaps noteworthy that phone numbers like +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace.",
         u"it's perhaps noteworthy that phone numbers like".split() + [
             u"+1 (800) 123-4567", u",", u"(800) 123-4567", u",", u"and",
             u"123-4567"
         ] + u"are treated as words despite their whitespace .".split())
    ]

    annotator = HappyFunTokenizer()
    annotator.start()

    try:
        with corenlp.CoreNLPClient(
                properties=annotator.properties,
                annotators="happyfun ssplit pos".split()) as client:
            for text, tokens in cases:
                ann = client.annotate(text)
                tokens_ = [t.word for t in ann.sentence[0].token]
                assert tokens == tokens_
    finally:
        annotator.terminate()
        annotator.join()

コード例 #9

0

ファイルを表示

ファイル: reward.py プロジェクト: rahular/coref-rl

def corenlp_start():
    os.environ['CORENLP_HOME'] = './corenlp'
    corenlp_client = corenlp.CoreNLPClient(endpoint="http://localhost:9000", memory='16G', 
        annotators="tokenize ssplit pos lemma depparse natlog openie".split())
    corenlp_client.annotate('This is a dummy input sentence to initialize all CoreNLP components.')
    print('(Re)initialized coreNLP server...')
    return corenlp_client

コード例 #10

0

ファイルを表示

def qgnet_main(args_dict):
    # first, run shell script, if necessary, in qgnet to create model
    subprocess.call([
        '../{}/download_QG-Net.sh'.format(settings.qgnet_dir),
        args_dict['qgnet_path']
    ])

    # second, pre-process the pdfs
    jsonObj, allDocs = load_data('{}/da_embeddings.txt'.format(
        settings.models_dir))
    abstracts = []
    for value in jsonObj.values():
        if "summary" in value['metadata']:
            abstracts.append(value['metadata']["summary"])
        elif "abstract" in value['metadata']:
            abstracts.append(value['metadata']["abstract"])

    nlp = corenlp.CoreNLPClient(output_format='json',
                                properties={'timeout': '50000'})

    features, tfidf = create_tf_idf(abstracts, False)

    for i, abstract in enumerate(abstracts):
        preprocess_pdf(abstract, features[i, :].toarray(), tfidf, nlp)

    # third, generate qg-net questions
    subprocess.call([
        '../{}/qg_reproduce_LS.sh'.format(settings.qgnet_dir),
        args_dict['qgnet_path'], settings.models_dir
    ])

コード例 #11

0

ファイルを表示

    def __init__(self, ontology_name):
        self.causal_headers = ["Source_File", 'Query', "Score",  "Span", "Relation Index", "Relation", "Relation_Type",
                               "Indicator", "Cause Index", "Cause", "Effect Index", "Effect", "Sentence"]
        self.event_headers = ["Source_File", 'Query', "Score", "Event Index", "Span", "Sentence Span","Relation", "Event_Type",
                              "FrameNet_Frame", "Indicator", "Location", "Time", 'Agent Index', "Agent",
                              'Patient Index', "Patient", "Sentence"]
        self.entity_headers = ["Source_File", 'Query', "Score", "Entity Index", "Span", "Sentence Span", "Entity", "Entity_Type",
                               "FrameNet_Frame", "Indicator", "Qualifier", "Sentence"]
        self.variable_headers = ["Source_File", 'Sentence', 'Indicator', 'Scoring', 'Index']
        self.entity_index = 0
        self.event_index = 0
        self.variable_index = 0
        self.causal_index = 0
        self.ontology_name = ontology_name

        if os.getenv('CORENLP_HOME') is not None and os.getenv('CORENLP_HOME') != '':
            print(f'using Stanford CoreNLP Server @ {os.getenv("CORENLP_HOME")}')
            self.CoreNLPclient = corenlp.CoreNLPClient( start_server=True,
                                                        be_quiet=True,
                                                        timeout=100000,
                                                        annotators=['tokenize',
                                                                   'ssplit',
                                                                   'pos',
                                                                   'parse',
                                                                   'lemma',
                                                                   'ner',
                                                                   'depparse'])
            self.CoreNLPclient.annotate("hello world") # warmup the CoreNLP client and start the java server
        else:
            raise ValueError('the "CORENLP_HOME" environment variable is not set, cannot run Stanford CoreNLP Server')

コード例 #12

0

ファイルを表示

    def __init__(self, corenlp_path):

        self.keyPhrases = {}
        self.rels = ['IsA', 'UsedFor', 'PartOf', 'HasA', 'CreatedBy', 'MadeOf']
        os.environ['CORENLP_HOME'] = corenlp_path
        self.CoreNLPclient = corenlp.CoreNLPClient(annotators=[
            'tokenize', 'ssplit', 'pos', 'depparse', 'lemma', 'parse'
        ])

コード例 #13

0

ファイルを表示

 def __init__(self, corenlp_path, previous_def_file):
     self.nounTags=['NN', 'NNP', 'NNS', "NNPS",'VBG']
     #os.environ['CORENLP_HOME'] = '/Users/evangeliaspiliopoulou/Desktop/stanfordCoreNLP'
     os.environ['CORENLP_HOME'] = corenlp_path
     self.CoreNLPclient = corenlp.CoreNLPClient(annotators=['tokenize', 'ssplit', 'pos', 'lemma'])
     if previous_def_file!= None:
         self.previous_def = json.load(open(previous_def_file))
     self.exceptionTerms= set()

コード例 #14

0

ファイルを表示

    def start():
        timer = Timer()
        timer.start('Start CoreNLP server')

        Corenlp.client = corenlp.CoreNLPClient(
            annotators=Corenlp.DEFAULT_ANNOTATORS,
            properties=Corenlp.DEFAULT_PROPERTIES)

        timer.stop()

コード例 #15

0

ファイルを表示

ファイル: stanfordCoreNLP.py プロジェクト: xinzhu-cai/probase-concept-generator

def Parse(text, annotators=None):
    if annotators == None:
        # annotators = ['tokenize', 'ssplit', 'lemma', 'pos', 'ner', 'parse', 'depparse', 'regnexer','coref']
        annotators = ['tokenize', 'ssplit', 'lemma', 'pos', 'parse']
    with corenlp.CoreNLPClient(annotators=annotators,
                               properties=StanfordCoreNLP_chinese_properties,
                               timeout=15000) as client:
        ann = client.annotate(text)
    return ann

コード例 #16

0

ファイルを表示

ファイル: stanfordCoreNLP.py プロジェクト: xinzhu-cai/probase-concept-generator

def POSTag(text, sent_split=True, tolist=True):
    words = []
    if text != '':
        try:
            lang = langdetect.detect(text)
        except langdetect.lang_detect_exception.LangDetectException:
            lang = "undetermined"
        if (lang == "zh-cn"):  #If text is chinese segment, else leave it
            #########
            if sent_split:
                annotators = ['tokenize', 'ssplit', 'pos']
                with corenlp.CoreNLPClient(
                        annotators=annotators,
                        properties=StanfordCoreNLP_chinese_properties,
                        timeout=15000) as client:
                    ann = client.annotate(text)
                words = [[(token.word, token.pos) for token in sent.token]
                         for sent in ann.sentence]
                segmented_list = [
                    ' '.join(['#'.join(posted) for posted in wordlist])
                    for wordlist in words
                ]
                segmented = '\n'.join(segmented_list)
            else:
                annotators = ['tokenize', 'pos']
                with corenlp.CoreNLPClient(
                        annotators=annotators,
                        properties=StanfordCoreNLP_chinese_properties,
                        timeout=15000) as client:
                    ann = client.annotate(text)
                words = [(token.word, token.pos)
                         for token in ann.sentencelessToken]
                segmented = ' '.join(['#'.join(posted) for posted in words])
        else:
            segmented = text
            words = segmented.split()
    else:
        segmented = text
    if tolist:
        return words  #list
    else:
        return segmented  #string

コード例 #17

0

ファイルを表示

    def __init__(self):
        #if not os.environ.get('CORENLP_HOME'):
        os.environ['CORENLP_HOME'] = os.path.abspath(
            os.path.join(os.path.dirname(__file__),
                         '../../third_party/stanford-corenlp-full-2018-10-05'))
        if not os.path.exists(os.environ['CORENLP_HOME']):
            raise Exception(
                f'''Please install Stanford CoreNLP and put it at {os.environ['CORENLP_HOME']}.

                Direct URL: http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
                Landing page: https://stanfordnlp.github.io/CoreNLP/''')
        self.client = corenlp.CoreNLPClient()

コード例 #18

0

ファイルを表示

ファイル: stanfordCoreNLP.py プロジェクト: xinzhu-cai/probase-concept-generator

def Chinese_CoreNLP_test(text=None, annotators=None):
    if text == None:
        text = ("国务院日前发出紧急通知，要求各地切实落实保证市场供应的各项政策，维护副食品价格稳定。")
    ####
    if annotators == None:
        annotators = ['tokenize', 'ssplit', 'pos']
        # annotators = ['tokenize', 'ssplit', 'lemma', 'pos', 'ner', 'depparse']
    with corenlp.CoreNLPClient(annotators=annotators,
                               properties=StanfordCoreNLP_chinese_properties,
                               timeout=15000) as client:
        ann = client.annotate(text)
    # sent_list = [token.word for token in ann.sentence[0].token]
    # ['国务院', '日前', '发出', '紧急', '通知', '，', '要求', '各地', '切实', '落实', '保证', '市场', '供应', '的', '各', '项', '政策', '，', '维护', '副食品', '价格', '稳定', '。']
    return ann

コード例 #19

0

ファイルを表示

ファイル: corenlp.py プロジェクト: zhaoxlpku/duorat

    def __init__(self):
        if not os.environ.get("CORENLP_HOME"):
            os.environ["CORENLP_HOME"] = os.path.abspath(
                os.path.join(
                    os.path.dirname(__file__),
                    "/corenlp/stanford-corenlp-full-2018-10-05",
                ))
        if not os.path.exists(os.environ["CORENLP_HOME"]):
            raise Exception(
                """Please install Stanford CoreNLP and put it at {}.

                Direct URL: http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
                Landing page: https://stanfordnlp.github.io/CoreNLP/""".format(
                    os.environ["CORENLP_HOME"]))
        self.client = corenlp.CoreNLPClient()

コード例 #20

0

ファイルを表示

def get_examples_from_forums(forums, guest_client):
    """ Convert reviews from a list of forums into classification examples.
      
      Args:
        forums: A list of forum ids
        conference: Conference name (from openreview_lib.Conference)

      Returns:
        A list of openreview_lib.ClassificationExamples
  """
    sid_map, pairs = orl.get_review_rebuttal_pairs(forums, guest_client)
    with corenlp.CoreNLPClient(annotators=CORENLP_ANNOTATORS,
                               output_format='conll') as corenlp_client:
        return orl.get_classification_examples(pairs, "review", sid_map,
                                               corenlp_client)

コード例 #21

0

ファイルを表示

def testPart():
	TEST_SENTENCE = "Which prize did Frederick Buechner create?"
	with corenlp.CoreNLPClient(annotators='tokenize ssplit parse lemma pos ner'.split()) as testclient:
		c = testclient.annotate(TEST_SENTENCE)
		TEST_PARSE_TREE = c.sentence[0].parseTree
	TEST_Q = queue.Queue(maxsize=MAX_SIZE)
	TEST_Q.put(TEST_PARSE_TREE)
	print (TEST_PARSE_TREE)
	tokens_list = parse_tree_WHNP(TEST_Q)
	print (tokens_list)

	spanList = generate_candidateAnswerSpans(tokens_list, c.sentence[0].token)
	sizeList = len(spanList)
	for s in spanList:
		print (str(s.tokens) + str(s.type))
	return 0

コード例 #22

0

ファイルを表示

    def __init__(self):
        directory = (os.getcwd() + r'\\corenlp')
        os.environ['CORENLP_HOME'] = directory
        prop = {'pos.model': directory + r'\gate-EN-twitter.model'}
        self.client = corenlp.CoreNLPClient(
            annotators="tokenize pos lemma".split(),
            properties=prop,
            output_format='json')

        #to combat connection error which occur in initial trial
        for _ in range(3):
            try:
                self.client.annotate('Hello, World')
                break
            except:
                continue

コード例 #23

0

ファイルを表示

ファイル: annotate_ws.py プロジェクト: kaisf1234/wikisql_noise

def annotate(sentence, lower=True):
    global client
    if client is None:
        client = corenlp.CoreNLPClient(annotators='ssplit,tokenize'.split(','))
    words, gloss, after = [], [], []
    for s in client.annotate(sentence).sentence:
        for t in s.token:
            words.append(t.word)
            gloss.append(t.originalText)
            after.append(t.after)
    if lower:
        words = [w.lower() for w in words]
    return {
        'gloss': gloss,
        'words': words,
        'after': after,
    }

コード例 #24

0

ファイルを表示

def separate_sentences(keywords, filepath, mode=0):
    global out_file
    pattern = ''
    for word in keywords:
        pattern += '(' + word + ')|'
    pattern = pattern[:-1]
    # print(pattern)
    # return
    # pattern += ']'
    # print(pattern)
    with corenlp.CoreNLPClient(annotators="tokenize ssplit".split()) as client:
        with open(filepath) as file:
            i = 0
            text = ''
            headline = False
            for line in file:
                if i < 2:
                    if i == 0 and re.search(pattern, line, re.IGNORECASE):
                        headline = True
                    out_file.write(line)
                    i += 1
                    continue
                if line == '\n':
                    file.readline()
                    # print(text)
                    ann = client.annotate(text)

                    # print('\n\n\n')
                    # print('=================================================================')
                    for i in range(len(ann.sentence)):
                        # print(corenlp.to_text(ann.sentence[i]))
                        sentence = corenlp.to_text(ann.sentence[i])

                        if headline and mode == 0:
                            out_file.write(sentence + '\n')
                        elif re.search(pattern, sentence, re.IGNORECASE):
                            out_file.write(sentence + '\n')

                    out_file.write('\n\n')
                    # print('=================================================================')
                    # print('\n\n\n')
                    text = ''
                    headline = False
                    i = 0
                    continue
                text += line

コード例 #25

0

ファイルを表示

ファイル: pretrained_embeddings.py プロジェクト: mattr1/seq2struct_forPRs

    def corenlp_client(self):
        if self._corenlp_client is None:
            if not os.environ.get('CORENLP_HOME'):
                os.environ['CORENLP_HOME'] = os.path.abspath(
                    os.path.join(
                        os.path.dirname(__file__),
                        '../../third_party/stanford-corenlp-full-2018-10-05'))
            if not os.path.exists(os.environ['CORENLP_HOME']):
                raise Exception(
                    '''Please install Stanford CoreNLP and put it at {}.

                    Direct URL: http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
                    Landing page: https://stanfordnlp.github.io/CoreNLP/'''.
                    format(os.environ['CORENLP_HOME']))
            self._corenlp_client = corenlp.CoreNLPClient(
                annotators="tokenize ssplit")
        return self._corenlp_client

コード例 #26

0

ファイルを表示

ファイル: utils.py プロジェクト: Diego999/SMS-Spam-Classification

def tokenize_lemmatize(data):
    import corenlp
    os.environ['CORENLP_HOME'] = 'lib/stanford-corenlp-full'
    with corenlp.CoreNLPClient(annotators='tokenize ssplit pos lemma'.split()) as client:
        for sample in data:
            ann = client.annotate(replace_email(replace_phone(replace_url(sample['text']))))
            tokens = []
            lemmas = []
            pos = []
            for sent in ann.sentence:
                tokens += [format_token(token.word) for token in sent.token]
                lemmas += [format_token(token.lemma) for token in sent.token]
                pos += [token.pos for token in sent.token]

            sample['tokens'] = tokens
            sample['lemmas'] = lemmas
            sample['pos'] = pos
    return data

コード例 #27

0

ファイルを表示

ファイル: corenlp.py プロジェクト: karthikshivaram24/bert-han

    def __init__(self,
                 model="stanford-corenlp-full-2018-10-05",
                 lemmatize=False):
        if not os.environ.get('CORENLP_HOME'):
            os.environ['CORENLP_HOME'] = os.path.abspath(
                os.path.join(os.path.dirname(__file__),
                             f'../../third_party/{model}'))
        if not os.path.exists(os.environ['CORENLP_HOME']):
            raise Exception(
                f'''Please install Stanford CoreNLP and put it at {os.environ['CORENLP_HOME']}.

                Direct URL: http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
                command: `curl https://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip | jar xv`
                Landing page: https://stanfordnlp.github.io/CoreNLP/''')
        self.client = corenlp.CoreNLPClient()
        self.corenlp_annotators = ['tokenize', 'ssplit']
        if lemmatize:
            self.corenlp_annotators.append('lemma')

コード例 #28

0

ファイルを表示

def main():
    args = parser.parse_args()
    conn = ordb.create_connection(args.dbfile)
    if conn is not None:
        ordb.create_table(conn, ordb.CREATE_COMMENTS_TABLE)
    else:
        print("Error! cannot create the database connection.")

    conn = ordb.create_connection(args.dbfile)
    with corenlp.CoreNLPClient(annotators=ANNOTATORS,
                               output_format='conll') as corenlp_client:
        orl.get_datasets(args.inputfile,
                         corenlp_client,
                         conn,
                         debug=args.debug)

    metadata = create_metadata_json(conn)
    relevant_text = create_text_json(conn)

コード例 #29

0

ファイルを表示

def do_annotate(args):
    args.props = dict(args.props) if args.props else {}
    if args.sentence_mode:
        args.props["ssplit.isOneSentence"] = True

    with corenlp.CoreNLPClient(annotators=args.annotators,
                               properties=args.props,
                               be_quiet=not args.verbose_server) as client:
        for line in args.input:
            if line.startswith("#"): continue

            ann = client.annotate(line.strip(), output_format=args.format)

            if args.format == "json":
                if args.sentence_mode:
                    ann = ann["sentences"][0]

                args.output.write(json.dumps(ann))
                args.output.write("\n")

コード例 #30

0

ファイルを表示

ファイル: collect_contexts.py プロジェクト: pranayyelugam/iclr-discourse-dataset

def main():

  guest_client = openreview.Client(baseurl='https://api.openreview.net')
  conference = orl.Conference.iclr18
  notes = list(openreview.tools.iterget_notes(
               guest_client,
               invitation=orl.INVITATION_MAP[conference]))

  obj = {conference:[]}

  with corenlp.CoreNLPClient(
      annotators=orl.CORENLP_ANNOTATORS,
      output_format='conll') as corenlp_client:
    for note in notes:
      p = orl.get_tokenized_chunks(corenlp_client, note.content["abstract"])
      obj[conference].append(p)

  with open("tokenized_abstracts.json", 'w') as f:
    json.dump(obj, f)