Ejemplo n.º 1
0
    def tokenize(self, text):
        """ 将text输入self.corenlp句柄
        :return: Tokens,Tokens中的data包括多个(TEXT, TEXT_WS, SPAN, POS, LEMMA, NER)
        """
        # logger.info(text[0:10] + "..." if len(text) > 10 else text)

        text = text.replace('\n', '\t')

        output = self.nlp.annotate(text, properties=self.props)
        """ 有效输出: 
        {
          "sentences": [
            {
              "index": 0,
              "entitymentions": [],
              "tokens": [
                {
                  "index": 1,
                  "word": "hello",
                  "originalText": "hello",
                  "lemma": "hello",
                  "characterOffsetBegin": 0,
                  "characterOffsetEnd": 5,
                  "pos": "UH",
                  "ner": "O",
                  "before": "",
                  "after": " "
                },
              ]
            }
          ]
        }"""
        try:
            output = json.loads(output)
        except:
            logger.info(
                "ERROR in Tokenizer: %s\noutput: %s" %
                ((text[0:100] + "..." if len(text) > 100 else text), output))
            if not self.nlp:
                self.close()
                self.nlp = None
            self.nlp = StanfordCoreNLP(self.classpath,
                                       memory=self.heap,
                                       lang=self.language,
                                       timeout=self.timeout)
            return None

        data = []
        tokens = [t for s in output['sentences'] for t in s['tokens']]
        for i in range(len(tokens)):
            # 获得 单词 及 其后的空白符(如果有的话)
            start_whitespace = tokens[i]['characterOffsetBegin']
            if i + 1 < len(tokens):
                end_whitespace = tokens[i + 1]['characterOffsetBegin']
            else:
                end_whitespace = tokens[i]['characterOffsetEnd']

            data.append(
                (special_char(tokens[i]['word']),
                 text[start_whitespace:end_whitespace],
                 (tokens[i]['characterOffsetBegin'],
                  tokens[i]['characterOffsetEnd']), tokens[i].get('pos', None),
                 tokens[i].get('lemma', None), tokens[i].get('ner', None)))
        return Tokens(data, self.annotators)
Ejemplo n.º 2
0
def getParse(sentence) -> str:
    # Preset
    nlp = StanfordCoreNLP('stanford-corenlp-4.2.0/', memory='8g')
    cc = OpenCC('t2s')

    # sentence = 'Those two splendid old electric trains.'
    print(
        "##################################################################################"
    )
    # # POS
    print('POS:', nlp.pos_tag(sentence))
    print(
        "##################################################################################"
    )

    # # Tokenize
    print('Tokenize:', nlp.word_tokenize(sentence))
    print(
        "##################################################################################"
    )

    # # NER
    print('NER:', nlp.ner(sentence))
    print(
        "##################################################################################"
    )

    # Parser
    tree = nlp.parse(sentence)
    parse_string = ' '.join(str(tree).split())
    print(parse_string)

    # ParserTest
    print('Parser:')
    print(nlp.parse(sentence))
    print(
        "##################################################################################"
    )

    #TREE Graph
    tagged = pos_tag(word_tokenize(sentence))
    # Extract all parts of speech from any text
    chunker = RegexpParser("""
                           NP: {<DT>?<JJ>*<NN>}    #To extract Noun Phrases
                           P: {<IN>}               #To extract Prepositions
                           V: {<V.*>}              #To extract Verbs
                           PP: {<P> <NP>}          #To extract Prepostional Phrases
                           VP: {<V> <NP|PP>*}      #To extarct Verb Phrases
                           """)

    # Print all parts of speech in above sentence
    output = chunker.parse(tagged)
    print("After Extracting\n", output)
    # To draw the parse tree
    output.draw()
    print(
        "##################################################################################"
    )

    # Close Stanford Parser
    nlp.close()
    return str(parse_string)
Ejemplo n.º 3
0
中英文分词: StanfordTokenizer
中英文词性标注: StanfordPOSTagger
中英文命名实体识别: StanfordNERTagger
中英文句法分析: StanfordParser
中英文依存句法分析: StanfordDependencyParser, StanfordNeuralDependencyParser
"""
from preprocessing import preprocess_string
from preprocessing import strip_numeric, remove_stopwords, strip_punctuation, tokenize
from timeit import default_timer
from stanfordcorenlp import StanfordCoreNLP

begin = default_timer()
str_test = u'''云南铜业股份有限公司(深交所:000878),简称云铜股份、云铜,前身为云南冶炼厂,成立于1958年,1998年改制为股份公司,更名为现称,1998年6月2日于深圳证券交易所上市。公司是中国第四大铜业企业,生产高纯阴极铜、电工用铜线坏、工业硫酸、金锭、银锭、电工用圆铜线、硫酸铜等主产品,并能综合回收金、银、铝、铋、铂、钯等多种有色金属。2007年10月,中国铝业收购云铜母公司云南铜业集团的49%股权,改名“中铝云南铜业集团”。'''
filter_setting = [tokenize, strip_punctuation]
text = preprocess_string(str_test, filter_setting)
nlp = StanfordCoreNLP('/home/weiwu/tools/stanford-corenlp-full-2017-06-09/',
                      lang='zh')
tokenize = nlp.word_tokenize(str_test)
pos_tag = nlp.pos_tag(str_test)
ner = nlp.ner(str_test)
parse = nlp.parse(str_test)
depend = nlp.dependency_parse(str_test)
end = default_timer()

from stanfordcorenlp import StanfordCoreNLP
import logging
import json
from collections import defaultdict


class StanfordNLP:
    def __init__(self, host='http://localhost', port=9000):
Ejemplo n.º 4
0
# stanfordcorenlp by Lynten Guo. A Python wrapper to Stanford CoreNLP server, version 3.9.1.
# PyPI page: pip install stanfordcorenlp

# Simple usage
from stanfordcorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP(
    r'D:\samli_202010\CoreNLP\CoreNLP\stanford-corenlp-4.1.0')
# nlp = StanfordCoreNLP('http://localhost', port=9000)
# Debug the wrapper
# nlp = StanfordCoreNLP(r'path_or_host', logging_level=logging.DEBUG)

# Check more info from the CoreNLP Server
# nlp = StanfordCoreNLP(r'path_or_host', quiet=False,
#   logging_level=logging.DEBUG)

sentence = 'I go to aist in Tokyo everyday.Tokyo is the capital city of Japan.'
print('Tokenize:', nlp.word_tokenize(sentence))
print('Part of Speech:', nlp.pos_tag(sentence))
print('Named Entities:', nlp.ner(sentence))
# print('Constituency Parsing:', nlp.parse(sentence))
print('Dependency Parsing:', nlp.dependency_parse(sentence))

# Do not forget to close! The backend server will consume a lot memery.
nlp.close()
Ejemplo n.º 5
0
from stanfordcorenlp import StanfordCoreNLP
import teasting
import re
import gensim.downloader as api
import time
import open_traning_data
start = time.time()

word_vectors = api.load("glove-wiki-gigaword-100")

teasts = teasting.teast()

data = open_traning_data.open_data()

nlp = StanfordCoreNLP(r'./stanford-corenlp-full-2018-10-05')

sentece_we_got_wong = []

for test_time in range(4927):

    data = teasts.full_teast()

    sentence1 = (data[0])
    sentence2 = (data[1])

    #full stop remover
    if sentence1[-1] == ".":
        sentence1 = sentence1[0:-2]
    if sentence2[-1] == ".":
        sentence2 = sentence2[0:-2]
Ejemplo n.º 6
0
from stanfordcorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP(r'F:\stanford-corenlp-full-2018-10-05')

sentence = 'Guangdong University of Foreign Studies is located in Guangzhou.'
print('Tokenize:', nlp.word_tokenize(sentence))
print('Part of Speech:', nlp.pos_tag(sentence))
print('Named Entities:', nlp.ner(sentence))
print('Constituency Parsing:', nlp.parse(sentence))
print('Dependency Parsing:', nlp.dependency_parse(sentence))

nlp.close()
Ejemplo n.º 7
0
            rf.write(json.dumps(relation_id))
        print("count_re: ", count_re, "\t count_na: ", count_na,
              "\t count_total: ", count_re + count_na)
        print("total_sentence_used: ", total_sentence_used)
        total_len = count_re + count_na if count_re + count_na < args.max_sentence else args.max_sentence
        train_list = RES_list[:int(0.8 * total_len)]
        test_list = RES_list[int(0.8 * total_len) + 1:]
        json.dump(train_list, trf)
        json.dump(test_list, tef)


def linecount(file_path):
    count = -1
    for count, line in enumerate(open(file_path, 'r', encoding='utf-8')):
        pass
    return count + 1


if __name__ == '__main__':
    # use StanfordCoreNLP to tag ner
    nlp = StanfordCoreNLP(args.stanford_path,
                          lang='zh',
                          logging_level=logging.WARNING)

    # use jieba to seg sentence
    #    jieba.load_userdict(args.jieba_dict)

    clean_sql_output(args.raw_sql_input, args.raw_sql_output)
    build_entity_relation(args.raw_sql_output, args.train_file, args.test_file,
                          args.disambi_attr_title)
Ejemplo n.º 8
0
class StanfordParser(OieParser):
    def __init__(self):
        self.nlp = StanfordCoreNLP('/home/xliucr/stanford-corenlp/stanford-corenlp-full-2018-02-27', memory='4g')
        # self.nlp = StanfordCoreNLP(r'/Users/Sean/Workspace/stanford-corenlp/stanford-corenlp-full-2018-02-27', memory='2g')
        self.wnl = WordNetLemmatizer()

    def _get_path(self, index, dependency_parse):
        path = []
        root_index = dependency_parse[0][2]
        # begin from 1
        index += 1
        if index == root_index:
            return path

        while index != root_index: 
            if index > root_index:
                index -= 1
            path.append(dependency_parse[index][0])
            index = dependency_parse[index][1]
        return path
    
    def parse(self, sentence_entity):
        try:
            _id, sentence, entities = sentence_entity
            sentence = sentence.strip()

            entities_replace = []
            sentence_replace = sentence
            for i in range(len(entities)):
                e = entities[i].replace('_', ' ').title()
                entities_replace.append((len(e), e))
            for i, e in enumerate(sorted(entities_replace, reverse=True)):
                sentence_replace = sentence_replace.replace(entities[i], e[1])
            
            if not isinstance(_id, str):
                _id = 'id_' + str(_id)

            # entity
            ner = self.nlp.ner(sentence_replace)
            ner_indice = [-1] * len(entities_replace)

            for j, x in enumerate(ner):
                for i, e in enumerate(entities_replace):           
                    if ner_indice[i] == -1 and e[1].startswith(x[0]):
                        ner_indice[i] = j

            for index in ner_indice:
                if index == -1:
                    return []
            
            entitiy_types = '-'.join(map(lambda x: (ner[x][1]), ner_indice))

            # tag
            tag = ' '.join(map(lambda x: x[1], self.nlp.pos_tag(sentence_replace)))
            # lexicalized dependency path
            dependency_parse = self.nlp.dependency_parse(sentence_replace)

            for i in range(1, len(dependency_parse)):
                # if dependency_parse[i][0] == 'ROOT':
                if dependency_parse[i][1] == 0:
                    return []

            left_path = self._get_path(ner_indice[0], dependency_parse)
            right_path = self._get_path(ner_indice[1], dependency_parse)
            

            # trigger
            root_index = dependency_parse[0][2] - 1
            root = self.wnl.lemmatize(self.wnl.lemmatize(ner[root_index][0]), 'v')
            trigger = 'TRIGGER:%s' %(root)

            dependency_path = ''
            for x in left_path:
                dependency_path += '<-' + x
            dependency_path += '<-' + root + '->'
            for x in right_path[::-1]:
                dependency_path += x + '->'
            
            return dependency_path, entities[0], entities[1], entitiy_types, trigger, _id, sentence, tag
        except:
            return []
    
    def shutdown(self):
        self.nlp.close()
Ejemplo n.º 9
0
                output_word += "'s"  # add the possessive morpheme
            output_word += token['after']
            print(output_word, end='')


text = "Tom and Jane are good friends. They are cool. He knows a lot of things and so does she. His car is red, but " \
       "hers is blue. It is older than hers. The big cat ate its dinner."

text0 = 'Barack Obama was born in Hawaii.  He is the president. Obama was elected in 2008.'

text2 = "The music was so loud that it couldn\'t be enjoyed." \
       "Our neighbors dislike the music. If they are angry, the cops will show up soon." \
       "If they are angry about the music, the neighbors will call the cops." \
       "Despite heri difficulty, Wilmai came to understand the point."

nlp = StanfordCoreNLP('/home/polo/Downloads/stanford-corenlp-full-2018-10-05/',
                      quiet=False)
props = {'annotators': 'dcoref', 'pipelineLanguage': 'en'}

output = json.loads(nlp.annotate(text, properties=props))
#output = nlp.annotate(text, properties= {'annotators':'dcoref','outputFormat':'json','ner.useSUTime':'false'})

resolve(output)

print('Original:', text)
print('_________________________________________')
print('Resolved: ', end='')
print_resolved(output)
nlp.close()
#draw()
Ejemplo n.º 10
0
import operator

print(sorted(queryType.items(), key=operator.itemgetter(1)))
print(sum)

top20howMany = {}

top20howMany = collections.Counter(howMany).most_common(20)
# print(top20howMany)
newdict = {}
for obj in top20howMany:
    newdict[obj[0]] = obj[1]
    # print(obj[1][0])

nlp = StanfordCoreNLP('http://corenlp.run', port=80)
tags = []
parse = []
dependency = []
for key in newdict.keys():
    tags.append((nlp.pos_tag(newdict[key][0])))
    parse.append(nlp.parse(newdict[key][0]))
    dependency.append((nlp.dependency_parse(newdict[key][0])))
print('tag')

# print(tags[1])
# print(parse[1])
# print(dependency[1])
# print(howMany['1-1037590-1'])

for i in range(len(tags)):
Ejemplo n.º 11
0
#encoding=utf8
import os,gc,re,sys
from stanfordcorenlp import StanfordCoreNLP



stanford_nlp = StanfordCoreNLP("/home/kuo/NLP/module"+os.sep+'stanfordnlp', lang='zh')



def ner_stanford(raw_sentence,return_list=True):
    if len(raw_sentence.strip())>0:
        return stanford_nlp.ner(raw_sentence) if return_list else iter(stanford_nlp.ner(raw_sentence))

def cut_stanford(raw_sentence,return_list=True):
    if len(raw_sentence.strip())>0:
        return stanford_nlp.pos_tag(raw_sentence) if return_list else iter(stanford_nlp.pos_tag(raw_sentence))




Ejemplo n.º 12
0
# For Testing Purposes.
### DO NOT MAKE CHANGES TO TESTING VARIABLE HERE ###
### ONLY MAKE CHANGES FROM COMMAND-LINE OPTIONS ###
DEBUG = False  # True/False.

# Get directory of executable files are located in relative to python file.
execDir = os.path.dirname(os.path.realpath(__file__))

# Variable to hold training data location.  Can change via command-line parameter.
trainingDataDir = os.path.join(execDir, "TrainingData")

# Setup a Solr instance. The timeout is optional.
solr = pysolr.Solr('http://localhost:8983/solr/part3core', timeout=10)

# Setup a StanfordCoreNLP instance to get the head word.
nlp = StanfordCoreNLP('http://localhost', port=9000)

# Flags for query variables and their scores.
# Higher scores put more weight on those search terms.
# Default all to 1 for now.
# Can use these flags to automate testing later.
SENTENCE_FLAG = True
SENTENCE_WEIGHT = 1
LEMMA_FLAG = True
LEMMA_WEIGHT = 1
STEM_FLAG = True
STEM_WEIGHT = 1
POSTAG_FLAG = True
POSTAG_WEIGHT = 1
HEADWORD_FLAG = True
HEADWORD_WEIGHT = 1
Ejemplo n.º 13
0
def get(url, country):
    import re
    import operator
    from py2neo import Graph, Node, Relationship
    from stanfordcorenlp import StanfordCoreNLP
    import urllib.request, urllib.error, urllib.parse
    import json
    import os
    from pprint import pprint
    try:
        from urllib import request
    except:
        from urllib2 import urlopen as request
        from urllib2 import Request
    from bs4 import BeautifulSoup

    #open a graph database
    graph = Graph("http://*****:*****@id']}")
                    continue
            if class_details["@id"] not in id_set:
                id_set.append(class_details["@id"])
                label_set.append(class_details["prefLabel"])

            if result["hierarchy"]:
                print("\n\tHierarchy annotations")
                for annotation in result["hierarchy"]:
                    try:
                        class_details = get_json(annotation["annotatedClass"]["links"]["self"])
                    except urllib.error.HTTPError:
                        print(f"Error retrieving {annotation['annotatedClass']['@id']}")
                        continue
                    pref_label = class_details["prefLabel"] or "no label"
                    print("\t\tClass details")
                    print("\t\t\tid: " + class_details["@id"])
                    print("\t\t\tprefLabel: " + class_details["prefLabel"])
                    print("\t\t\tontology: " + class_details["links"]["ontology"])
                    print("\t\t\tdistance from originally annotated class: " + str(annotation["distance"]))
    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
    }
    m = request.urlopen(request.Request(url, headers=HEADERS)).read()
    s = BeautifulSoup(m, "html.parser")
    metadata = s.findAll("p", attrs={"class":"bodytext"})

    n=len(metadata)

    #find official name
    official_name = ""
    for nn in range(1,n):
        if len(metadata[nn].get_text().split()) < 3:
            nx=nn
            break
        else:
            official_name += metadata[nn].get_text()

    print(country + "--" + official_name)
    print("processing...")
    ##create ontology
    official_name_onto = Node("dietary guidelines", name=official_name, area = country)
    graph.create(official_name_onto)
    ##nlp
    nation_set=[]
    res = stanford_model.ner(official_name)
    ##"n_t" = a temporary used number
    for n_t in range(0,len(res)):
        id_set=[]
        label_set=[]
        if res[n_t][1]=="NATIONALITY":
            if res[n_t][0] not in nation_set:
                nation_set.append(res[n_t][0])
                text_to_annotate = res[n_t][0]
                annotations = get_json(REST_URL + "/annotator?text=" + urllib.parse.quote(text_to_annotate)+"&ontologies=NCIT,MESH")
                print_annotations(annotations)
                Nation_onto=Node("nation", name=res[n_t][0])
                relation = Relationship(official_name_onto, "nation/language", Nation_onto)
                graph.create(relation)
                for nn in range(0,len(id_set)):
                    identifier=Node("identifier", url=id_set[nn])
                    relation = Relationship(Nation_onto, "identifier", identifier)
                    graph.create(relation)

    #find all publication years
    publication_year = []

    for nn in range(nx+1,n):
        if len(metadata[nn].get_text().split()) < 4:
            nx=nn
            break
        else:
            publication_year.append(metadata[nn].get_text())

    publication_year_para_onto = Node("description", content = publication_year)
    relation = Relationship(official_name_onto, "publication year", publication_year_para_onto)
    graph.create(relation)

    res = stanford_model.ner(str(publication_year))

    date_set=[]
    for n_t in range(0,len(res)):
        if res[n_t][1]=="DATE":
            if res[n_t][0] not in nation_set:
                date_set.append(res[n_t][0])

    date_set = re.findall('\d+', str(date_set))
    date_set = sorted(date_set, reverse=False)

    for date in date_set:
        publication_year_onto=Node("value", name = date, value = date, unit= "year")
        relation = Relationship(publication_year_para_onto, "has value", publication_year_onto)
        graph.create(relation)


    #calculation publication frequency
    if len(date_set) > 1:
        frequency=(int(date_set[len(date_set)-1]) - int(date_set[0]))/(len(date_set)-1)
        frequency_onto=Node("publication frequency", name="each " + str(int(frequency))+" years", value=int(frequency), unit="year")
        relation = Relationship(official_name_onto, "publication frequency", frequency_onto)
        graph.create(relation)

    #find stakeholders
    stakeholders = []
    for nn in range(nx+1,n):
        if len(metadata[nn].get_text().split()) < 3:
            nx=nn
            break
        else:
            stakeholders.append(str(metadata[nn].get_text()))

    stakeholders_para_onto = Node("description", content = stakeholders)
    relation = Relationship(official_name_onto, "stakeholders", stakeholders_para_onto)
    graph.create(relation)

    res = stanford_model.ner(str(stakeholders))
    organization = ""
    organization_set = []
    for n_t in range(0,len(res)):
        if res[n_t][1]=="ORGANIZATION":
            organization += " " + res[n_t][0]
        else:
            if organization != "":
                organization_set.append(str(organization))
                organization = ""

    organization_set_clean=[]
    for organization in organization_set:
        if organization not in organization_set_clean:
            organization_set_clean.append(organization)

    for organization in organization_set_clean:
        stakeholder_onto = Node("stakeholder", name = organization)
        relation = Relationship(stakeholders_para_onto, "has value", stakeholder_onto)
        graph.create(relation)


    #find audiance
    audience = []
    for nn in range(nx+1,n):
        nx=nn
        if len(metadata[nn].get_text().split()) < 3:
            break
        else:
            audience.append(metadata[nn].get_text())

    audience_onto = Node("audience", content = audience)
    relation = Relationship(official_name_onto, "audience", audience_onto)
    graph.create(relation)


    res = stanford_model.ner(str(audience))
    age=""
    age_set=[]

    ##nlp for getting age
    for n_t in range(0,len(res)):
        if res[n_t][1]=="DURATION":
            age += " " + res[n_t][0]
        else:
            if age != "":
                age_set.append(age)
                age = ""

    age_set_clean=[]
    for age in age_set:
        if age not in age_set_clean:
            age_set_clean.append(age)

    for age in age_set_clean:
        age_onto = Node("age", name = "over"+age)
        relation = Relationship(audience_onto, "age", age_onto)
        graph.create(relation)

    #find food guide
    food_guide=[]
    for nn in range(nx+1,n):
        nx=nn
        if len(metadata[nn].get_text().split()) < 3:
            break
        else:
            food_guide.append(metadata[nn].get_text())

    food_guide_onto = Node("food guide", name="food guide", content = food_guide)
    relation = Relationship(official_name_onto, "food guide", food_guide_onto)
    graph.create(relation)
    ##add identifiers
    id_set = []
    annotations = get_json(REST_URL + "/annotator?text=" + urllib.parse.quote(str(food_guide))+"&ontologies=FOODON")
    print_annotations(annotations)

    for nn in range(0,len(id_set)):
        identifier=Node("identifier", url=id_set[nn])
        relation = Relationship(food_guide_onto, "identifier", identifier)
        graph.create(relation)

    #find food guidelines(messages)
    guidelines = s.findAll('ul')
    n_max = len(guidelines)-6

    for n in range(2, n_max):
        for li in guidelines[n].findAll('li'):
            guideline = Node("message", content=li.get_text())
            relation = Relationship(official_name_onto, "message", guideline)
            graph.create(relation)
            id_set = []
            label_set = []
            annotations = get_json(REST_URL + "/annotator?text=" + urllib.parse.quote(str(li.get_text()))+"&ontologies=FOODON")
            print_annotations(annotations)
            if len(id_set) != 0:
                for nn in range(0,len(id_set)):
                    food_onto = Node("food", name=label_set[nn], url=id_set[nn])
                    relation = Relationship(guideline, "has value", food_onto)
                    graph.create(relation)
                    identifier=Node("identifier", url=id_set[nn])
                    relation = Relationship(food_onto, "identifier", identifier)
                    graph.create(relation)
    print("done")
    print("")
# -*- coding: utf-8 -*-
"""
Created on Wed Nov  7 21:31:36 2018

@author: User
"""

import json
import io
io = io.StringIO('["streaming API"]')
json.loads(ann)

import json
from stanfordcorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP('http://localhost', port=9000)
props = {'annotators': 'coref', 'pipelineLanguage': 'en'}

text = 'Barack Obama was born in Hawaii .  He is the president . Obama was elected in 2008 .'
result = json.loads(nlp.annotate(text, properties=props))

num, mentions = list(result['corefs'].items())[0]
for mention in mentions:
    print(mention)

props = {'annotators': 'coref', 'pipelineLanguage': 'en'}
nlp = StanfordCoreNLP('http://localhost', port=9000)

i = 0
j = 0
mentionsList = []
# Simple usage
from stanfordcorenlp import StanfordCoreNLP

#nlp = StanfordCoreNLP(r'G:/JavaLibraries/stanford-corenlp-full-2016-10-31/')
#nlp = StanfordCoreNLP(r'/Users/luisvalencia/Projects/Hackaton/CoreNLP/stanford-corenlp-full-2017-06-09/')
nlp = StanfordCoreNLP('http://localhost', port=9001, lang="es")


sentence = 'El perro de San Roque no tiene rabo'
#'Guangdong University of Foreign Studies is located in Guangzhou.'
'''
print ('Tokenize:', nlp.word_tokenize(sentence))
print ('Part of Speech:', nlp.pos_tag(sentence))
print ('Named Entities:', nlp.ner(sentence))
print ('Constituency Parsing:', nlp.parse(sentence))
print ('Dependency Parsing:', nlp.dependency_parse(sentence))
'''
Ejemplo n.º 16
0
 def __init__(self):
     self.nlp = StanfordCoreNLP('/home/xliucr/stanford-corenlp/stanford-corenlp-full-2018-02-27', memory='4g')
     # self.nlp = StanfordCoreNLP(r'/Users/Sean/Workspace/stanford-corenlp/stanford-corenlp-full-2018-02-27', memory='2g')
     self.wnl = WordNetLemmatizer()
Ejemplo n.º 17
0
from stanfordcorenlp import StanfordCoreNLP
import os
import yaml
import re
import json
dir_path = os.getcwd()

host = "http://192.168.12.39"
port = 9000
neg_file = "/home/ankit/NLP/API_NLP/Sentiment/negative-words.txt"
posi_file = "/home/ankit/NLP/API_NLP/Sentiment/positive-words.txt"
nlp = StanfordCoreNLP(host, port=port, timeout=300000000000)


def extractNounsAndAdj(Text):
    tagged = nlp.pos_tag(Text)
    return tagged


def tenGram(Text):
    pos = []
    senti_pos = {}
    typesOfNouns = ['NN', 'NNS', 'NNP', 'NNPS']
    ls = extractNounsAndAdj(Text)
    for ele in range(0, len(ls)):
        if ele == 0 and typesOfNouns.__contains__(ls[ele][1]):
            for position in range(0, 5):
                if ls[position][1] == 'JJ' or ls[position][1] == 'JJ' or ls[
                        position][1] == 'JJS':
                    pos.append((ls[ele][0], ls[position][0], position))
                    senti_pos[ls[0][0]] = pos
Ejemplo n.º 18
0
from keras import backend as K
from GCN import *
import re,os
from stanfordcorenlp import StanfordCoreNLP
import numpy as np
import tensorflow as tf

os.environ['CUDA_VISIBLE_DEVICES']='3'
os.environ["TF_CPP_MIN_LOG_LEVEL"]='3'

nlp = StanfordCoreNLP(r'../stanford-corenlp-full-2017-06-09/')

sentence = 'which you step on to activate it'
de = nlp.dependency_parse(sentence)

print ('Dependency Parsing:', de)

dep_sentences = []
for i in range(10):
    dep_sentences.append(de)

#依存关系标签
_DEP_LABELS = ['ROOT', 'DOBJ','ADV', 'ADV-GAP', 'AMOD', 'APPO', 'BNF', 'CONJ', 'COORD', 'DEP',
               'DEP-GAP', 'DIR', 'DIR-GAP', 'DIR-OPRD', 'DIR-PRD', 'DTV', 'EXT',
               'EXT-GAP', 'EXTR', 'GAP-LGS', 'GAP-LOC', 'GAP-LOC-PRD', 'GAP-MNR',
               'GAP-NMOD', 'GAP-OBJ', 'GAP-OPRD', 'GAP-PMOD', 'GAP-PRD', 'GAP-PRP',
               'GAP-SBJ', 'GAP-TMP', 'GAP-VC', 'HMOD', 'HYPH', 'IM', 'LGS', 'LOC',
               'LOC-OPRD', 'LOC-PRD', 'LOC-TMP', 'MNR', 'MNR-PRD', 'MNR-TMP', 'NAME',
               'NMOD', 'NSUBJ','OBJ', 'OPRD', 'P', 'PMOD', 'POSTHON', 'PRD', 'PRD-PRP',
               'PRD-TMP', 'PRN', 'PRP', 'PRT', 'PUT', 'SBJ', 'SUB', 'SUFFIX',
Ejemplo n.º 19
0
from scipy import spatial
from nltk.tree import *
import nltk.corpus
import nltk.tokenize.punkt
import nltk.stem.snowball
import string
from multiprocessing import Pool
from nltk.draw.tree import TreeView
from fuzzywuzzy import fuzz
from multiprocessing import Pool
from collections import Counter

public = '/home/users2/mehrotsh/scripts/packages/stanford-corenlp-full-2018-02-27/'
personal = '/home/samarth/stanford-corenlp-full-2018-02-27/'

nlp = StanfordCoreNLP(public)

#################################################### Functions ############################################################################


def tree():
    return defaultdict(tree)


def _leadingSpaces_(target):
    return len(target) - len(target.lstrip())


def _findParent_(curIndent, parid, treeRef):
    tmpid = parid
    while (curIndent <= treeRef[tmpid]['indent']):
Ejemplo n.º 20
0
class ParseTree:

    def __init__(self, text):
        self.text = text  # 传入的文本
        # 定义模型
        self.nlp = StanfordCoreNLP(r'E:/py/stanford-corenlp-4.2.0', lang='zh', quiet=False, logging_level=logging.DEBUG)
        # 分句
        self.sentences = self.preprocess()
        print(self.sentences)
        self.stopwords = []

    def load_dicts(self):
        stop = PATH + 'stop1205.txt'
        self.stopwords = self.dict_load(stop)

#     def preprocess(self):
#         """
#         预处理:
#         1. 去除换行符、多余的空格、百分号
#         2. 分句,存入列表
#         :return:返回句子列表
#         """
#         sentences = []
#         self.text = re.sub('%', '', re.sub(' ', '', re.sub('\xa0\xa0\xa0\r\n', '', self.text)))
#         start = 0
#         for i in range(len(self.text)):
#             if self.text[i] in ['。', '!', ';', '?', '……']:
#                 sentences.append(self.text[start:i + 1])
#                 start = i + 1
#         return sentences

    def preprocess(self):
        """
        把文本处理成摘要句子列表
        """
        return get_sum(self.text)
        
    def tree(self, sentence):
        sentence = sentence.replace(' ','')
        print(sentence)
        res = self.nlp.parse(sentence)
        # nlp.close()
        return res

    def sum_of_heights(self):
        """
        计算整篇文本的每句话构成的语法分析树的高度之和
        :return: 高度之和
        """
        sumHeights = []
        for sentence in self.sentences:
            sentence.replace('%','')
            res = self.tree(sentence)  # 语法树,是个字符串
            sumHeights.append(len(res.split("\r\n")))
        return np.sum(sumHeights)

    def avg_height(self):
        """
        这篇文章的每句话的语法分析树的平均高度
        :return:
        """
        return self.sum_of_heights() / len(self.sentences)

    def no_less_than_16(self):
        """
        计算整篇文本的高度不大于16的语法分析树的个数
        :return:
        """
        num = 0
        for sentence in self.sentences:
            res = self.tree(sentence)
            if len(res) >= 16:
                num += 1
        return num

    def no_less_than_16_percent(self):
        """
        高度不低于16的语法分析树的比例
        :return:
        """
        return self.no_less_than_16() / len(self.sentences)

    def nodes_sum(self):
        """
        总节点数
        :return:
        """
        node_sums = []
        for sentence in self.sentences:
            res = self.nlp.parse(sentence)
            result = -1  # 去除root
            for i in res:
                if i == '(':
                    result += 1
            node_sums.append(result)
        return np.sum(node_sums)

    def avg_nodes_sentence(self):
        """
        每句话的平均节点
        :return:
        """
        return self.nodes_sum() / len(self.sentences)

    def seg_sentence(self, sentence):
        """
        输入字符串,返回分词后的列表
        :param sentence:
        :return:
        """
        jieba.load_userdict('../词典/userdict.txt')
        sentence_seged = jieba.cut(sentence.strip())
        outstr = ''
        for word in sentence_seged:
            if word not in self.stopwords:
                if word != '\t':
                    outstr += word
                    outstr += " "
        return outstr.split(' ')

    def avg_nodes_word(self):
        """
        每个词的平均节点
        :return:
        """
        # 计算有几个词
        num = 0
        for sentence in self.sentences:
            sentence = self.seg_sentence(sentence)
            num += len(sentence)
        return self.nodes_sum() / num

    def np_sum(self):
        """
        计算整篇文章里的名词短语个数
        :return:
        """
        num = 0
        for sentence in self.sentences:
            res = self.tree(sentence).split("\r\n")
            for i in res:
                if 'NP' in i:
                    num += 1
        return num

    def avg_np(self):
        """
        语法分析树的平均名词短语个数
        :return:
        """
        return self.np_sum() / len(self.sentences)

    def vp_sum(self):
        """
        计算整篇文章里的动词短语个数
        :return:
        """
        num = 0
        for sentence in self.sentences:
            print(sentence)
            res = self.tree(sentence).split("\r\n")
            for i in res:
                if 'VP' in i:
                    num += 1
        return num

    def avg_vp(self):
        """
        语法分析树的平均动词短语个数
        :return:
        """
        return self.vp_sum() / len(self.sentences)

    def adjp_sum(self):
        """
        计算整篇文章里的形容词短语个数
        :return:
        """
        num = 0
        for sentence in self.sentences:
            res = self.tree(sentence).split("\r\n")
            for i in res:
                if 'ADJP' in i:
                    num += 1
        return num

    def avg_adjp(self):
        """
        语法分析树的平均形容词短语个数
        :return:
        """
        return self.adjp_sum() / len(self.sentences)

    def get_res(self):
        res = {}
        res['sum_height'] = self.sum_of_heights()
        res['height_16'] = self.no_less_than_16()
        res['sum_node'] = self.nodes_sum()
        res['sum_n'] = self.np_sum()
        res['sum_v'] = self.vp_sum()
        res['sum_adj'] = self.adjp_sum()
        res['avg_height'] = self.avg_height()
        res['16_ratio'] = self.no_less_than_16_percent()
        res['avg_node'] = self.avg_nodes_sentence()
        res['word_avg_node'] = self.avg_nodes_word()
        res['avg_n'] = self.avg_np()
        res['avg_v'] = self.avg_vp()
        res['avg_adj'] = self.avg_adjp()
        return res
Ejemplo n.º 21
0
class ChatBot:
    """
        Intelligent dialogue model based on-
        1. Template-based- AIML
        2. Knowledge Based- MySQL \\\
        3. Web Search
        4. Deep Learning: RNN
    """

    # initialize
    colorama.init()
    ws.load()

    #nltk.download()

    def __init__(self,
                 config_file='config.cfg',
                 host='http://localhost',
                 port=9000):
        config = configparser.ConfigParser()
        config.read(config_file)
        self.filter_file = config.get('resource', 'filter_file')
        self.load_file = config.get('resource', 'load_file')
        self.save_file = config.get('resource', 'save_file')
        self.shelve_file = config.get('resource', 'shelve_file')

        corp_dir = os.path.join(PROJECT_ROOT, 'Data', 'Corpus')
        knbs_dir = os.path.join(PROJECT_ROOT, 'Data', 'KnowledgeBase')
        res_dir = os.path.join(PROJECT_ROOT, 'Data', 'Result')

        # Initialize the KERNEL
        self.mybot = aiml.Kernel()
        sess = tf.Session()
        self.predictor = BotPredictor(sess,
                                      corpus_dir=corp_dir,
                                      knbase_dir=knbs_dir,
                                      result_dir=res_dir,
                                      result_file='basic')
        self.session_id = self.predictor.session_data.add_session()

        # Create AI Engine
        if os.path.isfile("model\AIChatEngine.brn"):
            self.mybot.bootstrap(brainFile="model\AIChatEngine.brn")
        else:
            self.mybot.bootstrap(learnFiles=self.load_file,
                                 commands='load aiml b')
            self.mybot.saveBrain("model\AIChatEngine.brn")

        #Initialization learning library
        self.template = '<aiml version="1.0" encoding="UTF-8">\n{rule}\n</aiml>'
        self.category_template = '<category><pattern>{pattern}</pattern><template>{answer}</template></category>'

        # Initialize Filter sensitive words
        #self.gfw = filter.DFAFilter()
        #self.gfw.parse(self.filter_file)

        # Use an existing server: StanfordCoreNLP
        self.nlp = StanfordCoreNLP(host, port=port, timeout=30000)
        self.props = {
            'annotators':
            'tokenize,ssplit,pos,lemma,ner,parse,depparse,dcoref,relation',
            'pipelineLanguage': 'en',
            'outputFormat': 'json'
        }

        # Initialize the Language Tool for GEC
        self.tool = language_check.LanguageTool('en-US')


# ###########################################################

    def response(self, user_message):
        print('# User -->: ' + user_message)

        # Limit word count
        if len(user_message) > 200:
            return self.mybot.respond('MAX')
        elif len(user_message) < 2:
            return self.mybot.respond('MIN')

        # **************************************************
        #       Filter sensitive words
        # **************************************************
        #message = self.gfw.filter(message, "*")
        #if message.find("*") != -1:
        #return self.mybot.respond('FILTER')

        # **************************************************
        #       Grammar Error Check and Prompt to User
        # **************************************************
        gec_message = user_message

        # **************************************************
        #       Start Conversation
        # **************************************************
        responseAnswer = ''
        botresponse = self.mybot.respond(gec_message)
        print('# Bot1  --> ' + botresponse)

        if botresponse[0] == '@':
            botresponse = botresponse.replace('@', '')
            print('# After Confirmation--> ' + botresponse)
            if gec_message == 'Yes':
                botresponse = self.mybot.respond(botresponse)
            else:
                return self.mybot.respond('ASK NEW QUERY')

        # Initialize Lemmatization
        wordnet_lemmatizer = WordNetLemmatizer()

        # User Sentence Tokenization
        word_tokens = self.nlp.word_tokenize(botresponse)

        # Removing stopwords
        stop_words = set(stopwords.words('english'))
        #stopwords.extend(string.punctuation)
        filtered_sentence = [w for w in word_tokens if not w in stop_words]
        filtered_stop_words = []
        for w in word_tokens:
            if w not in stop_words:
                filtered_stop_words.append(w)

        print(
            colorama.Fore.RED +
            '\n------------------ User Input Words --> Lemma -------------------------- '
            + colorama.Fore.RESET)
        final_sentence = []
        for word in filtered_stop_words:
            final_sentence.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
            print("{0:10}{1:5}{2:20}".format(
                word, '--> ', wordnet_lemmatizer.lemmatize(word, pos="v")))

        #print(colorama.Fore.GREEN+'\n********************* Dependency Parser ********************* '+colorama.Fore.RESET)
        #dependency_parser = self.nlp.dependency_parse(' '.join(final_sentence))
        #print(dependency_parser)

        # POS Tagger
        postagger = self.nlp.pos_tag(' '.join(final_sentence))
        print(
            colorama.Fore.YELLOW +
            '\n------------------ Identify POS Tagger -------------------------- '
            + colorama.Fore.RESET)
        print('pos tagger: ', postagger)

        print(
            "-----------------------------------------------------------------------"
        )
        grammar = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
        cp = nltk.RegexpParser(grammar)
        #tree = cp.parse(postagger)
        #print ("CP: ", cp)
        tree = cp.parse(postagger)
        print(tree)

        for word, pos in postagger:
            if pos == 'NNP':
                print(word)
        print(
            "-----------------------------------------------------------------------"
        )
        #https://github.com/ayat-rashad/ayat-rashad.github.io/blob/master/triples.ipynb

        # Add all NOUNs into list
        nounEntityList = []
        for pos in postagger:
            if pos[1] in ('NN', 'NNS', 'NNP', 'NNPS'):
                nounEntityList.append(pos[0])
        print(
            colorama.Fore.GREEN +
            '\n------------------ Added NOUN into Entity List ------------------------- '
            + colorama.Fore.RESET)
        print(nounEntityList, '\n')

        # 1: Template-based Strategy
        if botresponse[0] != '#':
            print('Template-based Strategy')
            responseAnswer = botresponse

        # 2: KB Searching Strategy
        elif botresponse.find('#NONE#') != -1:
            nounEntityList.remove('#NONE')
            ans = ''
            #ans = kb.kdd_search(nounEntityList, ' '.join(final_sentence), gec_message)
            if ans != '':
                print('KB Searching Strategy')
                responseAnswer = ans.encode('utf-8')

            # 3: Internet Retrieval Strategy
            else:
                #ans = crawler.web_search(gec_message)
                if ans != '':
                    print('Internet Retrieval Strategy')
                    responseAnswer = ans.encode('utf-8')

                # 4: Generative Strategy- RNN
                else:
                    if gec_message == 'Yes':
                        confirm_mgs = botresponse.replace('#NONE#:', '')
                        ans = deep.neural_network(self, confirm_mgs)
                        print('Generative Strategy with - YES')
                        print(confirm_mgs)
                    else:
                        ans = deep.neural_network(self, gec_message)
                        print('Generative Strategy')

                    responseAnswer = ans.encode('utf-8')

        # Learning Mode
        elif result.find('#LEARN#') != -1:
            question = result[8:]
            answer = message
            self.save(question, answer)
            return self.mybot.respond('Already studied')

        else:
            responseAnswer = self.mybot.respond('I don\'t know.')

        return responseAnswer

    # Grammar Error Check on Raw User Input
    def checkGrammarError(self, user_message):
        print(
            colorama.Fore.GREEN +
            '\n------------------ Grammar Error Correction -------------------------- '
            + colorama.Fore.RESET)
        matches = self.tool.check(user_message)
        gec_user_message = language_check.correct(user_message, matches)
        if (len(matches) > 0):
            i = 0
            for x in matches:
                print('Grammatical Error --> ', matches[i])
                print('Apply Rules--> ', matches[i].replacements)
                i = i + 1
        else:
            print('No Error Found.')
        return gec_user_message

    # SAVE Model
    def save(self, question, answer):
        db = shelve.open(self.shelve_file, 'c', writeback=True)
        db[question] = answer
        db.sync()
        rules = []
        for r in db:
            rules.append(self.category_template.format(pattern=r,
                                                       answer=db[r]))
        with open(self.save_file, 'w') as fp:
            fp.write(self.template.format(rule='\n'.join(rules)))

    def forget(self):
        os.remove(self.save_file) if os.path.exists(self.save_file) else None
        os.remove(self.shelve_file) if os.path.exists(
            self.shelve_file) else None
        self.mybot.bootstrap(learnFiles=self.load_file, commands='load aiml b')
import json
from collections import Counter
from stanfordcorenlp import StanfordCoreNLP
from termcolor import colored
from tqdm import tqdm

from nltk.tokenize.treebank import TreebankWordDetokenizer

nlp = StanfordCoreNLP('../span_bert/SpanBERT/stanford-corenlp-full-2018-10-05')
# nlp.close()

ALL_RELATIONS_TYPES = {'per:title': ['PERSON', 'TITLE'], 'org:top_members/employees': ['ORGANIZATION', 'PERSON'],
                       'org:country_of_headquarters': ['ORGANIZATION', 'COUNTRY'], 'per:parents': ['PERSON', 'PERSON'],
                       'per:age': ['PERSON', 'NUMBER'], 'per:countries_of_residence': ['PERSON', 'COUNTRY'],
                       'per:children': ['PERSON', 'PERSON'], 'org:alternate_names': ['ORGANIZATION', 'ORGANIZATION'],
                       'per:charges': ['PERSON', 'CRIMINAL_CHARGE'], 'per:cities_of_residence': ['PERSON', 'CITY'],
                       'per:origin': ['PERSON', 'NATIONALITY'], 'org:founded_by': ['ORGANIZATION', 'PERSON'],
                       'per:employee_of': ['PERSON', 'ORGANIZATION'], 'per:siblings': ['PERSON', 'PERSON'],
                       'per:alternate_names': ['PERSON', 'PERSON'], 'org:website': ['ORGANIZATION', 'URL'],
                       'per:religion': ['PERSON', 'RELIGION'], 'per:stateorprovince_of_death': ['PERSON', 'LOCATION'],
                       'org:parents': ['ORGANIZATION', 'ORGANIZATION'],
                       'org:subsidiaries': ['ORGANIZATION', 'ORGANIZATION'], 'per:other_family': ['PERSON', 'PERSON'],
                       'per:stateorprovinces_of_residence': ['PERSON', 'STATE_OR_PROVINCE'],
                       'org:members': ['ORGANIZATION', 'ORGANIZATION'],
                       'per:cause_of_death': ['PERSON', 'CAUSE_OF_DEATH'],
                       'org:member_of': ['ORGANIZATION', 'LOCATION'],
                       'org:number_of_employees/members': ['ORGANIZATION', 'NUMBER'],
                       'per:country_of_birth': ['PERSON', 'COUNTRY'],
                       'org:shareholders': ['ORGANIZATION', 'ORGANIZATION'],
                       'org:stateorprovince_of_headquarters': ['ORGANIZATION', 'STATE_OR_PROVINCE'],
                       'per:city_of_death': ['PERSON', 'CITY'], 'per:date_of_birth': ['PERSON', 'DATE'],
Ejemplo n.º 23
0
f = open('C:/Users/text/PycharmProjects/fin_network/data/result.txt')

text = f.read()

# 基于TF-IDF算法进行关键词抽取
jieba.load_userdict('C:/Users/text/PycharmProjects/fin_network/data/newdict.txt')
keywords = tfidf(text, topK=50)
print "keywords by tfidf:"
# 输出抽取出的关键词
for keyword in keywords:
    print keyword + "/",

'''
# from nltk.parse import stanford

from stanfordcorenlp import StanfordCoreNLP
import uniout
import jieba.posseg as pseg

nlp = StanfordCoreNLP(
    r'C:\ProgramData\Anaconda2\stanfordNLP\stanford-corenlp-full-2018-10-05',
    lang='zh')  # 这里改成你stanford-corenlp所在的目录
sentence = '对公司未来业绩造成不利影响'
print[(word, flag) for (word, flag) in pseg.cut(sentence)]
# print 'Tokenize:', nlp.word_tokenize(sentence)
# print 'Part of Speech:', nlp.pos_tag(sentence)
# print 'Named Entities:', nlp.ner(sentence)
print 'Constituency Parsing:', nlp.parse(sentence)
# print 'Dependency Parsing:', nlp.dependency_parse(sentence)
nlp.close()  # Do not forget to close! The backend server w
def create_nlp_pool(num_threads):
    return [StanfordCoreNLP('http://localhost', port=9000) for _ in range(num_threads)]
Ejemplo n.º 25
0
# /usr/bin/python
# coding:utf-8
import pickle
import re
import sys
import json
import os
import argparse
from stanfordcorenlp import StanfordCoreNLP
from amr_utils import read_json, remove_wiki, read_anonymized, get_concepts
from generate_parent_index import gen_par_index_seq

if __name__ == '__main__':
    nlp = StanfordCoreNLP(r'/home/wangante/stanford-corenlp-full-2018-10-05', lang='en')
    in_file, in_dir, o_file1, o_file2, o_file3, o_file4, o_file5, o_file6, o_file7 = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[
        5], sys.argv[6], sys.argv[7], sys.argv[8], sys.argv[9]

    origin_amr = {}
    for file_name in os.listdir(in_dir):
        with open(os.path.join(in_dir, file_name)) as file:
            for example in file.read().strip().split('\n\n')[1:]:
                example = example.split('\n')
                origin_amr[example[0].split()[2]] = [example[2][len('# ::alignments '):],
                                                     example[1][len('# ::tok '):].lower(),
                                                     ' '.join(example[3:])]

    input_list = read_json(in_file)
    num_items = len(input_list[0])
    regu = re.compile(r'[^:][^\s]*?~e\.\d+')
    for id, amr in origin_amr.items():
        amr=amr[2].split()
Ejemplo n.º 26
0
class CoreNlpTokenizer():
    def __init__(self, **kwargs):
        """
        :arg language: 语言
        :arg classpath: corenlp jars的目录
        :arg annotators: 一个可能包含'pos', 'lemma', 'ner'的集合
        :arg heap: java堆内存
        """
        self.language = kwargs.get('language', DEFAULTS['tokenizer_language'])
        self.annotators = copy.deepcopy(
            kwargs.get('annotators', DEFAULTS['tokenizer_annotators']))
        self.classpath = os.path.join(
            DATA_DIR, kwargs.get('classpath', DEFAULTS['tokenizer_classpath']))
        self.heap = kwargs.get('heap', DEFAULTS['tokenizer_heap'])
        self.timeout = kwargs.get('timeout', DEFAULTS['tokenizer_timeout'])

        # annotators: tokenize(分词), ssplit(断句), pos(词性标注), lemma(词元化), ner(命名实体识别)
        annotators = ['tokenize', 'ssplit']
        if 'ner' in self.annotators:
            annotators.extend(['pos', 'lemma', 'ner'])
        elif 'lemma' in self.annotators:
            annotators.extend(['pos', 'lemma'])
        elif 'pos' in self.annotators:
            annotators.extend(['pos'])
        annotators = ','.join(annotators)
        options = ','.join(['untokenizable=noneDelete', 'invertible=true'])

        self.nlp = StanfordCoreNLP(self.classpath,
                                   port=random.randint(9000, 65535),
                                   memory=self.heap,
                                   lang=self.language,
                                   timeout=self.timeout)
        self.props = {
            'timeout': str(self.timeout),
            'annotators': annotators,
            'pipelineLanguage': 'zh',
            'outputFormat': 'json',
            'prettyPrint': 'False',
            'tokenize.options': options,
            # 'nthreads': 4
        }

    def tokenize(self, text):
        """ 将text输入self.corenlp句柄
        :return: Tokens,Tokens中的data包括多个(TEXT, TEXT_WS, SPAN, POS, LEMMA, NER)
        """
        # logger.info(text[0:10] + "..." if len(text) > 10 else text)

        text = text.replace('\n', '\t')

        output = self.nlp.annotate(text, properties=self.props)
        """ 有效输出: 
        {
          "sentences": [
            {
              "index": 0,
              "entitymentions": [],
              "tokens": [
                {
                  "index": 1,
                  "word": "hello",
                  "originalText": "hello",
                  "lemma": "hello",
                  "characterOffsetBegin": 0,
                  "characterOffsetEnd": 5,
                  "pos": "UH",
                  "ner": "O",
                  "before": "",
                  "after": " "
                },
              ]
            }
          ]
        }"""
        try:
            output = json.loads(output)
        except:
            logger.info(
                "ERROR in Tokenizer: %s\noutput: %s" %
                ((text[0:100] + "..." if len(text) > 100 else text), output))
            if not self.nlp:
                self.close()
                self.nlp = None
            self.nlp = StanfordCoreNLP(self.classpath,
                                       memory=self.heap,
                                       lang=self.language,
                                       timeout=self.timeout)
            return None

        data = []
        tokens = [t for s in output['sentences'] for t in s['tokens']]
        for i in range(len(tokens)):
            # 获得 单词 及 其后的空白符(如果有的话)
            start_whitespace = tokens[i]['characterOffsetBegin']
            if i + 1 < len(tokens):
                end_whitespace = tokens[i + 1]['characterOffsetBegin']
            else:
                end_whitespace = tokens[i]['characterOffsetEnd']

            data.append(
                (special_char(tokens[i]['word']),
                 text[start_whitespace:end_whitespace],
                 (tokens[i]['characterOffsetBegin'],
                  tokens[i]['characterOffsetEnd']), tokens[i].get('pos', None),
                 tokens[i].get('lemma', None), tokens[i].get('ner', None)))
        return Tokens(data, self.annotators)

    def close(self):
        self.nlp.close()
Ejemplo n.º 27
0
import jieba
import logging
from stanfordcorenlp import StanfordCoreNLP

# Start a CoreNLP Remote Server with Terminal command:
'''
java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer 
-annotators tokenize,ssplit,pos,lemma,ner,parse,depparse,coref,quote -port 9000 -timeout 30000
'''

nlp = StanfordCoreNLP('http://localhost',
                      port=9000,
                      lang='zh',
                      logging_level=logging.DEBUG)

text_path = 'test_chinese_news.txt'
sentence = ''

with open(text_path, 'r') as f:
    for line in f.readlines():
        if line.strip():
            sentence += line


def cut(string):
    return ' '.join(jieba.cut(string))


cut_sentence = cut(sentence)

# print(nlp.word_tokenize(sentence))
Ejemplo n.º 28
0
def main():
    is_it_test = True
    train = False
    path_to_word_2_vec = ""
    path_to_data = ""
    path_to_nlp = ""
    path_to_our_model = ""
    if os.name == "nt":
        if is_it_test:
            path_to_word_2_vec = r"E:\FinalProject3\GoogleNews-vectors-negative300.bin"
        else:
            path_to_word_2_vec = r"E:\FinalProject3\wiki.en.vec"
        path_to_data = r"E:\FinalProject3\data"
        path_to_nlp = r'E:\FinalProject3\stanford-corenlp-full-2018-02-27'
        path_to_our_model = r'E:\FinalProject3\auto_de_only_wiki'
    else:
        if is_it_test:
            path_to_word_2_vec = "/home/ubuntu/Projet/FinalProject3/GoogleNews-vectors-negative300.bin"
        else:
            path_to_word_2_vec = "/home/ubuntu/Projet/FinalProject3/wiki.en.vec"
        path_to_data = "/home/ubuntu/Projet/FinalProject3/data/"
        path_to_nlp = "/home/ubuntu/Projet/FinalProject3/stanford-corenlp-full-2018-02-27"
        path_to_our_model = "/home/ubuntu/Projet/FinalProject3/auto_de_only_wiki"

    # for debug -->, quiet=False, logging_level=logging.DEBUG)
    nlp = StanfordCoreNLP(path_to_nlp)
    if train:
        dataset = DataClass(path_to_data)
        dataset.laod_data()
    else:
        dataset = DataClass(path_to_data)
    modelwords = MyWord2vec(path_to_word_2_vec)
    # ("/home/ubuntu/Project/FinalProject/", "wiki.en.vec")
    try:
        word2vec
    except NameError:
        var_exists = False
    else:
        var_exists = True

    if not var_exists:
        modelwords.load_embeddings()
        try:
            modelwords.model["check"]
            word2vec = True
        except:
            word2vec = False
            print('word2vec not configure')

    preprocessData = PreprocessClass(dataset, modelwords, nlp, "ml", train)
    preprocessData.getMaxLength()
    preprocessData.preprocessing_data()

    if not train:
        nnmodel = DLClass()
        nnmodel.model = load_model(path_to_our_model)
        graph = tf.get_default_graph()
        return {
            "preproc": preprocessData,
            'nnmodel': nnmodel.model,
            'graph': graph
        }

    predict(
        preprocessData,
        "A wiki is a Web site that allows users to add and update content"
        " on the site using their own Web browser.", path_to_our_model)

    if train:
        preprocessData.X, preprocessData.classified_output = shuffle(
            preprocessData.X, preprocessData.classified_output, random_state=0)
        # 1 to save model 10 for statistic result
        kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

        scores = defaultdict(int)
    nlp.close()

    if train:
        for train, test in kfold.split(preprocessData.X,
                                       preprocessData.classified_output):
            nnmodel = DLClass()
            nnmodel.build_model(preprocessData.X[train],
                                preprocessData.classified_output[train],
                                "cblstm")
            print('Predicting...')
            preds = np.array([
                i[0]
                for i in nnmodel.model.predict_classes(preprocessData.X[test])
            ])
            p = precision(preds, preprocessData.classified_output[test])
            r = recall(preds, preprocessData.classified_output[test])
            f1 = f1_score(preds, preprocessData.classified_output[test])
            print('(Fold) Precision: ', p, ' | Recall: ', r, ' | F: ', f1)
            scores['Precision'] += p
            scores['Recall'] += r
            scores['F1'] += f1

        nnmodel.model.save("/home/ubuntu/auto_de_only_wiki")
        print('Overall scores:')
        for n, sc in scores.items():
            print(n, '-> ', sc / 10 * 1.0)
Ejemplo n.º 29
0
# _*_coding:utf-8_*_

from __future__ import print_function

from stanfordcorenlp import StanfordCoreNLP

local_corenlp_path = r'G:/JavaLibraries/stanford-corenlp-full-2016-10-31/'
# local_corenlp_path = r'/home/gld/JavaLibs/stanford-corenlp-full-2016-10-31'

# Simple usage
nlp = StanfordCoreNLP(local_corenlp_path)

sentence = 'Guangdong University of Foreign Studies is located in Guangzhou.'
print('Tokenize:', nlp.word_tokenize(sentence))
print('Part of Speech:', nlp.pos_tag(sentence))
print('Named Entities:', nlp.ner(sentence))
print('Constituency Parsing:', nlp.parse(sentence))
print('Dependency Parsing:', nlp.dependency_parse(sentence))
position, cluster_no, text = nlp.dcorf(sentence)

nlp.__del__()
# Other human languages support, e.g. Chinese
nlp = StanfordCoreNLP(local_corenlp_path, lang='zh', quiet=False)

sentence = '清华大学位于北京。'
print(nlp.word_tokenize(sentence))
print(nlp.pos_tag(sentence))
print(nlp.ner(sentence))
print(nlp.parse(sentence))
print(nlp.dependency_parse(sentence))
Ejemplo n.º 30
0
#!/usr/bin/python3
# coding: utf-8
## run with sudo
from stanfordcorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP(r'/Users/coder352/datasets/Lib/stanford-corenlp-full-2018-02-27')

sentence = ":) ... 'll 're http://jmlr.org/papers/v15/srivastava14 u.s. http://baidu.com `` '' 's 1,2:3 2018.03.07 2018/03/07 2018-03-07 for 23-years old. pi is 3.1415, .8 0.8%% is good, +0.2 well-known -lrb- mr. mra mrs. no.1 ##-mill-dollar ###.##-### <unk>"
print('Tokenize:', nlp.word_tokenize(sentence)); print()
print('Part of Speech:', nlp.pos_tag(sentence)); print()
# print('Named Entities:', nlp.ner(sentence))  # 这个会报错
print('Constituency Parsing:', nlp.parse(sentence)); print()
print('Dependency Parsing:', nlp.dependency_parse(sentence)); print()
nlp.close()  # Do not forget to close! The backend server will consume a lot memery.
Ejemplo n.º 31
0
# coding=utf-8

from stanfordcorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP(r'D:\NLP_sourceCode\stanfordcorenlp')
# 如果所有设置都没有问题还是报错:
# 请注意:D:\Anaconda3\Lib\site-packages\stanfordcorenlp\corenlp.py
# memory 默认是4g,但是我只有8g,运行剩余不够4g,所以需要改小,但是速度会很慢,所以只有加内存条。

# step.1 启动 server
# Run a server using Chinese properties
# java -Xmx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -serverProperties StanfordCoreNLP-chinese.properties -port 9000 -timeout 15000
# nlp = StanfordCoreNLP('http://localhost', port=9000)

sentence = 'Guangdong University of Foreign Studies is located in Guangzhou'

# print (nlp.word_tokenize(sentence))
# print (nlp.pos_tag(sentence))
# print (nlp.ner(sentence))
print (nlp.parse(sentence))  # 语法树
# print (nlp.dependency_parse(sentence)) #依存句法

nlp.close() #释放,否则后端服务器将消耗大量内存