Python StanfordCoreNLP.annotate Exemples, pycorenlp.StanfordCoreNLP.annotate Python Exemples

Exemple #1

0

Afficher le fichier

class StanfordSentimentAnalyzer(SentimentAnalyzer):
    def __init__(self):
        self.snlp = StanfordCoreNLP('http://localhost:9000')

    def lemmatize(self, text):
        lemmatized = self.snlp.annotate(text, properties={
            'annotators': 'tokenize, ssplit, pos, lemma',
            'outputFormat': 'json'
        })

        sentence = lemmatized['sentences'][0]
        tokens = [x['lemma'] for x in sentence['tokens']]
        return ' '.join(tokens)

    def get_analyzer_type(self):
        return 'snlp'

    def get_sentiment_from_text(self, text):
        nearest_ascii = unidecode.unidecode(text)
        nearest_ascii = ' '.join(nearest_ascii.split())
        resp = self.snlp.annotate(nearest_ascii, properties={
                    'timeout': '50000',
                    'annotators': 'tokenize, ssplit, pos, lemma, sentiment',
                    'outputFormat': 'json'
             })

        sentiment = np.zeros(2)
        tokens = []
        # logging.debug(resp)
        for sentence in resp['sentences']:
            i = int(sentence['sentimentValue'])
            sentiment += np.array([max(i - 2, 0), max(2 - i, 0)])
            tokens += [x['lemma'] for x in sentence['tokens']]

        return sentiment, tokens

Exemple #2

0

Afficher le fichier

Fichier : test_basic.py Projet : leiyis99/ppaxe

def test_stanford_corenlp_server():
    '''
    Tests connection to stanford corenlp server
    '''
    try:
        nlp = StanfordCoreNLP('http://localhost:9000')
        nlp.annotate("HOLA")
        assert (1 == 1)
    except:
        assert ("Connection error" == "StanfordCoreNLP")

Exemple #3

0

Afficher le fichier

Fichier : parser.py Projet : jonasrothfuss/equity_news_thesis

class Parser:
    def __init__(self, coreNLPServer ='http://localhost:9000'):
        self.nlp = StanfordCoreNLP('http://localhost:9000')

    def word_list(self, text):
        nlp_output = self.nlp.annotate(text, properties={
            'annotators': 'tokenize,ssplit',
            'outputFormat': 'json'
        })
        word_array = []
        for sentence in nlp_output['sentences']:
            for w in sentence['tokens']:
                word_array.append(w['word'].lower())
        return word_array


    def parse_tree(self, text, binary=False, preprocessed=False):
        nlp_output = self.nlp.annotate(text, properties={
            'annotators': 'tokenize,ssplit,pos,parse',
            'outputFormat': 'json',
            'parse.binaryTrees': 'true'
        })
        if type(nlp_output) == str:
            nlp_output = json.loads(nlp_output, strict=False)

        if len(nlp_output['sentences']) > 1:
            #merge trees from sentences
            tree_string = "(Top "
            for s in nlp_output['sentences']:
                p_tree = Tree.fromstring(s['parse'])
                tree_string += str(p_tree[0])
            tree_string += ")"
            merged_tree = Tree.fromstring(tree_string)
        else:
            #no merging required
            merged_tree = Tree.fromstring(nlp_output['sentences'][0]['parse'])
            #remove root
            merged_tree = merged_tree[0]

        if binary:
            nltk.treetransforms.chomsky_normal_form(merged_tree)

        if preprocessed:
            merged_tree = preprocess_parse_tree(merged_tree)

        return merged_tree

    def draw_parse_tree(self, parse_tree):
        nltk.draw.tree.draw_trees(parse_tree)

Exemple #4

0

Afficher le fichier

Fichier : AnaphoraResolver.py Projet : maverick2789/Sherlock

def anaphora(text):
   
   nlp = StanfordCoreNLP('http://192.168.54.210:9000/')
   output = nlp.annotate(text, properties={
  'annotators': 'tokenize,ssplit,pos,depparse,parse,coref',
  'outputFormat': 'text'})
   sents = nltk.sent_tokenize(text) 
   a=[]
   for sent in sents:   
       a.append(sent.split())
   output = str(output.replace('\r','').replace('\t',''))  
   #output = output.split('Coreference set:', 1)[1]
   output = output.split('Coreference set:')
   #output = str(output.replace('\r','').replace('\t',''))
   #output = output.split('\n');
   for out in output[1:]:
       #print out
       out = str(out.replace('\r','').replace('\t',''))
       out = out.split('\n')
       for i in out[1:-1]:
           i = i.split(', that is:')
           toFrom = i[0].split('->')
           fromSent , fromStart, fromEnd = sentenceRange(toFrom[0])
           toSent , toStart, toEnd = sentenceRange(toFrom[1])
           fromText , toText = fromTo(i[1])
           
           if len(toText.split()) > 1:
              toText = shorten(toText)
              toText = [toText]
           #a[fromSent - 1][fromStart - 1:fromEnd - 1] = a[toSent - 1][toStart - 1:toEnd - 1]
              a[fromSent - 1][fromStart - 1:fromEnd - 1] = toText
   return a

Exemple #5

0

Afficher le fichier

Fichier : views.py Projet : a865143034/picture_linking

def ner(text):
    nlp = StanfordCoreNLP('http://localhost:8098/')
    output = nlp.annotate(text, properties={
        'annotators': 'tokenize,pos,ssplit,ner,lemma',
        'outputFormat': 'json',
    })
    return output['sentences']

Exemple #6

0

Afficher le fichier

Fichier : main.py Projet : DulanjanaYasara/chatbot

class CoreNLP:
    """Used to initialize the Stanford Core NLP in servlet mode and then connect to it using a socket"""
    mongo = MongoClient()
    mongo_db = mongo.get_database('dependencies')

    def __init__(self, timeout=15000, port=9000, buffer_size=4096):
        """Used to initialize the StanfordAPI object with the host, port and buffer"""
        # self.host = socket.gethostname()
        self.port = str(port)
        # self.timeout = str(timeout)
        # self.buffer = str(buffer_size)
        # self.process = Popen(
        #     args=['java', '-mx4g', '-cp', 'commons/corenlp/*', 'edu.stanford.nlp.pipeline.StanfordCoreNLPServer',
        #           '-port', self.port, '-timeout', self.timeout])
        # time.sleep(5)
        self.nlp = StanfordCoreNLP('http://localhost:' + self.port)

    def parse(self, text):
        dobj = self.mongo_db.get_collection('dependency').find_one({'text': text})
        if not dobj or dobj['deps'] == 'CoreNLP request timed out. Your document may be too long.':
            output = self.nlp.annotate(text, properties={
                'annotators': 'tokenize,ssplit,pos,depparse,parse,coref',
                'coref.algorithm': 'neural',
                'outputFormat': 'json',
            })
            dep = {'text': text, 'deps': output}
            self.mongo_db.get_collection('dependency').insert_one(dep)
            return output
        else:
            return dobj['deps']

Exemple #7

0

Afficher le fichier

class CoreNLP(object):
    def __init__(self):
        self.corenlp = StanfordCoreNLP('http://localhost:9000')

    def tokenize_sentence(self, sentence):
        doc = self.corenlp.annotate(
            sentence,
            properties={
                'annotators': 'tokenize,lemma,ssplit,pos,depparse,parse',
                'outputFormat': 'json'
            })
        s = doc['sentences'][0]

        return self.structure_tokens(s)

    def structure_tokens(self, sentence):
        words = []
        for token in sentence['tokens']:
            words.append({
                "pos": token['pos'],
                "token": token['word'],
                "links": [],
                "lemma": token['lemma']
            })

        for dep in sentence['enhancedPlusPlusDependencies']:
            words[dep['governor'] - 1]['links'].append(
                [dep['dep'], (dep['dependent'] - 1)])
        return words

Exemple #8

0

Afficher le fichier

Fichier : popcorn_parsing.py Projet : mantis522/pycharm_TDA

def making_parsed_tree(sentiment_code, file_name):
    splited_sentence_first = []
    parsed_sentence_first = []

    pcn = StanfordCoreNLP('http://*****:*****@", '', text)
        text = re.sub(r'http\S+', '', text)
        return text

    for a in tqdm(range(len(df_amazon))):
        tweet_txt = about_symbol(text[a])
        if label[a] == sentiment_code:
            if len(tweet_txt) > 3:
                tweet_txt = " ".join(tweet_txt.split())
                tweet_txt = contractions.fix(tweet_txt)

                doc = nlp(tweet_txt)
                splited_sentence_second = []
                parsed_sentence_second = []

                for sentence in doc.sentences:
                    temp = []
                    for token in sentence.tokens:
                        temp.append(token.text)
                    sum_text = " ".join(temp)
                    sum_text = about_symbol(sum_text)
                    output = pcn.annotate(sum_text,
                                          properties={
                                              'annotators': 'parse',
                                              'outputFormat': 'json'
                                          })
                    parsed_sent = output['sentences'][0]['parse']
                    parsed_sent = " ".join(parsed_sent.split())
                    parsed_sent = parsed_sent.replace('(', '<')
                    parsed_sent = parsed_sent.replace(')', '>')

                    parsed_sentence_second.append(parsed_sent)
                    splited_sentence_second.append(sum_text)
                    # print(parsed_sent)
                splited_sentence_first.append(splited_sentence_second)
                parsed_sentence_first.append(parsed_sentence_second)

            sent_json['splited_sentence'] = []
            sent_json['parsed_sentence'] = []
            sent_json['original_sentence'] = []
            sent_json['splited_sentence'].append(splited_sentence_first)
            sent_json['parsed_sentence'].append(parsed_sentence_first)
            sent_json['original_sentence'].append(tweet_txt)

    with open(file_name, 'w') as out_file:
        json.dump(sent_json, out_file, indent=4)

Exemple #9

0

Afficher le fichier

def extract_triples_openie():

    # run corenlp server from shell
    # java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "openie" -port 9000 -timeout 30000
    # http://corenlp.run/
    nlp = StanfordCoreNLP("http://localhost:9000")
    output = nlp.annotate(
        text,
        properties={
            #'annotators': 'tokenize, ssplit, pos, depparse, parse, openie',
            'annotators': 'openie',
            'outputFormat': 'json'
        })

    #print(output['sentences'][0].keys)

    for sentence in output['sentences']:

        for result in sentence['openie']:
            print("{" + result['subject'] + ", " + result['relation'] + ", " +
                  result['object'] + "}")


# RUN OpenIE extraction
#extract_triples_openie()

Exemple #10

0

Afficher le fichier

def annotate(text, url=None, properties=None):
    if url is None:
        url = NLP_SERVER
    if properties is None:
        properties = NLP_PROPERTIES
    nlp = StanfordCoreNLP(url)
    return nlp.annotate(text, properties)

Exemple #11

0

Afficher le fichier

def annotate_story(text, name, corenlp_url, props=neural_props):
    out_path = os.path.join('data', 'writing-prompts', 'annotations',
                            f'{name}.json')

    # Remove <newline>
    cleaned_text = []
    for token in text.split():
        if token != '<newline>':
            cleaned_text.append(token)
    cleaned_text = ' '.join(cleaned_text)

    if os.path.exists(out_path):
        return

    nlp = StanfordCoreNLP(corenlp_url)
    try:
        annotation = nlp.annotate(cleaned_text, properties=props)
    except requests.exceptions.ConnectionError as e:
        logger.error(f'Connection Error for {name}: {e}.')
        return

    if isinstance(annotation, str):
        logger.error(f'Error for {name}: {annotation}.')
        # Let's try a statistical approach
        if 'Error making document' in annotation and props[
                'coref.algorithm'] == 'neural':
            logger.info(f'Switching to statistical coref for {name}')
            annotate_story(text, name, corenlp_url, props=stats_props)
        else:
            logger.info(f'Please check {name}')
            return
    else:
        with open(out_path, 'w') as f:
            json.dump(annotation, f)

Exemple #12

0

Afficher le fichier

def tokenize_and_tag(idx, sentence):
    stanford_corenlp = StanfordCoreNLP(corenlp_url)
    tries = 0
    while True:
        try:
            annotation = stanford_corenlp.annotate(sentence.encode('utf8'),
                                                   properties={
                                                       'annotators':
                                                       'tokenize,pos,ner',
                                                       'outputFormat': 'json'
                                                   })
            assert type(annotation) == dict
            break
        except Exception:
            time.sleep(1)
            tries += 1
            if tries == 10:
                print "Failed for %s" % sentence
                return (idx, None, None, None)
            pass
    tokens, pos_tags, ner_tags = [], [], []
    for sentence in annotation['sentences']:
        tokens.extend([token['word'] for token in sentence['tokens']])
        pos_tags.extend([token['pos'] for token in sentence['tokens']])
        ner_tags.extend([token['ner'] for token in sentence['tokens']])
    return (idx, tokens, pos_tags, ner_tags)

Exemple #13

0

Afficher le fichier

Fichier : sypt_ptree.py Projet : marjanhs/phan_style_change

def get_pt_features_coreNLP(doc, ignoreleaf=True):
    en = doc.encode('utf-8')
    de = en.decode('utf-8')
    doc = de
    chars_to_remove = ['{', '}', '(', ')']
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    doc = re.sub(rx, '', doc)
    nlp = StanfordCoreNLP('http://localhost:9000')
    sentences = sent_tokenize(doc)
    ptree_features = list()
    for sentence in sentences:
        try:
            if sentence != "" and len(
                    word_tokenize(sentence)) <= 80:  # less than 50 words
                output = nlp.annotate(sentence,
                                      properties={
                                          'annotators': 'parse',
                                          'outputFormat': 'json'
                                      })

                parsed = (output['sentences'][0]['parse'])
                rules = traverse(parsed, ignoreleaf=ignoreleaf)
                ptree_features.append(rules)
        except:
            print('Problem in parsing sentece = %s' % sentence)

    return ptree_features

Exemple #14

0

Afficher le fichier

Fichier : env_sentiment_stanford.py Projet : settinghead/rl-chat

class SentmentEnvironment:
    def __init__(self):
        self.reset()
        self.stanford = StanfordCoreNLP('http://localhost:9000')

    def step(self, action: str):
        result = self.stanford.annotate(action,
                                        properties={
                                            'annotators': 'sentiment',
                                            'outputFormat': 'json',
                                            'timeout': '5000'
                                        })

        # Result types from CoreNLP:
        # negative: 1; # neutral: 2; positive: 3
        s_scores = [int(s['sentimentValue']) - 2 for s in result['sentences']]
        reward = math.tanh(sum(s_scores))

        done = len(self.history) > CONVO_LEN
        self.history.append(action)

        state = random.sample(sentences, 1)[0]

        return state, reward, done

    def reset(self):
        self.history = []

Exemple #15

0

Afficher le fichier

def sentiment_stanford(input_text):
    if input_text != '':
        nlp = StanfordCoreNLP('http://localhost:9000')
        res = nlp.annotate(input_text,
                           properties={
                               'annotators': 'sentiment',
                               'outputFormat': 'json',
                               'timeout': 1000,
                           })
        rows = []
        for s in res["sentences"]:
            rows.append([s["sentimentValue"], s["sentiment"]])

        df = pd.DataFrame(rows, columns=['sentiment_value', 'sentiment'])
        df['sentiment_value'] = df['sentiment_value'].apply(float)
        grouped_obj = df.groupby('sentiment')

        scores = {'pos': 0.0, 'neu': 0.0, 'neg': 0.0, 'compound': 0.0}
        for gr_name, gr_df in grouped_obj:
            mean_score = float(gr_df.mean())
            if gr_name == 'Positive':
                scores['pos'] = mean_score
            elif gr_name == 'Negative':
                scores['neg'] = mean_score
            elif gr_name == 'Neutral':
                scores['neu'] = mean_score

        scores['compound'] = scores['pos'] - scores['neg'] + (scores['neu'] /
                                                              2.)
        return scores
    else:
        return None

Exemple #16

0

Afficher le fichier

Fichier : parser.py Projet : ayoungprogrammer/Lango

class StanfordServerParser(Parser, GenericStanfordParser):
    """Follow the readme to setup the Stanford CoreNLP server"""
    def __init__(self, host='localhost', port=9000, properties={}):
        url = 'http://{0}:{1}'.format(host, port)
        self.nlp = StanfordCoreNLP(url)

        if not properties:
            self.properties = {
                'annotators': 'parse',
                'outputFormat': 'json',
            }
        else:
            self.properties = properties

    def _make_tree(self, result):
        return Tree.fromstring(result)

    def parse(self, sent):
        output = self.nlp.annotate(sent, properties=self.properties)

        # Got random html, return empty tree
        if isinstance(output, str):
            return Tree('', [])

        parse_output = output['sentences'][0]['parse'] + '\n\n'
        tree = next(next(self._parse_trees_output(parse_output)))[0]
        return tree

Exemple #17

0

Afficher le fichier

Fichier : method1.py Projet : rohitbhoj102/nlp-question-detection

class isQuestionBasic():

    # Init Constructor
    # Initialize StanfordCore NLP local instance on port 9000
    def __init__(self):
        self.nlp = StanfordCoreNLP('http://localhost:9000')

    # Input: Sentence to be predicted
    # Processing: 1. Uses Stanfors NLP's 'annotate' method to create Parse Tree
    # 2. Checks for occurence of 'SQ' or 'SBARQ' in the parse tree
    # Return: 1 - If sentence is question | 0 - If sentence is not a question
    def isQuestion(self, sentence):
        if '?' in sentence:
            return 1
        output = self.nlp.annotate(sentence,
                                   properties={
                                       'annotators': 'parse',
                                       'outputFormat': 'json',
                                       'timeout': 1000,
                                   })

        if ('SQ' or 'SBARQ') in output['sentences'][0]["parse"]:
            return 1
        else:
            return 0

Exemple #18

0

Afficher le fichier

def poemAnalysis(poemObject):
    #https://stackoverflow.com/questions/32879532/stanford-nlp-for-python
    #Connect to the Stanford NLP server. Note that in order to run this code,
    #The Stanford NLP server must be run. The local_corenlp_path above must be
    #changed, and the instructions must be followed from the link above.
    nlp = StanfordCoreNLP('http://localhost:9000')
    #Set the analysis to be of sentiment.
    pros = {'annotators': 'sentiment', 'outputFormat': 'json'}
    poem = poemObject.get('poems')[0]
    res = nlp.annotate(poem, properties=pros)
    totalSentiment = 0
    count = 0
    # calculate the average sentiment across the sentences.
    for s in res["sentences"]:
        totalSentiment = totalSentiment + float(s["sentimentValue"])
        count = count + 1
    averageSentiment = totalSentiment / count
    # 0: Very Negative
    # 1: Negative
    # 2: Neutral
    # 3: Positive
    # 4: Very Positive
    title = poemObject.get('title')
    fascicle = poemObject.get('fasc')
    publication_date = poemObject.get('pubdate')
    # returns all relevant information.
    return [title, fascicle, publication_date, averageSentiment, count]

Exemple #19

0

Afficher le fichier

Fichier : testStanfordNLP.py Projet : gregmccord/Research-Projects

def get_score(text):

    # Connect to the server
    nlp = StanfordCoreNLP('http://localhost:9000')
    text = text.lower()
    res = nlp.annotate(text,
                       properties={
                           'annotators': 'sentiment',
                           'outputFormat': 'json',
                           'timeout': 10000,
                       })

    # Average sentiment over sentences
    sum = 0
    tot_words = 0
    for s in res["sentences"]:
        value = int(s["sentimentValue"]) - 2  # so that neutral is 0
        scaled_val = value * len(s["tokens"])
        tot_words += len(s["tokens"])
        sum += scaled_val
    score = sum / tot_words

    # If the review is "neutral", either randomly assign it as either positive or negative
    # sentiment or ignore the review all together
    if score == 0:
        #sent_score = randint(0,1)
        sent_score = -1

    if score < 0:
        sent_score = 0
    elif score > 0:
        sent_score = 1

    return sent_score

Exemple #20

0

Afficher le fichier

Fichier : parser.py Projet : techscientist/Lango

class StanfordServerParser(Parser, GenericStanfordParser):
    """Follow the readme to setup the Stanford CoreNLP server"""
    def __init__(self, host='localhost', port=9000, properties={}):
        url = 'http://{0}:{1}'.format(host, port)
        self.nlp = StanfordCoreNLP(url)

        if not properties:
            self.properties = {
                'annotators': 'parse',
                'outputFormat': 'json',
            }
        else:
            self.properties = properties

    def _make_tree(self, result):
        return Tree.fromstring(result)

    def parse(self, sent):
        output = self.nlp.annotate(sent, properties=self.properties)

        # Got random html, return empty tree
        if isinstance(output, unicode):
            return Tree('', [])

        parse_output = output['sentences'][0]['parse'] + '\n\n'
        tree = next(next(self._parse_trees_output(parse_output)))[0]
        return tree

Exemple #21

0

Afficher le fichier

def extension_headline_simple():
    headline = request.args.get('q')
    nlp = StanfordCoreNLP('http://localhost:9000')
    output = nlp.annotate(headline,
                          properties={
                              'annotators': 'tokenize,openie,depparse',
                              'outputFormat': 'json'
                          })

    result = {"voice": [], "relationships": []}

    for dep in output["sentences"][0]["basicDependencies"]:
        if dep["dep"] == "nsubj":
            result["voice"].append("Active Voice: " + dep["dependentGloss"] +
                                   " -> " + dep["governorGloss"])
        if dep["dep"] == "nsubj:pass":
            result["voice"].append("Passive Voice: " + dep["dependentGloss"] +
                                   " -> " + dep["governorGloss"])

    for openie in output["sentences"][0]["openie"]:
        result["relationships"].append(f"Object: " + openie["object"] +
                                       ", Relation: " + openie["relation"] +
                                       ", Subject: " + openie["subject"])

    return json.dumps(result)

Exemple #22

0

Afficher le fichier

def test_parsing(df):
    nlp = StanfordCoreNLP('http://localhost:9000')


    utt = 'There is a pub called Wildwood which serves English food. It has a low customer rating and price range - typically less than £20.'

    output = nlp.annotate(utt, properties={
        'annotators': 'tokenize,ssplit,pos,depparse,parse',
        'outputFormat': 'json'
    })

    # divide the parse tree into lines
    ptree = '\n'.join([sent['parse'] for sent in output['sentences']])
    print(ptree)

    # if find_apposition(ptree.split('\n')):
    # if find_fronted_adjective_phrase(ptree.split('\n')):
    # if find_fronted_prepositional_phrase(ptree.split('\n')):
    # if find_fronted_verb_phrase(ptree.split('\n')):
    # if find_fronted_imperative_phrase(ptree.split('\n')):
    # if find_subordinate_clause_non_wh(ptree.split('\n')):
    # if find_subordinate_clause_wh(ptree.split('\n')):
    # if find_gerund_verb(ptree.split('\n')):
    if find_modal_verb(ptree.split('\n')):
    # if find_contrast(ptree.split('\n')):
    # if find_agreement(ptree.split('\n')):
    # if find_prepositions(ptree.split('\n')):
    # if find_existential_there(ptree.split('\n')):
        print(utt)

Exemple #23

0

Afficher le fichier

def main():
    args = parse_args()
    parser = RstParser()
    parser.load('../data/model')
    with gzip.open('features/bc3200.pickle.gz') as fin:
        print('Load Brown clusters for creating features ...')
        brown_clusters = pickle.load(fin)
    core_nlp = StanfordCoreNLP('http://localhost:9000')
    annotate = lambda x: core_nlp.annotate(
        x,
        properties={
            'annotators': 'tokenize,ssplit,pos,lemma,parse,depparse',
            'outputFormat': 'json',
            'ssplit.isOneSentence': True
        })
    edu_file_list = [
        os.path.join(args.edu_file_dir, fname)
        for fname in os.listdir(args.edu_file_dir)
        if fname.endswith('.edu.txt')
    ]
    for edu_file in edu_file_list:
        print('Parsing {}...'.format(edu_file))
        doc = create_doc_from_edu_file(edu_file, annotate_func=annotate)
        pred_rst = parser.sr_parse(doc, brown_clusters)
        tree_str = pred_rst.get_parse()
        pprint_tree_str = Tree.fromstring(tree_str).pformat(margin=150)
        with open(
                os.path.join(args.output_dir,
                             os.path.basename(edu_file) + '.parse'),
                'w') as fout:
            fout.write(pprint_tree_str)

Exemple #24

0

Afficher le fichier

Fichier : negKeywordsExtract.py Projet : dennybasillie/CZ4045_NLP

def main():

    nlp = StanfordCoreNLP('http://localhost:9000')

    negationWords = []

    with open(sys.argv[1], 'r') as f:
        for line in f:
            text = line.rstrip()
            output = nlp.annotate(text, properties={'annotators': 'depparse', 'outputFormat': 'json'})

            try:
                dep = output['sentences'][0]['basicDependencies']

                for i in range(len(dep)):
                    if dep[i]['dep'] == 'neg':
                        word = dep[i]['dependentGloss']
                        if word == "n't":
                            dep_temp = dep[i - 1]
                            if dep_temp['dep'] == 'expl':
                                word = dep_temp['governorGloss'] + word
                            else:
                                word = dep_temp['dependentGloss'] + word

                        word = word.lower()

                        if word not in negationWords:
                            negationWords.append(word)

        except:
            pass

if __name__ == "__main__":
    main()

Exemple #25

0

Afficher le fichier

Fichier : get_corenlp_json_paragraph.py Projet : AkariAsai/squad_code_dl

def main():
    nlp = StanfordCoreNLP('http://localhost:9000')
    df_dev = pd.read_csv("train_v1.csv")
    df_tokens = pd.DataFrame()
    index = 0
    corenlp_json = {}
    total_num = set(df_dev["id"].values)
    for question_id in df_dev["id"].values:
        paragraph = df_dev.loc[df_dev["id"] == question_id, "context"].iloc[0]
        sentences = split_paragraph_into_sentences(paragraph)

        question_json_result = []
        for sentence in sentences:
            output = nlp.annotate(sentence,
                                  properties={
                                      'annotators':
                                      'tokenize, pos, lemma, ner',
                                      'outputFormat': 'json'
                                  })
            if len(output["sentences"]) > 0:
                question_json_result.append(output["sentences"][0]["tokens"])
        corenlp_json[question_id] = question_json_result
        print(index)
        index += 1
        if index % 10001 == 0:
            with open("corenlp_paragraph_to" + str(index) + ".json",
                      'w+') as fp:
                json.dump(corenlp_json, fp)
                corenlp_json = {}

Exemple #26

0

Afficher le fichier

Fichier : Sentiment.py Projet : ctienshi/SentimentAnalysis

def stanfordNLP(data):
    sentimentLevel = 0

    nlp = StanfordCoreNLP('http://localhost:9000')
    res = nlp.annotate(data,
                       properties={
                           'annotators': 'sentiment',
                           'outputFormat': 'json',
                           'timeout': 100000
                       })
    #print (res)
    for i in res["sentences"]:
        val = int(i["sentimentValue"])
        if i["sentiment"] == "Verypositive":
            sentimentLevel = sentimentLevel + val + 5

        elif i["sentiment"] == "Positive":
            sentimentLevel = sentimentLevel + val + 1

        elif i["sentiment"] == "Neutral":
            sentimentLevel = 0

        elif i["sentiment"] == "Negative":
            sentimentLevel = sentimentLevel - val - 1

        elif i["sentiment"] == "Verynegative":
            sentimentLevel = sentimentLevel - val - 5

    stanfordLevel = calSentimentLevel(sentimentLevel)
    return stanfordLevel

Exemple #27

0

Afficher le fichier

def Sentiment_StanfordNLP(text):
    from pycorenlp import StanfordCoreNLP
    import numpy as np
    nlpStanford = StanfordCoreNLP('http://localhost:9000')
    results = nlpStanford.annotate(text,
                                   properties={
                                       'annotators': 'sentiment, ner, pos',
                                       'outputFormat': 'json',
                                       'timeout': 50000,
                                   })
    sentiment = []
    for s in results["sentences"]:
        sentiment.append(s["sentiment"])
    new_sentiment = []
    for sent in sentiment:
        new_string = sent.replace("Negative", "-1").replace(
            "Neutral",
            "0").replace("Positive",
                         "1").replace("Verynegative",
                                      "-2").replace("Verypositive", "2")
        new_sentiment.append(new_string)
    sentiment_mean = []
    for x in new_sentiment:
        sentiment_mean.append(int(x))
    return np.mean(sentiment_mean)

Exemple #28

0

Afficher le fichier

def parseDoc(docFileName, outFileName):
	df = pd.read_csv(docFileName, sep="\t")
	df.columns = ["docName","sentence"]
	docNames = df.docName.unique()
	nlp = StanfordCoreNLP('http://localhost:9000')
	parseResults = {}
	nb_sent = 0
	for docName in docNames:
		sents = df[df['docName'] == docName]['sentence'].values
		start_time = time.time()
		#print "Number of sentences : ",len(sents)
		nb_sent = nb_sent + len(sents)
		output = {}
		docStr = ( ' '.join([str(x) for x in sents]))
		output = nlp.annotate(docStr, properties={'timeout': '100000000','annotators': 'tokenize,ssplit,pos,lemma,ner,depparse, parse,coref','outputFormat': 'json'})
		#parseResults[docName] = output
		#print output
		elapsed_time = time.time() - start_time	
		#print "Number of sentence: ",len(sents), "Number of coreference chains : ",len(output['sentences']), " Elapsed time: ",elapsed_time

	print nb_sent, "," , len(docNames)
	print nb_sent/(len(docNames) * 1.0)

	
	for docName, result in parseResults.iteritems():
		print docName,"\t number of coreference chains :", len(result['corefs'])

	with open(outFileName, "w") as json_file:
		json.dump(parseResults, json_file)

Exemple #29

0

Afficher le fichier

def tweet_whole_sentiment(data):
    '''
    input: whole corpus
    output: 1 dicts for tweet_whole_sentiment, 
            keys: tweet_id, values: sentimentValues (1--Positive,2--Neutral,3--Negative
    '''
    try:
        nlp_wrapper = StanfordCoreNLP('http://localhost:5000')
        feature_dict = {}
        for tweet in data:
            tokenized = tweet.tweet_words()
            new_words = [word for word in tokenized if word.isalnum()]
            if not new_words:
                feature_dict[tweet.tweet_id] = 2
            text = " ".join(new_words)
            annotate = nlp_wrapper.annotate(text,
                                            properties={
                                                'annotators': 'sentiment',
                                                'outputFormat': 'json',
                                                'timeout': 10000,
                                            })
            for sentence in annotate["sentences"]:
                #                 feature_dict[tweet.tweet_id]=sentence["sentimentValue"]
                feature_dict[tweet.tweet_id] = [sentence["sentimentValue"]]
        # print(feature_dict)
        return feature_dict
    except Exception as e:
        print("In whole sentiment exception")
        print(str(e))

Exemple #30

0

Afficher le fichier

Fichier : parse.py Projet : sbeschke/amr_ud

def dep_parse(sentence):
    """
    Parse a sentence using CoreNLP dependency parser and extract lemmatization and dependencies in the extradependencies
    mode. The function depends on CoreNLP server being set up.
    See http://stanfordnlp.github.io/CoreNLP/corenlp-server.html
    :param sentence: sentence to be parsed
    :return: a dictionary whose keys are word indeces and values are lemmas;
    a dictionary whose keys are (parent, child) tuples and values are edge labels
    """
    nlp = StanfordCoreNLP('http://localhost:9000')
    annotation = nlp.annotate(
        (sentence),
        properties={
            'annotators': 'tokenize,ssplit,lemma,pos,depparse',
            'outputFormat': 'json',
            'depparse.extradependencies': 'MAXIMAL'
        })
    lemmas = {}
    words = {}
    for t in annotation['sentences'][0]['tokens']:
        lemmas[t['index']] = t['lemma']
        words[t['index']] = t['word']
    lemmas[0] = 'ROOT'
    words[0] = 'ROOT'
    dependencies = rename_dependencies(
        annotation['sentences'][0]['collapsed-ccprocessed-dependencies'])
    dep_edge_dict = collections.defaultdict(str)
    for dep in dependencies:
        dep_edge_dict[(dep['governor'], dep['dependent'])] = dep['dep']
    return lemmas, words, dep_edge_dict

Exemple #31

0

Afficher le fichier

Fichier : run.py Projet : yizhongw/TagNN-PDTB

def pdtb_preprocess(args):
    sections = os.listdir(PathConfig.pipe_data_dir)
    if not os.path.exists(PathConfig.json_data_dir):
        os.mkdir(PathConfig.json_data_dir)
    core_nlp = StanfordCoreNLP('http://localhost:9000')
    annotate_func = lambda x: core_nlp.annotate(x, properties={
        'annotators': 'tokenize,ssplit,pos,lemma,parse,depparse',
        'outputFormat': 'json',
        # 'ssplit.isOneSentence': True
    })
    instance_cnt = 0
    for section in sections:
        raw_sec_dir = os.path.join(PathConfig.pipe_data_dir, section)
        if not os.path.isdir(raw_sec_dir):
            continue
        converted_sec_dir = os.path.join(PathConfig.json_data_dir, section)
        if not os.path.exists(converted_sec_dir):
            os.mkdir(converted_sec_dir)
        for file in os.listdir(raw_sec_dir):
            fpath = os.path.join(raw_sec_dir, file)
            pipe_instances = load_pipe_file(fpath, types=['Implicit'])
            basename_prefix = os.path.basename(fpath).split('.')[0]
            for idx, inst in enumerate(pipe_instances, 1):
                inst.arg1_parse_result = annotate_func(inst.arg1)
                inst.arg2_parse_result = annotate_func(inst.arg2)
                with open(os.path.join(converted_sec_dir, '{}.{}.pickle'.format(basename_prefix, idx)), 'wb') as fout:
                    pickle.dump(inst, fout)
                instance_cnt += 1
                if instance_cnt % 100 == 0:
                    print(instance_cnt)
    print('Totally, {} instances are converted.'.format(instance_cnt))

Exemple #32

0

Afficher le fichier

Fichier : corenlp_process.py Projet : siddalmia/graph-lm

def tokenize_and_tag(idx, sentence):
    stanford_corenlp = StanfordCoreNLP(corenlp_url)
    tries = 0
    while True:
        try:
            annotation = stanford_corenlp.annotate(
                (sentence),
                properties={
                    'annotators': 'tokenize,pos,ner,depparse',
                    'outputFormat': 'json'
                })
            assert type(annotation) == dict
            break
        except Exception:
            time.sleep(1)
            tries += 1
            if tries == 10:
                print("Failed for {}".format(sentence))
                return (idx, None, None, None, None)
            pass
    tokens, pos_tags, ner_tags, depparse = [], [], [], []
    for sentence in annotation['sentences']:
        tokens.append([token['word'] for token in sentence['tokens']])
        pos_tags.append([token['pos'] for token in sentence['tokens']])
        ner_tags.append([token['ner'] for token in sentence['tokens']])
        depparse.append([(token['dependent'], token['governor'])
                         for token in sentence['basicDependencies']])
    return (idx, tokens, pos_tags, ner_tags, depparse)

Exemple #33

0

Afficher le fichier

 def annotateText(self, text):
     nlp = StanfordCoreNLP('http://localhost:9000')
     return nlp.annotate(text,
                         properties={
                             'annotators': 'tokenize,ssplit,pos',
                             'outputFormat': 'json'
                         })

Exemple #34

0

Afficher le fichier

Fichier : syn_recognition.py Projet : Prostigus/COSC310-Interactive-Conversational-Agent

def pos_tag(sentence):

    # Set up the Stanford CoreNLP Server
    nlp = StanfordCoreNLP('http://localhost:9000')

    # Use the API to POS-tag the sentence and get a json file back as output
    output = nlp.annotate(sentence, properties = 
    {
        'annotators': 'pos',
        'outputFormat': 'json',
    })
    
    # dict_replacements will contain all important words that we will find synonyms for as values and the keys will be the POS tag.
    # If word matches the specified POS tag, add it to dict_replacements along with the correct POS tag recognized by wordnet.
    # Run a loop that iterates over each word in the sentence we take in as input.
    # POS-tags (CoreNLP - Stanford): NN - Noun(Singular), NNS - Noun(Plural), VB - Verb
    # POS-tags (Wordnet) = n - noun, v - verb
    dict_replacements = {}
    pos_list = []
    for sent in output['sentences']:
        for word in sent['tokens']:
            if word['pos'] == 'NNS' or word['pos'] == 'NN':
                dict_replacements[word['word']] = 'n'
                
    return dict_replacements

Exemple #35

0

Afficher le fichier

Fichier : ontology.py Projet : xR86/ml-stuff

def corenlp_tokenize(text):
    nlp = StanfordCoreNLP('http://localhost:9000')
    output = nlp.annotate(text, properties={
        'annotators': 'tokenize,ssplit,pos,depparse,parse',
        'outputFormat': 'json'
    })
    print(output['sentences'][0]['parse'])

    return output

Exemple #36

0

Afficher le fichier

Fichier : 3_3_parsingtree.py Projet : eachsaj/Python-Natural-Language-Processing

def stanford_parsing_result():
    text =""" I shot an elephant. The dog chased the cat. School go to boy. """
    nlp = StanfordCoreNLP('http://localhost:9000')
    res = nlp.annotate(text, properties={
        'annotators': 'tokenize,ssplit,pos,depparse,parse',
        'outputFormat': 'json'
    })
    print(res['sentences'][0]['parse'])
    print(res['sentences'][2]['parse'])

Exemple #37

0

Afficher le fichier

Fichier : AnaphoraResolver.py Projet : maverick2789/Sherlock

def NERGetter(text):
    nlp = StanfordCoreNLP('http://192.168.54.210:9000/')
    output = nlp.annotate(text, properties={
    'annotators': 'tokenize,ssplit,pos , ner',
    'outputFormat': 'text'})
    output = str(output.replace('\r','').replace('\t',''))
    output = output.split('[', 1)[1]
    output = str(output)
    output = output.split('\n')
    for i in output[0:-1]:
        i = i.replace(']','')
        i = i.split('NamedEntityTag=')
    return i[1]

Exemple #38

0

Afficher le fichier

Fichier : test2.py Projet : IdeologyPin/pygate

class StanfordAnnotator(PR):
    def __init__(self, annotators='tokenize,ssplit,pos,parse'):#depparse
        self.annotators=annotators
        self.nlp = StanfordCoreNLP('http://localhost:9000')

    def process(self, doc):
        output=self.nlp.annotate(doc.getText(), properties={
              'annotators': self.annotators,
              'outputFormat': 'json',
              'timeout': '600000'

        })
        sents=[]
        tokens=[]
#         print "output", json.dumps(output)
        tStart=0
        tEnd=0
        for s in output['sentences']:
            sentText=[]
            sentTokens=[]
            for t in s['tokens']:
#                 print t
                sentText.append(t['before'])
                sentText.append(t['originalText'])

                token=Annotation(t['originalText'],tEnd,tEnd,t['characterOffsetBegin'], t['characterOffsetEnd'], 'Token', doc)
                token.setFeature('pos', t['pos'])
                token.setFeature('index', t['index'])
                tokens.append(token)
                sentTokens.append(token)
                tEnd+=1

            cStart=s['tokens'][0]['characterOffsetBegin']
            cEnd=s['tokens'][-1]['characterOffsetEnd']
            sentText="".join(sentText)
            print sentText
            sent=Annotation(sentText, tStart, tEnd, cStart, cEnd, 'Sentence', doc)
            tStart=tEnd

            sent.setFeature('constituency-parse', s['parse'])
            sent.setFeature('dep-parse', 'not implemented!')
            sent.setFeature('index', s['index'])
#           sent.setRelation('tokens',sentTokens)
            sents.append(sent)
#         pr-
        doc.setSents(sents)
        doc.setTokens(tokens)

Exemple #39

0

Afficher le fichier

Fichier : nlp.py Projet : zhtmike/wsc-project

class NLPFactory:
    def __init__(self):
        self.url = os.environ.get("CORENLP_URL", "http://localhost:9000")
        self.nlp = StanfordCoreNLP(self.url)

    def annotate(self, text):
        """
        annotate by dependence parser
        Args:
            text (str): input data

        Returns:
            json
        """
        # corenlp will treat sentences with full stop independently
        text = text.replace('.', ',').replace('!', ',')
        return self.nlp.annotate(text, properties={"annotators": "pos,lemma,depparse,sentiment", "outputFormat": "json"})

Exemple #40

0

Afficher le fichier

Fichier : Pronoun_Resolver.py Projet : sameer9311/Auto-Summarization-of-News

    def resolve(self, text):

        sentences_all = sent_tokenize(text, 'English')

        for i in range(2, len(sentences_all)):
            text2 = sentences_all[i-2]+' '+sentences_all[i-1]+' '+sentences_all[i]
            print(text2)
            sentences = sent_tokenize(text2, 'English')
            print(sentences)
            nlp = StanfordCoreNLP('http://localhost:9000')
            output = nlp.annotate(text2, properties={
                'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,mention,dcoref',
                'outputFormat': 'json'
            })

            # target.write(output)
            # target.close()
            corefs = output['corefs']
            cnt = 1

            for key, chains in corefs.items():

                substitute = ''
                print("\nchain number "+str(cnt))
                cnt += 1
                for chain in chains:

                    # print(chain['isRepresentativeMention']+'\n')
                    print(chain['type'] + ' ' + chain['text'])
                    if (chain['isRepresentativeMention'] is True) and (chain['type'] != 'PRONOMINAL'):
                        substitute = str(chain['text'])
                        print(substitute+'\n')

                    if (chain['type'] == 'PRONOMINAL') and (substitute != ''):
                        sentence_num = chain['sentNum']
                        words = word_tokenize(sentences[sentence_num - 1], 'English')
                        words[chain['startIndex'] - 1] = substitute
                        new_sentence = ' '.join(words)
                        sentences[sentence_num - 1] = new_sentence

            sentences_all[i-2] = sentences[0]
            sentences_all[i-1] = sentences[1]
            sentences_all[i] = sentences[2]

        return sentences_all

Exemple #41

0

Afficher le fichier

Fichier : generate_metrics.py Projet : hanhanwu/Hanhan_Play_With_Social_Media

def standford_sentiment_answer(text_str):
    asw_sentiment = make_default_sentiment()
    nlp = StanfordCoreNLP('http://localhost:9000')
    res = nlp.annotate(text_str,
                       properties={
                           'annotators': 'sentiment',
                           'outputFormat': 'json',
                           'timeout': 20000,
                       })
    try:
        total_value = 0.0
        for s in res["sentences"]:
            total_value += float(s["sentimentValue"])
            asw_sentiment[s["sentiment"]] += 1
        asw_sentiment['score'] = total_value
        return asw_sentiment
    except:
        return asw_sentiment

Exemple #42

0

Afficher le fichier

Fichier : ner2brat.py Projet : USCDataScience/parser-indexer-py

class NerToBratConverter(object):
    def __init__(self, corenlp_url='http://localhost:9000'):
        '''
        Create Converter for converting NER annotations to Brat annotations
        classifier training data.

        To start the server checkout: http://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started
        '''
        self.corenlp = StanfordCoreNLP(corenlp_url)

    def convertToBrat(self, text_file, ann_file):
        print("Processing %s" % text_file)
        with open(text_file) as f:
            text = f.read()

        props = { 'annotators': 'tokenize,ssplit,pos,ner', 'outputFormat': 'json'}
        output = self.corenlp.annotate(text, properties=props)
        # flatten sentences and tokens
        tokenlists = [s['tokens'] for s in output['sentences']]
        tokens = itertools.chain.from_iterable(tokenlists)

        count = 1
        with open(ann_file, 'w', 1) as out:
            for token in tokens:
                if token['ner'] != 'O':
                    rec = "T%d\t%s %d %d\t%s" % (count,
                            token['ner'],
                            token['characterOffsetBegin'],
                            token['characterOffsetEnd'],
                            token['originalText'])
                    # print(rec)
                    out.write(rec)
                    out.write("\n")
                    count += 1
        print("Wrote %s" % ann_file)

    def convert_all(self, input_paths):
        with open(input_paths) as paths:
            for d in map(lambda x: x.split(','), map(lambda x: x.strip(), paths)):
                self.convertToBrat(d[0], d[1])

Exemple #43

0

Afficher le fichier

Fichier : keyword_extract.py Projet : luotigerlsx/DataAnalysis_ML

class StanfordNERApi():
    '''
        Make use of StanfordCoreNLP Server
        Extract keyword through name entity recogonition
    '''
    def __init__(self):
        self.nlp = StanfordCoreNLP(NLP_SERVER)
        
    def ner_groupby_ner(self, text):
        response = self.nlp.annotate(text, properties={
            'annotators': 'ner,lemma',
            'outputFormat': 'json'
        })
        return self.__process_ner_groupby_ner(response)        
        
    def __process_ner_groupby_ner(self, response):
        output_dict = dict()
        '''The response is generally organized as {sentences:[{tokens:[]},{}]}'''
        if type(response) == dict and 'sentences' in response:
            for sentence in response['sentences']:
                for item in sentence['tokens']:
                    # we only care about ner in set TARGET_NER
                    if item.get('ner') in TARGET_NER:
                        if item['ner'] not in output_dict:
                            output_dict[item['ner']] = set()
                        output_dict[item['ner']].add(item['originalText']) 
            
            # convert from set to list for further json dumps
            for key in output_dict:
                output_dict[key] = list(output_dict[key])
            # convert dict to string by json dumps
            if len(output_dict) > 0:
                return json.dumps(output_dict)
            else:
                return None
        else:
            logger.warning('sentences part is not in the response from NLP server.')
            return None

Exemple #44

0

Afficher le fichier

Fichier : preprocess.py Projet : hanhanwu/Hanhan_NLP

class Preprocess():
    def __init__(self, argv):
        self.input = ""
        self.output_folder = ""       # output has to be a folder
        self.input_type = ""

        # Start Stanford CoreNLP Server
        self.nlp = StanfordCoreNLP('http://localhost:9000')

        # Read User Command Line
        opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
        for opt, arg in opts:
          if opt == '-h':
             print("Type 'python3.5 text_preprocessing/preprocess.py  -i <inputfile> -o <outputfile>' \
                   in run_source_code.sh file")
             sys.exit()
          elif opt in ("-i", "--ifile"):
             self.input = arg
             if os.path.exists(arg) == False:
                 print("Input doesn't exist")
                 sys.exit()
             if os.path.isdir(arg) == True: self.input_type = "dir"
             elif os.path.isfile(arg) == True: self.input_type = "file"
          elif opt in ("-o", "--ofile"):
             self.output_folder = arg

        print("Input: " + self.input +", " + self.input_type)
        print("Output: " + self.output_folder)


    def sentence_parsing(self, row_string):
        parsed_json = self.nlp.annotate(row_string, properties={
                       'annotators': 'tokenize,ssplit,pos',
                       'outputFormat': 'json'
                   })
        return parsed_json


    def output_preprocessed_data(self, json_input, file_name):
        rows = []
        for sent in json_input['sentences']:
            parsed_sent = " ".join([t['originalText'] + "/" + t['pos'] for t in sent['tokens']])
            rows.append(parsed_sent)
        output_file_path = self.output_folder + file_name
        with open(output_file_path, 'a') as preprocessed_out:
            for r in rows:
                preprocessed_out.write(r + "\n")


    def pos_tagging(self):
        if self.input_type == "file":
            input_path_elems = self.input.split("/")
            file_name = ""
            if input_path_elems[-1] != "/":
                file_name = input_path_elems[-1]
            else:
                file_name = input_path_elems[-2]
            text_string = ""
            with open(self.input, 'rb') as file_input:
                for r in file_input:
                    text_string = " ".join([text_string, r.strip().decode('utf-8', 'backslashreplace')])
            print(self.input)
            parsed_json = self.sentence_parsing(text_string)
            self.output_preprocessed_data(parsed_json, file_name)
        elif self.input_type == "dir":
            for file_name in os.listdir(self.input):
                input_file_path = self.input + file_name
                text_string = ""
                with open(input_file_path, 'rb') as file_input:
                    for r in file_input:
                        text_string = " ".join([text_string, r.strip().decode('utf-8', 'backslashreplace')])
                parsed_json = self.sentence_parsing(text_string)
                print(input_file_path)
                self.output_preprocessed_data(parsed_json, file_name)

Exemple #45

0

Afficher le fichier

Fichier : keyword_extract.py Projet : luotigerlsx/DataAnalysis_ML

class StanfordTFIDFApi():
    '''
        Make use of StanfordCoreNLP Server
        Extract keyword through tf-idf algorithm
    '''    
    def __init__(self):
        self.nlp = StanfordCoreNLP(NLP_SERVER)

    def __tf_by_pos(self, text, pos='N'):
        response = self.nlp.annotate(text, properties={
            'annotators': 'ner,lemma',
            'outputFormat': 'json'
        })
        logger.debug(json.dumps(response))
        '''The response is generally organized as {sentences:[{tokens:[]},{}]}'''
        
        result = list()
        if type(response) == dict and 'sentences' in response:
            for sentence in response['sentences']:
                for item in sentence['tokens']:
                    if item['pos'].startswith(pos):
                        # only accept engish word, and not in STOPWORDS
                        if acceptable_word(item['lemma'].lower()):
                            result.append((item['lemma'].lower()))
            toks_count = Counter(result)
            return toks_count
        else:
            logger.warning('sentences part is not in the response from NLP server.')
            return Counter()

    def tf_idf_groupby_pos(self, text, df_cache):
        
        output = dict()
        output['NOUN'] = self.__tf_by_pos(text, 'N')
        output['VERB'] = self.__tf_by_pos(text, 'V')
        
        for pos in output:
            logger.debug('Computed tf for %s:' % pos + json.dumps(output['VERB']))
            for word in output[pos]:
                '''Formula is: tf*log(N/df)'''
                if word in df_cache:
                    output[pos][word] = output[pos][word]*math.log(df_cache['total_document']
                                              /df_cache[word])
                else:
                    output[pos][word] = output[pos][word]*math.log(df_cache['total_document'])
                    
            # return the top 10 words
            output[pos] = [word for word, count in output[pos].most_common(10)]
            logger.debug('Computed tf-idf for %s:' % pos + json.dumps(output[pos]))
        
        return json.dumps(output)
        
    def compute_df(self, document_list):
        '''Compute document frequency based on input document list'''  
        df_cache = dict()
        df_output = dict()
        
        d_index = 0
        for document in document_list:
            d_index += 1
            # tokenize each document
            reg_toks = nltk.regexp_tokenize(document, SENTENCE_RE)
            for item in reg_toks:
                # change each word to lower case and lemmatize
                item = normalise(item)
                if item not in df_cache:
                    df_cache[item] = set([d_index])
                else:
                    df_cache[item].add(d_index)
        
        for item in df_cache:
            if acceptable_word(item):
                df_output[item] = len(df_cache[item])
        
        df_output['total_document'] = len(document_list)
        
        return df_output

Exemple #46

0

Afficher le fichier

Fichier : stanfordre.py Projet : AndreLamurias/IBEnt

class StanfordRE(ReModel):
    def __init__(self, corpus, relationtype, modelname="stanfordre_classifier.ser"):
        super(StanfordRE, self).__init__()
        self.modelname = modelname
        self.pairs = {}
        self.corenlp_client = None
        self.relationtype = relationtype
        self.corpus = corpus

    def generate_data(self, corpus, modelname, pairtypes):
        if os.path.isfile(self.temp_dir + modelname + ".txt"):
            print "removed old data"
            os.remove(self.temp_dir + modelname + ".txt")
        trainlines = []
        # get all entities of this document
        # doc_entities = []
        pcount = 0
        truepcount = 0
        ns = 0
        for sentence in corpus.get_sentences("goldstandard"):
            logging.info("{}".format(sentence.sid))
            nt_to_entity = {}
            for e in sentence.entities.elist['goldstandard']:
                # TODO: merge tokens of entity
                nt = str(e.tokens[0].order)
                nt_to_entity[nt] = e
            # print nt_to_entity
            # ns = sentence.sid.split("s")[-1]
            for t in sentence.tokens:
                nt = str(t.order)
                # print nt, nt in nt_to_entity
                if nt in nt_to_entity:
                    # print nt, nt_to_entity[nt], nt_to_entity[nt].type
                    #l = [str(ns), nt_to_entity[nt].type, nt, "O", t.pos, t.text, "O", "O", "O"]
                    # TODO: change other to entitiy name
                    l = [str(ns), "Other", nt, "O", t.pos, t.text, "O", "O", "O"]
                else:
                    # print nt, nt_to_entity
                    l = [str(ns), "O", nt, "O", t.pos, t.text, "O", "O", "O"]
                trainlines.append(l)
            trainlines.append([""])
            sentence_entities = [entity for entity in sentence.entities.elist["goldstandard"]]
            # logging.debug("sentence {} has {} entities ({})".format(sentence.sid, len(sentence_entities), len(sentence.entities.elist["goldstandard"])))
            for pair in itertools.combinations(sentence_entities, 2):
                if pair[0].type == pairtypes[0] and pair[1].type == pairtypes[1] or pair[1].type == pairtypes[0] and pair[0].type == pairtypes[1]:
                    # logging.debug(pair)
                    if pair[0].type == pairtypes[0]:
                        e1id = pair[0].eid
                        e2id = pair[1].eid
                    else:
                        e1id = pair[1].eid
                        e2id = pair[0].eid
                        pair = (pair[1], pair[0])
                    pid = sentence.did + ".p" + str(pcount)
                    # self.pairs[pid] = (e1id, e2id)
                    self.pairs[pid] = pair
                    if e2id in pair[0].targets:
                        truepcount += 1
                        nt1 = str(pair[0].tokens[0].order)
                        nt2 = str(pair[1].tokens[0].order)
                        trainlines.append([nt1, nt2, "+".join(pairtypes)])
                pcount += 1
                trainlines.append([""])
                ns += 1



        logging.info("Writing {} lines...".format(len(trainlines)))
        with codecs.open(self.temp_dir + modelname + ".corp", 'w', "utf-8") as trainfile:
            for l in trainlines:
                # print l
                trainfile.write("\t".join(l) + "\n")
        logging.info("True/total relations:{}/{} ({})".format(truepcount, pcount, str(1.0*truepcount/pcount)))

    def write_props(self):
        with open(config.corenlp_dir + "roth.properties", 'r') as propfile:
            lines = propfile.readlines()

        print lines
        with open(config.corenlp_dir + "roth.properties", 'w') as propfile:
            for l in lines:
                if l.startswith("serializedRelationExtractorPath"):
                    propfile.write("serializedRelationExtractorPath = {}\n".format(config.corenlp_dir + self.modelname))
                elif l.startswith("trainPath"):
                    propfile.write("trainPath = {}\n".format(self.temp_dir + self.modelname + ".corp"))
                else:
                    propfile.write(l)

    def train(self):
        self.generate_data(self.corpus, self.modelname, pairtypes=self.relationtype)
        # java -cp classpath edu.stanford.nlp.ie.machinereading.MachineReading --arguments roth.properties
        if os.path.isfile(config.corenlp_dir + self.modelname):
            print "removed old model"
            os.remove(config.corenlp_dir + self.modelname)
        if not os.path.isfile(self.temp_dir + self.modelname  + ".corp"):
            print "could not find training file " + config.corenlp_dir + self.modelname + ".corp"
            sys.exit()
        self.write_props()
        classpath = config.corenlp_dir + "*"
        srecall = ['java', '-mx3g', '-classpath', classpath, "edu.stanford.nlp.ie.machinereading.MachineReading",
                          "--arguments",  config.corenlp_dir + "roth.properties"]
        print " ".join(srecall)
        # sys.exit()
        srecall = Popen(srecall) #, stdout=PIPE, stderr=PIPE)
        res  = srecall.communicate()
        if not os.path.isfile(config.corenlp_dir + self.modelname):
            print "error with StanfordRE! model file was not created"
            print res[1]
            sys.exit()
        else:
            statinfo = os.stat(config.corenlp_dir + self.modelname)
            if statinfo.st_size == 0:
                print "error with StanfordRE! model has 0 bytes"
                print res[0]
                print res[1]
                sys.exit()
        # logging.debug(res)

    def load_classifier(self, inputfile="slk_classifier.model.txt", outputfile="jsre_results.txt"):
        self.corenlp_client = StanfordCoreNLP('http://localhost:9000')
        # sup.relation.model=
        tokenkeys = set()
        sentencekeys = set()
        for d in self.corpus.documents:
            for s in self.corpus.documents[d].sentences:
                corenlpres = self.corenlp_client.annotate(s.text.encode("utf8"), properties={
                        'ssplit.eolonly': True,
                        'openie.triple.all_nominals': True,
                        'openie.triple.strict': False,
                        'openie.max_entailments_per_clause': 500,
                        'annotators': 'tokenize,ssplit,pos,depparse,natlog,openie',
                        #'annotators': 'tokenize, ssplit, pos, lemma, ner, parse, relation, openie',
                        'outputFormat': 'json',
                        # 'sup.relation.model': self.modelname
                    })
                for o in corenlpres["sentences"][0]["openie"]:
                    if "mir" in o["object"] or "mir" in o["subject"]:
                        print "{}={}>{}".format(o["subject"], o["relation"], o["object"])


    def test(self, outputfile="jsre_results.txt"):
        pass

    def get_predictions(self, corpus, examplesfile="slk_classifier.model.txt", resultfile="jsre_results.txt"):
        pass

Exemple #47

0

Afficher le fichier

Fichier : simple_parsing_example.py Projet : hanhanwu/Hanhan_NLP

from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

text = 'To summarize the sprawling, byzantine plot: warning - possible spoilers ahead - an elderly Louvre curator is murdered in the museum. Although shot in the chest, he manages to disrobe and surround himself with cryptographic clues - written in blood AND invisible ink (!) - to the reason for his death. His estranged granddaughter, who, coincidentally is a police inspector (!!) AND a cryptologist (!!!), enlists the aid of a visiting Harvard professor and symbologist (!!!!) in unraveling the multiple mysteries of: '
res = nlp.annotate(text, properties={
                       'annotators': 'tokenize,ssplit,pos',
                       'outputFormat': 'json',
                   })

Exemple #48

0

Afficher le fichier

Fichier : nlp.py Projet : StanfordPsych254/erindbProject

#!/usr/bin/python

import cgi, cgitb 
import json
cgitb.enable()  # for troubleshooting

#the cgi library gets vars from html
data = cgi.FieldStorage()

from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

text = data['text'].value
annotators = data['annotators'].value

output = nlp.annotate(text, properties={'annotators': annotators, 'outputFormat': 'json'})

#this is the actual output
print "Content-Type: text/html\n"
print json.dumps(output)

Exemple #49

0

Afficher le fichier

Fichier : testStanfordAnn.py Projet : IdeologyPin/pygate

class StanfordAnnotator(PR):
    def __init__(self, annotators='tokenize,ssplit,pos,parse,lemma,ner', cacheDir='./corenlp'):#depparse
        self.annotators=annotators
        self.nlp = StanfordCoreNLP('http://localhost:9000')
        if not os.path.exists(cacheDir):
            os.makedirs(cacheDir)
        self.cache= os.listdir(cacheDir)
        self.cacheDir=cacheDir

    def getOutput(self, doc):
        jsonFile=doc.getId()+'.json'
        output=None
        outfile=None
        if jsonFile in self.cache:
            outfile=open(self.cacheDir+"/"+jsonFile, 'r')
            output=json.load(outfile, encoding='UTF-8')
        else:
            outfile=open(self.cacheDir+"/"+jsonFile, 'w')
            output=self.nlp.annotate(doc.getString(), properties={
                  'annotators': self.annotators,
                  'outputFormat': 'json',
                  'timeout': '600000'

            }, encoding='UTF-8')
            json.dump(output, outfile)
        outfile.close()
        return output

    def process(self, doc):
        output=self.getOutput(doc)

        sents=[]
        tokens=[]
#         print "output", json.dumps(output)
        tStart=0
        tEnd=0
        cStart=0
        cEnd=0
        text=doc.getText()
        for s in output['sentences']:
            sentText=[]
            sentTokens=[]
            for t in s['tokens']:
#               print t
                txt_bfr=t['before']
                txt_tkn=t['originalText']
                sentText.append(txt_bfr)
                sentText.append(txt_tkn)

                cStart=text.find(txt_tkn, cStart)
                cEnd=cStart+len(txt_tkn)

                token=Annotation(t['originalText'],tEnd,tEnd,cStart, cEnd, 'Token', doc)
                token.setFeature('pos', t['pos'])
                token.setFeature('lemma', t['lemma'])
                token.setFeature('ner', t['ner'])
                token.setFeature('index', t['index'])
                tokens.append(token)
                sentTokens.append(token)
                tEnd+=1

            sentCStart=sentTokens[0].cStart
            sentCEnd=sentTokens[-1].cEnd
            sentText=u''.join(sentText)
            # print sentText
            sent=Annotation(sentText, tStart, tEnd, sentCStart, sentCEnd, 'Sentence', doc)
            tStart=tEnd

            sent.setFeature('constituency-parse', s['parse'])
            sent.setFeature('dep-parse', 'not implemented!')
            sent.setFeature('index', s['index'])
            sent.setRelation('tokens',sentTokens)
            sents.append(sent)
#         pr-
        doc.setSents(sents)
        doc.setTokens(tokens)

Exemple #50

0

Afficher le fichier

Fichier : py-corenlp.py Projet : dshvets/EAPSI_2016


if __name__ == '__main__':
    nlp = StanfordCoreNLP('http://localhost:9000')

    for line in orig_file:
        if not line.startswith("PMID"):
            info = line.split('\t')
            pmid = info[0]
            ta = info[1]
            sentence = info[2]
            sentence = sentence.rstrip('\n')
            cleanSentence = removeBracket(sentence)
            extraClean = removeParenth(cleanSentence)
            output =  nlp.annotate(extraClean,properties={
            'annotators':'tokenize,ssplit,pos,depparse,parse',
            'outputFormat' : 'json'})
            try:
                result = output['sentences'][0]['parse']
                getPOS = extractPOS(result)
                fixBioName_POS = bioName(getPOS,sentence)
                newLine = pmid+'\t'+ta+'\t'+sentence+'\t'+fixBioName_POS+'\n'
                newFile.write(newLine)
            except:
                pass



orig_file.close()
newFile.close()

Exemple #51

0

Afficher le fichier

Fichier : parse_english_to_json.py Projet : giancds/idiom_type_identification

keep_all_dependencies = False
sent_count = 0
encoding = "utf-8"

with codecs.open(output_file, "a", "utf-8") as outfile:
    outfile.write("{\"corpus\":[\n")

    for line in codecs.open(input_file, "r", encoding):

        # if encoding.lower != "utf-8":
        #     line = line.encode("utf-8")

        sent_count += 1
        print("Processing sent #{:d}".format(sent_count))
        if sent_count == 16:
            print()

        output = nlp.annotate(line.replace("\n", "").strip(), properties, encoding="utf-8")

        if isinstance(output, str):
            json_obj = utils.process_json(output, sent_count, keep_all_dependencies)
        else:
            json_obj = utils.process_json(json.dumps(output, ensure_ascii=False), sent_count, keep_all_dependencies)

        json_str = json.dumps(json_obj, ensure_ascii=False)
        outfile.write("{:s},\n".format(json_str))

    outfile.write("]}")

print("Done!")

Exemple #52

0

Afficher le fichier

Fichier : stanford_parser.py Projet : paris5020/athene_system

class StanfordMethods:
    def __init__(self):
        self.webparser = StanfordCoreNLP('http://localhost:9020')
        self.load_pickle_file()
        #To use this parser an instance has to be started in parallel:
        #Download Stanford CoreNLP from: https://stanfordnlp.github.io/CoreNLP/index.html
        #Extract anywhere and execute following command: java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9020

    def webparse(self, text):
        return self.webparser.annotate(text, properties={
            'timeout': '500000',
            'annotators': 'tokenize,ssplit,truecase,pos,depparse,parse,sentiment',
            'outputFormat': 'json'
        })

    def load_pickle_file(self):
        try:
            self.known_ids = pickle.load(open(os.path.join(_pickled_data_folder, _stanford_pickle_database_file), 'rb'))
            print('loaded known_ids pickle')
            #print(self.known_ids)
        except:
            print('Stanford pickle does not exist')
            self.known_ids= {}


    def store_pickle_file(self):
        with open(os.path.join(_pickled_data_folder, _stanford_pickle_database_file), 'wb') as f:
            pickle.dump(self.known_ids, f, pickle.HIGHEST_PROTOCOL)


    def getStanfordInfo(self, type, body_id, head_id, text, max_number_of_sentences=99):
        '''
        reads info from file else calculates'
        :param type: either 'body' or 'headline'. type body will be stored, type headline will always be parsed, because it is not unique
        :param body_id: element id of which the stanford information is needed
        :param head_id
        :param text: text of which the stanfordinformation shall be extracted
        :param max_number_of_sentences: number of sentences which shall be parsed at maximum
        :return: [ [nouns], [verbs], [negations count, [root_dist]], [sentiment_value] ]
        '''
        if (type == 'body') and (body_id in self.known_ids):
            return self.known_ids[body_id]
        elif (type == 'headline') and (head_id+body_id in self.known_ids):
            return self.known_ids[str(head_id) + str(body_id)]
        else :
            try:
                result = self.extract_stanford_information(text, max_number_of_sentences)
                if type == 'body':
                    self.known_ids[body_id] = result
                elif type == 'headline':
                    self.known_ids[str(head_id) + str(body_id)] = result
                return result
            except Exception as e:
                self.store_pickle_file()
                print('problem with id: ' + str(body_id) + " type:" + type)
                print(text)
                print(e)
                raise e

    def extract_stanford_information(self, text, max_number_of_sentences=99):
        '''
        Stanford-parse the sentence
         :parameter: text, max_number of sentences to be parsed
         :return: [ [nouns], [verbs], [negations count, [root_dist]], [sentiment_value] ]
        '''
        #since the nlp parser might get some problems with long texts,
        #I decided to divide the text into sentences before parsing it with stanfordparser

        nouns = []
        verbs = []
        # sentiment_list = []
        sentiment_value_list = []
        negation_count = 0
        root_dist = []
        current_sentence = 0
        number_of_words = 0
        _refuting_words = [
            'fake',
            'fraud',
            'hoax',
            'false',
            'deny', 'denies',
            # 'refute',
            'not',
            'despite',
            'nope',
            'doubt', 'doubts',
            'bogus',
            'debunk',
            'pranks',
            'retract'
        ]

        for raw_sentence in tokenizer.tokenize(text):
            try:
                tagged_text = dict(self.webparse(raw_sentence))
                # Normally only one sentence should be in a raw_Sentence
                #  - but the nltk PunktSentence Tokenizer might have missed a split
                for sentence in tagged_text['sentences']:
                    current_sentence += 1
                    # Extract nouns and verbs
                    sentiment_value_list.append(int(sentence['sentimentValue']))
                    for token in sentence['tokens']:
                        if 'NN' in token['pos']:
                            nouns.append(token['originalText'])
                        elif 'V' in token['pos']:
                            verbs.append(token['originalText'])

                        if token['originalText'] in _refuting_words:
                            negation_count += 1
                            root_dist.append(calculate_distance(sentence, find_root_node(sentence), token['index']))
                    # Count negations
                    '''
                    # This only works correct, when at least on sentence can be parsed per text
                    for dependency in sentence['basicDependencies']['dep']:
                        try:
                           # dep, dependent, dependentGloss, governor, governorGloss = dependency.values()

                            if dependency == 'neg':
                                negation_count += 1
                                #calculate distance to negated words
                                #find head token

                                #find negated token

                                print('Negated token: ' + sentence['tokens'][i-1]['originalText'])

                                distance = calculate_distance(tagged_text, 'not', 'I')
                            # sentiment_list.append(sentence['sentiment'])

                        # Skip sentence if problem occurs while parsing
                        except Exception as e:
                            print('Error parsing sentence: ' + raw_sentence)
                            print(e)
                            #raise e
                            continue
                    '''
                    number_of_words += 1
                #only parse number of given sentences per call
                if current_sentence == max_number_of_sentences:
                    break
            except Exception as e:
                print('Error parsing sentence: ' + raw_sentence)
                print(e)
                #raise e
                continue
        #ToDo: Think about good way to combine the distance
        if negation_count >= 1:
            negation = [negation_count, root_dist]
        else:
            negation = [-1, -1]

        #calculate average words per sentence
        words_per_sentence = number_of_words/current_sentence

        return nouns, verbs, negation, sentiment_value_list, words_per_sentence

    def check_if_already_parsed(self, id):
        return id in self.known_ids

Exemple #53

0

Afficher le fichier

Fichier : brat2ner.py Projet : USCDataScience/parser-indexer-py

class BratToNerConverter(object):
    def __init__(self, corenlp_url='http://localhost:9000'):
        '''
        Create Converter for converting brat annotations to Core NLP NER CRF
        classifier training data.

        To start the server checkout: http://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started
        '''
        self.corenlp = StanfordCoreNLP(corenlp_url)


    def convert(self, text_file, ann_file):
        text, tree = self.parse(text_file, ann_file)
        props = { 'annotators': 'tokenize,ssplit', 'outputFormat': 'json'}
        if text[0].isspace():
            text = '.' + text[1:]
            # Reason: some tools trim/strip off the white spaces
            # which will mismatch the character offsets
        output = self.corenlp.annotate(text, properties=props)
        records = []
        for sentence in output['sentences']:
            continue_ann, continue_ann_en = None, None
            for tok in sentence['tokens']:
                begin, tok_end = tok['characterOffsetBegin'], tok['characterOffsetEnd']
                label = 'O'
                if begin in tree:
                    node = tree[begin]
                    if len(node) > 1:
                        print("WARN: multiple starts at ", begin, node)
                        if tok_end in node:
                            node = {tok_end: node[tok_end]} # picking one
                            print("Chose:", node)

                    ann_end, labels = node.items()[0]
                    if not len(labels) == 1:
                        print("WARN: Duplicate labels for token: %s, label:%s. Using the first one!" % (tok['word'], str(labels)))
                    if accept_labels is not None and labels[0] in accept_labels:
                        label = labels[0]
                    if tok_end == ann_end: # annotation ends where token ends
                        continue_ann = None
                    elif tok_end < ann_end and label != 'O':
                        print("Continue for the next %d chars" % (ann_end - tok_end))
                        continue_ann = label
                        continue_ann_end = ann_end
                elif continue_ann is not None and tok_end <= continue_ann_end:
                    print("Continuing the annotation %s, %d:%d %d]" % (continue_ann, begin, tok_end, continue_ann_end))
                    label = continue_ann            # previous label is this label
                    if continue_ann_end == tok_end: # continuation ends here
                        print("End")
                        continue_ann = None
                yield "%s\t%s" % (tok['word'], label)
            #yield "" # end of sentence
        yield "" # end of document


    def parse(self, txt_file, ann_file):
        with open(txt_file) as text_file, open(ann_file) as ann_file:
            texts = text_file.read().decode('utf8')
            text_file.close()
            #texts = text_file.read()
            anns = map(lambda x: x.strip().split('\t'), ann_file)
            anns = filter(lambda x: len(x) > 2, anns)
            # FIXME: ignoring the annotatiosn which are complex

            anns = filter(lambda x: ';' not in x[1], anns)
            # FIXME: some annotations' spread have been split into many, separated by ; ignoring them

            def __parse_ann(ann):
                spec = ann[1].split()
                name = spec[0]
                markers = list(map(lambda x: int(x), spec[1:]))
                #t = ' '.join([texts[begin:end] for begin,end in zip(markers[::2], markers[1::2])])
                t = texts[markers[0]:markers[1]]
                if not t == ann[2]:
                    print("Error: Annotation mis-match, file=%s, ann=%s" % (txt_file, str(ann)))
                    return None
                return (name, markers, t)
            anns = map(__parse_ann, anns) # format
            anns = filter(lambda x: x, anns) # skip None

            # building a tree index for easy accessing
            tree = {}
            for entity_type, pos, name in anns:
                begin, end = pos[0], pos[1]
                if begin not in tree:
                    tree[begin] = {}
                node = tree[begin]
                if end not in node:
                    node[end] = []
                node[end].append(entity_type)

            # Re-read file in without decoding it
            text_file = open(txt_file)
            texts = text_file.read()
            text_file.close()
            return texts, tree

    def convert_all(self, input_paths, output):
        with open(input_paths) as paths, open(output, 'w') as out:
            for p in map(lambda x: x.strip(), paths):
                d = p.split(',')
                print(d)
                for line in self.convert(d[0], d[1]):
                    out.write(line)
                    out.write("\n")
                out.write("\n") # end of document

Exemple #54

0

Afficher le fichier

Fichier : extract_names.py Projet : gvishal/Sematic-Job-Recommendation-Engine

        count=0
        flag=0

        for line in data:
            # print line
            if(line.isspace()): continue
            elif count==5 or flag: break
            else: count+=1

            line = line.lstrip().rstrip()
            # line = remove_non_ascii(line)
            # line = filter(lambda x: x in printable, line)

            output = nlp.annotate(line, properties={
              'annotators': 'tokenize,ssplit,pos,lemma,ner',
              'outputFormat': 'json'
              })

            # print 'output: ', output
            tagged = []
            for sentence in output['sentences']:
                for token in sentence['tokens']:
                    tagged.append((token['originalText'], token['ner']))

            # print tagged

            name = []
            for (el1,el2) in tagged:
                if el2 == u'PERSON':
                    print "Name identified: "+ el1
                    name.append(el1)

Exemple #55

0

Afficher le fichier

Fichier : corenlpparser.py Projet : USCDataScience/parser-indexer-py

class CoreNLPParser(JournalParser):
    CORENLP_PARSER = "edu.stanford.nlp.pipeline.CoreNLPServer"

    def __init__(self, **kwargs):
        super(CoreNLPParser, self).__init__(**kwargs)
        self.corenlp = StanfordCoreNLP(kwargs['corenlp_url'] )
        self.props = {
            'annotators': 'tokenize,ssplit,lemma,pos,ner',
            'outputFormat': 'json',
            'ner.useSUTime': False,  # dont want SUTime model
            'ner.applyNumericClassifiers': False, # Dont want numeric classifier
        }
        if kwargs.get('ner_model'): # set NER model from CLI
            if not os.path.exists(kwargs.get('ner_model')):
                print('Error: Could not find NER model %s.' % 
                      kwargs.get('ner_model'))
                sys.exit(1)
            self.props['ner.model'] = kwargs['ner_model']
        print("CoreNLP Properties : ", self.props)

    def parse_names(self, text, meta):
        if type(text) != str:
            text = text.encode('utf8') #, errors='ignore')
        if text[0].isspace(): # dont strip white spaces
            text = '.' + text[1:]

        output = self.corenlp.annotate(text, properties=self.props)
        # flatten sentences and tokens
        tokenlists = [s['tokens'] for s in output['sentences']]
        tokens = itertools.chain.from_iterable(tokenlists)
        names = []
        for token in tokens:
            if token['ner'] != 'O':
                name = {
                    'label': token['ner'],
                    'begin': token['characterOffsetBegin'],
                    'end': token['characterOffsetEnd'],
                    'text': token['originalText'],
                    'source': 'corenlp'
                }
                names.append(name)

        # Handle multi-word tokens:
        # Merge any adjacent Target tokens, if of the same type and 
        # separated by a space, into one span.
        names.sort(key=lambda x: int(x['begin']))
        new_names = []
        skip_names = []
        for n in names:
            if n in skip_names:
                continue
            next_name = [n2 for n2 in names if \
                         n['label'] == 'Target' and
                         n2['label'] == 'Target' and
                         int(n2['begin']) == int(n['end']) + 1]
            if len(next_name) > 0:
                print('%s: Merging %s and %s' % 
                      (meta['resourceName'],
                       n['text'], next_name[0]['text']))
                n['text'] += ' ' + next_name[0]['text']
                n['end']  = next_name[0]['end']
                skip_names.append(next_name[0])

            # Either way, save this one
            new_names.append(n)

        if len(names) != len(new_names):
            print('%d -> %d NERs' % (len(names), len(new_names)))

        if names:
            meta['ner'] = new_names
            meta['X-Parsed-By'].append(CoreNLPParser.CORENLP_PARSER)
        meta['sentences'] = output['sentences']
        return meta