Exemple #1
0
from nltk.tag import StanfordNERTagger
import nltk
from nltk.corpus import wordnet as wn
from nltk.parse.corenlp import CoreNLPDependencyParser
from graphviz import Source
from pattern.vector import stemmer
from pycorenlp import StanfordCoreNLP
from sutime import SUTime
from textblob import TextBlob
from stanfordnlp.server import CoreNLPClient
from pynlp import StanfordCoreNLP

annotators = 'tokenize, ssplit, pos, ner, coref'
options = {'openie.resolve_coref': True}

nlp = StanfordCoreNLP(annotators=annotators, options=options)
sdp = CoreNLPDependencyParser()

#-----------------------------------------------------------------------------------------------------------------------
#LOAD THE SENTENCES
filepath = 'kolbuszowa.txt'
list_sentences = []
with open(filepath, encoding="utf8") as file:
    for line in file:
        list_sentences.append([line[:line.rfind(".") + 1]])

#PREPROCESSING START
#CREATE TEMPORARY LIST FOR ADJUCENT SENTECNES FOR COREFERENCING (PREVIOUS 2 SENTENCES)
for i in range(len(list_sentences)):
    adj_sentences = []
    start_index = i - 1
    return sum/(len(sentiments))

#tales=['FundeVogel','Rapunzel','TheGooseGirl','Golden Bird','HansInGoodLuck','JorindaAndJorindel','TravelingMusicians','OldSultan','TheStraw','BriarRose','DogAndSparrow','TwelveDancingPrincesses','FishermanAndWife','TheWillowRen','FrogPrince','CatAndMouse']
taleSentiments=[]
for taleName in tales:
    #f = open("./Corefs/"+taleName,'r',encoding="utf8")
    p(taleName)
    if (sys.argv[1]==1):
        f=open("./Stories/"+taleName,'r',encoding="utf8")
    else:
        f=open("./Corefs/"+taleName,'r',encoding="utf8")
    tale= f.read()
    tale = tale.replace('\n', ' ')
    tale = tale.replace('\r', ' ')
    #pprint.pprint(tale)
    nlp_wrapper = StanfordCoreNLP('http://localhost:9000')
    #doc = "Ronaldo has moved from Real Madrid to Juventus. While Messi still plays for Barcelona"
    doc=tale
    #pprint.pprint(doc)
    annot_doc = nlp_wrapper.annotate(doc,
        properties={
            'annotators': 'ner, pos,depparse',
            'outputFormat': 'json',
            'timeout': 100000,
        })

    nsubjs=[]
    #pprint.pprint(annot_doc)
    for sentence in annot_doc['sentences']:
        for element in sentence['basicDependencies']:
            if(element['dep']=='nsubj'):
Exemple #3
0
 def __init__(self):
     self.load_data()
     self.nlp = StanfordCoreNLP(config.StanfordCoreNLP_Path)
def brat_to_conll(input_folder, output_filepath, tokenizer, language):
    '''
    Assumes '.txt' and '.ann' files are in the input_folder.
    Checks for the compatibility between .txt and .ann at the same time.
    '''
    if tokenizer == 'spacy':
        spacy_nlp = spacy.load(language)
    elif tokenizer == 'stanford':
        core_nlp = StanfordCoreNLP('http://localhost:{0}'.format(9000))
    else:
        raise ValueError("tokenizer should be either 'spacy' or 'stanford'.")
    verbose = False
    dataset_type = os.path.basename(input_folder)
    print("Formatting {0} set from BRAT to CONLL... ".format(dataset_type),
          end='')
    text_filepaths = sorted(glob.glob(os.path.join(input_folder, '*.txt')))
    output_file = codecs.open(output_filepath, 'w', 'latin-1')
    for text_filepath in text_filepaths:
        base_filename = os.path.splitext(os.path.basename(text_filepath))[0]
        annotation_filepath = os.path.join(os.path.dirname(text_filepath),
                                           base_filename + '.ann')
        # create annotation file if it does not exist
        if not os.path.exists(annotation_filepath):
            codecs.open(annotation_filepath, 'w', 'latin-1').close()

        text, entities = get_entities_from_brat(text_filepath,
                                                annotation_filepath)
        entities = sorted(entities, key=lambda entity: entity["start"])

        if tokenizer == 'spacy':
            sentences = get_sentences_and_tokens_from_spacy(text, spacy_nlp)
        elif tokenizer == 'stanford':
            sentences = get_sentences_and_tokens_from_stanford(text, core_nlp)

        for sentence in sentences:
            inside = False
            previous_token_label = 'O'
            for token in sentence:
                token['label'] = 'O'
                for entity in entities:
                    if entity['start'] <= token['start'] < entity['end'] or \
                       entity['start'] < token['end'] <= entity['end'] or \
                       token['start'] < entity['start'] < entity['end'] < token['end']:

                        token['label'] = entity['type'].replace(
                            '-', '_'
                        )  # Because the ANN doesn't support tag with '-' in it

                        break
                    elif token['end'] < entity['start']:
                        break

                if len(entities) == 0:
                    entity = {'end': 0}
                if token['label'] == 'O':
                    gold_label = 'O'
                    inside = False
                elif inside and token['label'] == previous_token_label:
                    gold_label = 'I-{0}'.format(token['label'])
                else:
                    inside = True
                    gold_label = 'B-{0}'.format(token['label'])
                if token['end'] == entity['end']:
                    inside = False
                previous_token_label = token['label']
                if verbose:
                    print('{0} {1} {2} {3} {4}\n'.format(
                        token['text'], base_filename, token['start'],
                        token['end'], gold_label))
                output_file.write('{0} {1} {2} {3} {4}\n'.format(
                    token['text'], base_filename, token['start'], token['end'],
                    gold_label))
            if verbose: print('\n')
            output_file.write('\n')

    output_file.close()
    print('Done.')
    if tokenizer == 'spacy':
        del spacy_nlp
    elif tokenizer == 'stanford':
        del core_nlp
Exemple #5
0
CORENLP_SERVER_ADDRESS = 'http://localhost:9000'

NER_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY,
                                     'ner-crf-training-data.tsv')
RE_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 're-training-data.corp')

if os.path.exists(OUTPUT_DIRECTORY):
    if os.path.exists(NER_TRAINING_DATA_OUTPUT_PATH):
        os.remove(NER_TRAINING_DATA_OUTPUT_PATH)
    if os.path.exists(RE_TRAINING_DATA_OUTPUT_PATH):
        os.remove(RE_TRAINING_DATA_OUTPUT_PATH)
else:
    os.makedirs(OUTPUT_DIRECTORY)

sentence_count = 0
nlp = StanfordCoreNLP(CORENLP_SERVER_ADDRESS)

# looping through .ann files in the data directory
ann_data_files = [
    f for f in listdir(DATA_DIRECTORY)
    if isfile(join(DATA_DIRECTORY, f)) and f.split('.')[1] == 'ann'
]

for file in ann_data_files:
    entities = []
    relations = []

    # process .ann file - place entities and relations into 2 seperate lists of tuples
    with open(join(DATA_DIRECTORY, file), 'r') as document_anno_file:
        lines = document_anno_file.readlines()
        for line in lines:
Exemple #6
0
from pycorenlp import StanfordCoreNLP
from scipy import spatial
nlp = StanfordCoreNLP('http://10.4.100.141:9000')
text = 'Timmy the elephant has eyes, ears, tusks and legs. Timmy the dog has four legs and four eyes. Timmy the hippo has a big nose and huge ears'
output = nlp.annotate(text,
                      properties={
                          'annotators':
                          'tokenize,ssplit,pos,depparse,parse,openie,ner',
                          'outputFormat': 'json'
                      })
contexts = []
words = []
occsCont = {}
for i in range(len(output['sentences'])):
    occs = {}
    contexts.append(output['sentences'][i]['openie'][0]['subject'])
    for x in output['sentences'][i]['tokens']:
        if (x['pos'] == 'CD' or x['pos'] == 'JJ' or x['pos'] == 'NN'
                or x['pos'] == 'NNS'):
            words.append(x['word'])
sWords = set(words)
for i in range(len(output['sentences'])):
    for x in sWords:
        occs[x] = 0
        occsCont[i] = dict(occs)
for i in range(len(output['sentences'])):
    senWords = (text.split('.'))
    for x in sWords:
        occsCont[i][x] = senWords[i].count(x)
sim = {}
for x in (range(1, len(contexts) + 1)):
Exemple #7
0
 def __init__(self):
     self.corenlp = StanfordCoreNLP('http://localhost:9000')
Exemple #8
0
# Stanford Core NLPの係り受け解析の結果(collapsed-dependencies)を
# 有向グラフとして可視化せよ.可視化には,係り受け木をDOT言語に変換し,
# Graphvizを用いるとよい.また,Pythonから有向グラフを直接的に可視化するには,
# pydotを使うとよい.
import pprint
from nltk.stem.porter import PorterStemmer
import pydot_ng as pydot
import pydotplus
from pycorenlp import StanfordCoreNLP

ipath = '../../data/input/'
opath = '../../data/output/'

nlp = StanfordCoreNLP("http://localhost:9000")
prop = {"annotators":"depparse", "outputFormat":"json"}

tokenized_list = []

with open(ipath+'nlp.txt', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        tokenized_list.append(nlp.annotate(line, properties=prop))

dots = []
edges = []
for line in tokenized_list:
    sentences = line['sentences']
    if len(sentences) == 0:
        pass
    else:
        for sentence in sentences:
Exemple #9
0
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://101.132.182.124:9000')

from datetime import datetime,timedelta
from collections import defaultdict
from pprint import pprint
from tqdm import tqdm
import re

import pymongo
from pymongo import InsertOne, DeleteMany, ReplaceOne, UpdateOne
from pymongo.errors import BulkWriteError
client = pymongo.MongoClient('34.224.37.110:27017')
db = client.tweet

import pandas as pd

def get_ner_dict(word,ner):
	ner_tuple = zip(word,ner)
	splits = [0]
	for index,nt in enumerate(ner_tuple):
		if index == 0:
			temp = nt[1]
			continue
		if temp != nt[1]:
			splits.append(index)
			temp = nt[1]
			continue
		else:
			temp = nt[1]
			continue
def get_clues(text):
    text = text
    print("*--------(%s)-------------*" % (text))
    print(type(text))
    nlp = StanfordCoreNLP('http://localhost:9001')
    stop_words = set(stopwords.words('english'))
    '''
		Method to remove numbers appended at last
	'''
    dep_parse = nlp.annotate(text,
                             properties={
                                 'annotators': 'depparse',
                                 'outputFormat': 'json',
                                 'timeout': 10000,
                             })

    pos = nlp.annotate(text,
                       properties={
                           'annotators': 'lemma',
                           'outputFormat': 'json',
                           'timeout': 10000,
                       })

    sn = SenticNet()
    word_to_dep = [{} for i in range(len(dep_parse['sentences']))]
    word_to_par = [{} for i in range(len(dep_parse['sentences']))]
    word_to_pos = [{} for i in range(len(dep_parse['sentences']))]
    word_to_lemma = [{} for i in range(len(dep_parse['sentences']))]
    word_to_child = [{} for i in range(len(dep_parse['sentences']))]
    sents = [[] for i in range(len(dep_parse['sentences']))]
    index_to_word = {}
    '''
		Constructing dicts for maintaining the dependencies among words. 
	'''
    '''
		Appending each word by occurence number to maintain distinct word count
	'''
    #print(dep_parse['sentences'])
    print("********")
    for i, sent in enumerate(dep_parse['sentences']):
        for dep in sent['basicDependencies']:
            word_to_dep[i][dep['dependentGloss'] +
                           str(dep['dependent'])] = dep['dep']
            word_to_par[i][dep['dependentGloss'] +
                           str(dep['dependent'])] = dep['governorGloss'] + str(
                               dep['governor'])
            index_to_word[dep['dependentGloss'] +
                          str(dep['dependent'])] = dep['dependentGloss']

            if (dep['governorGloss'] + str(dep['governor'])
                    not in word_to_child[i]):
                word_to_child[i][dep['governorGloss'] +
                                 str(dep['governor'])] = []
            if (dep['dependentGloss'] + str(dep['dependent'])
                    not in word_to_child[i]):
                word_to_child[i][dep['dependentGloss'] +
                                 str(dep['dependent'])] = []
            word_to_child[i][dep['governorGloss'] +
                             str(dep['governor'])].append(
                                 dep['dependentGloss'] + str(dep['dependent']))
            sents[i].append(dep['dependentGloss'] + str(dep['dependent']))
        word_to_dep[i]['ROOT0'] = 'root'
        word_to_par[i]['ROOT0'] = 'root'

    for i, sent in enumerate(pos['sentences']):
        for pos_tagger in sent['tokens']:
            word_to_pos[i][pos_tagger['word']] = pos_tagger['pos']
            word_to_lemma[i][pos_tagger['word']] = pos_tagger['lemma']
        word_to_pos[i]['ROOT'] = 'root'
        word_to_lemma[i]['ROOT'] = 'root'
    '''
		Displaying the deps
	'''

    ##Implemeting rules to extract aspects
    for i, sent in enumerate(sents):
        if (__name__ == '__main__'):
            print(word_to_dep[i], word_to_par[i], word_to_pos[i])
            print("Children==>")
            print(word_to_child[i])

    aspects = []
    for i, sent in enumerate(sents):
        for word in sent:
            '''
				Rule 0
			'''
            if ('subj' in word_to_dep[i][word]):
                for child in word_to_child[i][word_to_par[i][word]]:
                    if ('amod' in word_to_dep[i][child]
                            or 'advmod' in word_to_dep[i][child]):
                        aspects.append(word_to_par[i][word])
                        if (__name__ == '__main__'):
                            print("Rule 0 triggered.")
            '''
				Rule 1 (without sub): Very big to hold.
			'''
            if (word_to_dep[i][word] == 'xcomp' and
                ('JJ' in word_to_pos[i][index_to_word[word_to_par[i][word]]] or
                 'RB' in word_to_pos[i][index_to_word[word_to_par[i][word]]])):
                if (__name__ == '__main__'):
                    print("Rule 1 triggered")
                aspects.append(word_to_par[i][word])
            '''
				Rule 2 (without subj): Not to mention the price of the phone
			'''
            if (word_to_dep[i][word] == 'dobj' and 'VB'
                    in word_to_pos[i][index_to_word[(word_to_par[i][word])]]
                    and ('NN' in word_to_pos[i][index_to_word[(word)]]
                         or 'JJ' in word_to_pos[i][index_to_word[(word)]])):
                aspects.append(word)
                if (__name__ == '__main__'):
                    print("Rule 2 triggered")
                    print(word)
            '''
				Rule 3 (without subj): Love the sleekness of the player
			'''

            if ('NN' in word_to_pos[i][index_to_word[(word)]]
                    and word_to_dep[i][word] == 'nmod'):
                aspects.append(word_to_par[i][word])
                if (__name__ == '__main__'):
                    print("Rule 3 triggered")
                    print(word_to_par[i][word])
                '''
				Rule 4 (with sub): The battery lasts little 
				two aspects 
			'''
            if (word_to_dep[i][word] == 'advmod'
                    or word_to_dep[i][word] == 'amod' or word_to_dep[i][word]
                    == 'advcl') and ('VB' in word_to_pos[i][index_to_word[(
                        word_to_par[i][word])]]):
                aspects.append(word_to_par[i][word])
                for word2 in sent:
                    if (word2 != word and word_to_dep[i][word2] == 'nsubj'
                            and word_to_par[i][word2] == word_to_par[i][word]
                            and
                        ('NN' in word_to_pos[i][index_to_word[word2]]
                         or 'JJ' in word_to_pos[i][index_to_word[word2]])):
                        aspects.append(word2)
                        if (__name__ == '__main__'):
                            print("Rule 4 triggered")
                            print(word2)
                '''
				Rule 5 (with sub): I like the lens of this camera
			'''
            if ('NN' in word_to_pos[i][index_to_word[(word)]]
                    and word_to_dep[i][word] == 'dobj'):
                if (__name__ == '__main__'):
                    print("Rule 5 triggered")
                    print(word)
                try:
                    concept_info = sn.concept((word))
                    print("present in senticnet")
                except KeyError:
                    print("Yay")
                    aspects.append(word)
            '''
				Rule 6 : I like the beauty of the screen.
				Check if senticnet condition should be added
			'''
            if ('NN' in word_to_pos[i][index_to_word[(word)]]
                    and word_to_dep[i][word] == 'dobj'):
                try:
                    concept_info = sn.concept((word))
                    aspects.append(word)
                    print("yay!")
                except KeyError:
                    print("oops, not there in SenticNet")
                for word2 in sent:
                    if (word2 != word and word_to_par[i][word2] == word and
                            'NN' in word_to_pos[i][index_to_word[(word2)]]):
                        aspects.append(word2)
                        if (__name__ == '__main__'):
                            print("Rule 6 triggered.")
                            print(word2)
            '''
				Rule 7 : I would like to comment on the camera of this phone. 
			
			'''
            if (word_to_dep[i][word] == 'xcomp'):
                try:
                    concept_info = sn.concept((word))
                    aspects.append(word)
                    print("yay!")
                except KeyError:
                    print("oops, not there in SenticNet")
                for child in word_to_child[i][word]:
                    if ('NN' in word_to_pos[i][index_to_word[child]]):
                        aspects.append(child)
                        if (__name__ == '__main__'):
                            print("Rule 7 triggered.")
                            print(word)
                            print(child)
            '''
				Rule 8 : The car is expensive.
			'''
            if (word_to_dep[i][word] == 'nsubj'):
                for word2 in sent:
                    if (word2 != word and word_to_dep[i][word2] == 'cop'
                            and word_to_par[i][word2] == word_to_par[i][word]):
                        aspects.append(word_to_par[i][word])
                        if (__name__ == '__main__'):
                            print("Rule 8 triggered")
                            print(word_to_par[i][word])
            '''			
				Rule 9 : The camera is nice.
			'''
            if (word_to_dep[i][word] == 'nsubj'
                    and 'NN' in word_to_pos[i][index_to_word[(word)]]):
                for word2 in sent:
                    if (word2 != word and word_to_dep[i][word2] == 'cop'
                            and word_to_par[i][word2] == word_to_par[i][word]):
                        aspects.append(word)
                        if (__name__ == '__main__'):
                            print("Rule 9 triggered")
                            print(word)
            '''
				Rule 10 : The phone is very lightweight to carry.
			'''
            if (word_to_dep[i][word] == 'cop'):
                for word2 in sent:
                    if (word2 != word
                            and 'VB' in word_to_pos[i][index_to_word[(word2)]]
                            and word_to_par[i][word] == word_to_par[i][word2]):
                        aspects.append(word2)
                        if (__name__ == '__main__'):
                            print("Rule 10 triggered.")
                            print(word2)
            '''
				Extracting mods of dobjs

			'''
            if (word_to_dep[i][word] == 'dobj'):
                for child in word_to_child[i][word]:
                    if ('mod' in word_to_dep[i][child] and 'JJ'
                            in word_to_pos[i][index_to_word[(child)]]):
                        aspects.append(child)
            '''
				Rule 11 : Checking for conjuctions
			'''
        for asp in aspects:
            for word in sent:
                if (word_to_dep[i][word] == 'conj'
                        and word_to_par[i][word] == asp):
                    aspects.append(word)
                    if (__name__ == '__main__'):
                        print("Rule conj triggered.")
                        print(word)

    finalIAC = set(aspects)
    finalIAC = [index_to_word[f] for f in finalIAC]
    finalIAC = [w for w in finalIAC if not w in stop_words]

    finalSenti = []
    for iac in finalIAC:
        try:
            concept_info = sn.concept((iac))
            finalSenti.append(iac)
        except KeyError:
            print("No word available for " + iac)

    return finalIAC, finalSenti
Exemple #11
0
 def start(self):
     command = self.SERVER_COMMAND_PATTERN.format(self._memory,
                                                  self._timeout)
     self._process = self._open_process(command, wait=False)
     self._wait_for_server()
     self._http_client = StanfordCoreNLP(self.SERVER_URL)
#!/usr/bin/python3
from pycorenlp import StanfordCoreNLP
import os

# nlp = StanfordCoreNLP('http://13.67.115.74:9000')
# nlp = StanfordCoreNLP('http://119.63.99.173:9000')

if os.environ.get('LEXICA') == True or os.environ.get('LEXICA') == "true":
    # nlp = StanfordCoreNLP('http://192.168.0.100:9000')
    nlp = StanfordCoreNLP('http://localhost:9000')
else:
    nlp = StanfordCoreNLP('http://13.67.115.74:9000')

# nlp = StanfordCoreNLP('http://13.67.115.74:9000')

# nlp = StanfordCoreNLP('http://localhost:9000')
nlp = StanfordCoreNLP('http://192.168.0.100:9000')


def stanford_tree(line, annotators='parse'):
    output = nlp.annotate(line, properties={
        'annotators': annotators,
        'outputFormat': 'json'
    })
    try:
        return output
    except IndexError:
        pass
# ### Using spacy vector similarity function

# In[18]:


def getSimilarity(word1, word2):
    tokens = nlp(word1 + " " + word2)
    return tokens[0].similarity(tokens[1])


# ## Stanford CoreNLP

# In[1]:

from pycorenlp import StanfordCoreNLP
stanford_nlp = StanfordCoreNLP('http://localhost:9001')

# ### Pos tagging

# In[2]:


def getPOSTaggedDataFromTextUsingStanford(text):
    posSentences = []
    output = stanford_nlp.annotate(text,
                                   properties={
                                       'annotators': 'tokenize,ssplit,pos',
                                       'outputFormat': 'json'
                                   })
    for s in output['sentences']:
        posSentences.append(" ".join(
def get_NER(qc, query):
    nlp = StanfordCoreNLP(
        'http://corenlp.run/')  # normal corenlp server through internet
    #nlp = StanfordCoreNLP('http://localhost:9000')
    #nlp = StanfordCoreNLP('http://10.2.6.65:9099/') ## Desktop
    #nlp = StanfordCoreNLP('http://10.4.16.160:9094/') ## Lab server

    print("ner qc : ", qc, query)
    query_tokens = word_tokenize(query)

    if (qc[0] == 'LOCATION'):
        qc.append("CITY")
        qc.append("COUNTRY")
        qc.append("STATE_OR_PROVINCE")

        ###### For Spacy #########
        qc.append("LOC")
        qc.append("GPE")
        #qc.append("NORP")
        qc.append("FAC")
        qc.append(
            "ORG")  ### bcz we have very less samples on ORGA in QC part so

    if (qc[0] == 'ORGANIZATION'):
        ###### For Spacy #########
        qc.append("FAC")
        qc.append(
            "ORG")  ### bcz we have very less samples on ORGA in QC part so

    if (qc[0] == 'NUMBER'):
        ##### For Spacy ##########
        qc.append("PERCENT")
        qc.append("ORDINAL")
        qc.append("CARDINAL")

    if (qc[0] == 'DATE'):
        qc.append("DURATION")
        qc.append('TIME')
        qc.append("SET")

    ##### If TIME & DATE COMBINED THEN NO NEED TO WRITE BELLOW 5LINES SNIPPET
    if (qc[0] == 'TIME'):
        qc.append("DURATION")
        qc.append('DATE')
        qc.append("SET")

    print("In NER Section QC : ", qc)
    ###################### Only Spacy NER Tool for Query ###############################
    doc = Ner_script_spacy.NER_Spacy_funct(quest[0].decode('utf8'))
    for entity in doc.ents:
        #print(entity.text,entity.label_)
        q_ner.append(entity.label_)
        Whole_output.append(
            entity.label_ +
            "\n")  #### Does this Spacy has Any timeout Error...??

    print("\n Fiding Same type NERs in Top 10 Sentences:\n")
    for ii in range(len(Top_similarity_sent)):
        text1 = Top_similarity_sent[ii]
        text = unicodedata.normalize('NFKD', text1).encode('ascii', 'ignore')
        #text = text1
        print("text: ", text)
        nerr = []
        o_text = []
        new_ner_o_text = []
        chunk_ner = []

        ################# ONly Spacy NER Tool for Sentences #####################
        doc = Ner_script_spacy.NER_Spacy_funct(text.decode('utf8'))
        for entity in doc.ents:
            nerr.append(
                entity.label_
            )  ####### NEED TO MODIFY HERE FOR CHUNKING Pbm Bcz we need Sequence
            o_text.append(entity.text)
            if (entity.label_ != "O"):
                random_ans.append(entity.text)

        #print("\n############# NER Preprocessing ###################\n")
        print("\n")
        for b in range(len(o_text)):
            new_ner_o_text.append(o_text[b])

        for k in range(len(qc)):
            for z in range(len(new_ner_o_text)):
                if (new_ner_o_text[z] != "NULL"):
                    if (new_ner_o_text[z] != "."):
                        if (nerr[z] == qc[k]):  # v.v imp condition
                            chunk_ner.append(new_ner_o_text[z])
                            final_answers.append(new_ner_o_text[z])

    if (len(random_ans) == 0):
        print("random_ans: ", random_ans)
        print(
            "NEED Figure out some Solution When NER is not recognizing any word...!!!!!!!!!!"
        )
        stop_words = set(stopwords.words('english'))
        Tokens = []
        for sent in Top_similarity_sent:  ### Here also we can take some sentences from K-ranked sentences
            Tokens.append(word_tokenize(sent))

        Tokens = list(itertools.chain(*Tokens))
        filtered_sent_tokens = [w for w in Tokens if not w in stop_words]

        d = Counter(filtered_sent_tokens)
        words = [
            pair[0] for pair in sorted(
                d.items(), key=lambda item: item[1], reverse=True)
        ]
        print(words)
        '''
		sent = Top_similarity_sent[0]
		sent_tokens = word_tokenize(sent)
		for tk in sent_tokens:
			random_ans.append(tk)
		'''
        print("words len : ", len(words))
        limit = 10
        if (len(words) < limit):
            limit = len(words)

        for ra_i in range(
                limit
        ):  ##### Taking 10 random answers from top K-ranked sentences by ignoring the stopwords.
            random_ans.append(words[ra_i])

    max_final_ans = []
    for i in range(len(final_answers)):
        max_final_ans.append(final_answers.count(final_answers[i]))
    ss = sorted(range(len(max_final_ans)),
                key=max_final_ans.__getitem__,
                reverse=True)

    print("\n")
    print("Answers Set : ", random_ans)
    print("\n\nAfter Chunking of NERs:\n")
    print("final_answers : ", final_answers)
    print("max_final_ans : ", max_final_ans)
    print("lenghts of [AnswersSet, final_answers, max_final_ans,ss] : ",
          len(random_ans), len(final_answers), len(max_final_ans), len(ss))
    optional_print = []
    print("query_tokens : ", query_tokens)
    print("\n------------- > > Output < < --------------\n")
    #p_n=10
    if (len(ss) >= 10):
        p_n = len(ss)
    else:
        p_n = len(ss)
    z = 0
    for i in range(len(ss)):
        j = ss[i]
        x = []
        x.append(final_answers[j])
        if ((set(optional_print).intersection(x))):
            print(" ")
        else:
            if not (final_answers[j] in query_tokens):
                #print("else ans not in Q : ",final_answers[j])
                optional_print.append(final_answers[j])
                label_ans.append(final_answers[j])

            z = z + 1
    if (z == 0 and (len(final_answers) != 0)):
        print(random.choice(final_answers))

    print("Possible Predictable label_answers : ", label_ans)
Exemple #15
0
def making_parsed_tree(sentiment_code, file_name):
    splited_sentence_first = []
    parsed_sentence_first = []

    pcn = StanfordCoreNLP('http://*****:*****@", '', text)
        text = re.sub(r'http\S+', '', text)
        return text

    for a in tqdm(range(len(df))):
        tweet_txt = about_symbol(text[a])
        if label[a] == sentiment_code:
            if len(tweet_txt) > 3:
                tweet_txt = " ".join(tweet_txt.split())
                tweet_txt = contractions.fix(tweet_txt)

                doc = nlp(tweet_txt)
                splited_sentence_second = []
                parsed_sentence_second = []

                for sentence in doc.sentences:
                    temp = []
                    for token in sentence.tokens:
                        temp.append(token.text)
                    sum_text = " ".join(temp)
                    sum_text = about_symbol(sum_text)
                    output = pcn.annotate(sum_text,
                                          properties={
                                              'annotators': 'parse',
                                              'outputFormat': 'json'
                                          })
                    parsed_sent = output['sentences'][0]['parse']
                    parsed_sent = " ".join(parsed_sent.split())
                    parsed_sent = parsed_sent.replace('(', '<')
                    parsed_sent = parsed_sent.replace(')', '>')

                    parsed_sentence_second.append(parsed_sent)
                    splited_sentence_second.append(sum_text)
                    # print(parsed_sent)
                splited_sentence_first.append(splited_sentence_second)
                parsed_sentence_first.append(parsed_sentence_second)

            sent_json['splited_sentence'] = []
            sent_json['parsed_sentence'] = []
            sent_json['original_sentence'] = []
            sent_json['splited_sentence'].append(splited_sentence_first)
            sent_json['parsed_sentence'].append(parsed_sentence_first)
            sent_json['original_sentence'].append(tweet_txt)

    with open(file_name, 'w') as out_file:
        json.dump(sent_json, out_file, indent=4)
                    logger.info('Skipped question due to offset mismatch:')
                    logger.info(question)
                qa['question_entities'] = question_entities
    logger.info('In total, {} contexts and {} questions are skipped...'.format(
        skip_context_cnt, skip_question_cnt))


if __name__ == '__main__':
    args = parse_args()

    # make output directory if not exist
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    # register corenlp server
    nlp = StanfordCoreNLP('http://localhost:9753')

    # load train and dev datasets
    ftrain = open(args.train_file, 'r', encoding='utf-8')
    trainset = json.load(ftrain)
    fdev = open(args.predict_file, 'r', encoding='utf-8')
    devset = json.load(fdev)

    for dataset, path, name in zip((trainset, devset),
                                   (args.train_file, args.predict_file),
                                   ('train', 'dev')):
        tagging(dataset, nlp)
        output_path = os.path.join(
            args.output_dir,
            "{}.tagged.json".format(os.path.basename(path)[:-5]))
        json.dump(dataset, open(output_path, 'w', encoding='utf-8'))
Exemple #17
0
 def get_stanforcorenlp(self):
     self.stanfordCoreNLP = StanfordCoreNLP('http://localhost:9000')
     return self.stanfordCoreNLP
Exemple #18
0
    def __init__(self, app, prefix=''):
        self.app = app
        self.prefix = prefix

    def __call__(self, environ, start_response):

        if environ['PATH_INFO'].startswith(self.prefix):
            environ['PATH_INFO'] = environ['PATH_INFO'][len(self.prefix):]
            environ['SCRIPT_NAME'] = self.prefix
            return self.app(environ, start_response)
        else:
            start_response('404', [('Content-Type', 'text/plain')])
            return ["This url does not belong to the app.".encode()]


core.NLP = StanfordCoreNLP(os.environ['PPAXE_CORENLP'])
app = Flask(__name__)  # create the application instance
app.wsgi_app = ReverseProxied(app.wsgi_app)

# app.wsgi_app = PrefixMiddleware(app.wsgi_app, prefix=environ.get('SCRIPT_NAME', ''))


# FUNCTIONS
# -----------------------------------------------------------------------
def create_pdf(pdf_data):
    '''
    Creates pdf file
    '''
    pdf = StringIO()
    pisa.CreatePDF(StringIO(pdf_data), pdf)
    return pdf
from pycorenlp import StanfordCoreNLP
from pprint import pprint
import json

FILE = "data/test200"

nlp = StanfordCoreNLP('http://localhost:{0}'.format(9000))


def get_stanford_annotations(
        text,
        port=9000,
        annotators='tokenize,ssplit,pos,lemma,depparse,parse'):
    output = nlp.annotate(text,
                          properties={
                              "timeout": "10000",
                              "ssplit.isOneSentence": "true",
                              'annotators': annotators,
                          })
    return output


with open(FILE + '.txt',
          encoding='utf-8') as in_file, open(FILE + '.NRE',
                                             'w',
                                             encoding='utf-8') as out_file:
    for line in in_file:
        ls = line.strip().split('\t')
        sent_id = ls[0].strip()
        document = ' '.join(ls[1].strip().split())
        token1 = ls[2]
Exemple #20
0
def main():
    start_time = time.time()
    parser = argparse.ArgumentParser(description='')
    parser.add_argument("actions",
                        default="classify",
                        help="Actions to be performed.",
                        choices=[
                            "load_corpus", "annotate", "classify",
                            "write_results", "write_goldstandard", "train",
                            "test", "train_multiple", "test_multiple",
                            "train_matcher", "test_matcher", "crossvalidation",
                            "train_relations", "test_relations"
                        ])
    parser.add_argument(
        "--goldstd",
        default="",
        dest="goldstd",
        nargs="+",
        help="Gold standard to be used. Will override corpus, annotations",
        choices=config.paths.keys())
    parser.add_argument("--submodels",
                        default="",
                        nargs='+',
                        help="sub types of classifiers"),
    parser.add_argument(
        "-i",
        "--input",
        dest="input",
        action="store",
        default='''Administration of a higher dose of indinavir should be \\
considered when coadministering with megestrol acetate.''',
        help="Text to classify.")
    parser.add_argument(
        "--corpus",
        dest="corpus",
        nargs=2,
        default=[
            "chemdner",
            "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt"
        ],
        help="format path")
    parser.add_argument("--annotations", dest="annotations")
    parser.add_argument("--tag",
                        dest="tag",
                        default="0",
                        help="Tag to identify the text.")
    parser.add_argument("--models",
                        dest="models",
                        help="model destination path, without extension")
    parser.add_argument("--entitytype",
                        dest="etype",
                        help="type of entities to be considered",
                        default="all")
    parser.add_argument("--pairtype",
                        dest="ptype",
                        help="type of pairs to be considered",
                        default="all")
    parser.add_argument("--doctype",
                        dest="doctype",
                        help="type of document to be considered",
                        default="all")
    parser.add_argument("--annotated",
                        action="store_true",
                        default=False,
                        dest="annotated",
                        help="True if the input has <entity> tags.")
    parser.add_argument(
        "-o",
        "--output",
        "--format",
        dest="output",
        nargs=2,
        help="format path; output formats: xml, html, tsv, text, chemdner.")
    parser.add_argument("--crf",
                        dest="crf",
                        help="CRF implementation",
                        default="stanford",
                        choices=["stanford", "crfsuite"])
    parser.add_argument("--log",
                        action="store",
                        dest="loglevel",
                        default="WARNING",
                        help="Log level")
    parser.add_argument("--kernel",
                        action="store",
                        dest="kernel",
                        default="svmtk",
                        help="Kernel for relation extraction")
    options = parser.parse_args()

    # set logger
    numeric_level = getattr(logging, options.loglevel.upper(), None)
    if not isinstance(numeric_level, int):
        raise ValueError('Invalid log level: %s' % options.loglevel)
    while len(logging.root.handlers) > 0:
        logging.root.removeHandler(logging.root.handlers[-1])
    logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s'
    logging.basicConfig(level=numeric_level, format=logging_format)
    logging.getLogger().setLevel(numeric_level)
    logging.getLogger("requests.packages").setLevel(30)
    logging.info("Processing action {0} on {1}".format(options.actions,
                                                       options.goldstd))

    # set configuration variables based on the goldstd option if the corpus has a gold standard,
    # or on corpus and annotation options
    # pre-processing options
    if options.actions == "load_corpus":
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_format = config.paths[options.goldstd]["format"]
        corpus_path = config.paths[options.goldstd]["text"]
        corpus_ann = config.paths[options.goldstd]["annotations"]

        corenlp_client = StanfordCoreNLP('http://localhost:9000')
        corpus = load_corpus(options.goldstd, corpus_path, corpus_format,
                             corenlp_client)
        corpus.save(config.paths[options.goldstd]["corpus"])
        if corpus_ann:  #add annotation if it is not a test set
            corpus.load_annotations(corpus_ann, options.etype, options.ptype)
            corpus.save(config.paths[options.goldstd]["corpus"])

    elif options.actions == "annotate":  # rext-add annotation to corpus
        if len(options.goldstd) > 1:
            print "load only one corpus each time"
            sys.exit()
        options.goldstd = options.goldstd[0]
        corpus_path = config.paths[options.goldstd]["corpus"]
        corpus_ann = config.paths[options.goldstd]["annotations"]
        logging.info("loading corpus %s" % corpus_path)
        corpus = pickle.load(open(corpus_path, 'rb'))
        logging.debug("loading annotations...")
        corpus.clear_annotations(options.etype)
        corpus.load_annotations(corpus_ann, options.etype, options.ptype)
        # corpus.get_invalid_sentences()
        corpus.save(config.paths[options.goldstd]["corpus"])
    else:
        corpus = Corpus("corpus/" + "&".join(options.goldstd))
        for g in options.goldstd:
            corpus_path = config.paths[g]["corpus"]
            logging.info("loading corpus %s" % corpus_path)
            this_corpus = pickle.load(open(corpus_path, 'rb'))
            corpus.documents.update(this_corpus.documents)
        if options.actions == "write_goldstandard":
            model = BiasModel(options.output[1])
            model.load_data(corpus, [])
            results = model.test()
            #results = ResultsNER(options.output[1])
            #results.get_ner_results(corpus, model)
            results.save(options.output[1] + ".pickle")
            #logging.info("saved gold standard results to " + options.output[1] + ".txt")

        # training
        elif options.actions == "train":
            if options.crf == "stanford":
                model = StanfordNERModel(options.models, options.etype)
            elif options.crf == "crfsuite":
                model = CrfSuiteModel(options.models, options.etype)
            model.load_data(corpus, feature_extractors.keys(), options.etype)
            model.train()
        elif options.actions == "train_matcher":  # Train a simple classifier based on string matching
            model = MatcherModel(options.models)
            model.train(corpus)
            # TODO: term list option
            #model.train("TermList.txt")
        elif options.actions == "train_multiple":  # Train one classifier for each type of entity in this corpus
            # logging.info(corpus.subtypes)
            models = TaggerCollection(basepath=options.models,
                                      corpus=corpus,
                                      subtypes=corpus.subtypes)
            models.train_types()
        elif options.actions == "train_relations":
            if options.kernel == "jsre":
                model = JSREKernel(corpus, options.ptype)
            elif options.kernel == "svmtk":
                model = SVMTKernel(corpus, options.ptype)
            elif options.kernel == "stanfordre":
                model = StanfordRE(corpus, options.ptype)
            elif options.kernel == "multir":
                model = MultiR(corpus, options.ptype)
            elif options.kernel == "scikit":
                model = ScikitRE(corpus, options.ptype)
            elif options.kernel == "crf":
                model = CrfSuiteRE(corpus, options.ptype)
            model.train()
        # testing
        elif options.actions == "test":
            base_port = 9191
            if len(options.submodels) > 1:
                allresults = ResultSetNER(corpus, options.output[1])
                for i, submodel in enumerate(options.submodels):
                    model = StanfordNERModel(options.models + "_" + submodel)
                    model.load_tagger(base_port + i)
                    # load data into the model format
                    model.load_data(corpus,
                                    feature_extractors.keys(),
                                    mode="test")
                    # run the classifier on the data
                    results = model.test(corpus, port=base_port + i)
                    allresults.add_results(results)
                    model.kill_process()
                # save the results to an object that can be read again, and log files to debug
                final_results = allresults.combine_results()
            else:
                if options.crf == "stanford":
                    model = StanfordNERModel(options.models, options.etype)
                elif options.crf == "crfsuite":
                    model = CrfSuiteModel(options.models, options.etype)
                model.load_tagger()
                model.load_data(corpus, feature_extractors.keys(), mode="test")
                final_results = model.test(corpus)
            #with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile:
            #    lines = final_results.corpus.write_chemdner_results(options.models, outfile)
            #final_results.lines = lines
            final_results.save(options.output[1] + ".pickle")
        elif options.actions == "test_matcher":
            if "mirna" in options.models:
                model = MirnaMatcher(options.models)
            else:
                model = MatcherModel(options.models)
            results = ResultsNER(options.models)
            results.corpus, results.entities = model.test(corpus)
            allentities = set()
            for e in results.entities:
                allentities.add(results.entities[e].text)
            with codecs.open(options.output[1] + ".txt", 'w',
                             'utf-8') as outfile:
                outfile.write('\n'.join(allentities))

            results.save(options.output[1] + ".pickle")
        elif options.actions == "test_multiple":
            logging.info("testing with multiple classifiers... {}".format(
                ' '.join(options.submodels)))
            allresults = ResultSetNER(corpus, options.output[1])
            if len(options.submodels) < 2:
                models = TaggerCollection(basepath=options.models)
                models.load_models()
                results = models.test_types(corpus)
                final_results = results.combine_results()
            else:
                base_port = 9191
                for submodel in options.submodels:
                    models = TaggerCollection(basepath=options.models + "_" +
                                              submodel,
                                              baseport=base_port)
                    models.load_models()
                    results = models.test_types(corpus)
                    logging.info("combining results...")
                    submodel_results = results.combine_results()
                    allresults.add_results(submodel_results)
                    base_port += len(models.models)
                final_results = allresults.combine_results()
            logging.info("saving results...")
            final_results.save(options.output[1] + ".pickle")
        elif options.actions == "test_relations":
            if options.kernel == "jsre":
                model = JSREKernel(corpus, options.ptype, train=False)
            elif options.kernel == "svmtk":
                model = SVMTKernel(corpus, options.ptype)
            elif options.kernel == "rules":
                model = RuleClassifier(corpus, options.ptype)
            elif options.kernel == "stanfordre":
                model = StanfordRE(corpus, options.ptype)
            elif options.kernel == "scikit":
                model = ScikitRE(corpus, options.ptype)
            elif options.kernel == "crf":
                model = CrfSuiteRE(corpus, options.ptype, test=True)
            model.load_classifier()
            model.test()
            results = model.get_predictions(corpus)
            results.save(options.output[1] + ".pickle")

    total_time = time.time() - start_time
    logging.info("Total time: %ss" % total_time)
Exemple #21
0
def process_request(conn, addr):
	print("connected client:", addr)
	lst = b''
	data_com = conn.recv(4096)
	data_com = data_com.decode("utf8")
	data_com = data_com.split(' ')
	lenght = int(data_com[1])
	i = 0
	while i < lenght:
		data = conn.recv(1024)
		lst += data
		i += 1024
	# print(data_com)
	lst2 = pickle.loads(lst)

	if data_com[0].upper() == 'STAT':
		if len(lst2) < 10:
			error = 'Not enough data'
			conn.sendall(error.encode("utf8"))
		else:
			tweet_top = tweet_top10(lst2)
			retweet_top = (list(retweet_top10(lst2)))[:10]
			retweet_top10_necessary = []
			for i in range(len(retweet_top)):
				retweet_top10_necessary.append([])
				retweet_top10_necessary[i].append(retweet_top[i][6])
				retweet_top10_necessary[i].append(retweet_top[i][3])
				retweet_top10_necessary[i].append(retweet_top[i][8])
			author_top = author_top10(lst2)
			country_tweet, country_retweet = country(lst2)
			# print(tweet_top)
			# print(retweet_top10_necessary)
			# print(author_top)
			data_for_client = [['Popular words', 'Number of words']]
			data_for_client.extend(tweet_top)
			data_for_client.extend([])
			data_for_client.extend([['Tweet content', 'author', 'RT']])
			data_for_client.extend(retweet_top10_necessary)
			data_for_client.extend([['author', 'followers']])
			data_for_client.extend(author_top)
			data_for_client.extend([['country_tweet'], country_tweet])
			data_for_client.extend([['country_retweet'], country_retweet])
			# print(data_for_client)
			message = pickle.dumps(data_for_client)
			size = len(message)
			conn.sendall((str(size)).encode("utf8"))
			time.sleep(1)
			conn.sendall(message)

	if data_com[0].upper() == 'ENTI':
		nlp = StanfordCoreNLP('http://localhost:9000')
		pos = []
		for i in lst2:
			text = i[6].replace('\n',' ')
			# print(i[6])
			result = nlp.annotate( text, properties = {'annotators': 'ner', 'outputFormat': 'json', 'timeout': 100000, })
			# print(result["sentences"][0])
			for word in result["sentences"][0]["tokens"]:
				pos.append('{} ({})'.format(word["word"], word["ner"]))
				# print(pos)
			# print('')
			# print(text)
		string = " ".join(pos)
		# print(pos)
		message = pickle.dumps(string)
		size = len(message)
		conn.sendall((str(size)).encode("utf8"))
		time.sleep(1)
		conn.sendall(message)

	conn.close()
Exemple #22
0
def nlp_partial_sent(host_url):
    nlp_server = StanfordCoreNLP('http://localhost:9000')
    return partial(nlp_server.annotate, properties={'outputFormat': 'json'})
 def __init__(self, files=None):
     self.sources = files
     self.triples = []
     self.news = ""
     self.nlp = StanfordCoreNLP('http://localhost:9000')
Exemple #24
0
    def _find_nodes(self, tag_with):
        """
        Apply NER to extract entities and their positional information from the
        context.
        When working with flair, a heuristic is used to counteract cases
        in which an entity contains trailing punctuation (this would conflict
        with BertTokenizer later on).
        :param tag_with: either 'stanford' or an instance of flair.models.SequenceTagger
        """
        ent_id = 0

        if tag_with == 'stanford':
            tagger = StanfordCoreNLP("http://corenlp.run/")
            for para_id, paragraph in enumerate(
                    self.context):  # between 0 and 10 paragraphs
                sentences = [
                    paragraph[0]
                ] + paragraph[1]  # merge header and sentences to one list
                for sent_id, sentence in enumerate(
                        sentences):  # first sentence is the paragraph title
                    annotated = tagger.annotate(sentence,
                                                properties={
                                                    "annotators": "ner",
                                                    "outputFormat": "json"
                                                })
                    entities = annotated['sentences'][0][
                        'entitymentions']  # list of dicts

                    for e in entities:
                        self.graph[ent_id] = {
                            "address":
                            (para_id, sent_id, e['characterOffsetBegin'],
                             e['characterOffsetEnd']),
                            "links": [],  # relations
                            "mention":
                            e['text']  # name of the node
                        }
                        #print(f"in EntityGraph._find_nodes(): address & mention: {self.graph[ent_id]['address']} -- {self.graph[ent_id]['mention']}") #CLEANUP
                        ent_id += 1

        elif type(tag_with) == SequenceTagger:
            tagger = tag_with
            #print(f"in EntityGraph._find_nodes(): context:\n{self.context}") #CLEANUP
            for para_id, paragraph in enumerate(
                    self.context):  # between 0 and 10 paragraphs
                # merge header and sentences to one list and convert to Sentence object
                sentences = [
                    Sentence(s) for s in [paragraph[0]] + paragraph[1]
                ]
                tagged_sentences = tagger.predict(sentences)

                for sent_id, sentence in enumerate(
                        tagged_sentences
                ):  # first sentence is the paragraph title
                    entities = sentence.get_spans('ner')
                    for e in entities:
                        if e.text.endswith(('.', '?', '!', ',',
                                            ':')):  # counter tagging errors
                            end_pos = e.end_pos - 1
                            text = e.text[:-1]
                        else:
                            end_pos = e.end_pos
                            text = e.text
                        self.graph.update({
                            ent_id: {
                                "address":
                                (para_id, sent_id, e.start_pos, end_pos),
                                "links": [],  # relations
                                "mention": text  # name of the node
                            }
                        })
                        ent_id += 1
        else:
            print(
                f"invalid tagger; {tag_with}. Continuing with a flair tagger.")
            self._find_nodes(SequenceTagger.load('ner'))
Exemple #25
0
# -*- coding:utf-8 -*-
from pycorenlp import StanfordCoreNLP
import re
from nltk import RegexpParser

#nlp = StanfordCoreNLP('http://localhost:9000/')
nlp = StanfordCoreNLP("http://corenlp.run/")

grammar = """
    V: {<VB.*><PR>?<IN|TO>?}
    W: {<NN*|JJ|RB.*|PRP.*|DT>}
    P: {<IN|TO|PR>}
    VP2: {<V><P>}
    VP3: {<V><W>+<P>}
    VP1: {<V>}
"""

vp_parser = RegexpParser(grammar)


def clean(word):
    if "(" in word:
        word = word[:word.find("(")]
    return word


def analyze(sentence):
    output = nlp.annotate(sentence, properties={
        'annotators': 'tokenize,ssplit,pos,parse,depparse,coref',
        'tokenize.whitespace': True,
        'outputFormat': 'json'
Exemple #26
0
def nlp_partial_sent(host_url):
    from pycorenlp import StanfordCoreNLP
    from functools import partial
    nlp_server = StanfordCoreNLP('http://localhost:9000')
    return partial(nlp_server.annotate, properties={'outputFormat': 'json'})
import re
import json
import pickle
import os

from tqdm import tqdm
from pycorenlp import StanfordCoreNLP

nlp_server = StanfordCoreNLP('http://ink-molly.usc.edu:9000')

version = "1.0"

# File name
file_path = "data/01.src.txt"

# Display options
IF_DISP_PREFIX = False
IF_DISP_TQDM = False
IF_DISP_VB_UNMATCH = False
IF_DISP_IF_UNMATCH = False
IF_DISP_BAN = False
IF_DISP_ALL_SEN = False
IF_VERB_ONLY = True

# Character filter
character_patterns = [
    '^Craig:.*',
    '^Cestero:.*',
]

Exemple #28
0
def get_tokens_and_dependencies(sentence):
    nlp = StanfordCoreNLP('http://localhost:9000')
    output = nlp.annotate(sentence, properties={'annotators': 'tokenize,ssplit,pos,depparse,parse,dcoref','outputFormat': 'json'})
    tokens = output['sentences'][0]['tokens']
    dependencies = output['sentences'][0]['basic-dependencies']
    return tokens, dependencies
Exemple #29
0
import sys
reload(sys)
sys.setdefaultencoding('utf8')

import pickle
import re
import random
import string
import json
from xml.etree import ElementTree as etree
from pycorenlp import StanfordCoreNLP
from xmljson import BadgerFish
from collections import OrderedDict
nlp = StanfordCoreNLP('http://localhost:9000')


def parse(input):
    dbg = open("debug","w")


    output = nlp.annotate(input, properties={
        'annotators': 'tokenize,ssplit,pos',
        'outputFormat': 'xml',
        'timeout': 30000})
    fixed = []
    for o in output:
        fixed.append(o)
    return("".join(fixed))


def lparse(input):
from pycorenlp import StanfordCoreNLP
from typing import Set

logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)

all_zeroes = "ALL_ZERO"
unknown_el = "_UNKNOWN"
epsilon = 10e-8

special_tokens = {"&ndash;": "–",
                  "&mdash;": "—",
                  "@card@": "0"
                  }

corenlp = StanfordCoreNLP('http://semanticparsing:9000')
corenlp_properties = {
    'annotators': 'tokenize, pos, ner',
    'outputFormat': 'json'
}
corenlp_caseless = {
    'pos.model': 'edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger',
    'ner.model': #'edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz,' +
                 'edu/stanford/nlp/models/ner/english.muc.7class.caseless.distsim.crf.ser.gz,'
                 #+ 'edu/stanford/nlp/models/ner/english.conll.4class.caseless.distsim.crf.ser.gz'
}


module_location = os.path.abspath(__file__)
module_location = os.path.dirname(module_location)
RESOURCES_FOLDER = os.path.join(module_location, "..", "resources/")