from nltk.tag import StanfordNERTagger import nltk from nltk.corpus import wordnet as wn from nltk.parse.corenlp import CoreNLPDependencyParser from graphviz import Source from pattern.vector import stemmer from pycorenlp import StanfordCoreNLP from sutime import SUTime from textblob import TextBlob from stanfordnlp.server import CoreNLPClient from pynlp import StanfordCoreNLP annotators = 'tokenize, ssplit, pos, ner, coref' options = {'openie.resolve_coref': True} nlp = StanfordCoreNLP(annotators=annotators, options=options) sdp = CoreNLPDependencyParser() #----------------------------------------------------------------------------------------------------------------------- #LOAD THE SENTENCES filepath = 'kolbuszowa.txt' list_sentences = [] with open(filepath, encoding="utf8") as file: for line in file: list_sentences.append([line[:line.rfind(".") + 1]]) #PREPROCESSING START #CREATE TEMPORARY LIST FOR ADJUCENT SENTECNES FOR COREFERENCING (PREVIOUS 2 SENTENCES) for i in range(len(list_sentences)): adj_sentences = [] start_index = i - 1
return sum/(len(sentiments)) #tales=['FundeVogel','Rapunzel','TheGooseGirl','Golden Bird','HansInGoodLuck','JorindaAndJorindel','TravelingMusicians','OldSultan','TheStraw','BriarRose','DogAndSparrow','TwelveDancingPrincesses','FishermanAndWife','TheWillowRen','FrogPrince','CatAndMouse'] taleSentiments=[] for taleName in tales: #f = open("./Corefs/"+taleName,'r',encoding="utf8") p(taleName) if (sys.argv[1]==1): f=open("./Stories/"+taleName,'r',encoding="utf8") else: f=open("./Corefs/"+taleName,'r',encoding="utf8") tale= f.read() tale = tale.replace('\n', ' ') tale = tale.replace('\r', ' ') #pprint.pprint(tale) nlp_wrapper = StanfordCoreNLP('http://localhost:9000') #doc = "Ronaldo has moved from Real Madrid to Juventus. While Messi still plays for Barcelona" doc=tale #pprint.pprint(doc) annot_doc = nlp_wrapper.annotate(doc, properties={ 'annotators': 'ner, pos,depparse', 'outputFormat': 'json', 'timeout': 100000, }) nsubjs=[] #pprint.pprint(annot_doc) for sentence in annot_doc['sentences']: for element in sentence['basicDependencies']: if(element['dep']=='nsubj'):
def __init__(self): self.load_data() self.nlp = StanfordCoreNLP(config.StanfordCoreNLP_Path)
def brat_to_conll(input_folder, output_filepath, tokenizer, language): ''' Assumes '.txt' and '.ann' files are in the input_folder. Checks for the compatibility between .txt and .ann at the same time. ''' if tokenizer == 'spacy': spacy_nlp = spacy.load(language) elif tokenizer == 'stanford': core_nlp = StanfordCoreNLP('http://localhost:{0}'.format(9000)) else: raise ValueError("tokenizer should be either 'spacy' or 'stanford'.") verbose = False dataset_type = os.path.basename(input_folder) print("Formatting {0} set from BRAT to CONLL... ".format(dataset_type), end='') text_filepaths = sorted(glob.glob(os.path.join(input_folder, '*.txt'))) output_file = codecs.open(output_filepath, 'w', 'latin-1') for text_filepath in text_filepaths: base_filename = os.path.splitext(os.path.basename(text_filepath))[0] annotation_filepath = os.path.join(os.path.dirname(text_filepath), base_filename + '.ann') # create annotation file if it does not exist if not os.path.exists(annotation_filepath): codecs.open(annotation_filepath, 'w', 'latin-1').close() text, entities = get_entities_from_brat(text_filepath, annotation_filepath) entities = sorted(entities, key=lambda entity: entity["start"]) if tokenizer == 'spacy': sentences = get_sentences_and_tokens_from_spacy(text, spacy_nlp) elif tokenizer == 'stanford': sentences = get_sentences_and_tokens_from_stanford(text, core_nlp) for sentence in sentences: inside = False previous_token_label = 'O' for token in sentence: token['label'] = 'O' for entity in entities: if entity['start'] <= token['start'] < entity['end'] or \ entity['start'] < token['end'] <= entity['end'] or \ token['start'] < entity['start'] < entity['end'] < token['end']: token['label'] = entity['type'].replace( '-', '_' ) # Because the ANN doesn't support tag with '-' in it break elif token['end'] < entity['start']: break if len(entities) == 0: entity = {'end': 0} if token['label'] == 'O': gold_label = 'O' inside = False elif inside and token['label'] == previous_token_label: gold_label = 'I-{0}'.format(token['label']) else: inside = True gold_label = 'B-{0}'.format(token['label']) if token['end'] == entity['end']: inside = False previous_token_label = token['label'] if verbose: print('{0} {1} {2} {3} {4}\n'.format( token['text'], base_filename, token['start'], token['end'], gold_label)) output_file.write('{0} {1} {2} {3} {4}\n'.format( token['text'], base_filename, token['start'], token['end'], gold_label)) if verbose: print('\n') output_file.write('\n') output_file.close() print('Done.') if tokenizer == 'spacy': del spacy_nlp elif tokenizer == 'stanford': del core_nlp
CORENLP_SERVER_ADDRESS = 'http://localhost:9000' NER_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 'ner-crf-training-data.tsv') RE_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 're-training-data.corp') if os.path.exists(OUTPUT_DIRECTORY): if os.path.exists(NER_TRAINING_DATA_OUTPUT_PATH): os.remove(NER_TRAINING_DATA_OUTPUT_PATH) if os.path.exists(RE_TRAINING_DATA_OUTPUT_PATH): os.remove(RE_TRAINING_DATA_OUTPUT_PATH) else: os.makedirs(OUTPUT_DIRECTORY) sentence_count = 0 nlp = StanfordCoreNLP(CORENLP_SERVER_ADDRESS) # looping through .ann files in the data directory ann_data_files = [ f for f in listdir(DATA_DIRECTORY) if isfile(join(DATA_DIRECTORY, f)) and f.split('.')[1] == 'ann' ] for file in ann_data_files: entities = [] relations = [] # process .ann file - place entities and relations into 2 seperate lists of tuples with open(join(DATA_DIRECTORY, file), 'r') as document_anno_file: lines = document_anno_file.readlines() for line in lines:
from pycorenlp import StanfordCoreNLP from scipy import spatial nlp = StanfordCoreNLP('http://10.4.100.141:9000') text = 'Timmy the elephant has eyes, ears, tusks and legs. Timmy the dog has four legs and four eyes. Timmy the hippo has a big nose and huge ears' output = nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse,openie,ner', 'outputFormat': 'json' }) contexts = [] words = [] occsCont = {} for i in range(len(output['sentences'])): occs = {} contexts.append(output['sentences'][i]['openie'][0]['subject']) for x in output['sentences'][i]['tokens']: if (x['pos'] == 'CD' or x['pos'] == 'JJ' or x['pos'] == 'NN' or x['pos'] == 'NNS'): words.append(x['word']) sWords = set(words) for i in range(len(output['sentences'])): for x in sWords: occs[x] = 0 occsCont[i] = dict(occs) for i in range(len(output['sentences'])): senWords = (text.split('.')) for x in sWords: occsCont[i][x] = senWords[i].count(x) sim = {} for x in (range(1, len(contexts) + 1)):
def __init__(self): self.corenlp = StanfordCoreNLP('http://localhost:9000')
# Stanford Core NLPの係り受け解析の結果(collapsed-dependencies)を # 有向グラフとして可視化せよ.可視化には,係り受け木をDOT言語に変換し, # Graphvizを用いるとよい.また,Pythonから有向グラフを直接的に可視化するには, # pydotを使うとよい. import pprint from nltk.stem.porter import PorterStemmer import pydot_ng as pydot import pydotplus from pycorenlp import StanfordCoreNLP ipath = '../../data/input/' opath = '../../data/output/' nlp = StanfordCoreNLP("http://localhost:9000") prop = {"annotators":"depparse", "outputFormat":"json"} tokenized_list = [] with open(ipath+'nlp.txt', encoding='utf-8') as f: lines = f.readlines() for line in lines: tokenized_list.append(nlp.annotate(line, properties=prop)) dots = [] edges = [] for line in tokenized_list: sentences = line['sentences'] if len(sentences) == 0: pass else: for sentence in sentences:
from pycorenlp import StanfordCoreNLP nlp = StanfordCoreNLP('http://101.132.182.124:9000') from datetime import datetime,timedelta from collections import defaultdict from pprint import pprint from tqdm import tqdm import re import pymongo from pymongo import InsertOne, DeleteMany, ReplaceOne, UpdateOne from pymongo.errors import BulkWriteError client = pymongo.MongoClient('34.224.37.110:27017') db = client.tweet import pandas as pd def get_ner_dict(word,ner): ner_tuple = zip(word,ner) splits = [0] for index,nt in enumerate(ner_tuple): if index == 0: temp = nt[1] continue if temp != nt[1]: splits.append(index) temp = nt[1] continue else: temp = nt[1] continue
def get_clues(text): text = text print("*--------(%s)-------------*" % (text)) print(type(text)) nlp = StanfordCoreNLP('http://localhost:9001') stop_words = set(stopwords.words('english')) ''' Method to remove numbers appended at last ''' dep_parse = nlp.annotate(text, properties={ 'annotators': 'depparse', 'outputFormat': 'json', 'timeout': 10000, }) pos = nlp.annotate(text, properties={ 'annotators': 'lemma', 'outputFormat': 'json', 'timeout': 10000, }) sn = SenticNet() word_to_dep = [{} for i in range(len(dep_parse['sentences']))] word_to_par = [{} for i in range(len(dep_parse['sentences']))] word_to_pos = [{} for i in range(len(dep_parse['sentences']))] word_to_lemma = [{} for i in range(len(dep_parse['sentences']))] word_to_child = [{} for i in range(len(dep_parse['sentences']))] sents = [[] for i in range(len(dep_parse['sentences']))] index_to_word = {} ''' Constructing dicts for maintaining the dependencies among words. ''' ''' Appending each word by occurence number to maintain distinct word count ''' #print(dep_parse['sentences']) print("********") for i, sent in enumerate(dep_parse['sentences']): for dep in sent['basicDependencies']: word_to_dep[i][dep['dependentGloss'] + str(dep['dependent'])] = dep['dep'] word_to_par[i][dep['dependentGloss'] + str(dep['dependent'])] = dep['governorGloss'] + str( dep['governor']) index_to_word[dep['dependentGloss'] + str(dep['dependent'])] = dep['dependentGloss'] if (dep['governorGloss'] + str(dep['governor']) not in word_to_child[i]): word_to_child[i][dep['governorGloss'] + str(dep['governor'])] = [] if (dep['dependentGloss'] + str(dep['dependent']) not in word_to_child[i]): word_to_child[i][dep['dependentGloss'] + str(dep['dependent'])] = [] word_to_child[i][dep['governorGloss'] + str(dep['governor'])].append( dep['dependentGloss'] + str(dep['dependent'])) sents[i].append(dep['dependentGloss'] + str(dep['dependent'])) word_to_dep[i]['ROOT0'] = 'root' word_to_par[i]['ROOT0'] = 'root' for i, sent in enumerate(pos['sentences']): for pos_tagger in sent['tokens']: word_to_pos[i][pos_tagger['word']] = pos_tagger['pos'] word_to_lemma[i][pos_tagger['word']] = pos_tagger['lemma'] word_to_pos[i]['ROOT'] = 'root' word_to_lemma[i]['ROOT'] = 'root' ''' Displaying the deps ''' ##Implemeting rules to extract aspects for i, sent in enumerate(sents): if (__name__ == '__main__'): print(word_to_dep[i], word_to_par[i], word_to_pos[i]) print("Children==>") print(word_to_child[i]) aspects = [] for i, sent in enumerate(sents): for word in sent: ''' Rule 0 ''' if ('subj' in word_to_dep[i][word]): for child in word_to_child[i][word_to_par[i][word]]: if ('amod' in word_to_dep[i][child] or 'advmod' in word_to_dep[i][child]): aspects.append(word_to_par[i][word]) if (__name__ == '__main__'): print("Rule 0 triggered.") ''' Rule 1 (without sub): Very big to hold. ''' if (word_to_dep[i][word] == 'xcomp' and ('JJ' in word_to_pos[i][index_to_word[word_to_par[i][word]]] or 'RB' in word_to_pos[i][index_to_word[word_to_par[i][word]]])): if (__name__ == '__main__'): print("Rule 1 triggered") aspects.append(word_to_par[i][word]) ''' Rule 2 (without subj): Not to mention the price of the phone ''' if (word_to_dep[i][word] == 'dobj' and 'VB' in word_to_pos[i][index_to_word[(word_to_par[i][word])]] and ('NN' in word_to_pos[i][index_to_word[(word)]] or 'JJ' in word_to_pos[i][index_to_word[(word)]])): aspects.append(word) if (__name__ == '__main__'): print("Rule 2 triggered") print(word) ''' Rule 3 (without subj): Love the sleekness of the player ''' if ('NN' in word_to_pos[i][index_to_word[(word)]] and word_to_dep[i][word] == 'nmod'): aspects.append(word_to_par[i][word]) if (__name__ == '__main__'): print("Rule 3 triggered") print(word_to_par[i][word]) ''' Rule 4 (with sub): The battery lasts little two aspects ''' if (word_to_dep[i][word] == 'advmod' or word_to_dep[i][word] == 'amod' or word_to_dep[i][word] == 'advcl') and ('VB' in word_to_pos[i][index_to_word[( word_to_par[i][word])]]): aspects.append(word_to_par[i][word]) for word2 in sent: if (word2 != word and word_to_dep[i][word2] == 'nsubj' and word_to_par[i][word2] == word_to_par[i][word] and ('NN' in word_to_pos[i][index_to_word[word2]] or 'JJ' in word_to_pos[i][index_to_word[word2]])): aspects.append(word2) if (__name__ == '__main__'): print("Rule 4 triggered") print(word2) ''' Rule 5 (with sub): I like the lens of this camera ''' if ('NN' in word_to_pos[i][index_to_word[(word)]] and word_to_dep[i][word] == 'dobj'): if (__name__ == '__main__'): print("Rule 5 triggered") print(word) try: concept_info = sn.concept((word)) print("present in senticnet") except KeyError: print("Yay") aspects.append(word) ''' Rule 6 : I like the beauty of the screen. Check if senticnet condition should be added ''' if ('NN' in word_to_pos[i][index_to_word[(word)]] and word_to_dep[i][word] == 'dobj'): try: concept_info = sn.concept((word)) aspects.append(word) print("yay!") except KeyError: print("oops, not there in SenticNet") for word2 in sent: if (word2 != word and word_to_par[i][word2] == word and 'NN' in word_to_pos[i][index_to_word[(word2)]]): aspects.append(word2) if (__name__ == '__main__'): print("Rule 6 triggered.") print(word2) ''' Rule 7 : I would like to comment on the camera of this phone. ''' if (word_to_dep[i][word] == 'xcomp'): try: concept_info = sn.concept((word)) aspects.append(word) print("yay!") except KeyError: print("oops, not there in SenticNet") for child in word_to_child[i][word]: if ('NN' in word_to_pos[i][index_to_word[child]]): aspects.append(child) if (__name__ == '__main__'): print("Rule 7 triggered.") print(word) print(child) ''' Rule 8 : The car is expensive. ''' if (word_to_dep[i][word] == 'nsubj'): for word2 in sent: if (word2 != word and word_to_dep[i][word2] == 'cop' and word_to_par[i][word2] == word_to_par[i][word]): aspects.append(word_to_par[i][word]) if (__name__ == '__main__'): print("Rule 8 triggered") print(word_to_par[i][word]) ''' Rule 9 : The camera is nice. ''' if (word_to_dep[i][word] == 'nsubj' and 'NN' in word_to_pos[i][index_to_word[(word)]]): for word2 in sent: if (word2 != word and word_to_dep[i][word2] == 'cop' and word_to_par[i][word2] == word_to_par[i][word]): aspects.append(word) if (__name__ == '__main__'): print("Rule 9 triggered") print(word) ''' Rule 10 : The phone is very lightweight to carry. ''' if (word_to_dep[i][word] == 'cop'): for word2 in sent: if (word2 != word and 'VB' in word_to_pos[i][index_to_word[(word2)]] and word_to_par[i][word] == word_to_par[i][word2]): aspects.append(word2) if (__name__ == '__main__'): print("Rule 10 triggered.") print(word2) ''' Extracting mods of dobjs ''' if (word_to_dep[i][word] == 'dobj'): for child in word_to_child[i][word]: if ('mod' in word_to_dep[i][child] and 'JJ' in word_to_pos[i][index_to_word[(child)]]): aspects.append(child) ''' Rule 11 : Checking for conjuctions ''' for asp in aspects: for word in sent: if (word_to_dep[i][word] == 'conj' and word_to_par[i][word] == asp): aspects.append(word) if (__name__ == '__main__'): print("Rule conj triggered.") print(word) finalIAC = set(aspects) finalIAC = [index_to_word[f] for f in finalIAC] finalIAC = [w for w in finalIAC if not w in stop_words] finalSenti = [] for iac in finalIAC: try: concept_info = sn.concept((iac)) finalSenti.append(iac) except KeyError: print("No word available for " + iac) return finalIAC, finalSenti
def start(self): command = self.SERVER_COMMAND_PATTERN.format(self._memory, self._timeout) self._process = self._open_process(command, wait=False) self._wait_for_server() self._http_client = StanfordCoreNLP(self.SERVER_URL)
#!/usr/bin/python3 from pycorenlp import StanfordCoreNLP import os # nlp = StanfordCoreNLP('http://13.67.115.74:9000') # nlp = StanfordCoreNLP('http://119.63.99.173:9000') if os.environ.get('LEXICA') == True or os.environ.get('LEXICA') == "true": # nlp = StanfordCoreNLP('http://192.168.0.100:9000') nlp = StanfordCoreNLP('http://localhost:9000') else: nlp = StanfordCoreNLP('http://13.67.115.74:9000') # nlp = StanfordCoreNLP('http://13.67.115.74:9000') # nlp = StanfordCoreNLP('http://localhost:9000') nlp = StanfordCoreNLP('http://192.168.0.100:9000') def stanford_tree(line, annotators='parse'): output = nlp.annotate(line, properties={ 'annotators': annotators, 'outputFormat': 'json' }) try: return output except IndexError: pass
# ### Using spacy vector similarity function # In[18]: def getSimilarity(word1, word2): tokens = nlp(word1 + " " + word2) return tokens[0].similarity(tokens[1]) # ## Stanford CoreNLP # In[1]: from pycorenlp import StanfordCoreNLP stanford_nlp = StanfordCoreNLP('http://localhost:9001') # ### Pos tagging # In[2]: def getPOSTaggedDataFromTextUsingStanford(text): posSentences = [] output = stanford_nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos', 'outputFormat': 'json' }) for s in output['sentences']: posSentences.append(" ".join(
def get_NER(qc, query): nlp = StanfordCoreNLP( 'http://corenlp.run/') # normal corenlp server through internet #nlp = StanfordCoreNLP('http://localhost:9000') #nlp = StanfordCoreNLP('http://10.2.6.65:9099/') ## Desktop #nlp = StanfordCoreNLP('http://10.4.16.160:9094/') ## Lab server print("ner qc : ", qc, query) query_tokens = word_tokenize(query) if (qc[0] == 'LOCATION'): qc.append("CITY") qc.append("COUNTRY") qc.append("STATE_OR_PROVINCE") ###### For Spacy ######### qc.append("LOC") qc.append("GPE") #qc.append("NORP") qc.append("FAC") qc.append( "ORG") ### bcz we have very less samples on ORGA in QC part so if (qc[0] == 'ORGANIZATION'): ###### For Spacy ######### qc.append("FAC") qc.append( "ORG") ### bcz we have very less samples on ORGA in QC part so if (qc[0] == 'NUMBER'): ##### For Spacy ########## qc.append("PERCENT") qc.append("ORDINAL") qc.append("CARDINAL") if (qc[0] == 'DATE'): qc.append("DURATION") qc.append('TIME') qc.append("SET") ##### If TIME & DATE COMBINED THEN NO NEED TO WRITE BELLOW 5LINES SNIPPET if (qc[0] == 'TIME'): qc.append("DURATION") qc.append('DATE') qc.append("SET") print("In NER Section QC : ", qc) ###################### Only Spacy NER Tool for Query ############################### doc = Ner_script_spacy.NER_Spacy_funct(quest[0].decode('utf8')) for entity in doc.ents: #print(entity.text,entity.label_) q_ner.append(entity.label_) Whole_output.append( entity.label_ + "\n") #### Does this Spacy has Any timeout Error...?? print("\n Fiding Same type NERs in Top 10 Sentences:\n") for ii in range(len(Top_similarity_sent)): text1 = Top_similarity_sent[ii] text = unicodedata.normalize('NFKD', text1).encode('ascii', 'ignore') #text = text1 print("text: ", text) nerr = [] o_text = [] new_ner_o_text = [] chunk_ner = [] ################# ONly Spacy NER Tool for Sentences ##################### doc = Ner_script_spacy.NER_Spacy_funct(text.decode('utf8')) for entity in doc.ents: nerr.append( entity.label_ ) ####### NEED TO MODIFY HERE FOR CHUNKING Pbm Bcz we need Sequence o_text.append(entity.text) if (entity.label_ != "O"): random_ans.append(entity.text) #print("\n############# NER Preprocessing ###################\n") print("\n") for b in range(len(o_text)): new_ner_o_text.append(o_text[b]) for k in range(len(qc)): for z in range(len(new_ner_o_text)): if (new_ner_o_text[z] != "NULL"): if (new_ner_o_text[z] != "."): if (nerr[z] == qc[k]): # v.v imp condition chunk_ner.append(new_ner_o_text[z]) final_answers.append(new_ner_o_text[z]) if (len(random_ans) == 0): print("random_ans: ", random_ans) print( "NEED Figure out some Solution When NER is not recognizing any word...!!!!!!!!!!" ) stop_words = set(stopwords.words('english')) Tokens = [] for sent in Top_similarity_sent: ### Here also we can take some sentences from K-ranked sentences Tokens.append(word_tokenize(sent)) Tokens = list(itertools.chain(*Tokens)) filtered_sent_tokens = [w for w in Tokens if not w in stop_words] d = Counter(filtered_sent_tokens) words = [ pair[0] for pair in sorted( d.items(), key=lambda item: item[1], reverse=True) ] print(words) ''' sent = Top_similarity_sent[0] sent_tokens = word_tokenize(sent) for tk in sent_tokens: random_ans.append(tk) ''' print("words len : ", len(words)) limit = 10 if (len(words) < limit): limit = len(words) for ra_i in range( limit ): ##### Taking 10 random answers from top K-ranked sentences by ignoring the stopwords. random_ans.append(words[ra_i]) max_final_ans = [] for i in range(len(final_answers)): max_final_ans.append(final_answers.count(final_answers[i])) ss = sorted(range(len(max_final_ans)), key=max_final_ans.__getitem__, reverse=True) print("\n") print("Answers Set : ", random_ans) print("\n\nAfter Chunking of NERs:\n") print("final_answers : ", final_answers) print("max_final_ans : ", max_final_ans) print("lenghts of [AnswersSet, final_answers, max_final_ans,ss] : ", len(random_ans), len(final_answers), len(max_final_ans), len(ss)) optional_print = [] print("query_tokens : ", query_tokens) print("\n------------- > > Output < < --------------\n") #p_n=10 if (len(ss) >= 10): p_n = len(ss) else: p_n = len(ss) z = 0 for i in range(len(ss)): j = ss[i] x = [] x.append(final_answers[j]) if ((set(optional_print).intersection(x))): print(" ") else: if not (final_answers[j] in query_tokens): #print("else ans not in Q : ",final_answers[j]) optional_print.append(final_answers[j]) label_ans.append(final_answers[j]) z = z + 1 if (z == 0 and (len(final_answers) != 0)): print(random.choice(final_answers)) print("Possible Predictable label_answers : ", label_ans)
def making_parsed_tree(sentiment_code, file_name): splited_sentence_first = [] parsed_sentence_first = [] pcn = StanfordCoreNLP('http://*****:*****@", '', text) text = re.sub(r'http\S+', '', text) return text for a in tqdm(range(len(df))): tweet_txt = about_symbol(text[a]) if label[a] == sentiment_code: if len(tweet_txt) > 3: tweet_txt = " ".join(tweet_txt.split()) tweet_txt = contractions.fix(tweet_txt) doc = nlp(tweet_txt) splited_sentence_second = [] parsed_sentence_second = [] for sentence in doc.sentences: temp = [] for token in sentence.tokens: temp.append(token.text) sum_text = " ".join(temp) sum_text = about_symbol(sum_text) output = pcn.annotate(sum_text, properties={ 'annotators': 'parse', 'outputFormat': 'json' }) parsed_sent = output['sentences'][0]['parse'] parsed_sent = " ".join(parsed_sent.split()) parsed_sent = parsed_sent.replace('(', '<') parsed_sent = parsed_sent.replace(')', '>') parsed_sentence_second.append(parsed_sent) splited_sentence_second.append(sum_text) # print(parsed_sent) splited_sentence_first.append(splited_sentence_second) parsed_sentence_first.append(parsed_sentence_second) sent_json['splited_sentence'] = [] sent_json['parsed_sentence'] = [] sent_json['original_sentence'] = [] sent_json['splited_sentence'].append(splited_sentence_first) sent_json['parsed_sentence'].append(parsed_sentence_first) sent_json['original_sentence'].append(tweet_txt) with open(file_name, 'w') as out_file: json.dump(sent_json, out_file, indent=4)
logger.info('Skipped question due to offset mismatch:') logger.info(question) qa['question_entities'] = question_entities logger.info('In total, {} contexts and {} questions are skipped...'.format( skip_context_cnt, skip_question_cnt)) if __name__ == '__main__': args = parse_args() # make output directory if not exist if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) # register corenlp server nlp = StanfordCoreNLP('http://localhost:9753') # load train and dev datasets ftrain = open(args.train_file, 'r', encoding='utf-8') trainset = json.load(ftrain) fdev = open(args.predict_file, 'r', encoding='utf-8') devset = json.load(fdev) for dataset, path, name in zip((trainset, devset), (args.train_file, args.predict_file), ('train', 'dev')): tagging(dataset, nlp) output_path = os.path.join( args.output_dir, "{}.tagged.json".format(os.path.basename(path)[:-5])) json.dump(dataset, open(output_path, 'w', encoding='utf-8'))
def get_stanforcorenlp(self): self.stanfordCoreNLP = StanfordCoreNLP('http://localhost:9000') return self.stanfordCoreNLP
def __init__(self, app, prefix=''): self.app = app self.prefix = prefix def __call__(self, environ, start_response): if environ['PATH_INFO'].startswith(self.prefix): environ['PATH_INFO'] = environ['PATH_INFO'][len(self.prefix):] environ['SCRIPT_NAME'] = self.prefix return self.app(environ, start_response) else: start_response('404', [('Content-Type', 'text/plain')]) return ["This url does not belong to the app.".encode()] core.NLP = StanfordCoreNLP(os.environ['PPAXE_CORENLP']) app = Flask(__name__) # create the application instance app.wsgi_app = ReverseProxied(app.wsgi_app) # app.wsgi_app = PrefixMiddleware(app.wsgi_app, prefix=environ.get('SCRIPT_NAME', '')) # FUNCTIONS # ----------------------------------------------------------------------- def create_pdf(pdf_data): ''' Creates pdf file ''' pdf = StringIO() pisa.CreatePDF(StringIO(pdf_data), pdf) return pdf
from pycorenlp import StanfordCoreNLP from pprint import pprint import json FILE = "data/test200" nlp = StanfordCoreNLP('http://localhost:{0}'.format(9000)) def get_stanford_annotations( text, port=9000, annotators='tokenize,ssplit,pos,lemma,depparse,parse'): output = nlp.annotate(text, properties={ "timeout": "10000", "ssplit.isOneSentence": "true", 'annotators': annotators, }) return output with open(FILE + '.txt', encoding='utf-8') as in_file, open(FILE + '.NRE', 'w', encoding='utf-8') as out_file: for line in in_file: ls = line.strip().split('\t') sent_id = ls[0].strip() document = ' '.join(ls[1].strip().split()) token1 = ls[2]
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("actions", default="classify", help="Actions to be performed.", choices=[ "load_corpus", "annotate", "classify", "write_results", "write_goldstandard", "train", "test", "train_multiple", "test_multiple", "train_matcher", "test_matcher", "crossvalidation", "train_relations", "test_relations" ]) parser.add_argument( "--goldstd", default="", dest="goldstd", nargs="+", help="Gold standard to be used. Will override corpus, annotations", choices=config.paths.keys()) parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument( "-i", "--input", dest="input", action="store", default='''Administration of a higher dose of indinavir should be \\ considered when coadministering with megestrol acetate.''', help="Text to classify.") parser.add_argument( "--corpus", dest="corpus", nargs=2, default=[ "chemdner", "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt" ], help="format path") parser.add_argument("--annotations", dest="annotations") parser.add_argument("--tag", dest="tag", default="0", help="Tag to identify the text.") parser.add_argument("--models", dest="models", help="model destination path, without extension") parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all") parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument("--annotated", action="store_true", default=False, dest="annotated", help="True if the input has <entity> tags.") parser.add_argument( "-o", "--output", "--format", dest="output", nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.") parser.add_argument("--crf", dest="crf", help="CRF implementation", default="stanford", choices=["stanford", "crfsuite"]) parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction") options = parser.parse_args() # set logger numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.getLogger("requests.packages").setLevel(30) logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd)) # set configuration variables based on the goldstd option if the corpus has a gold standard, # or on corpus and annotation options # pre-processing options if options.actions == "load_corpus": if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_format = config.paths[options.goldstd]["format"] corpus_path = config.paths[options.goldstd]["text"] corpus_ann = config.paths[options.goldstd]["annotations"] corenlp_client = StanfordCoreNLP('http://localhost:9000') corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client) corpus.save(config.paths[options.goldstd]["corpus"]) if corpus_ann: #add annotation if it is not a test set corpus.load_annotations(corpus_ann, options.etype, options.ptype) corpus.save(config.paths[options.goldstd]["corpus"]) elif options.actions == "annotate": # rext-add annotation to corpus if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_path = config.paths[options.goldstd]["corpus"] corpus_ann = config.paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) logging.debug("loading annotations...") corpus.clear_annotations(options.etype) corpus.load_annotations(corpus_ann, options.etype, options.ptype) # corpus.get_invalid_sentences() corpus.save(config.paths[options.goldstd]["corpus"]) else: corpus = Corpus("corpus/" + "&".join(options.goldstd)) for g in options.goldstd: corpus_path = config.paths[g]["corpus"] logging.info("loading corpus %s" % corpus_path) this_corpus = pickle.load(open(corpus_path, 'rb')) corpus.documents.update(this_corpus.documents) if options.actions == "write_goldstandard": model = BiasModel(options.output[1]) model.load_data(corpus, []) results = model.test() #results = ResultsNER(options.output[1]) #results.get_ner_results(corpus, model) results.save(options.output[1] + ".pickle") #logging.info("saved gold standard results to " + options.output[1] + ".txt") # training elif options.actions == "train": if options.crf == "stanford": model = StanfordNERModel(options.models, options.etype) elif options.crf == "crfsuite": model = CrfSuiteModel(options.models, options.etype) model.load_data(corpus, feature_extractors.keys(), options.etype) model.train() elif options.actions == "train_matcher": # Train a simple classifier based on string matching model = MatcherModel(options.models) model.train(corpus) # TODO: term list option #model.train("TermList.txt") elif options.actions == "train_multiple": # Train one classifier for each type of entity in this corpus # logging.info(corpus.subtypes) models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=corpus.subtypes) models.train_types() elif options.actions == "train_relations": if options.kernel == "jsre": model = JSREKernel(corpus, options.ptype) elif options.kernel == "svmtk": model = SVMTKernel(corpus, options.ptype) elif options.kernel == "stanfordre": model = StanfordRE(corpus, options.ptype) elif options.kernel == "multir": model = MultiR(corpus, options.ptype) elif options.kernel == "scikit": model = ScikitRE(corpus, options.ptype) elif options.kernel == "crf": model = CrfSuiteRE(corpus, options.ptype) model.train() # testing elif options.actions == "test": base_port = 9191 if len(options.submodels) > 1: allresults = ResultSetNER(corpus, options.output[1]) for i, submodel in enumerate(options.submodels): model = StanfordNERModel(options.models + "_" + submodel) model.load_tagger(base_port + i) # load data into the model format model.load_data(corpus, feature_extractors.keys(), mode="test") # run the classifier on the data results = model.test(corpus, port=base_port + i) allresults.add_results(results) model.kill_process() # save the results to an object that can be read again, and log files to debug final_results = allresults.combine_results() else: if options.crf == "stanford": model = StanfordNERModel(options.models, options.etype) elif options.crf == "crfsuite": model = CrfSuiteModel(options.models, options.etype) model.load_tagger() model.load_data(corpus, feature_extractors.keys(), mode="test") final_results = model.test(corpus) #with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile: # lines = final_results.corpus.write_chemdner_results(options.models, outfile) #final_results.lines = lines final_results.save(options.output[1] + ".pickle") elif options.actions == "test_matcher": if "mirna" in options.models: model = MirnaMatcher(options.models) else: model = MatcherModel(options.models) results = ResultsNER(options.models) results.corpus, results.entities = model.test(corpus) allentities = set() for e in results.entities: allentities.add(results.entities[e].text) with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile: outfile.write('\n'.join(allentities)) results.save(options.output[1] + ".pickle") elif options.actions == "test_multiple": logging.info("testing with multiple classifiers... {}".format( ' '.join(options.submodels))) allresults = ResultSetNER(corpus, options.output[1]) if len(options.submodels) < 2: models = TaggerCollection(basepath=options.models) models.load_models() results = models.test_types(corpus) final_results = results.combine_results() else: base_port = 9191 for submodel in options.submodels: models = TaggerCollection(basepath=options.models + "_" + submodel, baseport=base_port) models.load_models() results = models.test_types(corpus) logging.info("combining results...") submodel_results = results.combine_results() allresults.add_results(submodel_results) base_port += len(models.models) final_results = allresults.combine_results() logging.info("saving results...") final_results.save(options.output[1] + ".pickle") elif options.actions == "test_relations": if options.kernel == "jsre": model = JSREKernel(corpus, options.ptype, train=False) elif options.kernel == "svmtk": model = SVMTKernel(corpus, options.ptype) elif options.kernel == "rules": model = RuleClassifier(corpus, options.ptype) elif options.kernel == "stanfordre": model = StanfordRE(corpus, options.ptype) elif options.kernel == "scikit": model = ScikitRE(corpus, options.ptype) elif options.kernel == "crf": model = CrfSuiteRE(corpus, options.ptype, test=True) model.load_classifier() model.test() results = model.get_predictions(corpus) results.save(options.output[1] + ".pickle") total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)
def process_request(conn, addr): print("connected client:", addr) lst = b'' data_com = conn.recv(4096) data_com = data_com.decode("utf8") data_com = data_com.split(' ') lenght = int(data_com[1]) i = 0 while i < lenght: data = conn.recv(1024) lst += data i += 1024 # print(data_com) lst2 = pickle.loads(lst) if data_com[0].upper() == 'STAT': if len(lst2) < 10: error = 'Not enough data' conn.sendall(error.encode("utf8")) else: tweet_top = tweet_top10(lst2) retweet_top = (list(retweet_top10(lst2)))[:10] retweet_top10_necessary = [] for i in range(len(retweet_top)): retweet_top10_necessary.append([]) retweet_top10_necessary[i].append(retweet_top[i][6]) retweet_top10_necessary[i].append(retweet_top[i][3]) retweet_top10_necessary[i].append(retweet_top[i][8]) author_top = author_top10(lst2) country_tweet, country_retweet = country(lst2) # print(tweet_top) # print(retweet_top10_necessary) # print(author_top) data_for_client = [['Popular words', 'Number of words']] data_for_client.extend(tweet_top) data_for_client.extend([]) data_for_client.extend([['Tweet content', 'author', 'RT']]) data_for_client.extend(retweet_top10_necessary) data_for_client.extend([['author', 'followers']]) data_for_client.extend(author_top) data_for_client.extend([['country_tweet'], country_tweet]) data_for_client.extend([['country_retweet'], country_retweet]) # print(data_for_client) message = pickle.dumps(data_for_client) size = len(message) conn.sendall((str(size)).encode("utf8")) time.sleep(1) conn.sendall(message) if data_com[0].upper() == 'ENTI': nlp = StanfordCoreNLP('http://localhost:9000') pos = [] for i in lst2: text = i[6].replace('\n',' ') # print(i[6]) result = nlp.annotate( text, properties = {'annotators': 'ner', 'outputFormat': 'json', 'timeout': 100000, }) # print(result["sentences"][0]) for word in result["sentences"][0]["tokens"]: pos.append('{} ({})'.format(word["word"], word["ner"])) # print(pos) # print('') # print(text) string = " ".join(pos) # print(pos) message = pickle.dumps(string) size = len(message) conn.sendall((str(size)).encode("utf8")) time.sleep(1) conn.sendall(message) conn.close()
def nlp_partial_sent(host_url): nlp_server = StanfordCoreNLP('http://localhost:9000') return partial(nlp_server.annotate, properties={'outputFormat': 'json'})
def __init__(self, files=None): self.sources = files self.triples = [] self.news = "" self.nlp = StanfordCoreNLP('http://localhost:9000')
def _find_nodes(self, tag_with): """ Apply NER to extract entities and their positional information from the context. When working with flair, a heuristic is used to counteract cases in which an entity contains trailing punctuation (this would conflict with BertTokenizer later on). :param tag_with: either 'stanford' or an instance of flair.models.SequenceTagger """ ent_id = 0 if tag_with == 'stanford': tagger = StanfordCoreNLP("http://corenlp.run/") for para_id, paragraph in enumerate( self.context): # between 0 and 10 paragraphs sentences = [ paragraph[0] ] + paragraph[1] # merge header and sentences to one list for sent_id, sentence in enumerate( sentences): # first sentence is the paragraph title annotated = tagger.annotate(sentence, properties={ "annotators": "ner", "outputFormat": "json" }) entities = annotated['sentences'][0][ 'entitymentions'] # list of dicts for e in entities: self.graph[ent_id] = { "address": (para_id, sent_id, e['characterOffsetBegin'], e['characterOffsetEnd']), "links": [], # relations "mention": e['text'] # name of the node } #print(f"in EntityGraph._find_nodes(): address & mention: {self.graph[ent_id]['address']} -- {self.graph[ent_id]['mention']}") #CLEANUP ent_id += 1 elif type(tag_with) == SequenceTagger: tagger = tag_with #print(f"in EntityGraph._find_nodes(): context:\n{self.context}") #CLEANUP for para_id, paragraph in enumerate( self.context): # between 0 and 10 paragraphs # merge header and sentences to one list and convert to Sentence object sentences = [ Sentence(s) for s in [paragraph[0]] + paragraph[1] ] tagged_sentences = tagger.predict(sentences) for sent_id, sentence in enumerate( tagged_sentences ): # first sentence is the paragraph title entities = sentence.get_spans('ner') for e in entities: if e.text.endswith(('.', '?', '!', ',', ':')): # counter tagging errors end_pos = e.end_pos - 1 text = e.text[:-1] else: end_pos = e.end_pos text = e.text self.graph.update({ ent_id: { "address": (para_id, sent_id, e.start_pos, end_pos), "links": [], # relations "mention": text # name of the node } }) ent_id += 1 else: print( f"invalid tagger; {tag_with}. Continuing with a flair tagger.") self._find_nodes(SequenceTagger.load('ner'))
# -*- coding:utf-8 -*- from pycorenlp import StanfordCoreNLP import re from nltk import RegexpParser #nlp = StanfordCoreNLP('http://localhost:9000/') nlp = StanfordCoreNLP("http://corenlp.run/") grammar = """ V: {<VB.*><PR>?<IN|TO>?} W: {<NN*|JJ|RB.*|PRP.*|DT>} P: {<IN|TO|PR>} VP2: {<V><P>} VP3: {<V><W>+<P>} VP1: {<V>} """ vp_parser = RegexpParser(grammar) def clean(word): if "(" in word: word = word[:word.find("(")] return word def analyze(sentence): output = nlp.annotate(sentence, properties={ 'annotators': 'tokenize,ssplit,pos,parse,depparse,coref', 'tokenize.whitespace': True, 'outputFormat': 'json'
def nlp_partial_sent(host_url): from pycorenlp import StanfordCoreNLP from functools import partial nlp_server = StanfordCoreNLP('http://localhost:9000') return partial(nlp_server.annotate, properties={'outputFormat': 'json'})
import re import json import pickle import os from tqdm import tqdm from pycorenlp import StanfordCoreNLP nlp_server = StanfordCoreNLP('http://ink-molly.usc.edu:9000') version = "1.0" # File name file_path = "data/01.src.txt" # Display options IF_DISP_PREFIX = False IF_DISP_TQDM = False IF_DISP_VB_UNMATCH = False IF_DISP_IF_UNMATCH = False IF_DISP_BAN = False IF_DISP_ALL_SEN = False IF_VERB_ONLY = True # Character filter character_patterns = [ '^Craig:.*', '^Cestero:.*', ]
def get_tokens_and_dependencies(sentence): nlp = StanfordCoreNLP('http://localhost:9000') output = nlp.annotate(sentence, properties={'annotators': 'tokenize,ssplit,pos,depparse,parse,dcoref','outputFormat': 'json'}) tokens = output['sentences'][0]['tokens'] dependencies = output['sentences'][0]['basic-dependencies'] return tokens, dependencies
import sys reload(sys) sys.setdefaultencoding('utf8') import pickle import re import random import string import json from xml.etree import ElementTree as etree from pycorenlp import StanfordCoreNLP from xmljson import BadgerFish from collections import OrderedDict nlp = StanfordCoreNLP('http://localhost:9000') def parse(input): dbg = open("debug","w") output = nlp.annotate(input, properties={ 'annotators': 'tokenize,ssplit,pos', 'outputFormat': 'xml', 'timeout': 30000}) fixed = [] for o in output: fixed.append(o) return("".join(fixed)) def lparse(input):
from pycorenlp import StanfordCoreNLP from typing import Set logger = logging.getLogger(__name__) logger.setLevel(logging.ERROR) all_zeroes = "ALL_ZERO" unknown_el = "_UNKNOWN" epsilon = 10e-8 special_tokens = {"–": "–", "—": "—", "@card@": "0" } corenlp = StanfordCoreNLP('http://semanticparsing:9000') corenlp_properties = { 'annotators': 'tokenize, pos, ner', 'outputFormat': 'json' } corenlp_caseless = { 'pos.model': 'edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger', 'ner.model': #'edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz,' + 'edu/stanford/nlp/models/ner/english.muc.7class.caseless.distsim.crf.ser.gz,' #+ 'edu/stanford/nlp/models/ner/english.conll.4class.caseless.distsim.crf.ser.gz' } module_location = os.path.abspath(__file__) module_location = os.path.dirname(module_location) RESOURCES_FOLDER = os.path.join(module_location, "..", "resources/")