def anaphora(text): nlp = StanfordCoreNLP('http://192.168.54.210:9000/') output = nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse,coref', 'outputFormat': 'text'}) sents = nltk.sent_tokenize(text) a=[] for sent in sents: a.append(sent.split()) output = str(output.replace('\r','').replace('\t','')) #output = output.split('Coreference set:', 1)[1] output = output.split('Coreference set:') #output = str(output.replace('\r','').replace('\t','')) #output = output.split('\n'); for out in output[1:]: #print out out = str(out.replace('\r','').replace('\t','')) out = out.split('\n') for i in out[1:-1]: i = i.split(', that is:') toFrom = i[0].split('->') fromSent , fromStart, fromEnd = sentenceRange(toFrom[0]) toSent , toStart, toEnd = sentenceRange(toFrom[1]) fromText , toText = fromTo(i[1]) if len(toText.split()) > 1: toText = shorten(toText) toText = [toText] #a[fromSent - 1][fromStart - 1:fromEnd - 1] = a[toSent - 1][toStart - 1:toEnd - 1] a[fromSent - 1][fromStart - 1:fromEnd - 1] = toText return a
def corenlp_tokenize(text): nlp = StanfordCoreNLP('http://localhost:9000') output = nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse', 'outputFormat': 'json' }) print(output['sentences'][0]['parse']) return output
def stanford_parsing_result(): text =""" I shot an elephant. The dog chased the cat. School go to boy. """ nlp = StanfordCoreNLP('http://localhost:9000') res = nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse', 'outputFormat': 'json' }) print(res['sentences'][0]['parse']) print(res['sentences'][2]['parse'])
def NERGetter(text): nlp = StanfordCoreNLP('http://192.168.54.210:9000/') output = nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos , ner', 'outputFormat': 'text'}) output = str(output.replace('\r','').replace('\t','')) output = output.split('[', 1)[1] output = str(output) output = output.split('\n') for i in output[0:-1]: i = i.replace(']','') i = i.split('NamedEntityTag=') return i[1]
class Parser: def __init__(self, coreNLPServer ='http://localhost:9000'): self.nlp = StanfordCoreNLP('http://localhost:9000') def word_list(self, text): nlp_output = self.nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit', 'outputFormat': 'json' }) word_array = [] for sentence in nlp_output['sentences']: for w in sentence['tokens']: word_array.append(w['word'].lower()) return word_array def parse_tree(self, text, binary=False, preprocessed=False): nlp_output = self.nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,parse', 'outputFormat': 'json', 'parse.binaryTrees': 'true' }) if type(nlp_output) == str: nlp_output = json.loads(nlp_output, strict=False) if len(nlp_output['sentences']) > 1: #merge trees from sentences tree_string = "(Top " for s in nlp_output['sentences']: p_tree = Tree.fromstring(s['parse']) tree_string += str(p_tree[0]) tree_string += ")" merged_tree = Tree.fromstring(tree_string) else: #no merging required merged_tree = Tree.fromstring(nlp_output['sentences'][0]['parse']) #remove root merged_tree = merged_tree[0] if binary: nltk.treetransforms.chomsky_normal_form(merged_tree) if preprocessed: merged_tree = preprocess_parse_tree(merged_tree) return merged_tree def draw_parse_tree(self, parse_tree): nltk.draw.tree.draw_trees(parse_tree)
def __init__(self, argv): self.input = "" self.output_folder = "" # output has to be a folder self.input_type = "" # Start Stanford CoreNLP Server self.nlp = StanfordCoreNLP('http://localhost:9000') # Read User Command Line opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="]) for opt, arg in opts: if opt == '-h': print("Type 'python3.5 text_preprocessing/preprocess.py -i <inputfile> -o <outputfile>' \ in run_source_code.sh file") sys.exit() elif opt in ("-i", "--ifile"): self.input = arg if os.path.exists(arg) == False: print("Input doesn't exist") sys.exit() if os.path.isdir(arg) == True: self.input_type = "dir" elif os.path.isfile(arg) == True: self.input_type = "file" elif opt in ("-o", "--ofile"): self.output_folder = arg print("Input: " + self.input +", " + self.input_type) print("Output: " + self.output_folder)
def __init__(self, annotators='tokenize,ssplit,pos,parse,lemma,ner', cacheDir='./corenlp'):#depparse self.annotators=annotators self.nlp = StanfordCoreNLP('http://localhost:9000') if not os.path.exists(cacheDir): os.makedirs(cacheDir) self.cache= os.listdir(cacheDir) self.cacheDir=cacheDir
class StanfordServerParser(Parser, GenericStanfordParser): """Follow the readme to setup the Stanford CoreNLP server""" def __init__(self, host='localhost', port=9000, properties={}): url = 'http://{0}:{1}'.format(host, port) self.nlp = StanfordCoreNLP(url) if not properties: self.properties = { 'annotators': 'parse', 'outputFormat': 'json', } else: self.properties = properties def _make_tree(self, result): return Tree.fromstring(result) def parse(self, sent): output = self.nlp.annotate(sent, properties=self.properties) # Got random html, return empty tree if isinstance(output, str): return Tree('', []) parse_output = output['sentences'][0]['parse'] + '\n\n' tree = next(next(self._parse_trees_output(parse_output)))[0] return tree
class CoreNLP: """Used to initialize the Stanford Core NLP in servlet mode and then connect to it using a socket""" mongo = MongoClient() mongo_db = mongo.get_database('dependencies') def __init__(self, timeout=15000, port=9000, buffer_size=4096): """Used to initialize the StanfordAPI object with the host, port and buffer""" # self.host = socket.gethostname() self.port = str(port) # self.timeout = str(timeout) # self.buffer = str(buffer_size) # self.process = Popen( # args=['java', '-mx4g', '-cp', 'commons/corenlp/*', 'edu.stanford.nlp.pipeline.StanfordCoreNLPServer', # '-port', self.port, '-timeout', self.timeout]) # time.sleep(5) self.nlp = StanfordCoreNLP('http://localhost:' + self.port) def parse(self, text): dobj = self.mongo_db.get_collection('dependency').find_one({'text': text}) if not dobj or dobj['deps'] == 'CoreNLP request timed out. Your document may be too long.': output = self.nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse,coref', 'coref.algorithm': 'neural', 'outputFormat': 'json', }) dep = {'text': text, 'deps': output} self.mongo_db.get_collection('dependency').insert_one(dep) return output else: return dobj['deps']
def __init__(self, corenlp_url='http://localhost:9000'): ''' Create Converter for converting NER annotations to Brat annotations classifier training data. To start the server checkout: http://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started ''' self.corenlp = StanfordCoreNLP(corenlp_url)
def resolve(self, text): sentences_all = sent_tokenize(text, 'English') for i in range(2, len(sentences_all)): text2 = sentences_all[i-2]+' '+sentences_all[i-1]+' '+sentences_all[i] print(text2) sentences = sent_tokenize(text2, 'English') print(sentences) nlp = StanfordCoreNLP('http://localhost:9000') output = nlp.annotate(text2, properties={ 'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,mention,dcoref', 'outputFormat': 'json' }) # target.write(output) # target.close() corefs = output['corefs'] cnt = 1 for key, chains in corefs.items(): substitute = '' print("\nchain number "+str(cnt)) cnt += 1 for chain in chains: # print(chain['isRepresentativeMention']+'\n') print(chain['type'] + ' ' + chain['text']) if (chain['isRepresentativeMention'] is True) and (chain['type'] != 'PRONOMINAL'): substitute = str(chain['text']) print(substitute+'\n') if (chain['type'] == 'PRONOMINAL') and (substitute != ''): sentence_num = chain['sentNum'] words = word_tokenize(sentences[sentence_num - 1], 'English') words[chain['startIndex'] - 1] = substitute new_sentence = ' '.join(words) sentences[sentence_num - 1] = new_sentence sentences_all[i-2] = sentences[0] sentences_all[i-1] = sentences[1] sentences_all[i] = sentences[2] return sentences_all
def standford_sentiment_answer(text_str): asw_sentiment = make_default_sentiment() nlp = StanfordCoreNLP('http://localhost:9000') res = nlp.annotate(text_str, properties={ 'annotators': 'sentiment', 'outputFormat': 'json', 'timeout': 20000, }) try: total_value = 0.0 for s in res["sentences"]: total_value += float(s["sentimentValue"]) asw_sentiment[s["sentiment"]] += 1 asw_sentiment['score'] = total_value return asw_sentiment except: return asw_sentiment
def __init__(self, host='localhost', port=9000, properties={}): url = 'http://{0}:{1}'.format(host, port) self.nlp = StanfordCoreNLP(url) if not properties: self.properties = { 'annotators': 'parse', 'outputFormat': 'json', } else: self.properties = properties
def __init__(self, timeout=15000, port=9000, buffer_size=4096): """Used to initialize the StanfordAPI object with the host, port and buffer""" # self.host = socket.gethostname() self.port = str(port) # self.timeout = str(timeout) # self.buffer = str(buffer_size) # self.process = Popen( # args=['java', '-mx4g', '-cp', 'commons/corenlp/*', 'edu.stanford.nlp.pipeline.StanfordCoreNLPServer', # '-port', self.port, '-timeout', self.timeout]) # time.sleep(5) self.nlp = StanfordCoreNLP('http://localhost:' + self.port)
def __init__(self, **kwargs): super(CoreNLPParser, self).__init__(**kwargs) self.corenlp = StanfordCoreNLP(kwargs['corenlp_url'] ) self.props = { 'annotators': 'tokenize,ssplit,lemma,pos,ner', 'outputFormat': 'json', 'ner.useSUTime': False, # dont want SUTime model 'ner.applyNumericClassifiers': False, # Dont want numeric classifier } if kwargs.get('ner_model'): # set NER model from CLI if not os.path.exists(kwargs.get('ner_model')): print('Error: Could not find NER model %s.' % kwargs.get('ner_model')) sys.exit(1) self.props['ner.model'] = kwargs['ner_model'] print("CoreNLP Properties : ", self.props)
class StanfordAnnotator(PR): def __init__(self, annotators='tokenize,ssplit,pos,parse'):#depparse self.annotators=annotators self.nlp = StanfordCoreNLP('http://localhost:9000') def process(self, doc): output=self.nlp.annotate(doc.getText(), properties={ 'annotators': self.annotators, 'outputFormat': 'json', 'timeout': '600000' }) sents=[] tokens=[] # print "output", json.dumps(output) tStart=0 tEnd=0 for s in output['sentences']: sentText=[] sentTokens=[] for t in s['tokens']: # print t sentText.append(t['before']) sentText.append(t['originalText']) token=Annotation(t['originalText'],tEnd,tEnd,t['characterOffsetBegin'], t['characterOffsetEnd'], 'Token', doc) token.setFeature('pos', t['pos']) token.setFeature('index', t['index']) tokens.append(token) sentTokens.append(token) tEnd+=1 cStart=s['tokens'][0]['characterOffsetBegin'] cEnd=s['tokens'][-1]['characterOffsetEnd'] sentText="".join(sentText) print sentText sent=Annotation(sentText, tStart, tEnd, cStart, cEnd, 'Sentence', doc) tStart=tEnd sent.setFeature('constituency-parse', s['parse']) sent.setFeature('dep-parse', 'not implemented!') sent.setFeature('index', s['index']) # sent.setRelation('tokens',sentTokens) sents.append(sent) # pr- doc.setSents(sents) doc.setTokens(tokens)
class NLPFactory: def __init__(self): self.url = os.environ.get("CORENLP_URL", "http://localhost:9000") self.nlp = StanfordCoreNLP(self.url) def annotate(self, text): """ annotate by dependence parser Args: text (str): input data Returns: json """ # corenlp will treat sentences with full stop independently text = text.replace('.', ',').replace('!', ',') return self.nlp.annotate(text, properties={"annotators": "pos,lemma,depparse,sentiment", "outputFormat": "json"})
def load_classifier(self, inputfile="slk_classifier.model.txt", outputfile="jsre_results.txt"): self.corenlp_client = StanfordCoreNLP('http://localhost:9000') # sup.relation.model= tokenkeys = set() sentencekeys = set() for d in self.corpus.documents: for s in self.corpus.documents[d].sentences: corenlpres = self.corenlp_client.annotate(s.text.encode("utf8"), properties={ 'ssplit.eolonly': True, 'openie.triple.all_nominals': True, 'openie.triple.strict': False, 'openie.max_entailments_per_clause': 500, 'annotators': 'tokenize,ssplit,pos,depparse,natlog,openie', #'annotators': 'tokenize, ssplit, pos, lemma, ner, parse, relation, openie', 'outputFormat': 'json', # 'sup.relation.model': self.modelname }) for o in corenlpres["sentences"][0]["openie"]: if "mir" in o["object"] or "mir" in o["subject"]: print "{}={}>{}".format(o["subject"], o["relation"], o["object"])
class NerToBratConverter(object): def __init__(self, corenlp_url='http://localhost:9000'): ''' Create Converter for converting NER annotations to Brat annotations classifier training data. To start the server checkout: http://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started ''' self.corenlp = StanfordCoreNLP(corenlp_url) def convertToBrat(self, text_file, ann_file): print("Processing %s" % text_file) with open(text_file) as f: text = f.read() props = { 'annotators': 'tokenize,ssplit,pos,ner', 'outputFormat': 'json'} output = self.corenlp.annotate(text, properties=props) # flatten sentences and tokens tokenlists = [s['tokens'] for s in output['sentences']] tokens = itertools.chain.from_iterable(tokenlists) count = 1 with open(ann_file, 'w', 1) as out: for token in tokens: if token['ner'] != 'O': rec = "T%d\t%s %d %d\t%s" % (count, token['ner'], token['characterOffsetBegin'], token['characterOffsetEnd'], token['originalText']) # print(rec) out.write(rec) out.write("\n") count += 1 print("Wrote %s" % ann_file) def convert_all(self, input_paths): with open(input_paths) as paths: for d in map(lambda x: x.split(','), map(lambda x: x.strip(), paths)): self.convertToBrat(d[0], d[1])
class StanfordNERApi(): ''' Make use of StanfordCoreNLP Server Extract keyword through name entity recogonition ''' def __init__(self): self.nlp = StanfordCoreNLP(NLP_SERVER) def ner_groupby_ner(self, text): response = self.nlp.annotate(text, properties={ 'annotators': 'ner,lemma', 'outputFormat': 'json' }) return self.__process_ner_groupby_ner(response) def __process_ner_groupby_ner(self, response): output_dict = dict() '''The response is generally organized as {sentences:[{tokens:[]},{}]}''' if type(response) == dict and 'sentences' in response: for sentence in response['sentences']: for item in sentence['tokens']: # we only care about ner in set TARGET_NER if item.get('ner') in TARGET_NER: if item['ner'] not in output_dict: output_dict[item['ner']] = set() output_dict[item['ner']].add(item['originalText']) # convert from set to list for further json dumps for key in output_dict: output_dict[key] = list(output_dict[key]) # convert dict to string by json dumps if len(output_dict) > 0: return json.dumps(output_dict) else: return None else: logger.warning('sentences part is not in the response from NLP server.') return None
def __init__(self, files=None): self.sources = files self.triples = [] self.news = "" self.nlp = StanfordCoreNLP('http://localhost:9000')
def __init__(self): self.url = os.environ.get("CORENLP_URL", "http://localhost:9000") self.nlp = StanfordCoreNLP(self.url)
import os import sys from unidecode import unidecode import string printable = set(string.printable) # from nltk.tag import StanfordNERTagger # st=StanfordNERTagger("./classifiers/english.all.3class.distsim.crf.ser.gz", path_to_jar="./stanford-ner.jar") from pycorenlp import StanfordCoreNLP nlp = StanfordCoreNLP('http://localhost:9000') path = sys.argv[1] spath = path + '/../names/' d = os.path.dirname(spath) if not os.path.exists(d): os.makedirs(d) def remove_non_ascii(text): return unidecode(unicode(text, encoding = "utf-8")) for file in os.listdir(path): current=os.path.join(path,file) if os.path.isfile(current): data=open(current,'rb')
else: return x #tagger keeps freaking out at numbers in parentheses. this function removes the parentheses def removeParenth(x): findParenth = re.search('\([0-9]+\)',x) if findParenth: x = re.sub('\(','',x) x = re.sub('\)','',x) return x else: return x if __name__ == '__main__': nlp = StanfordCoreNLP('http://localhost:9000') for line in orig_file: if not line.startswith("PMID"): info = line.split('\t') pmid = info[0] ta = info[1] sentence = info[2] sentence = sentence.rstrip('\n') cleanSentence = removeBracket(sentence) extraClean = removeParenth(cleanSentence) output = nlp.annotate(extraClean,properties={ 'annotators':'tokenize,ssplit,pos,depparse,parse', 'outputFormat' : 'json'}) try: result = output['sentences'][0]['parse']
from pycorenlp import StanfordCoreNLP from pprint import pprint import json FILE = "data/test200" nlp = StanfordCoreNLP('http://localhost:{0}'.format(9000)) def get_stanford_annotations( text, port=9000, annotators='tokenize,ssplit,pos,lemma,depparse,parse'): output = nlp.annotate(text, properties={ "timeout": "10000", "ssplit.isOneSentence": "true", 'annotators': annotators, }) return output with open(FILE + '.txt', encoding='utf-8') as in_file, open(FILE + '.NRE', 'w', encoding='utf-8') as out_file: for line in in_file: ls = line.strip().split('\t') sent_id = ls[0].strip() document = ' '.join(ls[1].strip().split()) token1 = ls[2]
def rawfileprocess(rawfile, outputFile, aspect_file): f = open(rawfile) fout = open(outputFile, 'a') faspectfile = open(aspect_file, 'a') result = [] aspectset = set() nlp = StanfordCoreNLP('http://localhost:9000') for line in f: line = line.strip() seperatorIndex = line.find('##') if seperatorIndex <= 0: continue #aspect prrprocess aspectString = line[:seperatorIndex].strip() if aspectString.find('[') < 0: continue aspectsTmp = aspectString.split(',') aspects = [] for aspectScore in aspectsTmp: aspectScore = aspectScore.strip(' ') if aspectScore.find('[u]') >= 0 or aspectScore.find('[p]') >= 0: continue endIndex = aspectScore.find('[') if endIndex < 0: continue aspects.append(aspectScore[:endIndex].split()) for aspectitem in aspectScore[:endIndex].split(): if aspectitem != ' ': aspectset.add(aspectitem) if len(aspects) == 0: continue #sentence tokenizer and word tokenizer and dep pos rawReview_1 = line[seperatorIndex + 2:].strip() output3 = nlp.annotate(rawReview_1, properties={ 'annotators': 'tokenize,pos,depparse', 'outputFormat': 'json' }) for index_sentence in range(0, len(output3['sentences'])): subsentence = output3['sentences'][index_sentence]['tokens'] subsentencedep = output3['sentences'][index_sentence][ 'enhancedPlusPlusDependencies'] tmpword = [] tmppos = [] tmpdeps = [] for index in range(0, len(subsentence)): tmpword.append(subsentence[index]['word']) tmppos.append(subsentence[index]['pos']) tmpdep = '' for deps in subsentencedep: if deps['dependent'] == index + 1 or deps[ 'governor'] == index + 1: dependent_index = deps['dependent'] - 1 gov_index = deps['governor'] - 1 if deps['governorGloss'] == 'ROOT': govpos = '#' else: govpos = subsentence[gov_index]['pos'] if deps['dependentGloss'] == 'ROOT': deppos = '#' else: deppos = subsentence[dependent_index]['pos'] tmpdep += '(' + deps['dep'] + ' ' + deps[ 'governorGloss'] + ' ' + govpos + ' ' + deps[ 'dependentGloss'] + ' ' + deppos + ')\t' tmpdep.strip('\t') tmpdeps.append(tmpdep) lables = get_lable_2(aspects, tmpword) if 'B-TERM' not in lables: continue ''' for i in range(0,len(lables)): result.append(tmpword[i]+'\t'+tmppos[i] +'\t'+lables[i]+'\n') #+'\t'+tmpdeps[i] #result.append('\n') result.append('\n') ''' for i in range(0, len(lables)): result.append(tmpword[i] + '\t' + tmppos[i] + '\t' + lables[i] + '\t' + tmpdeps[i] + '\n') result.append('\n') try: for aspect in aspectset: faspectfile.write(aspect + '\n') for word in result: fout.write(word) except IOError: print " IOError exception" exit(0) f.close() fout.close() faspectfile.close()
from pycorenlp import StanfordCoreNLP nlp = StanfordCoreNLP('http://localhost:9000') from functools import reduce import pandas as pd def keywordsInSen(sen): words = [(t['lemma'], t['index'], t['ner']) for t in sen['tokens'] if t['ner'] != 'O'] reducedWords = [] if len(words) == 0: return [] parWord = words[0] if len(words) > 1: for w in words[1:]: if w[1] == parWord[1] + 1 and w[2] == parWord[2]: parWord = (parWord[0] + ' ' + w[0], parWord[1] + 1, parWord[2]) else: reducedWords.append((parWord[0], parWord[2])) parWord = w reducedWords.append((parWord[0], parWord[2])) return reducedWords def keywordsInTxt(txt): an = nlp.annotate(txt, properties={ 'annotators': 'ner', 'outputFormat': 'json' }) wordLists = [keywordsInSen(s) for s in an['sentences']] return set(reduce(lambda x, y: x + y, wordLists)) def extractFromHeadlines(headlineTable):
import re import json import pickle import os from tqdm import tqdm from pycorenlp import StanfordCoreNLP nlp_server = StanfordCoreNLP('http://ink-molly.usc.edu:9000') version = "1.0" # File name file_path = "data/01.src.txt" # Display options IF_DISP_PREFIX = False IF_DISP_TQDM = False IF_DISP_VB_UNMATCH = False IF_DISP_IF_UNMATCH = False IF_DISP_BAN = False IF_DISP_ALL_SEN = False IF_VERB_ONLY = True # Character filter character_patterns = [ '^Craig:.*', '^Cestero:.*', ]
""" from bs4 import BeautifulSoup as bS import collections import networkx as nx import os from pycorenlp import StanfordCoreNLP import re import requests import unicodedata # starting coreNLP server via the following command. # for the latest version, see: https://stanfordnlp.github.io/CoreNLP/ # java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 nlp = StanfordCoreNLP('http://localhost:9000') # <editor-fold desc='Define the functions that we will need for this script..'> def edge2graph(input_label, input_edges, input_diag_name, input_bio_dict): """ Combines all the info in one graph object... This is a preprocessing step to store the info as a json. :param input_label: The input category we are going to prepocess :param input_edges: List of all edges that are found while analyzing the text :param input_diag_name: List of all diag people names :param input_bio_dict: Dict of the bio text of all the diag people :return: """ ppl_label_edge_list = [x[:-1] for x in input_edges if input_label in x]
from pycorenlp import StanfordCoreNLP import pymysql db = pymysql.connect(host="localhost", user="******", password="******", db="ArticleNYT") print(db) nlp = StanfordCoreNLP('http://localhost:9000') print(nlp) cur = db.cursor() cur.execute("""SELECT COUNT(NID) FROM Tech_2018;""") temp = cur.fetchone() counter = temp[0] - 1 print(counter) while (counter >= 0): try: cur.execute("""SELECT Abstract from Tech_2018 WHERE NID = %s;""", (counter)) tmpabs = cur.fetchone() finabs = str(tmpabs[0]) res = nlp.annotate(finabs, properties={ 'annotators': 'sentiment', 'outputFormat': 'json', 'timeout': 1000000000, "ssplit.eolonly": "true" }) for s in res["sentences"]: print("%s" % (s["sentimentValue"])) score = s['sentimentValue']
return sum/(len(sentiments)) #tales=['FundeVogel','Rapunzel','TheGooseGirl','Golden Bird','HansInGoodLuck','JorindaAndJorindel','TravelingMusicians','OldSultan','TheStraw','BriarRose','DogAndSparrow','TwelveDancingPrincesses','FishermanAndWife','TheWillowRen','FrogPrince','CatAndMouse'] taleSentiments=[] for taleName in tales: #f = open("./Corefs/"+taleName,'r',encoding="utf8") p(taleName) if (sys.argv[1]==1): f=open("./Stories/"+taleName,'r',encoding="utf8") else: f=open("./Corefs/"+taleName,'r',encoding="utf8") tale= f.read() tale = tale.replace('\n', ' ') tale = tale.replace('\r', ' ') #pprint.pprint(tale) nlp_wrapper = StanfordCoreNLP('http://localhost:9000') #doc = "Ronaldo has moved from Real Madrid to Juventus. While Messi still plays for Barcelona" doc=tale #pprint.pprint(doc) annot_doc = nlp_wrapper.annotate(doc, properties={ 'annotators': 'ner, pos,depparse', 'outputFormat': 'json', 'timeout': 100000, }) nsubjs=[] #pprint.pprint(annot_doc) for sentence in annot_doc['sentences']: for element in sentence['basicDependencies']: if(element['dep']=='nsubj'):
# -*- coding:utf-8 -*- from pycorenlp import StanfordCoreNLP import re from nltk import RegexpParser #nlp = StanfordCoreNLP('http://localhost:9000/') nlp = StanfordCoreNLP("http://corenlp.run/") grammar = """ V: {<VB.*><PR>?<IN|TO>?} W: {<NN*|JJ|RB.*|PRP.*|DT>} P: {<IN|TO|PR>} VP2: {<V><P>} VP3: {<V><W>+<P>} VP1: {<V>} """ vp_parser = RegexpParser(grammar) def clean(word): if "(" in word: word = word[:word.find("(")] return word def analyze(sentence): output = nlp.annotate(sentence, properties={ 'annotators': 'tokenize,ssplit,pos,parse,depparse,coref', 'tokenize.whitespace': True, 'outputFormat': 'json'
#import json #import nltk #import math #import re import string from pycorenlp import StanfordCoreNLP from textblob import TextBlob from nltk.corpus import wordnet from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer import test_main_rules as rules analyser = SentimentIntensityAnalyzer() nlp = StanfordCoreNLP('http://localhost:9000') #f = open("sample_sentences.txt","r") line = ''' I have received redmi note 4 black matte 64gb version today. Packaging is so good. About phone: its fabulous phone.Amazing battery back up, good camera, great memory, beautiful colour of phone with classy primium look of black matte makes it different from other phone. I am loving every feature of this phone. ''' asp_sent = {} asp_rating = {} def corefResolver(line): ind_sent = [] complete_coref_output = nlp.annotate(line, properties={ 'annotators': 'dcoref', 'outputFormat': 'json' }) coref_output = complete_coref_output['corefs']
CORENLP_SERVER_ADDRESS = 'http://localhost:9000' NER_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 'ner-crf-training-data.tsv') RE_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 're-training-data.corp') if os.path.exists(OUTPUT_DIRECTORY): if os.path.exists(NER_TRAINING_DATA_OUTPUT_PATH): os.remove(NER_TRAINING_DATA_OUTPUT_PATH) if os.path.exists(RE_TRAINING_DATA_OUTPUT_PATH): os.remove(RE_TRAINING_DATA_OUTPUT_PATH) else: os.makedirs(OUTPUT_DIRECTORY) sentence_count = 0 nlp = StanfordCoreNLP(CORENLP_SERVER_ADDRESS) # looping through .ann files in the data directory ann_data_files = [ f for f in listdir(DATA_DIRECTORY) if isfile(join(DATA_DIRECTORY, f)) and f.split('.')[1] == 'ann' ] for file in ann_data_files: entities = [] relations = [] # process .ann file - place entities and relations into 2 seperate lists of tuples with open(join(DATA_DIRECTORY, file), 'r') as document_anno_file: lines = document_anno_file.readlines() for line in lines:
def brat_to_conll(input_folder, output_filepath, tokenizer, language): ''' Assumes '.txt' and '.ann' files are in the input_folder. Checks for the compatibility between .txt and .ann at the same time. ''' if tokenizer == 'spacy': spacy_nlp = spacy.load(language) elif tokenizer == 'stanford': core_nlp = StanfordCoreNLP('http://localhost:{0}'.format(9000)) else: raise ValueError("tokenizer should be either 'spacy' or 'stanford'.") verbose = False dataset_type = os.path.basename(input_folder) print("Formatting {0} set from BRAT to CONLL... ".format(dataset_type), end='') text_filepaths = sorted(glob.glob(os.path.join(input_folder, '*.txt'))) output_file = codecs.open(output_filepath, 'w', 'latin-1') for text_filepath in text_filepaths: base_filename = os.path.splitext(os.path.basename(text_filepath))[0] annotation_filepath = os.path.join(os.path.dirname(text_filepath), base_filename + '.ann') # create annotation file if it does not exist if not os.path.exists(annotation_filepath): codecs.open(annotation_filepath, 'w', 'latin-1').close() text, entities = get_entities_from_brat(text_filepath, annotation_filepath) entities = sorted(entities, key=lambda entity: entity["start"]) if tokenizer == 'spacy': sentences = get_sentences_and_tokens_from_spacy(text, spacy_nlp) elif tokenizer == 'stanford': sentences = get_sentences_and_tokens_from_stanford(text, core_nlp) for sentence in sentences: inside = False previous_token_label = 'O' for token in sentence: token['label'] = 'O' for entity in entities: if entity['start'] <= token['start'] < entity['end'] or \ entity['start'] < token['end'] <= entity['end'] or \ token['start'] < entity['start'] < entity['end'] < token['end']: token['label'] = entity['type'].replace( '-', '_' ) # Because the ANN doesn't support tag with '-' in it break elif token['end'] < entity['start']: break if len(entities) == 0: entity = {'end': 0} if token['label'] == 'O': gold_label = 'O' inside = False elif inside and token['label'] == previous_token_label: gold_label = 'I-{0}'.format(token['label']) else: inside = True gold_label = 'B-{0}'.format(token['label']) if token['end'] == entity['end']: inside = False previous_token_label = token['label'] if verbose: print('{0} {1} {2} {3} {4}\n'.format( token['text'], base_filename, token['start'], token['end'], gold_label)) output_file.write('{0} {1} {2} {3} {4}\n'.format( token['text'], base_filename, token['start'], token['end'], gold_label)) if verbose: print('\n') output_file.write('\n') output_file.close() print('Done.') if tokenizer == 'spacy': del spacy_nlp elif tokenizer == 'stanford': del core_nlp
import sys import json from pycorenlp import StanfordCoreNLP nlp = StanfordCoreNLP('http://localhost:9000') # file_name = './python_code/test.txt' # input = open(file_name).read().splitlines() file_name = json.loads(sys.stdin.readlines()[0]) input = file_name.splitlines() i = 0 ans_index = 1 res_sentence_arr_disp = [] pos_tags = ['NN', 'NNP', 'NNS', 'NNPS', 'CD', 'JJ'] res_sentence_disp = '' while (i < len(input)): input[i] = input[i].lower() res = nlp.annotate(input[i], properties={ 'annotators': 'pos', 'outputFormat': 'json', 'timeout': 1000000, }) for k in range(0, len(res["sentences"])): tokens = res["sentences"][k]['tokens'] b_flag = False for token in tokens:
#-----Lemmanizing Words------------ lemmas = [] wordnet_lemmatizer = WordNetLemmatizer() for word in unic_nouns: lemma = wordnet_lemmatizer.lemmatize(word, ) lemmas.append(lemma) #----------------------------------- #--------Word Cloud----------------- #Print to generate Word Clouds """" for i in range(len(lemmas)): print(str(nouns_frequency[i])+ " " + lemmas[i]) """ #-------------------------------------Item 4----------------------------------------------------------- sentence = "The last love letter I wrote was probably about 10 years ago." #tokenized = nltk.word_tokenize(sentence) parse = StanfordCoreNLP('http://localhost:9000') output = parse.annotate(sentence, properties={ 'annotators': 'parse', 'outputFormat': 'json' }) tree1 = output['sentences'][0]['parse'] + "" treeFinal = Tree.fromstring(tree1) treeFinal.draw() #t = Tree. #t.draw()
def __init__(self): self.corenlp = StanfordCoreNLP('http://localhost:9000')
import json import os import re import requests import sys import traceback from json import JSONDecodeError from requests.exceptions import RequestException from nltk.tokenize import sent_tokenize from pycorenlp import StanfordCoreNLP nlpserver = StanfordCoreNLP("http://localhost:9000") def clean_depparse(dep): """ Given a dependency dictionary, return a formatted string representation. """ return str(dep['dep'] + "(" + dep['governorGloss'].lower() + "-" + str(dep['governor']) + ", " + dep['dependentGloss'] + "-" + str(dep['dependent']) + ")") def clean_treeparse(tree): cleaned_tree = re.sub(r' {2,}', ' ', tree) cleaned_tree = re.sub(r'\n', '', cleaned_tree) cleaned_tree = re.sub(r'\([^\s]*\s', '', cleaned_tree) cleaned_tree = re.sub(r'\)', '', cleaned_tree) cleaned_tree = re.sub(r'-LRB-', '(', cleaned_tree) cleaned_tree = re.sub(r'-RRB-', ')', cleaned_tree)
class GoldenSupervision(): def __init__(self): self.load_data() self.nlp = StanfordCoreNLP(config.StanfordCoreNLP_Path) def load_data(self): # loading webcomplexquestions with open(config.complexwebquestions_dir + 'ComplexWebQuestions_' + config.EVALUATION_SET + '.json') as f: questions = json.load(f) print(len(questions)) print(pd.DataFrame(questions)['compositionality_type'].value_counts()) # aliases version compWebQ = pd.DataFrame( [{'ID': question['ID'], 'question': question['question'], 'webqsp_question': question['webqsp_question'], \ 'machine_question': question['machine_question'], 'comp': question['compositionality_type'], \ } for question in questions]) print(compWebQ['comp'].value_counts()) self.compWebQ = compWebQ.to_dict(orient="rows") def calc_split_point(self, question): question['question'] = question['question'].replace('?', '').replace( '.', '') question['machine_question'] = question['machine_question'].replace( '?', '').replace('.', '') machine_annotations = self.annotat(question['machine_question'], annotators='tokenize') webqsp_annotations = self.annotat(question['webqsp_question'], annotators='tokenize') question['machine_tokens'] = machine_annotations question['webqsp_tokens'] = webqsp_annotations # calculating original split point org_q_vec = question['webqsp_tokens'] machine_q_vec = question['machine_tokens'] org_q_offset = 0 for word in machine_q_vec: if org_q_offset < len( org_q_vec) and org_q_vec[org_q_offset] == word: org_q_offset += 1 else: break # adding split_point2 for composition if question['comp'] == 'composition': org_q_offset2 = len(machine_q_vec) - 1 for word in org_q_vec[::-1]: if org_q_offset2 > 0 and machine_q_vec[org_q_offset2] == word: org_q_offset2 -= 1 else: break if org_q_offset2 != len(machine_q_vec) - 1: question['split_point2'] = org_q_offset2 else: question['split_point2'] = org_q_offset2 question['machine_comp_internal'] = ' '.join( question['machine_tokens'] [org_q_offset:question['split_point2'] + 1]) question['split_point'] = org_q_offset if question['split_point'] == 0: question['split_point'] = 1 org_q_offset = 0 new_part = [] for word in question['machine_tokens']: if org_q_offset < len(question['webqsp_tokens']) and question[ 'webqsp_tokens'][org_q_offset] == word: org_q_offset += 1 else: new_part.append(word) question['split_point'] = org_q_offset question['new_part'] = ' '.join(new_part) return question # Generating golden supervision def gen_golden_supervision(self): qind = 0 num_q_to_proc = len(self.compWebQ) for question in self.compWebQ[0:num_q_to_proc]: # print question qind += 1 if qind % 100 == 0: print(qind) if question['comp'] is None or question['comp'] in [ 'comparative', 'superlative' ]: continue question = self.calc_split_point(question) mg_question = question['machine_question'].split() if question['split_point'] == 0: question['split_point'] = 1 question['flip_rephrase'] = 0 if question['comp'] == 'conjunction': tokens_anno = self.annotat(' '.join(mg_question)) question['machine_comp_internal'] = '' s = question['split_point'] question['split_part1'] = ' '.join(mg_question[:s]) question['split_part2'] = mg_question[s:] if question['split_part2'][ 0] == 'and': # delete conjunction word question['split_part2'] = question['split_part2'][1:] # add wh- and nouns of first part head_part = [] for i in range(len(tokens_anno)): # if we meet a verb, or a that(WDT) in the middle, we break if 'V' in tokens_anno[i]['pos'] or ( 'WDT' in tokens_anno[i]['pos'] and i != 0): break else: head_part.append(mg_question[i]) question['split_part2'] = ' '.join(head_part + question['split_part2']) else: if question['split_point2'] <= question['split_point']: print('found error in split point 2') question['split_point2'] = question['split_point'] = 1 s1, s2 = question['split_point'], question['split_point2'] question['split_part1'] = question['machine_comp_internal'] question['split_part2'] = ' '.join(mg_question[:s1] + [ '%composition', ] + mg_question[s2 + 1:]) # print('{}[{}]\n[{}]\n[{}]\n{}'.format(question['comp'], ' '.join(mg_question), # question['split_part1'], question['split_part2'], '-' * 100)) out = pd.DataFrame(self.compWebQ[0:num_q_to_proc])[[ 'ID', 'comp', 'flip_rephrase', 'split_part1', 'machine_comp_internal', 'split_part2', 'question', 'machine_question' ]] with open( config.golden_supervision_dir + config.EVALUATION_SET + '.json', 'w') as outfile: json.dump(out.to_dict(orient="rows"), outfile, sort_keys=True, indent=4) def annotat(self, text, annotators='pos'): question = text.replace('?', '') text = unicodedata.normalize('NFKD', question).encode( 'ascii', 'ignore').decode(encoding='UTF-8') output = self.nlp.annotate(text, properties={ 'annotators': annotators, 'outputFormat': 'json' }) try: tokens_anno = output['sentences'][0]['tokens'] except KeyError: tokens_anno = [k['word'] for k in output['tokens']] return tokens_anno
class Text2Vec(object): def __init__(self, wordvec_path, preload=False): self.wp = wordvec_path self.wv = {} self.__read_wv__(preload=preload) self.nlp = StanfordCoreNLP('http://localhost:9000') def __read_wv__(self, sep=" ", preload=False): if not preload: with open(self.wp, 'r') as f: for line in f: tmp = line.split(sep) word = tmp[0] vec = np.array([float(each) for each in tmp[1:]]) self.wv[word] = vec print("Number of tokens: ", len(self.wv)) # pprint(self.wv.keys()) # dump wordvector with open('gloveWordVector.bin', 'wb') as f2: pickle.dump(self.wv, f2) else: with open(self.wp, 'rb') as f: self.wv = pickle.load(f) def convert2vec(self, in_path, out_path, has_keywords=False): vectors = [] labels = [] with open(in_path) as f: for i, line in enumerate(f): data = json.loads(line) section = data['section'] if has_keywords: keywords = data['keywords'] else: keywords = None headline = data['headline'] lead_paragraph = data['lead_paragraph'] tokens = [] try: out = self.nlp.annotate(lead_paragraph, properties={ 'annotators': 'tokenize, ssplit, pos', 'outputFormat': 'json' }) # take sentence 1 if isinstance(out, dict) and out['sentences']: sentence = out['sentences'][0] for each in sentence['tokens']: word = each['word'].lower() word = word.strip('.') word = word.strip(',') word = word.strip(')') word = word.strip('(') pos = each['pos'] if "JJ" in pos or "NN" in pos or "VB" in pos: tokens.append(word) except AssertionError: pass # add keywords if keywords: for each in keywords: tmp = each['value'] tmp = tmp.split(" ") tmp = [each.strip(',').lower() for each in tmp] tmp = [each.strip('.').lower() for each in tmp] tmp = [each.strip(')').lower() for each in tmp] tmp = [each.strip('(').lower() for each in tmp] tokens += tmp # add headline if headline: tmp = headline.split(' ') tmp = [each.strip(',').lower() for each in tmp] tmp = [each.strip('.').lower() for each in tmp] tmp = [each.strip(')').lower() for each in tmp] tmp = [each.strip('(').lower() for each in tmp] tokens += tmp wv = None fail2find = [] count = 0 for t in tokens: if t in self.wv: if wv is None: if float('inf') not in self.wv[t] and -float( 'inf') not in self.wv[t] and all( self.wv[t] < 1e5): wv = self.wv[t] count += 1 else: if float('inf') not in self.wv[t] and -float( 'inf') not in self.wv[t] and all( self.wv[t] < 1e5): wv += self.wv[t] count += 1 else: fail2find.append(t) print( "article %s -- Tokens not in word vector dictionary: %s" % (i, fail2find)) if wv is not None: vectors.append(wv / count) labels.append(section) vectors = np.array(vectors) print(vectors.shape) print(vectors) unique_labels = set(labels) label_mapping = {} for i, each in enumerate(unique_labels): label_mapping[each] = i new_labels = [] for each in labels: new_labels.append(label_mapping[each]) new_labels = np.array(new_labels).reshape(-1, 1) print(new_labels.shape) complete_data = np.concatenate((vectors, new_labels), axis=1) print(complete_data) np.savetxt('NewYorkTime.csv', complete_data, delimiter=',') @staticmethod def plot_data(data): num_sample = 5000 label = data[:, -1] feature = data[:, :-1] assignment = {} for i in range(len(feature)): if label[i] not in assignment: assignment[label[i]] = [] assignment[label[i]].append(i) # down sample old_assignment = assignment assignment = {} indicies = [] for label in old_assignment: last_length = len(indicies) indicies += np.random.choice( old_assignment[label], size=min(int(num_sample / len(old_assignment)), len(old_assignment[label])), replace=False).tolist() assignment[label] = np.arange(last_length, len(indicies)) feature = feature[indicies] print(feature.shape) print(len(indicies)) print(len(np.unique(indicies))) tsne = TSNE() x = tsne.fit_transform(feature) fig, ax = plt.subplots() # ax.plot(x[:, 0], x[:, 1], '*') r = RandomColor() colors = r.generate(count=len(assignment)) for i, label in enumerate(assignment): ax.plot(x[assignment[label]][:, 0], x[assignment[label]][:, 1], '*', color=colors[i], label=label) plt.legend() plt.show()
class StanfordRE(ReModel): def __init__(self, corpus, relationtype, modelname="stanfordre_classifier.ser"): super(StanfordRE, self).__init__() self.modelname = modelname self.pairs = {} self.corenlp_client = None self.relationtype = relationtype self.corpus = corpus def generate_data(self, corpus, modelname, pairtypes): if os.path.isfile(self.temp_dir + modelname + ".txt"): print "removed old data" os.remove(self.temp_dir + modelname + ".txt") trainlines = [] # get all entities of this document # doc_entities = [] pcount = 0 truepcount = 0 ns = 0 for sentence in corpus.get_sentences("goldstandard"): logging.info("{}".format(sentence.sid)) nt_to_entity = {} for e in sentence.entities.elist['goldstandard']: # TODO: merge tokens of entity nt = str(e.tokens[0].order) nt_to_entity[nt] = e # print nt_to_entity # ns = sentence.sid.split("s")[-1] for t in sentence.tokens: nt = str(t.order) # print nt, nt in nt_to_entity if nt in nt_to_entity: # print nt, nt_to_entity[nt], nt_to_entity[nt].type #l = [str(ns), nt_to_entity[nt].type, nt, "O", t.pos, t.text, "O", "O", "O"] # TODO: change other to entitiy name l = [str(ns), "Other", nt, "O", t.pos, t.text, "O", "O", "O"] else: # print nt, nt_to_entity l = [str(ns), "O", nt, "O", t.pos, t.text, "O", "O", "O"] trainlines.append(l) trainlines.append([""]) sentence_entities = [entity for entity in sentence.entities.elist["goldstandard"]] # logging.debug("sentence {} has {} entities ({})".format(sentence.sid, len(sentence_entities), len(sentence.entities.elist["goldstandard"]))) for pair in itertools.combinations(sentence_entities, 2): if pair[0].type == pairtypes[0] and pair[1].type == pairtypes[1] or pair[1].type == pairtypes[0] and pair[0].type == pairtypes[1]: # logging.debug(pair) if pair[0].type == pairtypes[0]: e1id = pair[0].eid e2id = pair[1].eid else: e1id = pair[1].eid e2id = pair[0].eid pair = (pair[1], pair[0]) pid = sentence.did + ".p" + str(pcount) # self.pairs[pid] = (e1id, e2id) self.pairs[pid] = pair if e2id in pair[0].targets: truepcount += 1 nt1 = str(pair[0].tokens[0].order) nt2 = str(pair[1].tokens[0].order) trainlines.append([nt1, nt2, "+".join(pairtypes)]) pcount += 1 trainlines.append([""]) ns += 1 logging.info("Writing {} lines...".format(len(trainlines))) with codecs.open(self.temp_dir + modelname + ".corp", 'w', "utf-8") as trainfile: for l in trainlines: # print l trainfile.write("\t".join(l) + "\n") logging.info("True/total relations:{}/{} ({})".format(truepcount, pcount, str(1.0*truepcount/pcount))) def write_props(self): with open(config.corenlp_dir + "roth.properties", 'r') as propfile: lines = propfile.readlines() print lines with open(config.corenlp_dir + "roth.properties", 'w') as propfile: for l in lines: if l.startswith("serializedRelationExtractorPath"): propfile.write("serializedRelationExtractorPath = {}\n".format(config.corenlp_dir + self.modelname)) elif l.startswith("trainPath"): propfile.write("trainPath = {}\n".format(self.temp_dir + self.modelname + ".corp")) else: propfile.write(l) def train(self): self.generate_data(self.corpus, self.modelname, pairtypes=self.relationtype) # java -cp classpath edu.stanford.nlp.ie.machinereading.MachineReading --arguments roth.properties if os.path.isfile(config.corenlp_dir + self.modelname): print "removed old model" os.remove(config.corenlp_dir + self.modelname) if not os.path.isfile(self.temp_dir + self.modelname + ".corp"): print "could not find training file " + config.corenlp_dir + self.modelname + ".corp" sys.exit() self.write_props() classpath = config.corenlp_dir + "*" srecall = ['java', '-mx3g', '-classpath', classpath, "edu.stanford.nlp.ie.machinereading.MachineReading", "--arguments", config.corenlp_dir + "roth.properties"] print " ".join(srecall) # sys.exit() srecall = Popen(srecall) #, stdout=PIPE, stderr=PIPE) res = srecall.communicate() if not os.path.isfile(config.corenlp_dir + self.modelname): print "error with StanfordRE! model file was not created" print res[1] sys.exit() else: statinfo = os.stat(config.corenlp_dir + self.modelname) if statinfo.st_size == 0: print "error with StanfordRE! model has 0 bytes" print res[0] print res[1] sys.exit() # logging.debug(res) def load_classifier(self, inputfile="slk_classifier.model.txt", outputfile="jsre_results.txt"): self.corenlp_client = StanfordCoreNLP('http://localhost:9000') # sup.relation.model= tokenkeys = set() sentencekeys = set() for d in self.corpus.documents: for s in self.corpus.documents[d].sentences: corenlpres = self.corenlp_client.annotate(s.text.encode("utf8"), properties={ 'ssplit.eolonly': True, 'openie.triple.all_nominals': True, 'openie.triple.strict': False, 'openie.max_entailments_per_clause': 500, 'annotators': 'tokenize,ssplit,pos,depparse,natlog,openie', #'annotators': 'tokenize, ssplit, pos, lemma, ner, parse, relation, openie', 'outputFormat': 'json', # 'sup.relation.model': self.modelname }) for o in corenlpres["sentences"][0]["openie"]: if "mir" in o["object"] or "mir" in o["subject"]: print "{}={}>{}".format(o["subject"], o["relation"], o["object"]) def test(self, outputfile="jsre_results.txt"): pass def get_predictions(self, corpus, examplesfile="slk_classifier.model.txt", resultfile="jsre_results.txt"): pass
author: Giancarlo D. Salton """ from pycorenlp import StanfordCoreNLP import codecs import json import utils properties = { "annotators": "tokenize,ssplit,pos,depparse,lemma", "depparse.extradependencies": "NONE", "outputFormat": "json" } nlp = StanfordCoreNLP('http://localhost:9000') input_file = "sample.en" output_file = "{:s}.json".format(input_file) keep_all_dependencies = False sent_count = 0 encoding = "utf-8" with codecs.open(output_file, "a", "utf-8") as outfile: outfile.write("{\"corpus\":[\n") for line in codecs.open(input_file, "r", encoding): # if encoding.lower != "utf-8": # line = line.encode("utf-8")
class StanfordTFIDFApi(): ''' Make use of StanfordCoreNLP Server Extract keyword through tf-idf algorithm ''' def __init__(self): self.nlp = StanfordCoreNLP(NLP_SERVER) def __tf_by_pos(self, text, pos='N'): response = self.nlp.annotate(text, properties={ 'annotators': 'ner,lemma', 'outputFormat': 'json' }) logger.debug(json.dumps(response)) '''The response is generally organized as {sentences:[{tokens:[]},{}]}''' result = list() if type(response) == dict and 'sentences' in response: for sentence in response['sentences']: for item in sentence['tokens']: if item['pos'].startswith(pos): # only accept engish word, and not in STOPWORDS if acceptable_word(item['lemma'].lower()): result.append((item['lemma'].lower())) toks_count = Counter(result) return toks_count else: logger.warning('sentences part is not in the response from NLP server.') return Counter() def tf_idf_groupby_pos(self, text, df_cache): output = dict() output['NOUN'] = self.__tf_by_pos(text, 'N') output['VERB'] = self.__tf_by_pos(text, 'V') for pos in output: logger.debug('Computed tf for %s:' % pos + json.dumps(output['VERB'])) for word in output[pos]: '''Formula is: tf*log(N/df)''' if word in df_cache: output[pos][word] = output[pos][word]*math.log(df_cache['total_document'] /df_cache[word]) else: output[pos][word] = output[pos][word]*math.log(df_cache['total_document']) # return the top 10 words output[pos] = [word for word, count in output[pos].most_common(10)] logger.debug('Computed tf-idf for %s:' % pos + json.dumps(output[pos])) return json.dumps(output) def compute_df(self, document_list): '''Compute document frequency based on input document list''' df_cache = dict() df_output = dict() d_index = 0 for document in document_list: d_index += 1 # tokenize each document reg_toks = nltk.regexp_tokenize(document, SENTENCE_RE) for item in reg_toks: # change each word to lower case and lemmatize item = normalise(item) if item not in df_cache: df_cache[item] = set([d_index]) else: df_cache[item].add(d_index) for item in df_cache: if acceptable_word(item): df_output[item] = len(df_cache[item]) df_output['total_document'] = len(document_list) return df_output
def process_request(conn, addr): print("connected client:", addr) lst = b'' data_com = conn.recv(4096) data_com = data_com.decode("utf8") data_com = data_com.split(' ') lenght = int(data_com[1]) i = 0 while i < lenght: data = conn.recv(1024) lst += data i += 1024 # print(data_com) lst2 = pickle.loads(lst) if data_com[0].upper() == 'STAT': if len(lst2) < 10: error = 'Not enough data' conn.sendall(error.encode("utf8")) else: tweet_top = tweet_top10(lst2) retweet_top = (list(retweet_top10(lst2)))[:10] retweet_top10_necessary = [] for i in range(len(retweet_top)): retweet_top10_necessary.append([]) retweet_top10_necessary[i].append(retweet_top[i][6]) retweet_top10_necessary[i].append(retweet_top[i][3]) retweet_top10_necessary[i].append(retweet_top[i][8]) author_top = author_top10(lst2) country_tweet, country_retweet = country(lst2) # print(tweet_top) # print(retweet_top10_necessary) # print(author_top) data_for_client = [['Popular words', 'Number of words']] data_for_client.extend(tweet_top) data_for_client.extend([]) data_for_client.extend([['Tweet content', 'author', 'RT']]) data_for_client.extend(retweet_top10_necessary) data_for_client.extend([['author', 'followers']]) data_for_client.extend(author_top) data_for_client.extend([['country_tweet'], country_tweet]) data_for_client.extend([['country_retweet'], country_retweet]) # print(data_for_client) message = pickle.dumps(data_for_client) size = len(message) conn.sendall((str(size)).encode("utf8")) time.sleep(1) conn.sendall(message) if data_com[0].upper() == 'ENTI': nlp = StanfordCoreNLP('http://localhost:9000') pos = [] for i in lst2: text = i[6].replace('\n',' ') # print(i[6]) result = nlp.annotate( text, properties = {'annotators': 'ner', 'outputFormat': 'json', 'timeout': 100000, }) # print(result["sentences"][0]) for word in result["sentences"][0]["tokens"]: pos.append('{} ({})'.format(word["word"], word["ner"])) # print(pos) # print('') # print(text) string = " ".join(pos) # print(pos) message = pickle.dumps(string) size = len(message) conn.sendall((str(size)).encode("utf8")) time.sleep(1) conn.sendall(message) conn.close()
from pycorenlp import StanfordCoreNLP import json from nltk.tree import Tree from SentimentModelFunctions import * if __name__ == '__main__': nlp = StanfordCoreNLP('http://localhost:9000') #14, 37, 58, 97, 99 text = ["Hai"] for t in text: print("Text: {}".format(t)) output = nlp.annotate( t, properties={ 'annotators': 'tokenize,ssplit, parse', 'outputFormat': 'json', 'parse.model': 'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' }) print(type(output)) print( type("CoreNLP request timed out. Your document may be too long.")) print(json.dumps(output, indent=4)) for i in range(len(output['sentences'])): tokenized_sent = [ token_json['word'] for token_json in output['sentences'][i]['tokens'] ]
class SentimentAnnotator: def __init__(self): self.nlp_wrapper = None self.settings = { 'annotators': 'sentiment', 'outputFormat': 'json', 'timeout': 1000000, } # --------------------------------------------------------------------- # Start CoreNLP server before using sentiment annotator # cd stanford-corenlp-full-2018-10-05/ # java -mx1g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer # note: change the 2g to 1g if space requirements too high # --------------------------------------------------------------------- # os.chdir("./stanford-corenlp-full-2018-10-05/") # os.system('java -mx1g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer') def sentence_level(self, sentence_tokens: list) -> list: """ Given a sentence as a list of tokens, return the breakdown of sentiment values in the sentence :param sentence_tokens: A list of tokens :return: list of sentiment counts e.g. [0.1, 0.3, 0.2, 0.3, 0.1] counts[0:4] --> frequency of tokens with sentiment values 0 - 4 """ counts = [0] * 5 for token in sentence_tokens: sentiment_val = int( self.nlp_wrapper.annotate(token, properties=self.settings) ["sentences"][0]["sentimentValue"]) counts[sentiment_val] += 1 return [count / len(sentence_tokens) for count in counts] def transform(self, string: str) -> list: """ Given a string, decompose it into sentences and annotate the sentiments of each sentence :param string: string of data :return: list of sentiment count averages across all sentences """ if self.nlp_wrapper is None: self.nlp_wrapper = StanfordCoreNLP('http://localhost:9000') all_sentences = self.nlp_wrapper.annotate( string, properties=self.settings)["sentences"] sentiment_values = [] for sentence in all_sentences: token_list = [ token['originalText'] for token in sentence['tokens'] ] sentiment_values.append( self.sentence_level(token_list) + [int(sentence["sentimentValue"])]) # print(self.sentence_level(token_list) + [int(sentence["sentimentValue"])]) return list(np.mean(sentiment_values, axis=0)) def fit_transform(self, data: pd.Series): return data.apply(lambda x: self.transform(x))
class Preprocess(): def __init__(self, argv): self.input = "" self.output_folder = "" # output has to be a folder self.input_type = "" # Start Stanford CoreNLP Server self.nlp = StanfordCoreNLP('http://localhost:9000') # Read User Command Line opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="]) for opt, arg in opts: if opt == '-h': print("Type 'python3.5 text_preprocessing/preprocess.py -i <inputfile> -o <outputfile>' \ in run_source_code.sh file") sys.exit() elif opt in ("-i", "--ifile"): self.input = arg if os.path.exists(arg) == False: print("Input doesn't exist") sys.exit() if os.path.isdir(arg) == True: self.input_type = "dir" elif os.path.isfile(arg) == True: self.input_type = "file" elif opt in ("-o", "--ofile"): self.output_folder = arg print("Input: " + self.input +", " + self.input_type) print("Output: " + self.output_folder) def sentence_parsing(self, row_string): parsed_json = self.nlp.annotate(row_string, properties={ 'annotators': 'tokenize,ssplit,pos', 'outputFormat': 'json' }) return parsed_json def output_preprocessed_data(self, json_input, file_name): rows = [] for sent in json_input['sentences']: parsed_sent = " ".join([t['originalText'] + "/" + t['pos'] for t in sent['tokens']]) rows.append(parsed_sent) output_file_path = self.output_folder + file_name with open(output_file_path, 'a') as preprocessed_out: for r in rows: preprocessed_out.write(r + "\n") def pos_tagging(self): if self.input_type == "file": input_path_elems = self.input.split("/") file_name = "" if input_path_elems[-1] != "/": file_name = input_path_elems[-1] else: file_name = input_path_elems[-2] text_string = "" with open(self.input, 'rb') as file_input: for r in file_input: text_string = " ".join([text_string, r.strip().decode('utf-8', 'backslashreplace')]) print(self.input) parsed_json = self.sentence_parsing(text_string) self.output_preprocessed_data(parsed_json, file_name) elif self.input_type == "dir": for file_name in os.listdir(self.input): input_file_path = self.input + file_name text_string = "" with open(input_file_path, 'rb') as file_input: for r in file_input: text_string = " ".join([text_string, r.strip().decode('utf-8', 'backslashreplace')]) parsed_json = self.sentence_parsing(text_string) print(input_file_path) self.output_preprocessed_data(parsed_json, file_name)
if __name__ == '__main__': args = parse_args() args.output_dir = "../../data/copa/" args.train_file = "../../data/copa/train.jsonl" args.predict_file = "../../data/copa/val.jsonl" args.test_file = "../../data/copa/test.jsonl" # make output directory if not exist if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) # register corenlp server nlp = StanfordCoreNLP('http://localhost:9753') # load train and dev datasets trainset = read_json_lines(args.train_file) devset = read_json_lines(args.predict_file) testset = read_json_lines(args.test_file) for dataset, path, name in zip( (trainset, devset, testset), (args.train_file, args.predict_file, args.test_file), ('train', 'dev', 'test')): output_path = os.path.join( args.output_dir, "{}.tagged.jsonl".format(os.path.basename(path)[:-6])) tagging(dataset, nlp, output_path) # output_path = os.path.join(args.output_dir, "{}.tagged.jsonl".format(os.path.basename(path)[:-6]))
class DataProcessor: def __init__(self, files=None): self.sources = files self.triples = [] self.news = "" self.nlp = StanfordCoreNLP('http://localhost:9000') def add_source(self, files): self.sources = files def generate_triples(self): for source in self.sources: if os.path.exists(source): with open(source) as file: lines = file.readlines() for line in lines: [h, r, t] = line.split() triple = self._create_triple(h, t, r) self.triples.append(triple) return self.triples def analyse_input(self): # nlp = StanfordCoreNLP('http://localhost:9000') # text = "'Tom be 42 years old, Tom be a teacher'" output = self.nlp.annotate( self.news, properties={ 'annotators': 'tokenize, ssplit, pos, depparse, parse, openie', 'outputFormat': 'json' }) self.triples = [] try: for item in output['sentences'][0]['openie']: tmp = item['subject'].replace(" ", "_") + "\t" \ + self._format_relation(item['relation']) + "\t" \ + item["object"].replace(" ", "_") [h, r, t] = tmp.split() triple = self._create_triple(h, t, r) self.triples.append(triple) # triple.append(tmp) except: # traceback.print_exc() pass # print(triple) # def solve_sentence(self, sentence): # sentence = "Trump_campaign_spokeswoman willPutFirst America" # nhead = "Trump_campaign_spokeswoman" # nrelation = "willPutAt" # ntail = "America" # return self._create_triple(nhead, ntail, nrelation) def _create_triple(self, nhead="", ntail="", nrelation=""): head = Entity(nhead.replace("\'", "`")) relation = Relation(nrelation.replace("\'", "`"), re.sub('[^a-zA-Z \n_]', '', nrelation)) tail = Entity(ntail.replace("\'", "`")) return Triple(head, relation, tail) def _format_relation(self, str): arr = [pos for pos, char in enumerate(str) if char == " "] result = "" for index, item in enumerate(str): if (index - 1) in arr: result += item.upper() else: result += item result = result.replace(" ", "") return result
with open('tweet_data_01.csv', 'r', encoding='mac_roman') as csvfile: f_reader = csv.reader(csvfile, delimiter=',') for row in f_reader: tweet_tuple = (row[0], row[1], row[2], row[3] ) #date,text,retweet,favorite raw_tweets.append(tweet_tuple) csvfile.close() #print(len(raw_tweets)) #text = 'Learning a winning stock trading strategy is EASY with Tim Sykes http://smq.tc/1BFjMXK $MGPI $CORN $CAMP' # text = '14 Top impressive #ETF $DOD $RWX $IFGL $BSJG $ERO $SCHC $VEGI $HGI $UYG $URA $GRU $JJG $CORN https://twitter.com/search?f=tweets&vertical=default&q=%24DOD%20OR%20%24RWX%20OR%20%24IFGL%20OR%20%24BSJG%20OR%20%24ERO%20OR%20%24SCHC%20OR%20%24VEGI%20OR%20%24HGI%20OR%20%24UYG%20OR%20%24URA%20OR%20%24GRU%20OR%20%24JJG%20OR%20%24CORN&src=typd …' # text = re.sub('http\S+\s+','',text) # print(text) nlp = StanfordCoreNLP('http://localhost:9000') cnt = 0 for tweet in raw_tweets: cnt += 1 if cnt == 1: continue text = tweet[1].strip() #get raw tweet text text = re.sub('http\S+\s+', '', text) #get rid of urls in the tweet res = nlp.annotate(text, properties={ 'annotators': 'sentiment', 'outputFormat': 'json', 'timeout': 10000, }) if len(res) > 1 or len(res) < 0:
import os os.chdir("/home/gowtham/Documents/stanford-corenlp-full-2017-06-09/") #call(["java","-mx4g","-cp",'"*"'," edu.stanford.nlp.pipeline.StanfordCoreNLPServer","-port","9009","-timeout","15000"]) os.system( 'java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9011 -timeout 15000' ) import json from pycorenlp import StanfordCoreNLP nlp = StanfordCoreNLP('http://localhost:9011') l = [] text = ('Pusheen and Smitha walked along the beach. ') l = text.split(".") print(l) output = nlp.annotate(l[0], properties={ 'annotators': 'sentiment', 'outputFormat': 'json' }) print(output['sentences'][0]['sentiment']) with open("output.json", "w+") as f: f.write(json.dumps(output, indent=4, sort_keys=True, ensure_ascii=False))
def __init__(self): self.nlp = StanfordCoreNLP(NLP_SERVER)
fopPOSNLTK = fopInputAfterTranslation + 'pos_nltk/' fopSortedBySimScore = fopInputAfterTranslation + 'sortBySimilarityScore/' fpSortedSource = fopSortedBySimScore + 'source.txt' fopSortedPOSStanford = fopSortedBySimScore + 'pos_stanford/' fopSortedPOSNLTK = fopSortedBySimScore + 'pos_nltk/' createDirIfNotExist(fopSortedPOSStanford) createDirIfNotExist(fopSortedPOSNLTK) strParseResultsType = "<class 'pyparsing.ParseResults'>" strStrType = "<class 'str'>" nltk.download('bllip_wsj_no_aux') model_dir = find('models/bllip_wsj_no_aux').path parser = RerankingParser.from_unified_model_dir(model_dir) strServerPort = '9000' nlpObj = StanfordCoreNLP('http://localhost:' + strServerPort) fpAppendNLTKText = fopSortedBySimScore + 'appendPOS.nltk.text.txt' fpAppendNLTKPOS = fopSortedBySimScore + 'appendPOS.nltk.pos.txt' fpAppendStanfordText = fopSortedBySimScore + 'appendPOS.stanford.text.txt' fpAppendStanfordPOS = fopSortedBySimScore + 'appendPOS.stanford.pos.txt' fpV2AppendNLTKText = fopSortedBySimScore + 'v2.appendPOS.nltk.text.txt' fpV2AppendNLTKPOS = fopSortedBySimScore + 'v2.appendPOS.nltk.pos.txt' fpV2AppendStanfordText = fopSortedBySimScore + 'v2.appendPOS.stanford.text.txt' fpV2AppendStanfordPOS = fopSortedBySimScore + 'v2.appendPOS.stanford.pos.txt' f1 = open(fpAppendStanfordText, 'r') arrStanfordText = f1.read().strip().split('\n') f1.close() f1 = open(fpAppendStanfordPOS, 'r')
def __init__(self, annotators='tokenize,ssplit,pos,parse'):#depparse self.annotators=annotators self.nlp = StanfordCoreNLP('http://localhost:9000')
class AMRInputPreprocessor(object): def __init__(self, url=core_nlp_url): self.nlp = StanfordCoreNLP(url) self.joints_map = self.readJoints() self.number_texts = { "hundred", "thousand", "million", "billion", "trillion", "hundreds", "thousands", "millions", "billions", "trillions" } self.slashedNumber = re.compile(r'-*\d+-\d+') def readJoints(self): joints_map = {} with open("data/joints.txt", 'r') as f: line = f.readline() while line.strip() != '': line = f.readline() compounds = line.split() past = "" for w in compounds: joints_map.setdefault(past[:-1], []).append(w) past = past + w + "-" return joints_map def combine_number(self, data): #combine phrase e.g. : make up def combinable_number(n1, n2): return n2 in self.number_texts and n1 != "-" def combinable(i, m): return len(lemma) > 0 and m == "CD"\ and pos[-1] =="CD" and combinable_number(lemma[-1], data["lem"][i]) lemma = [] ner = [] tok = [] pos = [] for i, m in enumerate(data["pos"]): if combinable(i, m): lemma[-1] = lemma[-1] + "," + data["lem"][i] tok[-1] = tok[-1] + "," + data["tok"][i] pos[-1] = "CD" # ner[-1] = ner[-1] else: lemma.append(data["lem"][i]) tok.append(data["tok"][i]) pos.append(data["pos"][i]) ner.append(data["ner"][i]) data["lem"] = lemma data["ner"] = ner data["pos"] = pos data["tok"] = tok return data def tag_url_and_split_number(self, data): lemma = [] ner = [] tok = [] pos = [] for i, le in enumerate(data["lem"]): if "http" in le or "www." in le: ner.append("URL") lemma.append(data["lem"][i]) tok.append(data["tok"][i]) pos.append(data["pos"][i]) elif re.match(self.slashedNumber, le) and data["ner"][i] == "DATE": les = le.replace("-", " - ").split() toks = data["tok"][i].replace("-", " - ").split() assert len(les) == len(toks), data for l in les: if l != "-": pos.append(data["pos"][i]) ner.append(data["ner"][i]) else: pos.append(":") ner.append("0") lemma = lemma + les tok = tok + toks else: ner.append(data["ner"][i]) lemma.append(data["lem"][i]) tok.append(data["tok"][i]) pos.append(data["pos"][i]) data["lem"] = lemma data["ner"] = ner data["pos"] = pos data["tok"] = tok return data def combine_phrase(self, data): #combine phrase e.g. : make up lemma = [] ner = [] tok = [] pos = [] skip = False for i, le in enumerate(data["lem"]): if skip: skip = False elif len(lemma) > 0 and le in self.joints_map.get(lemma[-1], []): lemma[-1] = lemma[-1] + "-" + le tok[-1] = tok[-1] + "-" + data["tok"][i] pos[-1] = "COMP" ner[-1] = "0" elif len(lemma) > 0 and le == "-" and i < len(data["lem"])-1 \ and data["lem"][i+1] in self.joints_map.get( lemma[-1] ,[]): lemma[-1] = lemma[-1] + "-" + data["lem"][i + 1] tok[-1] = tok[-1] + "-" + data["tok"][i + 1] pos[-1] = "COMP" ner[-1] = "0" skip = True else: lemma.append(le) tok.append(data["tok"][i]) pos.append(data["pos"][i]) ner.append(data["ner"][i]) data["lem"] = lemma data["ner"] = ner data["pos"] = pos data["tok"] = tok return data def featureExtract(self, src_text, whiteSpace=False): data = {} output = self.nlp.annotate( src_text.strip(), properties={ 'annotators': "tokenize,ssplit,pos,lemma,ner", "tokenize.options": "splitHyphenated=true,normalizeParentheses=false", "tokenize.whitespace": whiteSpace, 'ssplit.isOneSentence': True, 'outputFormat': 'json' }) snt = output['sentences'][0]["tokens"] data["ner"] = [] data["tok"] = [] data["lem"] = [] data["pos"] = [] for snt_tok in snt: data["ner"].append(snt_tok['ner']) data["tok"].append(snt_tok['word']) data["lem"].append(snt_tok['lemma']) data["pos"].append(snt_tok['pos']) # if whiteSpace is False: # return self.featureExtract(" ".join(data["tok"]),True) asserting_equal_length(data) return data def preprocess(self, src_text): data = self.featureExtract(src_text) data = self.combine_phrase(data) #phrase from fixed joints.txt file data = self.combine_number(data) data = self.tag_url_and_split_number(data) asserting_equal_length(data) return data
#!/usr/bin/python import cgi, cgitb import json cgitb.enable() # for troubleshooting #the cgi library gets vars from html data = cgi.FieldStorage() from pycorenlp import StanfordCoreNLP nlp = StanfordCoreNLP('http://localhost:9000') text = data['text'].value annotators = data['annotators'].value output = nlp.annotate(text, properties={'annotators': annotators, 'outputFormat': 'json'}) #this is the actual output print "Content-Type: text/html\n" print json.dumps(output)
def brat_to_conll(input_folder, output_filepath, tokenizer, language): ''' Assumes '.txt' and '.ann' files are in the input_folder. Checks for the compatibility between .txt and .ann at the same time. ''' use_pos = False if tokenizer == 'spacy': spacy_nlp = spacy.load(language) elif tokenizer == 'stanford': core_nlp = StanfordCoreNLP('http://localhost:{0}'.format(9000)) elif tokenizer == 'pos': use_pos = True else: raise ValueError("tokenizer should be either 'spacy' or 'stanford'.") verbose = False dataset_type = os.path.basename(input_folder) print("Formatting {0} set from BRAT to CONLL... ".format(dataset_type), end='') text_filepaths = sorted(glob.glob(os.path.join(input_folder, '*.txt'))) output_file = codecs.open(output_filepath, 'w', 'utf-8') for text_filepath in text_filepaths: base_filename = os.path.splitext(os.path.basename(text_filepath))[0] annotation_filepath = os.path.join(os.path.dirname(text_filepath), base_filename + '.ann') # create annotation file if it does not exist if not os.path.exists(annotation_filepath): codecs.open(annotation_filepath, 'w', 'UTF-8').close() if use_pos: annotation_filepath2 = os.path.join(os.path.dirname(text_filepath), base_filename + '.ann2') # create annotation file if it does not exist if not os.path.exists(annotation_filepath2): codecs.open(annotation_filepath2, 'w', 'UTF-8').close() text, entities = get_entities_from_brat(text_filepath, annotation_filepath) entities = sorted(entities, key=lambda entity: entity["start"]) if use_pos: pos_tags = get_pos_tags_from_brat(text, annotation_filepath2) #sentences = get_sentences_and_tokens_from_spacy(text, spacy_nlp) sentences = get_sentences_and_tokens_from_pos_tagger(pos_tags) #sentences = get_sentences_and_tokens_from_PlanTL(text,med_tagger) else: if tokenizer == 'spacy': sentences = get_sentences_and_tokens_from_spacy( text, spacy_nlp) elif tokenizer == 'stanford': sentences = get_sentences_and_tokens_from_stanford( text, core_nlp) if use_pos: token_counter = 0 rep_pos_max = 0 rep_pos_counter = 0 rep_pos = False for sentence in sentences: inside = False previous_token_label = 'O' for token in sentence: ''' if use_pos and token['text'] in ['\n', '\t', ' ', '']: print('EMPTY TOKEN') exit() ''' #token_counter += 1 #continue token['label'] = 'O' for entity in entities: if entity['start'] <= token['start'] < entity['end'] or \ entity['start'] < token['end'] <= entity['end'] or \ token['start'] < entity['start'] < entity['end'] < token['end']: token['label'] = entity['type'].replace( '-', '_' ) # Because the ANN doesn't support tag with '-' in it break elif token['end'] < entity['start']: break if len(entities) == 0: entity = {'end': 0} if token['label'] == 'O': gold_label = 'O' inside = False elif inside and token['label'] == previous_token_label: gold_label = 'I-{0}'.format(token['label']) else: inside = True gold_label = 'B-{0}'.format(token['label']) if token['end'] == entity['end']: inside = False previous_token_label = token['label'] if use_pos: pos_tag = pos_tags[token_counter]['type'] if not rep_pos and len( pos_tags[token_counter]['text'].split()) > 1: rep_pos = True rep_pos_max = len( pos_tags[token_counter]['text'].split()) rep_pos_counter = 0 elif rep_pos: rep_pos_counter += 1 if rep_pos_counter >= rep_pos_max: rep_pos = False rep_pos_counter = 0 else: token_counter += 1 if len('{0} {1} {2} {3} {4} {5}\n'.format( token['text'], base_filename, token['start'], token['end'], pos_tag, gold_label).split()) != 6: continue input('{0} {1} {2} {3} {4} {5}\n'.format( token['text'], base_filename, token['start'], token['end'], pos_tag, gold_label)) if verbose: print('{0} {1} {2} {3} {4} {5}\n'.format( token['text'].split()[0], base_filename, token['start'], token['end'], pos_tag, gold_label)) output_file.write('{0} {1} {2} {3} {4} {5}\n'.format( token['text'].split()[0], base_filename, token['start'], token['end'] - (len(token['text']) - len(token['text'].split()[0])), pos_tag, gold_label)) else: if verbose: print('{0} {1} {2} {3} {4}\n'.format( token['text'], base_filename, token['start'], token['end'], gold_label)) output_file.write('{0} {1} {2} {3} {4}\n'.format( token['text'], base_filename, token['start'], token['end'], gold_label)) if verbose: print('\n') output_file.write('\n') output_file.close() print('Done.') if not use_pos: if tokenizer == 'spacy': del spacy_nlp elif tokenizer == 'stanford': del core_nlp
def __init__(self): self.load_data() self.nlp = StanfordCoreNLP(config.StanfordCoreNLP_Path)
''' #from corenlp import * import sys import csv from SBAR_parser import * import codecs from HelpingFunctions_2 import * from lists import * import ast import json #import simplejson as json #corenlp = StanfordCoreNLP() from pycorenlp import StanfordCoreNLP corenlp = StanfordCoreNLP('http://localhost:9000') reload(sys) sys.setdefaultencoding('utf-8') ### ### Core function ### def FeatureExtractor (data_nlp, id): features = {} # Feature 1: if a MODAL_VERBS is in the sent # Feature 2: if a MODAL_VERBS is tagged as a MD (modal auxiliary) or VB in the sent # Feature 3: if a MODAL_VERBS is in aux relationship with "be"/"feel" # Feature 4: if a MODAL_VERBS is followed by "have" + VBN