def load(self): self.target_nodes = self.config.get('target_nodes', '//utt') self.input_attribute = self.config.get('input_attribute', 'norm_text') self.merge_clitics = self.config.get('merge_clitics', 'True') ## string, not bool ## check tools exist: corenlp_location = os.path.join(self.voice_resources.path[c.BIN], '..', \ 'corenlp-python', 'corenlp') assert os.path.isdir(corenlp_location) sys.path.append(corenlp_location) from corenlp import StanfordCoreNLP corenlp_dir = os.path.join(corenlp_location, '..', 'stanford-corenlp-full-2014-06-16') ## Each document is to be treated as one sentence, no sentence splitting at all. ## Write config for this if necessary: corenlp_conf_name = 'no_sentence_split.properties' corenlp_conf_file = os.path.join(corenlp_location, corenlp_conf_name) if not os.path.isfile(corenlp_conf_file): data = ['annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref', \ 'ssplit.isOneSentence = true'] writelist(data, corenlp_conf_file) print 'Loading stanford corenlp modules from %s ...'%(corenlp_dir) print 'Takes a while (~20-30 seconds)...' self.models = StanfordCoreNLP(corenlp_dir, properties=corenlp_conf_name)
def splitCol8toWords(): ## using corenlp to do split up job ## corenlp setting corenlp_dir = "stanford-corenlp-full-2014-08-27/" corenlp = StanfordCoreNLP(corenlp_dir) ## load dataset with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTupleDec9.json') as t: trainTup = json.load(t) fres = open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTuple_splitTitleNumBW_Dec9.txt','w') split_res = [] for num,tup in enumerate(trainTup): ## after modify col8 and save, col8 now may be empty.. if not tup[8]: continue ## use corenlp to splitup res = corenlp.parse(tup[8]) par = json.loads(res) slist = par["sentences"][0]['words'] temp = [] for s in slist: temp.append(s[0]) split_res.append([tup[0],tup[1],tup[6],temp]) fres.write(tup[0]+'\t'+tup[1]+'\t'+tup[6]+'\t'+','.join(temp)+'\n') print 'No.', num,tup[6] print tup[8] print [tup[0],tup[1],tup[6],temp] ## record new dataset with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTuple_splitTitleNumBW_Dec9.json','w') as f: json.dump(split_res,f) fres.close()
def stanford_parse(corpus_path): """Parse a directory (recursively) with the Stanford parser...""" import os import ast try: from corenlp import StanfordCoreNLP except: raise ValueError("CoreNLP not installed.") path_part, corpus_name = os.path.split(corpus_path) new_corpus_folder = 'parsed_%s' % corpus_name new_corpus_path = os.path.join(path_part, new_corpus_folder) if not os.path.exists(new_corpus_path): os.makedirs(new_corpus_path) corenlp = StanfordCoreNLP() files = os.listdir(corpus_path) for root, dirs, files in os.walk(corpus_path, topdown=True): for name in files: filepath = os.path.join(root, name) f = open(filepath) raw = f.read() parsed_text = ast.literal_eval(corenlp.parse(raw)) for index, sent in enumerate(parsed_text['sentences']): syntax_tree = sent['parsetree'] plain_text = sent['text'] subcorpus_path = os.path.join(new_corpus_path, subcorpus_name) if not os.path.exists(subcorpus_path): os.makedirs(subcorpus_path)
def afterModifyCol8_splitCol8(): col8_splitup = [] with open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/npovTrail2/Nov8data/train_afterdataclean_modifiedcleanedTupleNov8.json') as t: trainTup = json.load(t) corenlp_dir = "stanford-corenlp-full-2014-08-27V3.4.1" corenlp = StanfordCoreNLP(corenlp_dir) for num, tup in enumerate(trainTup): print 'No.',num print 'orin: ',tup[8] res = corenlp.parse(tup[8]) par = json.loads(res) print par slist = par["sentences"][0]['words'] # print slist temp = [] for s in slist: temp.append(s[0]) col8_splitup.append(temp) print temp ## check dependencies split dlist = par['sentences'][0]['dependencies'] demp = [] for d in dlist: demp.append(d) print demp if num == 4: break
def afterModifyCol8_splitCol8(): col8_splitup = [] with open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/npovTrail2/Nov8data/train_afterdataclean_modifiedcleanedTupleNov8.json') as t: trainTup = json.load(t) corenlp_dir = "stanford-corenlp-full-2014-08-27V3.4.1" # corenlp_dir = "stanford-corenlp-full-2015-01-29" # corenlp_dir = "stanford-corenlp-full-2013-06-20" corenlp = StanfordCoreNLP(corenlp_dir) # res = corenlp.parse("Bell, a company which is based in LA, makes and distributes computer products. I hate you.") # par = json.loads(res) # for i in par["sentences"][0]['dependencies']: # print i for num, tup in enumerate([trainTup[1853]]): print 'No.',num print 'orin: ',tup[8] res = corenlp.parse(tup[8]) par = json.loads(res) # print par slist = par["sentences"][0]['words'] # print slist temp = [] for s in slist: temp.append(s[0]) col8_splitup.append(temp) print temp ## check dependencies split dlist = par['sentences'][0]['dependencies'] demp = [] for d in dlist: demp.append(d) print demp if num == 4: break
def f1f2f3f4f5f6f7(file_,file2_): ## using corenlp to do split up job ## corenlp setting corenlp_dir = "stanford-corenlp-full-2014-08-27/" corenlp = StanfordCoreNLP(corenlp_dir) ## load dataset with open(file_) as t: trainTup = json.load(t) ## data structure to hold fea1 to fea7 a list feaLst = [] for num,tup in enumerate(trainTup): ## after modify col8 and save, col8 now may be empty.. if not tup[8]: continue print "No. %d tup in processing.." % (num) ## use corenlp to splitup res = corenlp.parse(tup[8]) par = json.loads(res) print tup[8] ## use corenlp to get lemma and pos for p,word in enumerate(par["sentences"][0]['words']): print str(p)+'th w in tupl '+str(num) tmp = {} tmp['Word'] = word[0] tmp['Lemma'] = word[1]['Lemma'] tmp['POS'] = word[1]['PartOfSpeech'] feaLst.append(tmp) ## add pos-1,pos+1,pos-2 and pos+2 slen = len(feaLst) for ind,val in enumerate(feaLst): if (ind-1) >= 0 and (ind-1) <= slen-1: val['POS-1'] = feaLst[ind-1]['POS'] else: val['POS-1'] = "NA" if (ind+1) >= 0 and (ind+1) <= slen -1: val['POS+1'] = feaLst[ind+1]['POS'] else: val['POS+1'] = "NA" if (ind-2) >= 0 and (ind-2) <= slen -1: val['POS-2'] = feaLst[ind-2]['POS'] else: val['POS-2'] = "NA" if (ind+2) >=0 and (ind+2) <= slen -1: val['POS+2'] = feaLst[ind+2]['POS'] else: val['POS+2'] = "NA" for i in feaLst: print 'w:',i['Word'],' lemma:',i['Lemma'],' pos-2:',i['POS-2'],' pos-1:',i['POS-1'],' pos:',i['POS'],' pos+1:',i['POS+1'],' pos+2:',i['POS+2'] with open(file2_,'w') as o: json.dump(feaLst,o) print len(feaLst) print len(trainTup)
def corenlpLemmaPOS_stanfordparserDependency_split_equalChecking(): ## corenlp setting corenlp_dir = "stanford-corenlp-full-2014-08-27/" corenlp = StanfordCoreNLP(corenlp_dir) ## stanfordDependencies setting sd = StanfordDependencies.get_instance(backend="subprocess", version="3.4.1") os.environ["STANFORD_PARSER"] = "stanford-parser-full-2014-08-27/" os.environ["STANFORD_MODELS"] = "stanford-parser-full-2014-08-27/" parser = stanford.StanfordParser( model_path="stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ) with open("../../dataclean_Nov8_2015/train_afterdataclean_modifiedcleanedTupleNov8.json") as t: trainTup = json.load(t) for num, tup in enumerate(trainTup): ## after modify col8 and save, col8 now may be empty.. if not tup[8]: continue ## use corenlp to split sentence print "No.", num print tup[8] res = corenlp.parse(tup[8]) par = json.loads(res) slist = par["sentences"][0]["words"] print slist temp = [] for s in slist: temp.append(s[0]) print temp ## use stanfordDependencies to do split sentence sentences = parser.raw_parse(tup[8]) s = "" for line in sentences: for sentence in line: s += str(sentence) sent = sd.convert_tree(s) print sent detemp = [] for t in sent: detemp.append(t[1]) print detemp for di, ti in zip(detemp, temp): if di == ti: pass else: if ( (ti == "(" and di == "-LRB-") or (ti == ")" and di == "-RRB-") or (ti == "[" and di == "-LSB-") or (ti == "]" and di == "-RSB-") ): print "diff in parenthesis" pass else: print "{", di, " ,", ti, " }"
def checkCoreNLPSplit_DependencySplit(file_): with open(file_) as f: tset = json.load(f) ## corenlp setting corenlp_dir = "stanford-corenlp-full-2014-08-27/" corenlp = StanfordCoreNLP(corenlp_dir) ## stanfordDependencies setting sd = StanfordDependencies.get_instance(backend="subprocess",version='3.4.1') os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2014-08-27/' os.environ['STANFORD_MODELS'] = 'stanford-parser-full-2014-08-27/' parser = stanford.StanfordParser(model_path="stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") for num, tup in enumerate(tset): print num if not tup[8]: continue ## use corenlp to splitup res = corenlp.parse(tup[8]) par = json.loads(res) slist = par["sentences"][0]['words'] temp = [] for s in slist: temp.append(s[0]) ## use stanfordDependencies to do split sentence sentences = parser.raw_parse(tup[8]) s="" for line in sentences: for sentence in line: s+=str(sentence) sent = sd.convert_tree(s) detemp = [] for t in sent: detemp.append(t[1]) ## check if same for di,ti in zip(detemp,temp): if di == ti: pass else: if (ti == '(' and di == '-LRB-') or (ti == ')' and di == '-RRB-') or (ti == '[' and di == '-LSB-') or (ti == ']' and di == '-RSB-'): print "diff in parenthesis" pass else: print "!!!" print "{",di,' ,',ti," }"
def parse(sentence): from corenlp import StanfordCoreNLP parser = StanfordCoreNLP() data = parser.parse(sentence) #print data open_file = open("data.json", "wb") open_file.write(data) open_file.close()
def request_features_from_stanford(data_dir, flag): all_sentences, _ = read_tsv(path.join(data_dir, flag + '.tsv')) sentences_str = [] for sentence in all_sentences: sentence = ['·' if i == '.' else i for i in sentence] if sentence[-1] == '·': sentence[-1] = '.' sentences_str.append(' '.join(sentence)) all_data = [] with StanfordCoreNLP(FULL_MODEL, lang='en') as nlp: for sentence in tqdm(sentences_str): props = { 'timeout': '5000000', 'annotators': 'pos, parse, depparse', 'pipelineLanguage': 'en', 'outputFormat': 'json' } results = nlp.annotate(sentence, properties=props) # results = nlp.request(annotators='deparser', data=sentence) # results = nlp.request(annotators='pos', data=sentence) # result = results['sentences'][0] all_data.append(results) # assert len(all_data) == len(sentences_str) with open(path.join(data_dir, flag + '.stanford.json'), 'w', encoding='utf8') as f: for data in all_data: json.dump(data, f, ensure_ascii=False) f.write('\n')
def request_features_from_stanford(data_dir, flag): all_sentences = read_txt(path.join(data_dir, flag + '.txt')) sentences_str = [] for sentence, tags in all_sentences: sentence = [change(i) for i in sentence] sentences_str.append([sentence, tags]) all_data = [] with StanfordCoreNLP(FULL_MODEL, lang='en') as nlp: for sentence, tags in tqdm(sentences_str): props = { 'timeout': '5000000', 'annotators': 'pos, parse, depparse', 'tokenize.whitespace': 'true', 'ssplit.eolonly': 'true', 'pipelineLanguage': 'en', 'outputFormat': 'json' } results = nlp.annotate(' '.join(sentence), properties=props) results["tags"] = tags results["word"] = sentence all_data.append(results) assert len(all_data) == len(sentences_str) with open(path.join(data_dir, flag + '.stanford.json'), 'w', encoding='utf8') as f: for data in all_data: json.dump(data, f, ensure_ascii=False) f.write('\n')
class CoreNLP(object): '''Connect CoreNLP server''' _NLP = StanfordCoreNLP( os.environ.get('CORENLP_URL') or 'http://localhost:9000') _LOCAL_DEMO_PROP = { 'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse, openie, coref', "openie.resolve_coref": "true", 'outputFormat': 'json' } _ONLINE_DEMO_PROP = { "annotators": "tokenize,ssplit,pos,ner,depparse,openie,coref", "coref.md.type": "dep", "coref.mode": "statistical", 'outputFormat': 'json' } @staticmethod def annotate(text): '''Get result from CoreNLP via JSON''' try: return CoreNLP.nlp().annotate(text, properties=CoreNLP._ONLINE_DEMO_PROP) except UnicodeError: pprint(text) @staticmethod def nlp(): '''Return CoreNLP Server''' return CoreNLP._NLP
class NLPParser(object): """ NLP parse, including Part-Of-Speech tagging and dependency parse. Attributes ========== parser: StanfordCoreNLP the Staford Core NLP parser """ def __init__(self, corenlp_dir): self.parser = StanfordCoreNLP(corenlp_dir) def parse(self, sent): """ Part-Of-Speech tagging and dependency parse. :param sent: string :return: a list of tuple (word, pos, dependency) """ result = self.parser.raw_parse(sent) tuples = [] for s in result['sentences']: word, pos, dependency = [], [], [] for dep in s['dependencies']: dependency.append({'type': dep[0], 'dep': int(dep[2])-1, 'gov': int(dep[4])-1}) for w in s['words']: word.append(w[0]) pos.append(w[1]['PartOfSpeech']) tuples.append((word, pos, dependency)) return tuples
class Parser(): def __init__(self): #corenlp_dir = "/export/data/ghpaetzold/simpatico/server_simplifiers/core_nlp/stanford-corenlp-full-2016-10-31/" corenlp_dir = "/export/data/cscarton/simpatico/stanford-corenlp-full-2016-10-31/" self.corenlp = StanfordCoreNLP(corenlp_dir, memory="4g", properties='galician.myproperties.properties') def process(self, sentence): #sentences = open(self.doc, "r").read().strip().split("\n") #sentences = [l.strip().split(' ') for l in f_read] #dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") return self.corenlp.raw_parse(sentence)['sentences'][0] def transform(self, parsed): dict_dep = {} for rel, _, head, word, n in parsed['dependencies']: n = int(n) head = int(head) if head not in dict_dep.keys(): dict_dep[head] = {} if rel not in dict_dep[head].keys(): dict_dep[head][rel] = [] dict_dep[head][rel].append(n) return dict_dep
def request_features_from_stanford(data_path, do_predict=False): data_dir = data_path[:data_path.rfind('/')] flag = data_path[data_path.rfind('/') + 1:data_path.rfind('.')] if os.path.exists(path.join(data_dir, flag + '.stanford.json')): print('The Stanford data file for %s already exists!' % str(data_path)) return None print('Requesting Stanford results for %s' % str(data_path)) if do_predict: all_sentences, _ = read_sentence(data_path) else: all_sentences, _ = read_tsv(data_path) sentences_str = [] for sentence in all_sentences: sentences_str.append(''.join(sentence)) all_data = [] with StanfordCoreNLP(FULL_MODEL, lang='zh') as nlp: for sentence in tqdm(sentences_str): results = nlp.request(annotators='parse,depparse', data=sentence) # result = results['sentences'][0] result = merge_results(results['sentences']) all_data.append(result) # assert len(all_data) == len(sentences_str) with open(path.join(data_dir, flag + '.stanford.json'), 'w', encoding='utf8') as f: for data in all_data: json.dump(data, f, ensure_ascii=False) f.write('\n')
def sentToParse(Res, num_sents): # load corenlp sys.path.insert(0, osp.join(ROOT_DIR, 'pyutils', 'corenlp')) from corenlp import StanfordCoreNLP parser_path = osp.join(ROOT_DIR, 'pyutils', 'corenlp', 'stanford-corenlp-full-2015-01-30') stanfordParser = StanfordCoreNLP(parser_path) num_sents = len(Res) if num_sents < 0 else num_sents print 'stanford parser loaded.' # start parsing num_sents = len(Res) if num_sents < 0 else num_sents for i in range(num_sents): ref_id, sent = Res[i]['ref_id'], Res[i]['sent'] parse = stanfordParser.raw_parse(sent)['sentences'][0] Res[i]['parse'] = parse print '%s/%s sent is parsed.' % (i + 1, num_sents)
class NLPParser(object): """ NLP parse, including Part-Of-Speech tagging and dependency parse. Attributes ========== parser: StanfordCoreNLP the Staford Core NLP parser """ def __init__(self, corenlp_dir): self.parser = StanfordCoreNLP(corenlp_dir) def parse(self, sent): """ Part-Of-Speech tagging and dependency parse. :param sent: string :return: a list of tuple (word, pos, dependency) """ result = self.parser.raw_parse(sent) tuples = [] for s in result['sentences']: word, pos, dependency = [], [], [] for dep in s['dependencies']: dependency.append({ 'type': dep[0], 'dep': int(dep[2]) - 1, 'gov': int(dep[4]) - 1 }) for w in s['words']: word.append(w[0]) pos.append(w[1]['PartOfSpeech']) tuples.append((word, pos, dependency)) return tuples
def getClient(): socket = TSocket.TSocket('localhost', port) socket.setTimeout(10000) transport = TTransport.TBufferedTransport(socket) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = StanfordCoreNLP.Client(protocol) transport.open() return client
def remove_tuples0MoreThan1BiasedWord_fromOriginalTuple(): ## using corenlp to do split up job ## corenlp setting corenlp_dir = "stanford-corenlp-full-2014-08-27/" corenlp = StanfordCoreNLP(corenlp_dir) ## load dataset with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTupleDec9.json') as t: trainTup = json.load(t) # with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTuple_splitTitleNumBW_stripPuncNum_Dec9.json') as a: # verify = json.load(a) b = open('../../devDataclean_Dec8_2015/dev_biasword0ormorethan1_modifiedFile_dec11.txt','w') res2 = [] for num,tup in enumerate(trainTup): print num ## after modify col8 and save, col8 now may be empty.. if not tup[8]: continue ## use corenlp to splitup res = corenlp.parse(tup[8]) par = json.loads(res) slist = par["sentences"][0]['words'] temp = [] for s in slist: temp.append(s[0]) ## count of biased word cnum = temp.count(tup[6]) if cnum == 1: ## verify if the qualified sent is the same as the split col8 file: dev_afterdataclean_modifiedcleanedTuple_splitTitleNumBW_Dec9.json # if (verify[num][2] == tup[6]) and (verify[num][0] == tup[0]): res2.append(tup) # else: # print "two file are diff" # print verify[num] # print tup # sys.exit() else: b.write(str(tup)+'\n') with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTuple_elimBiasWord0orMoreThanOne_fullTup_Dec11.json','w') as f: json.dump(res2,f) b.close()
class NLPParser(object): """ NLP parse, including Part-Of-Speech tagging. Attributes ========== parser: StanfordCoreNLP the Staford Core NLP parser """ def __init__(self, corenlp_dir): self.parser = StanfordCoreNLP(corenlp_dir) #self.parser = POSTagger(corenlp_dir+'/models/english-bidirectional-distsim.tagger', corenlp_dir+'/stanford-postagger.jar') def parse(self, sent): """ Part-Of-Speech tagging :param sent: string :return: a list of tuple (tokens, pos) """ """ tokens = [] pos = [] result = self.parser.tag(sent.split()) for entry in result: tokens.append(entry[0]) pos.append(entry[1]) tuples = [tokens, pos] return tuples """ result = self.parser.raw_parse(sent) tuples = [] word, pos = [], [] for s in result['sentences']: for w in s['words']: word.append(w[0]) pos.append(w[1]['PartOfSpeech']) pattern = re.compile('\[Text=') tokenpattern = re.compile('\[Text=[^\s]+\s') pospattern = re.compile('PartOfSpeech=[^\s]+\s') startIdxed = [] for t in re.finditer(pattern, s['parsetree']): startIdxed.append(t.start()) for i in range(len(startIdxed)): start = startIdxed[i] if i < len(startIdxed) - 1: end = startIdxed[i+1] else: end = -1 token = s['parsetree'][start:end] text = re.findall(tokenpattern, token) partOfSpeech = re.findall(pospattern, token) word.append(text[0][6:-1]) pos.append(partOfSpeech[0][13:-1]) tuples.append((word, pos)) #print tuples return tuples
def __init__(self, fallback=False): self.FILE = "nlp_infos.p" self.data = None self.data_length = None self.corenlp_dir = "helper/stanfordnlp/corenlp-python/stanford-corenlp-full-2013-11-12/" if fallback: try: self.corenlp = StanfordCoreNLP(self.corenlp_dir) except TIMEOUT: print "Stanford CoreNLP Timeout"
def __init__(self, project): """Instantiate and ready the parser. Note that readying the parser takes some time. """ self.parser = StanfordCoreNLP(app.config["CORE_NLP_DIR"]) self.project = project logger = logging.getLogger(__name__) global project_logger project_logger = ProjectLogger(logger, project)
def stanford_parse(data, corpus_name = 'corpus'): from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) print "\n%s: Initialising CoreNLP... \n" % thetime import os import ast try: from corenlp import StanfordCoreNLP except: raise ValueError("CoreNLP not installed.") from corpkit.progressbar import ProgressBar corenlp = StanfordCoreNLP() if not os.path.exists(corpus_name): os.makedirs(corpus_name) p = ProgressBar(len(data)) for index, datum in enumerate(data): p.animate(index) text = datum[0] metadata = datum[1] number_of_zeroes = len(str(len(data))) - 1 filename = str(index).zfill(number_of_zeroes) + '.txt' file_data = [] parsed_text = ast.literal_eval(corenlp.parse(text)) trees = [] raw_texts = [] for index, sent in enumerate(parsed_text['sentences']): syntax_tree = sent['parsetree'] plain_text = sent['text'] trees.append(syntax_tree) raw_texts.append(plain_text) #subcorpus_path = os.path.join(new_corpus_path, subcorpus_name) file_data = ['<raw>' + '\n'.join(raw_texts) + '\n</raw>', '<parse>' + '\n'.join(trees) + '\n</parse>', ] if not os.path.exists(os.path.join(corpus_name, metadata)): os.makedirs(os.path.join(corpus_name, metadata)) try: fo=open(os.path.join(corpus_name, metadata, filename),"w") except IOError: print "Error writing file." fo.write('\n'.join(file_data)) fo.close() p.animate(len(data)) print 'Done!'
def parse_file(): sentence_file = open('sentences.txt', 'w') dep_file = open('deps.txt', 'w') tree_file = open('trees.txt', 'w') abstracts = [line.strip() for line in open('relabs.txt', 'r')] corenlp = StanfordCoreNLP() for abstract in abstracts: parse = corenlp.parse(abstract) xml = json.loads(parse) sentences = xml['sentences'] for sentence in sentences: # Write sentence sentence_file.write(sentence['text'] + "\n") # Write parse tree tree_file.write(sentence['parsetree'] + "\n") # Write dependencies for dep in sentence['dependencies']: dep_file.write('@'.join(dep) + "\t") dep_file.write("\n") dep_file.close() tree_file.close() sentence_file.close()
def split_one(params): from corenlp import StanfordCoreNLP index, fn, start, end = params # skip 'start' line lno = 0 fin = open(fn) while lno < start: fin.readline() lno += 1 ret = [] parser = StanfordCoreNLP("stanford-corenlp-full-2015-01-29/", properties="default.properties", serving=False) for i in xrange(start, end): line = fin.readline() ll = line.decode('utf8').strip().split('\t') """pay attention to here !!!""" # if len(ll) != 3: # continue # if not ll[2].endswith('@en'): # continue # text = ll[2][1:-4] if len(ll) != 2: continue text = ll[1] text = text.replace('\\n', ' ').replace('\\r', ' ') try: rsp = json.loads(parser.parse(text)) sentences = [] for s in rsp['sentences']: sentences.append(s['text']) ret.append(('%s\t%s' % (ll[0], '\t'.join(sentences))).encode('utf8')) except Exception as e: print e fin.close() return ret
def local_split_description(fn_in, fn_out): from corenlp import StanfordCoreNLP parser = StanfordCoreNLP("stanford-corenlp-full-2015-01-29/", properties="default.properties", serving=False) with open(fn_out, 'w') as fout: with open(fn_in) as fin: for line in fin: ll = line.decode('utf8').strip().split('\t') if len(ll) != 3: continue if not ll[2].endswith('@en'): continue text = ll[2][1:-4] text = text.replace('\\n', ' ').replace('\\r', ' ') try: rsp = json.loads(parser.parse(text)) sentences = [] for s in rsp['sentences']: sentences.append(s['text']) print >> fout, ('%s\t%s' % (ll[0], '\t'.join(sentences))).encode('utf8') except Exception as e: print e.message
def run(self): import sys from corenlp import StanfordCoreNLP import jsonrpc sys.__stdin__ = sys.__stdout__ server = jsonrpc.Server( jsonrpc.JsonRpc20(), jsonrpc.TransportTcpIp(addr=("0.0.0.0", int(self.port)))) nlp = StanfordCoreNLP() server.register_function(nlp.parse) server.register_function(nlp.parse_file) print "registering parse_file" server.register_function(lambda *a, **k: 'pong', 'ping') try: server.serve() except KeyboardInterrupt: print("%d exiting" % self.port)
def request_features_from_stanford(data_dir, flag): all_sentences, _ = read_txt(path.join(data_dir, flag + '.txt')) sentences_str = [] for sentence in all_sentences: sentence = [change(i) for i in sentence] # if sentence[-1] == '·': # sentence[-1] = '.' sentences_str.append(' '.join(sentence)) all_data = [] with StanfordCoreNLP(FULL_MODEL, lang='zh', port=randint(38400, 38596)) as nlp: for sentence in tqdm(sentences_str): props = { 'timeout': '5000000', 'annotators': 'pos, parse, depparse', 'tokenize.whitespace': 'true', 'ssplit.eolonly': 'true', 'pipelineLanguage': 'en', 'outputFormat': 'json' } results = nlp.annotate(sentence, properties=props) # results = nlp.request(annotators='deparser', data=sentence) # results = nlp.request(annotators='pos', data=sentence) # result = results['sentences'][0] all_data.append(results) # assert len(all_data) == len(sentences_str) print(all_data) with open(path.join(data_dir, flag + '.stanford.json'), 'w', encoding='utf8') as f: for data in all_data: json.dump(data, f, ensure_ascii=False) f.write('\n')
import os from nltk.tokenize import sent_tokenize from corenlp import StanfordCoreNLP # The directory in which the stanford core NLP .jar is located -- you have to # download this from their website. CORE_NLP_DIR = "stanford-corenlp-dir/" PARSER = StanfordCoreNLP(CORE_NLP_DIR) in_file = "sentences.txt" text = open(in_file, 'r').read() sentences = sent_tokenize(text) # Break the text into sentences. for i, sentence in enumerate(sentences): try: parse = PARSER.raw_parse(sentence) if i % 50 == 0: print " Entered sentence " + str(i) + " of " + str(len(sentences)) write_parse_products(parse['sentences'][0]) except Exception: print "Error on sentence:\n\t " + sentence + " \n " pass def write_parse_products(self, parse): words = parse['words'] word_objects = [] text = "" for i, word_info in enumerate(words): properties = word_info[1] token = word_info[0].lower().strip()
#!/usr/bin/env python import sys, bz2 sys.path.insert(0, '/Users/timpalpant/Documents/Workspace/corenlp-python') import nltk from nltk.tree import Tree from corenlp import StanfordCoreNLP from remove_random_word import remove_random_word print("Booting StanfordCoreNLP") nlp = StanfordCoreNLP() print("Initializing train file") train = bz2.BZ2File('../data/train_v2.txt.bz2') for line in train: rline = remove_random_word(line) lparse = nlp.raw_parse(line) ltree = Tree.fromstring(lparse['sentences'][0]['parsetree']) rparse = nlp.raw_parse(rline) rtree = Tree.fromstring(rparse['sentences'][0]['parsetree']) print(ltree) print(rtree)
def __init__(self): #corenlp_dir = "/export/data/ghpaetzold/simpatico/server_simplifiers/core_nlp/stanford-corenlp-full-2016-10-31/" corenlp_dir = "/export/data/cscarton/simpatico/stanford-corenlp-full-2016-10-31/" self.corenlp = StanfordCoreNLP(corenlp_dir, memory="4g", properties='galician.myproperties.properties')
def stanfordParse(text, corenlpDir='corenlp/stanford-corenlp-full-2014-01-04'): global stanford if stanford is None: stanford = StanfordCoreNLP(corenlpDir) return stanford.raw_parse(text)
class MyExtract(object): ''' classdocs ''' def __init__(self): ''' constructor ''' self.rawcorpus = None self.corpus = [] self.pars = [] self.wordspace = None self.docspace = None self.stop = set(stopwords.words('english')) self.parser = None self.prelations = [] self.nrelations = [] def buildRawCorpus(self, myfile): ''' extract text from xml files ''' corpus = "" for txtfile in glob.glob(devdata + myfile): print "reading " + txtfile xmldoc = minidom.parse(txtfile) itemlist = xmldoc.getElementsByTagName('text') for s in itemlist: text = s.firstChild.data if "." in text: corpus = corpus + " " + text self.rawcorpus = corpus.encode("utf-8") def buildCorpus(self): ''' preprocess raw text (tokenize, remove stopwords) ''' sents = self.rawcorpus.split(".") for sent in sents: toks = [ w.lower() for w in nltk.word_tokenize(sent.decode('utf-8')) if w.lower() not in self.stop ] self.corpus.append(toks) def tokenizeAbs(self, parag): ''' preprocess raw text (tokenize, remove stopwords) ''' toks = [ w.lower() for w in nltk.word_tokenize(parag) if w.lower() not in self.stop ] return toks def buildRawSents(self, myfile): for txtfile in glob.glob(devdata + myfile): xmldoc = minidom.parse(txtfile) itemlist0 = xmldoc.getElementsByTagName('document') count = 0 for it0 in itemlist0: parag = "" itemlist = it0.getElementsByTagName('text') for item in itemlist: if '.' in item.firstChild.data: parag = parag + " " + item.firstChild.data toks = self.tokenizeAbs(parag.encode("utf-8").decode('utf-8')) lab = [txtfile + '_' + ` count `] self.pars.append(doc2vec.LabeledSentence(words=toks, tags=lab)) count = count + 1 def exploreCDRCorpus(self, myfile, maxsize): ''' extract entities + relations from xml ''' diseases = {} chemicals = {} relations = [] xmldoc = minidom.parse(myfile) itemlist0 = xmldoc.getElementsByTagName('document') count = 0 for it0 in itemlist0: print "\t- processing abstract " + ` count ` parsed = self.docspace.docvecs[myfile + "_" + ` count `] itemlist1 = it0.getElementsByTagName('annotation') print "\t\t+ " + ` len(itemlist1) ` + " entities" for it1 in itemlist1: itemlist2 = it1.getElementsByTagName('infon') typ = itemlist2[0].firstChild.data mesh = itemlist2[len(itemlist2) - 1].firstChild.data text = it1.getElementsByTagName( 'text')[0].firstChild.data.lower() codes = mesh.split('|') for code in codes: ent = MyEntity(text, code, typ) if (typ == 'Chemical'): chemicals[code] = ent if (typ == 'Disease'): diseases[code] = ent itemlist3 = it0.getElementsByTagName('relation') print "\t\t+ " + ` 2 * len( itemlist3) ` + " positive and negative relations" print "\t\t\t* extracting features for positive relations" print "\t\t\t* extracting features for negative relations" for it3 in itemlist3: itemlist4 = it3.getElementsByTagName('infon') key1 = itemlist4[1].firstChild.data key2 = itemlist4[2].firstChild.data e1 = chemicals[key1] e2 = diseases[key2] e1.bow = self.avgBOW(e1.text) e2.bow = self.avgBOW(e2.text) rel = MyRelation(e1, e2, '1') rel.abs = parsed self.prelations.append(rel) relations.append(key1 + "_" + key2) num = 0 for key1 in chemicals.keys(): for key2 in diseases.keys(): if key1 + "_" + key2 not in relations: if num < len(itemlist3): e1 = chemicals[key1] e2 = diseases[key2] e1.bow = self.avgBOW(e1.text) e2.bow = self.avgBOW(e2.text) rel = MyRelation(e1, e2, '-1') rel.abs = parsed self.nrelations.append(rel) num = num + 1 count = count + 1 if (count == maxsize): break def exploreDDICorpus(self, myfile, maxsize, ftyp): ''' extract entities + relations from xml ''' #print(myfile) xmldoc = minidom.parse(myfile) itemlist0 = xmldoc.getElementsByTagName('document') count = 0 for it0 in itemlist0: # abstract with annotations print "\t- processing abstract " + ` count ` drugs = {} # entities itemlist1 = it0.getElementsByTagName('annotation') print "\t\t+ " + ` len(itemlist1) ` + " entities" for it1 in itemlist1: itemlist2a = it1.getElementsByTagName('infon') typ = itemlist2a[0].firstChild.data print typ itemlist2b = it1.getElementsByTagName('text') text = itemlist2b[0].firstChild.data.lower() print text ent = MyEntity(text, "", typ) ent.bow = self.avgBOW(ent.text) drugs[text] = ent # abstract itemlist3 = it0.getElementsByTagName('text') abstract = "" for it3 in itemlist3: if (len(it3.firstChild.data.split()) > 3): abstract = abstract + it3.firstChild.data # parse abstract parsed = self.parseSentence(abstract) #stanford docvec = self.docspace.docvecs[myfile + "_" + ` count `] #doc2vec #print len(drugs.keys()) if (len(drugs.keys()) > 1): e1 = drugs[drugs.keys()[0]] e2 = drugs[drugs.keys()[1]] e1.bow = self.avgBOW(e1.text) e2.bow = self.avgBOW(e2.text) #print(ftyp) if (ftyp == "positive"): #print(parsed) rel = MyRelation(e1, e2, '1') rel.abs = docvec rel.parse = parsed.encode("utf-8") self.prelations.append(rel) if (ftyp == "negative"): #print(docvec) rel = MyRelation(e1, e2, '-1') rel.abs = docvec rel.parse = parsed.encode("utf-8") self.nrelations.append(rel) # increment counter count = count + 1 if (count == maxsize): break def avgBOW(self, entity): bow = [] ents = entity.split(" ") i = 0 while i < self.wordspace.layer1_size: v = 0 for ent in ents: if ent in self.wordspace.vocab: v = v + self.wordspace[ent][i] bow.append(v / len(ents)) i = i + 1 return np.array(bow) def buildWordSpace(self, modelfile): ''' compute distributional model ''' model = Word2Vec(self.corpus, min_count=1, size=20, iter=100, workers=4) model.save(modelfile) self.wordspace = model def buildDocSpace(self, modelfile): ''' compute distributional model ''' model = doc2vec.Doc2Vec(self.pars, min_count=5, size=20, iter=100, workers=4) model.save(modelfile) self.docspace = model def loadWordSpace(self, modelfile): ''' compute distributional model ''' self.wordspace = Word2Vec.load(devdata + modelfile) def loadDocSpace(self, modelfile): ''' compute distributional model ''' self.docspace = doc2vec.Doc2Vec.load(devdata + modelfile) def loadParser(self): corenlp_dir = os.environ['STANFORD'] self.parser = StanfordCoreNLP(corenlp_dir + "/") # wait a few minutes... def parseSentence(self, sentence): parsed = self.parser.raw_parse(sentence)['sentences'][0]['parsetree'] return parsed
def run(debug, host, port, close, memory, input, output, arango, user, project, limit, pictures, summary, relations, corefs, newgraph, documentedges): uwsgi.cache_update('busy', b'1') if debug: logging.basicConfig(level=logging.DEBUG) logging.debug("Debug on.") else: logging.basicConfig(level=logging.INFO) nlp_bytes = None nlp_bytes = uwsgi.cache_get('nlp') # Set progress bar start parameters if nlp_bytes: init_time = 2 else: init_time = 10 if pictures or summary: nlp_time = 60 else: nlp_time = 80 yield "data:1\n\n" # If standford corenlp server host and port given use that, otherwise start a new instance through python wrapper if host and port: if nlp_bytes: temp_nlp = pickle.loads(nlp_bytes) temp_nlp.close() nlp = StanfordCoreNLP(host, port) uwsgi.cache_update('nlp', pickle.dumps(nlp)) logging.debug("nlp to cache: host {}".format(uwsgi.cache_get('nlp'))) elif nlp_bytes: nlp = pickle.loads(nlp_bytes) logging.debug("nlp from cache: {}".format(uwsgi.cache_get('nlp'))) else: nlp = StanfordCoreNLP(r'../deps/stanford-corenlp/', memory=memory, timeout=200000, quiet=not debug) uwsgi.cache_update('nlp', pickle.dumps(nlp)) logging.debug("nlp to cache: file {}".format(uwsgi.cache_get('nlp'))) DOC_CHUNK_SIZE = 10000 # Initialise corenlp properties, s3 bucket connection, and doc count for progress bar data, n_items, properties, s3 = init(input, output, nlp, relations=relations, corefs=corefs, chunk_size=DOC_CHUNK_SIZE, limit=limit) logging.debug("items to process: {}".format(n_items)) logging.debug("Loading CoreNLP models...") # Load corenlp models in separate thread to allow to send regular pings to the frontend server_init_thread = Thread(target=nlp.annotate, args=("", properties)) server_init_thread.start() while server_init_thread.is_alive(): time.sleep(30) yield "data:1\n\n" else: server_init_thread.join() yield "data:" + str(init_time) + "\n\n" # Create or load existing networkx graph object for this project graph_path = os.path.join(output, user, project, "nlp_outputs", 'graph_temp.pkl') if not newgraph: if output[:5] == 's3://' and s3.exists(graph_path): with s3.open(graph_path, 'rb') as f: logging.debug("Reading existing graph...") G = nx.read_gpickle(f) elif os.path.isfile(graph_path): G = nx.read_gpickle(graph_path) else: G = nx.MultiGraph() else: if arango: r = requests.delete("http://" + arango + "/ingest/" + user + "/" + project + "/") G = nx.MultiGraph() # Main NLP parsing loop. Run corenlp annotator pipeline, resolve coreferences and extract relations. Then load into networkx graph i = 0 for document in parse_docs(data, input, output, user, project, nlp, properties, chunk_size=DOC_CHUNK_SIZE, limit=limit, s3=s3): yield "data:" + str(int(i / n_items * nlp_time) + init_time) + "\n\n" if corefs: resolve_coreferences(document[1]) yield "data:" + str(int(i / n_items * nlp_time) + init_time) + "\n\n" for r in make_entity_relationships(document[0], document[1], document[2], document[3], relations=relations, documentedges=documentedges): key_suffix = r.semantic_type or "" G.add_edge(r.entity1._key, r.entity2._key, key=r.type + key_suffix, source_file=r.source_file, word_dist=r.word_dist, document_id=r.document_id, document_date=r.document_date, from_char_offset=(r.e1_char_start, r.e1_char_end), to_char_offset=(r.e2_char_start, r.e2_char_end), semantic_type=r.semantic_type, label_first=r.entity1.label_orig, label_second=r.entity2.label_orig) nodes = [] elements1 = r.entity1.__dict__ nodes.append((r.entity1._key, elements1)) elements2 = r.entity2.__dict__ nodes.append((r.entity2._key, elements2)) G.add_nodes_from(nodes) yield "data:" + str(int(i / n_items * nlp_time) + init_time) + "\n\n" i += 1 # Close the NLP server if required. Keep open to avoid model loading next time if close: nlp.close() uwsgi.cache_del('nlp') logging.debug("Calculating same sentence centrality...") set_type_centrality(G, "same_sentence") if documentedges: yield "data:" + str(init_time + nlp_time + 2) + "\n\n" set_type_centrality(G, "same_document") yield "data:" + str(init_time + nlp_time + 5) + "\n\n" else: yield "data:" + str(init_time + nlp_time + 5) + "\n\n" # Write graph object to JSON representation out_data = json_graph.node_link_data(G) # Serialise and write the graph object for use in next upload if output[:5] == 's3://': with s3.open(graph_path, 'wb') as f: nx.write_gpickle(G, f) else: nx.write_gpickle(G, graph_path) del G # remove and rename output variables to fit data api requirements out_data.pop('directed') out_data.pop('multigraph') out_data['vertices'] = out_data.pop('nodes') out_data['edges'] = out_data.pop('links') # Run wikipedia lookups of thumbnail urls and article summaries if pictures or summary: processes = [] with ThreadPoolExecutor(max_workers=None) as executor: for idx, v in enumerate(out_data['vertices']): v.pop('id') if v['_key'].split("_")[-1] not in ('LOCATION', 'MISC', 'ORGANIZATION', 'PERSON', 'COREF'): url = 'https://en.wikipedia.org/wiki/' + v['_key'] processes.append( executor.submit(getWikiImageSummary, url, pictures, summary, idx)) i = 0 for task in as_completed(processes): logging.debug( "Finished processing vertex: {} out of {}".format( i + 1, len(processes))) imageurl, summarytext, idx = task.result() out_data['vertices'][idx]['image_url'], out_data['vertices'][ idx]['summary'] = imageurl, summarytext if i % 10 == 0: yield "data:" + str( int(i / len(processes) * (80 - nlp_time)) + nlp_time + init_time + 5) + "\n\n" i += 1 # More renaming to fit data api requirements for e in out_data['edges']: e['_from'] = "vertices/" + clean_label(e.pop('source')) e['_to'] = "vertices/" + clean_label(e.pop('target')) e['type'] = e.pop('key')[:13] e['_key'] = str(uuid.uuid4()) yield "data:96\n\n" # Either load data into arango db, or save json representation to file system or s3 LINE_LIMIT = 100000 if arango: logging.debug("sending: {}, {}, {}".format(arango, user, project)) send_to_arango(out_data, arango, user, project, LINE_LIMIT, doc_type="vertices") yield "data:97\n\n" send_to_arango(out_data, arango, user, project, LINE_LIMIT, doc_type="same_sentence") yield "data:98\n\n" if documentedges: logging.debug("adding document edges") send_to_arango(out_data, arango, user, project, LINE_LIMIT, doc_type="same_document") else: edges_ss = [ e for e in out_data['edges'] if e['type'] == "same_sentence" ] if documentedges: edges_sd = [ e for e in out_data['edges'] if e['type'] == "same_document" ] write_list_in_chunks(out_data['vertices'], LINE_LIMIT // 10, output, user, project, 'vertices') yield "data:97\n\n" write_list_in_chunks(edges_ss, LINE_LIMIT, output, user, project, 'edges_ss') yield "data:98\n\n" if documentedges: write_list_in_chunks(edges_sd, LINE_LIMIT, output, user, project, 'edges_sd') uwsgi.cache_del('busy') yield "data:100\n\n"
class BasicStanfordCoreNLP(UtteranceProcessor): ''' Basic version doesn't do anything with coref, const. and depend. parses produced by analysis. For now, words from all sentences found in the utterance are put at the top level of the utterance -- sentences are throw away, but could be used later for e.g. paragraph-level utterances. If merge_clitics, merge e.g. I 'll -> single word I'll Add spaces back in where there is no punctuation as points at which silence can be inserted during alignment Add reduced POS as well as Stanford POS ''' def load(self): self.target_nodes = self.config.get('target_nodes', '//utt') self.input_attribute = self.config.get('input_attribute', 'norm_text') self.merge_clitics = self.config.get('merge_clitics', 'True') ## string, not bool ## check tools exist: corenlp_location = os.path.join(self.voice_resources.path[c.BIN], '..', \ 'corenlp-python', 'corenlp') assert os.path.isdir(corenlp_location) sys.path.append(corenlp_location) from corenlp import StanfordCoreNLP corenlp_dir = os.path.join(corenlp_location, '..', 'stanford-corenlp-full-2014-06-16') ## Each document is to be treated as one sentence, no sentence splitting at all. ## Write config for this if necessary: corenlp_conf_name = 'no_sentence_split.properties' corenlp_conf_file = os.path.join(corenlp_location, corenlp_conf_name) if not os.path.isfile(corenlp_conf_file): data = ['annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref', \ 'ssplit.isOneSentence = true'] writelist(data, corenlp_conf_file) print 'Loading stanford corenlp modules from %s ...'%(corenlp_dir) print 'Takes a while (~20-30 seconds)...' self.models = StanfordCoreNLP(corenlp_dir, properties=corenlp_conf_name) def process_utterance(self, utt): ## _END_ node end_node = Element('token') end_node.set(self.input_attribute, '_END_') utt.append(end_node) for node in utt.xpath(self.target_nodes): assert node.has_attribute(self.input_attribute) input = node.get(self.input_attribute) analysis = self.models.raw_parse(input) ## analysis looks like this: # {'coref': ... # 'sentences': [{'parsetree': ... } # 'text': # 'dependencies': # 'indexeddependencies': # 'words': [('and', {'NamedEntityTag': 'O', \ # 'CharacterOffsetEnd': '3', 'Lemma': 'and', \ # 'PartOfSpeech': 'CC', 'CharacterOffsetBegin': '0'}), ... ] # } # ] # } ## preprocess the analysis: add spaces back between words where there is no ## punc (to use as potential silence insertion points for alignment), and ## possibly merge clitics (he 's -> he's, i ll' -> i'll) ## MERGE SUCCESSIVE PUNCTUATION TOKENS new_analysis = {} new_analysis['sentences'] = [] for sentence in analysis['sentences']: #new_sentence = copy.deepcopy(sentence) #new_sentence['words'] = [] new_words = [] for word in sentence['words']: # is there a previous word? if len(new_words) > 0: # if both space / punct: if self.all_space_or_punc(new_words[-1][0]) and self.all_space_or_punc(word[0]): prev_word = new_words.pop(-1) combined = self.merge_words(prev_word, word) new_words.append(combined) else: new_words.append(word) else: new_words.append(word) sentence['words'] = new_words new_analysis['sentences'].append(sentence) analysis = new_analysis ## MERGE CLITICS ## This also merges e.g. . '' --> .'' (given by norm scripts from ." ) at sentence ends. if self.merge_clitics == 'True': ## string not bool new_analysis = {} new_analysis['sentences'] = [] for sentence in analysis['sentences']: #print sentence new_sentence = copy.deepcopy(sentence) new_sentence['words'] = [] i = 0 while i < (len(sentence['words'])-1): this_word = sentence['words'][i] next_word = sentence['words'][i+1] if next_word[0].startswith("'") or next_word[0] == "n't": merged = self.merge_words(this_word, next_word) new_sentence['words'].append(merged) i += 2 else: new_sentence['words'].append(this_word) i += 1 last_word = sentence['words'][-1] if not(last_word[0].startswith("'") or last_word[0] == "n't"): new_sentence['words'].append(last_word) new_analysis['sentences'].append(new_sentence) analysis = new_analysis ## ADD SPACES: new_analysis = {} new_analysis['sentences'] = [] for sentence in analysis['sentences']: new_sentence = copy.deepcopy(sentence) new_sentence['words'] = [] ## For now, ignore parsetree, dependencies, indexeddependencies (sentence level) previous_lemma = '_NONE_' for word in sentence['words']: (text, word_attributes) = word this_lemma = word_attributes['Lemma'] ## Add whitespace back in to tokens to use for silence insertion in alignment later. ## Don't add it where either neighbour is punctuation, or at start of ## utt (where previous_lemma is '_NONE_': if not (self.all_space_or_punc(previous_lemma) or \ self.all_space_or_punc(this_lemma)): if previous_lemma != '_NONE_': new_sentence['words'].append((' ', {'NamedEntityTag': ' ', \ 'PartOfSpeech': ' ', 'Lemma': ' '})) previous_lemma = this_lemma new_sentence['words'].append(word) new_analysis['sentences'].append(new_sentence) analysis = new_analysis ## combine all sentences to one for now: all_words = [] for sentence in analysis['sentences']: all_words.extend(sentence['words']) ## Add stuff into the target node (probably utt): for word in all_words: (text, word_attributes) = word word_node = Element('token') ## also includes punctuation etc. word_node.set(self.input_attribute, text) ## see above at sentence level about 'text' ## For now, ignore CharacterOffsetBegin, CharacterOffsetEnd (word level) word_node.set('ne', word_attributes['NamedEntityTag']) word_node.set('pos', word_attributes['PartOfSpeech']) word_node = self.add_reduced_POS(word_node) word_node.set('lemma', word_attributes['Lemma']) utt.append(word_node) ## _END_ node end_node = Element('token') end_node.set(self.input_attribute, '_END_') utt.append(end_node) def add_reduced_POS(self, node): full_POS = node.attrib['pos'] if '|' in full_POS: full_POS = full_POS.split('|')[0] ## add coarse POS (content/function) and reduced (adj,noun,adv,etc.) map = dict([('IN', 'function'), ('TO', 'function'), ('DT', 'function'), \ ('PDT', 'function'), ('MD', 'function'), ('CC', 'function'), \ ('WP', 'function'), ('PP$', 'function'), ('EX', 'function'), \ ('POS', 'function'), ('PP', 'function'), ('WDT', 'function'), \ ('PRP', 'function'), ('PRP$', 'function'), ('RP', 'function'), \ ('WP$', 'function'), ('WRB', 'function'), ('LS', 'function'),\ ('NN', 'noun'), ('NNS', 'noun'), \ ('NP', 'noun'), ('NNP', 'noun'), ('NPS', 'noun'), ('NNPS', 'noun'), ('FW', 'noun'), \ ('VBG', 'verb'), ('VBN', 'verb'), \ ('VB', 'verb'), ('VBD', 'verb'), ('VBP', 'verb'), ('VBZ', 'verb'), \ ('JJ', 'adj'), ('JJR', 'adj'), ('JJS', 'adj'), ('CD', 'adj'), \ ('RB', 'adv'), ('RBR', 'adv'), ('RBS', 'adv'), ('UH', 'interj')]) ## NOTE: # FW -- foreign word -> noun # LS -- list item -> function if full_POS not in map: if full_POS == ' ': red_pos = 'space' elif self.all_space_or_punc(full_POS): red_pos = 'punc' else: print 'MISSING POS: %s'%(full_POS) red_pos = 'other' else: red_pos = map[full_POS] node.set('coarse_pos', red_pos) return node def all_space_or_punc(self, token): '''Use regex to match unicode properties to see if token is all punctuation or space This duplicates later work by e.g. token classifier.''' space_or_punc = '[\p{Z}||\p{C}||\p{P}||\p{S}]' return regex.match('\A' + space_or_punc + '+\Z', token) def merge_words(self, word1, word2): merged_form = word1[0] + word2[0] merged_POS = word1[1]['PartOfSpeech'] + '|' + word2[1]['PartOfSpeech'] merged_lemma = word1[1]['Lemma'] ## first word's lemma merged_NER = word1[1]['NamedEntityTag'] ## first words NE tag merged = (merged_form, \ {'PartOfSpeech': merged_POS, \ 'Lemma': merged_lemma, \ 'NamedEntityTag': merged_NER}) return merged
import os from nltk.tokenize import sent_tokenize from corenlp import StanfordCoreNLP # The directory in which the stanford core NLP .jar is located -- you have to # download this from their website. CORE_NLP_DIR = "stanford-corenlp-dir/" PARSER = StanfordCoreNLP(CORE_NLP_DIR) in_file = "sentences.txt" text = open(in_file, 'r').read() sentences = sent_tokenize(text) # Break the text into sentences. for i, sentence in enumerate(sentences): try: parse = PARSER.raw_parse(sentence) if i%50 == 0: print " Entered sentence " + str(i) + " of " + str(len(sentences)) write_parse_products(parse['sentences'][0]) except Exception: print "Error on sentence:\n\t " + sentence + " \n " pass def write_parse_products(self, parse): words = parse['words'] word_objects = [] text = "" for i, word_info in enumerate(words): properties = word_info[1] token = word_info[0].lower().strip() surface = word_info[0].strip()
class StringProcessor(object): """Tokenize or parse a string. """ def __init__(self, project): """Instantiate and ready the parser. Note that readying the parser takes some time. """ self.parser = StanfordCoreNLP(app.config["CORE_NLP_DIR"]) self.project = project logger = logging.getLogger(__name__) global project_logger project_logger = ProjectLogger(logger, project) def tokenize(self, txt): """Turn a string of one or more ``Sentence``\s into a list of ``Sentence`` objects. This method will also tokenize each word in txt, find its PoS, lemma, and space_before. :param str txt: One or more sentences, in a string format. :return list: A list of document.Sentence objects. """ sentences = [] for sentence_text in split_sentences(txt): sentence = self.parse_with_error_handling(sentence_text) sentences.extend(tokenize_from_raw(sentence, sentence_text, self.project)) return sentences def parse(self, sentence, relationships=None, dependencies=None, max_length=30): """Parse a ``Sentence`` and extract dependencies, parse trees, etc. Note that for max_length, a "word" is defined as something with a space on at least one side. This is not the typical definition of "word". This is done so that length can be checked before resources are committed to processing a very long sentence. :param Sentence sentence: The ``Sentence`` object. :param int max_length: The most amount of words to process. """ parsed = self.parse_with_error_handling(sentence.text) # If the parse was unsuccessful, exit if parsed == None: return parsed_sentence = parsed["sentences"][0] if len(parsed["sentences"]) > 1: project_logger.warning("More than one sentence passed in to" " StringProcessor.parse().") parsed_sentence["text"] += parsed["sentences"][1]["text"] for dependency in parsed_sentence["dependencies"]: # We don't want to make a dependency involving ROOT if int(dependency[2]) > 0 and int(dependency[4]) > 0: governor = dependency[1] dependent = dependency[3] governor_index = int(dependency[2]) - 1 dependent_index = int(dependency[4]) - 1 governor_pos = parsed_sentence["words"][governor_index][1]\ ["PartOfSpeech"] governor_lemma = parsed_sentence["words"][governor_index][1]\ ["Lemma"] dependent_pos = parsed_sentence["words"][dependent_index][1]\ ["PartOfSpeech"] dependent_lemma = parsed_sentence["words"][dependent_index][1]\ ["Lemma"] grammatical_relationship = dependency[0] # If dictionaries are present, run with duplication handling if relationships != None and dependencies != None: key = grammatical_relationship if key in relationships.keys(): relationship = relationships[key] else: try: relationship = GrammaticalRelationship.query.\ filter_by(name = grammatical_relationship).\ one() except(MultipleResultsFound): project_logger.error("duplicate records found " "for: %s", str(key)) except(NoResultFound): relationship = GrammaticalRelationship( name = grammatical_relationship) relationships[key] = relationship # Read the data for the governor, and find the # corresponding word governor = Word.query.filter_by( word = governor, lemma = governor_lemma, part_of_speech = governor_pos ).first() # Same as above for the dependent in the relationship dependent = Word.query.filter_by( word = dependent, lemma = dependent_lemma, part_of_speech = dependent_pos ).first() try: governor.id dependent.id except: project_logger.error("Governor or dependent not " "found; giving up on parse. This likely indicates" " an error in the preprocessing; rerunning the " "preprocessor is recommended.") project_logger.info(sentence) return sentence key = (relationship.name, governor.id, dependent.id) if key in dependencies.keys(): dependency = dependencies[key] else: try: dependency = Dependency.query.filter_by( grammatical_relationship = relationship, governor = governor, dependent = dependent ).one() except(MultipleResultsFound): self.logg_error(("duplicate records found for: %s", str(key))) except(NoResultFound): dependency = Dependency( grammatical_relationship = relationship, governor = governor, dependent = dependent ) dependencies[key] = dependency # Add the dependency to the sentence sentence.add_dependency( dependency = dependency, governor_index = governor_index, dependent_index = dependent_index, project = self.project, force = False ) dependency.save(False) else: # TODO: fill pass return sentence def parse_with_error_handling(self, text): """Run the parser and handle errors properly. Also checks the sentence text for irregularities that may break the parser and handles it before proceeding. Any failure will cause this method to return None :param str text: The text of the sentence to check """ # Check for non-string if not isinstance(text, str) and not isinstance(text, unicode): project_logger.warning("Parser got a non-string argument: %s", text) return None # Check for non-unicode if not isinstance(text, unicode): # Try to convert the string to unicode if possible # Unit test: should fail with this example: # http://stackoverflow.com/questions/6257647/convert-string-to-unicode try: text = unicode(text) except(UnicodeDecodeError): project_logger.warning("The following sentence text is " "not unicode; convertion failed.") project_logger.info(text) # Skip sentence if flag is True if app.config["SKIP_SENTENCE_ON_ERROR"]: return None else: # Try to parse the sentence anyway project_logger.warning("Attempting to parse " "non-unicode sentence.") # Check for empty or nonexistent text if text == "" or text == None: return None # Check for irregular characters # TODO: what are considered irregular characters? # Try to parse, catch errors parsed_text = None try: parsed_text = self.parser.raw_parse(text) # TODO: handle all errors properly # ProcessError, TimeoutError, OutOfMemoryError except TimeoutError as e: project_logger.error("Got a TimeoutError: %s", str(e)) return None except ProcessError as e: project_logger.error("Got a ProcessError: %s", str(e)) return None except: project_logger.error("Unknown error") return None # Parse successful, return parsed text return parsed_text
class Nlp_persistence(object): """Persistence layer for having fast access to information produced by the StanfordCoreNLP tool.""" def __init__(self, fallback=False): self.FILE = "nlp_infos.p" self.data = None self.data_length = None self.corenlp_dir = "helper/stanfordnlp/corenlp-python/stanford-corenlp-full-2013-11-12/" if fallback: try: self.corenlp = StanfordCoreNLP(self.corenlp_dir) except TIMEOUT: print "Stanford CoreNLP Timeout" def __enter__(self): return self def __exit__(self, type, value, traceback): self.close() def close(self): # When exiting, update pickle file with new sentences and kill StanfordCoreNLP before so we definitely have enough memory for that try: del(self.corenlp) except AttributeError: # There was a timeout pass # Write only if we added something to self.data if self.data_length < len(self.data): self._write() def create_persistence(self, relations): try: # Trying to load data data = pickle.load(open(self.FILE, "rb")) except (IOError, EOFError): # No data so far print "Could not open cache. Create new." logging.info("Could not find %s. Create new data.", self.FILE) data = {} # Create nlp information for all relevant sentences for relation in relations: if not relation.source.sentence in data: self._update_data(relation.source, data) else: print "Sentence is already in data" if not relation.target.sentence in data: self._update_data(relation.target, data) else: print "Sentence is already in data" print "Done!" logging.info("Successfully loaded all nlp information to persistence file.") # Save data to a file pickle.dump(data, open(self.FILE, "wb"), protocol=-1) def _update_data(self, entity, data): sentence_obj = entity.sentence try: tree = self._get_tree(sentence_obj) except RPCInternalError: logging.error("Could not process the following sentence from text %s: %s", sentence_obj.filename, sentence_obj.text) # Return without updating data return print "--- " + sentence_obj.filename print sentence_obj.text data.update({sentence_obj: tree}) def load(self): data = {} if self.data is None: try: data = pickle.load(open(self.FILE, "rb")) except (IOError, EOFError): logging.warning("No cached nlp data.") finally: self.data = data self.data_length = len(data) else: # Data is already there - there is nothing to do pass def get_info_for_sentence(self, sentence): if type(self.data) is dict: try: return self.data[sentence] except KeyError: logging.error("Nlp_persistence: This sentence is not a key/Is not available in the Nlp persistence layer.") logging.info("Nlp_persistence fallback to CoreNLP server") # Fallback: Try to get tree from CoreNLP server tree = self._get_tree(sentence) # Drive by caching self.data.update({sentence: tree}) return tree else: logging.error("You have to use Nlp_persistence.load() before you can get the information of a sentence") return None def get_collapsed_dependencies(self, sentence): info = self.get_info_for_sentence(sentence) return info['sentences'][0]['dependencies'] def get_parse_tree(self, sentence): info = self.get_info_for_sentence(sentence) return info['sentences'][0]['parsetree'] def _write(self): # Save data to a file pickle.dump(self.data, open(self.FILE, "wb")) def _get_tree(self, sentence): tree = self.corenlp.raw_parse(sentence.text) return tree def get_pos_tag_for_word(self, sentence, word): """Returns the POS tag for a word in a sentence. If the word is not in the sentence raise WordNotInSentence error.""" info_sentence = self.get_info_for_sentence(sentence) words = info_sentence['sentences'][0]['words'] for w in words: if w[0] in word: return w[1]["PartOfSpeech"] else: raise PosTagNotFound(sentence, word) def get_lemma_for_word(self, sentence, word): """Returns the lemma for a word in sentence.""" info_sentence = self.get_info_for_sentence(sentence) words = info_sentence['sentences'][0]['words'] for w in words: if w[0] in word: return w[1]["Lemma"] else: raise LemmaNotFound(sentence, word) def is_main_verb(self, sentence, word): """Returns true if word is a main verb of sentence and not an aux.""" info_sentence = self.get_info_for_sentence(sentence) dependencies = info_sentence['sentences'][0]['dependencies'] for dependency in dependencies: if dependency[0] == "aux" and dependency[2] == word: return False else: return True def get_all_aux_for_verb(self, sentence, verb): """Returns all distinct aux for verb as strings in order of the sentence.""" info_sentence = self.get_info_for_sentence(sentence) dependencies = info_sentence['sentences'][0]['dependencies'] aux = [] for dependency in dependencies: if (dependency[0] == "aux" or dependency[0] == "auxpass") and dependency[1] == verb: aux.append(dependency[2]) return aux def get_verb_for_aux(self, sentence, aux): """Returns the governing verb for the aux as string.""" info_sentence = self.get_info_for_sentence(sentence) dependencies = info_sentence['sentences'][0]['dependencies'] for dependency in dependencies: if dependency[0] == "aux" and dependency[2] == aux: return dependency[1] else: raise AuxNotFound(aux) def find_all_verb_pos_tags(self, sentence, verb): """Returns all pos tags for all verbs based on the dependencies relation of the sentence.""" if self.is_main_verb(sentence, verb): # verb is not an aux main_verb = verb else: # verb is aux (this should normally not happen due to the data) main_verb = self.get_verb_for_aux(sentence, verb) auxes = self.get_all_aux_for_verb(sentence, main_verb) verb_pos = self.get_pos_tag_for_word(sentence, main_verb) aux_pos = map(lambda aux: self.get_pos_tag_for_word(sentence, aux), auxes) return aux_pos + [verb_pos] def get_governing_verb(self, event): sentence = event.sentence # info = [verb, aux, pos verb, pos aux, index_of_verb] info = self.get_info_on_governing_verb(event.text, event.index, sentence) if info is None: raise CouldNotFindGoverningVerb else: if info[0] is None: raise CouldNotFindGoverningVerb else: return (info[0], info[4]) def is_root(self, event): sentence = event.sentence info_sentence = self.get_info_for_sentence(sentence) collapsed_dependencies = info_sentence['sentences'][0]['dependencies'] for dependency in collapsed_dependencies: dependency_type = dependency[0] dependent = dependency[2] if dependency_type == "root" and dependent == event.text: return True else: return False def get_info_on_governing_verb(self, non_verb, index, sentence): """This method returns information about the governing verb of a non-verb. It returns an array with the following format: [verb, aux, POS of verb, POS of aux, index_of_verb] """ info = self.get_info_for_sentence(sentence) if info: # Search for non_verb governing_verb, index = self._get_governing_verb(non_verb, index, info) info_on_governing_verb = [governing_verb, None, None, None, index] # Set POS of main verb pos_verb = self._get_pos_of_verb(governing_verb, info) info_on_governing_verb[2] = pos_verb # Searching for an Aux for the governing verb aux = self._get_aux_of_verb(governing_verb, info) info_on_governing_verb[1] = aux # If there is an aux, get it's POS if aux: pos_aux = self._get_pos_of_verb(aux, info) info_on_governing_verb[3] = pos_aux return info_on_governing_verb else: return None def _get_aux_of_verb(self, verb, info): dependencies = info['sentences'][0]['dependencies'] sources = [x[1] for x in dependencies] # Find index of verb in targets index = None for i, source in enumerate(sources): if source == verb and dependencies[i][0] == "aux": index = i # Get aux if index is None: # Not every verb has an aux return None else: aux = dependencies[index][2] return aux def _get_pos_of_verb(self, verb, info): info_on_words = info['sentences'][0]['words'] for word in info_on_words: if word[0] == verb: return word[1]['PartOfSpeech'] def _find_governing_word(self, word, dependencies): for dependency in dependencies: if dependency[2] == word: return dependency[1] else: return None def _find_governing_word_index(self, word, index, index_dependencies): word = word + "-" + str(index) for dependency in index_dependencies: if dependency[2] == word: # Remove governor with index appended return dependency[1] else: return None def _remove_index_from_token(self, token): if token: token = token.split("-")[:-1] return "-".join(token) else: return None def _get_index_from_token(self, token): if token: index = token.split("-")[-1] return index else: return None def _get_governing_verb(self, non_verb, index, info): index_dependencies = info['sentences'][0]['indexeddependencies'] # Try to find a governor for non_verb governor = self._find_governing_word_index(non_verb, index, index_dependencies) # Search through tree as long we find a verb and until we can go further up while not self._is_verb(self._remove_index_from_token(governor), info) and governor is not None: old_governor = governor governor = self._find_governing_word_index(self._remove_index_from_token(governor), self._get_index_from_token(governor), index_dependencies) if governor == old_governor: # Detected circle (does not happen often, but happens. Not sure why.) governor = None break if governor: # Remove index from governor string return (self._remove_index_from_token(governor), int(self._get_index_from_token(governor))) else: # Examples when this is allowed to happen: # Example for when it happens: "And in Hong Kong, a three percent drop." <- no verb # Other example: "One exception was the swine flu pandemic of 2009-2010, when 348 children died." and "pandemic". "pandemic" is the root of the sentence and is not governed by anything # Other corner case: "And the dominant flu strain early in the season was one that tends to cause more severe illness." for "season" raise CouldNotFindGoverningVerb(non_verb, index) def _is_verb(self, text, info): """Checks if text has the POS tag of a verb.""" if not text: return False words = info['sentences'][0]['words'] for word in words: if word[0] == text: if word[1]['PartOfSpeech'] in ['VBG', 'VBD', 'VB', 'VBN', 'VBP', 'VBZ']: return True return False
#!/usr/bin/env python import os import sys import csv import json sys.path.append(os.path.expanduser('~/github/stanford-corenlp-python')) from corenlp import StanfordCoreNLP corenlp = StanfordCoreNLP() # I propose this csv dialect for all our bolt csv files # as a first step into standardizing our data files csv.register_dialect('bolt', quoting=csv.QUOTE_ALL, doublequote=False, escapechar='\\', lineterminator='\n') def parse_sentence(s): """Returns a dictionary with the parse results returned by the Stanford parser for the provided sentence.""" return json.loads(corenlp.parse(s, verbose=False))['sentences'][0] if __name__ == '__main__': with open('sentences.csv', 'rb') as fi, open('sentences2.csv', 'wb') as fo: reader = csv.reader(fi) writer = csv.writer(fo, 'bolt')
from corenlp import StanfordCoreNLP import pprint if __name__ == '__main__': nlp = StanfordCoreNLP('http://localhost:9000') # text = "Non tolerance was gandhijis weapon." # text = ("We went to pitapit,it can be expensive but not hygienic.") # text = ("The dishes at Alkareem are highly recommended.") text = ("The sitting which is mostly outdoor is the prettiest you can come across in CP") # text = ('I loved The Crispy Vegetables but found the Wontons to be devoid of any flavor') # text = ("delicious veg manchurian.") # text = ('London is good at studies but bad at sports.') # text = ("The tiger prawns here,it doesn't get better.") # text = ('Check out the pics to find out who greeted me on my first visit to Bercos CP branch, it can be expensive but not hygienic.') output = nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse,ner', 'outputFormat': 'json' }) # pprint.pprint(output) tree = output['sentences'][0]['parse'] print tree x = output['sentences'][0]['collapsed-dependencies'] # pprint.pprint(x) print '-------------------------------------------------' for i in range(len(x)): print x[i]['dep'] + '-->' + x[i]['governorGloss'] + '-' + str(x[i]['governor']) + ' ' + x[i]['dependentGloss'] + '-' + str(x[i]['dependent']) # print(output['sentences'][0]['parse']) # output = nlp.tokensregex(text, pattern='/Pusheen|Smitha/', filter=False) # print(output) # output = nlp.semgrex(text, pattern='{tag: VBD}', filter=False)
def get_data_raw(self): snlp = StanfordCoreNLP( corenlp_path=os.path.dirname(self.files()[0].path)) return snlp
def __init__(self, corenlp_dir): self.parser = StanfordCoreNLP(corenlp_dir)
# Get the corpus-file open corpusjson = 'protest.json' jsonobject = json.load(codecs.open(corpusjson)) # Get and clean the text: texts = (clean_html(parse(StringIO(obj[4].replace("\n", " ")))).getroot().text_content() for obj in jsonobject) print "Story text generator object created." # Turn it into a string object, then an html object, then back into string... #texts = clean_html(parse(StringIO(text))).getroot().text_content() print "Setting up parser: " # Set up the parser stanford_parser = StanfordCoreNLP() print "Creating parser generator object: " # Parse dat. parsed_texts = (stanford_parser.parse(unicode(text)) for text in texts) # Save the result to a file # Not sure how enumerate() works with generators; ostensibly a wrapper which # retains laziness, but I don't wanna risk it and introduce more variables. i = 0 # So, it's gross. Whatever. for story in parsed_texts: i += 1 with codecs.open(str(i)+".json", 'w') as fh: json.dump(json.loads(story), fh, indent=2)
def typedependencies(sent_list, neg_words, compound_word_list): pos_dict = {} depend_dict = {} depend_list = [] proper_names = [] # neg_words = [] compound_dic = {} nlp = StanfordCoreNLP('http://localhost:9000') for i in range(len(sent_list)): compound_list = [] print sent_list[i] output = nlp.annotate(sent_list[i], properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse,ner', 'outputFormat': 'json' }) # pprint.pprint(output) x = output['sentences'][0]['basic-dependencies'] # pprint.pprint(output['sentences'][0]['parse']) # pprint.pprint(x) # print '-------------------------------------------------' for j in range(len(x)): if 'compound' in x[j]['dep']: # compound_word(x[j]) ll = [ x[j]['governorGloss'], x[j]['governor'], x[j]['dependentGloss'], x[j]['dependent'] ] compound_dic[x[j]['governor']] = x[j]['governorGloss'] compound_dic[x[j]['dependent']] = x[j]['dependentGloss'] # compound_list.append(ll) d = [ x[j]['dep'], x[j]['governorGloss'], str(x[j]['governor']), x[j]['dependentGloss'], str(x[j]['dependent']) ] depend_list.append(d) # getting the negative words.. if 'neg' in x[j]['dep']: x1 = x[j]['governorGloss'].lower() x2 = x[j]['dependentGloss'].lower() if x1 not in stopwords: neg_words.append([x1, x[j]['governor']]) else: neg_words.append([x2, x[j]['dependent']]) if 'conj' in x[j]['dep']: x1 = x[j]['governorGloss'].lower() x2 = x[j]['dependentGloss'].lower() if x1 in neg_prefix: neg_words.append([x2, x[j]['dependent']]) # elif (x2 == 'not' or x2 == 'nor' or x2 == 'non'): # neg_words.append(x1) elif x2 in neg_prefix: neg_words.append([x1, x[j]['governor']]) print(x[j]['dep'] + '-->' + x[j]['governorGloss'] + '-' + str(x[j]['governor']) + ' ' + x[j]['dependentGloss'] + '-' + str(x[j]['dependent'])) print '===================================' for key, value in sorted(compound_dic.items()): compound_list.append([key, value]) # print compound_word(compound_list) compound_dic.clear() y = output['sentences'][0]['tokens'] for k in range(len(y)): pos_dict[y[k]['word']] = y[k]['pos'] if 'NNP' in y[k]['pos']: proper_names.append(y[k]['word']) depend_dict[i] = depend_list depend_list = [] if len(compound_list) > 0: w = compound_word(compound_list) else: w = [] for jj in range(len(w)): if w[jj] != '': print w[jj] compound_word_list.append(w[jj]) print '--------NAMES------' + str(proper_names) print '--------NEGATIVE----' + str(neg_words) return depend_dict, pos_dict, proper_names
from nltk.tokenize import sent_tokenize from nltk.tag.stanford import NERTagger from nltk.parse.stanford import StanfordParser from corenlp import StanfordCoreNLP wsj = open('wsj_0063.txt') #extract named entities nerTagger=NERTagger('stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2014-08-27/stanford-ner.jar') ner = [] for line in wsj: ner.append(nerTagger.tag(unicode(line,errors='ignore').split())) #parse sentences paragraph = "" for line in wsj: paragraph += line.replace('\n',' ') sentences = sent_tokenize(paragraph) parser = StanfordParser('stanford-parser-full-2014-10-31/stanford-parser.jar','stanford-parser-full-2014-10-31/stanford-parser-3.5.0-models.jar') parsed = parser.raw_parse_sents(sentences) #coreference corenlp_dir = "stanford-corenlp-full-2014-08-27" corenlp = StanfordCoreNLP(corenlp_dir) corenlp.batch_parse(paragraph) wsj.close()
# -- coding: utf-8 -- import jsonrpc from simplejson import loads from socket import error as SocketError import errno from corenlp import StanfordCoreNLP import sys reload(sys) sys.setdefaultencoding('utf8') corenlp_dir = "stanford-corenlp-full-2014-08-27/" #server = jsonrpc.ServerProxy(jsonrpc.JsonRpc20(),jsonrpc.TransportTcpIp(addr=("127.0.0.1", 8080),timeout=200.0)) server = StanfordCoreNLP(corenlp_dir) orig_file = open('pubmed_1000.csv', 'r') new_file = open('coref-1000.csv', 'w') count = 0 gotdata = 1 result = [] for line in orig_file.readlines(): cols = line.split('\t') message = cols[2] simplemessage = "Stanford University is located in California. It is a great university." print "Sending line: " + str(count) data = server.parse(message) ''' while not gotdata: try: print "Sending line: " + str(count) data = server.parse(message)
from corenlp import StanfordCoreNLP import pprint if __name__ == '__main__': nlp = StanfordCoreNLP('http://localhost:9000') # text = "Non tolerance was gandhijis weapon." # text = ("We went to pitapit,it can be expensive but not hygienic.") # text = ("The dishes at Alkareem are highly recommended.") text = ( "The sitting which is mostly outdoor is the prettiest you can come across in CP" ) # text = ('I loved The Crispy Vegetables but found the Wontons to be devoid of any flavor') # text = ("delicious veg manchurian.") # text = ('London is good at studies but bad at sports.') # text = ("The tiger prawns here,it doesn't get better.") # text = ('Check out the pics to find out who greeted me on my first visit to Bercos CP branch, it can be expensive but not hygienic.') output = nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse,ner', 'outputFormat': 'json' }) # pprint.pprint(output) tree = output['sentences'][0]['parse'] print tree x = output['sentences'][0]['collapsed-dependencies'] # pprint.pprint(x) print '-------------------------------------------------' for i in range(len(x)): print x[i]['dep'] + '-->' + x[i]['governorGloss'] + '-' + str(
def scrape_func(address, website): """ Function to scrape various RSS feeds. Uses the 'keep' and 'ignore' iterables to define which words should be used in the text search. Inputs ------ address : address for the RSS feed to scrape. String. website : name of the website to scrape to be used in the filepath for the output. String. database : name of the MongoDB database that contains the collections. String? pymongo connection object? """ connection = MongoClient() db = connection.atrocities_data collection = db[website] sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') corenlp_dir = 'stanford-corenlp/' corenlp_parse = StanfordCoreNLP(corenlp_dir) log = open('log_file.txt', 'a') results = pattern.web.Newsfeed().search(address, count=100, cached=False) log1 = 'There are %d results from %s \n' % (len(results), website) log.write(log1) for result in results: if website == 'nyt': text = pages_scrape.scrape(result.url, result.title) head_sentences = sent_detector.tokenize(text.strip())[:4] joined_sentences = ' '.join(head_sentences) parsed = corenlp_parse.raw_parse(joined_sentences) entry_id = mongo_connection.add_entry(collection, text, parsed, result.title, result.url, result.date, website) if entry_id: log2 = 'Added entry from %s with id %s \n' % (result.url, str(entry_id) ) log.write(log2) else: log2 = 'Result from %s already in database \n' % (result.url) log.write(log2) if website == 'bbc': text = pages_scrape.scrape(result.url, result.title) head_sentences = sent_detector.tokenize(text.strip())[:4] joined_sentences = ' '.join(head_sentences) parsed = corenlp_parse.raw_parse(joined_sentences) entry_id = mongo_connection.add_entry(collection, text, parsed, result.title, result.url, result.date, website) if entry_id: log2 = 'Added entry from %s with id %s \n' % (result.url, str(entry_id) ) log.write(log2) else: log2 = 'Result from %s already in database \n' % (result.url) log.write(log2) if website == 'reuters': text = pages_scrape.scrape(result.url, result.title) head_sentences = sent_detector.tokenize(text.strip())[:4] joined_sentences = ' '.join(head_sentences) parsed = corenlp_parse.raw_parse(joined_sentences) entry_id = mongo_connection.add_entry(collection, text, parsed, result.title, result.url, result.date, website) if entry_id: log2 = 'Added entry from %s with id %s \n' % (result.url, str(entry_id) ) log.write(log2) else: log2 = 'Result from %s already in database \n' % (result.url) log.write(log2) if website == 'ap': text = pages_scrape.scrape(result.url, result.title) head_sentences = sent_detector.tokenize(text.strip())[:4] joined_sentences = ' '.join(head_sentences) parsed = corenlp_parse.raw_parse(joined_sentences) entry_id = mongo_connection.add_entry(collection, text, parsed, result.title, result.url, result.date, website) if entry_id: log2 = 'Added entry from %s with id %s \n' % (result.url, str(entry_id) ) log.write(log2) else: log2 = 'Result from %s already in database \n' % (result.url) log.write(log2) if website == 'upi': text = pages_scrape.scrape(result.url, result.title) head_sentences = sent_detector.tokenize(text.strip())[:4] joined_sentences = ' '.join(head_sentences) parsed = corenlp_parse.raw_parse(joined_sentences) entry_id = mongo_connection.add_entry(collection, text, parsed, result.title, result.url, result.date, website) if entry_id: log2 = 'Added entry from %s with id %s \n' % (result.url, str(entry_id) ) log.write(log2) else: log2 = 'Result from %s already in database \n' % (result.url) log.write(log2) if website == 'xinhua': page_url = result.url.encode('ascii') page_url = page_url.replace('"', '') text = pages_scrape.scrape(page_url, result.title) head_sentences = sent_detector.tokenize(text.strip())[:4] joined_sentences = ' '.join(head_sentences) parsed = corenlp_parse.raw_parse(joined_sentences) entry_id = mongo_connection.add_entry(collection, text, parsed, result.title, result.url, result.date, website) if entry_id: log2 = 'Added entry from %s with id %s \n' % (result.url, str(entry_id) ) log.write(log2) else: log2 = 'Result from %s already in database \n' % (result.url) log.write(log2) if website == 'google': text = pages_scrape.scrape(result.url, result.title) head_sentences = sent_detector.tokenize(text.strip())[:4] joined_sentences = ' '.join(head_sentences) parsed = corenlp_parse.raw_parse(joined_sentences) entry_id = mongo_connection.add_entry(collection, text, parsed, result.title, result.url, result.date, website) if entry_id: log2 = 'Added entry from %s with id %s \n' % (result.url, str(entry_id) ) log.write(log2) else: log2 = 'Result from %s already in database \n' % (result.url) log.write(log2) interupt = '+' * 70 log3 = '%s\nScrape %s once at %s!\n%s\n' % (interupt, website, datetime.datetime.now(), interupt) log.write(log3) log.close()
from corenlp import StanfordCoreNLP import simplejson as json corenlp_dir = "/home/clai/lubbock/repos-3rd/stanford-corenlp-python/stanford-corenlp-full-2015-04-20/" print "loading..." corenlp = StanfordCoreNLP(corenlp_dir) results = corenlp.raw_parse("Hello world. It's a wonderful day.") print results print json.dumps(results, indent=4)
# # paragraph += re.sub(r'\([^)]*\)', '',d['line'])+' ' # episodeLines.append(paragraph.replace('\n','').replace(' ',' ')) outfile = open('scripts/'+e+'.txt','w') paragraph = "" for d in searcher.documents(episode=e): outfile.writelines(d['line'].encode('utf-8')+' ') # outfile.writelines((d['speaker']+': '+d['line']).encode('utf-8')+' ') # paragraph += d['speaker']+': '+d['line']+' ' # # paragraph += re.sub(r'\([^)]*\)', '',d['line'])+' ' # paragraph = paragraph.replace('\n','').replace(' ',' ') # outfile.writelines(paragraph.encode('utf-8')) outfile.close() parsed = [] corenlp_dir = "stanford-corenlp-full-2014-08-27" corenlp = StanfordCoreNLP(corenlp_dir) for e in episodeNum: for d in searcher.documents(episode=e): parsed.append(corenlp.raw_parse(d)) # sentClient = StanfordNLPSentimentClient('http://localhost:8080') # sentiment = [] # for t in text: # sentiment.append(sentClient.classify(t)) # mask = imread("friends.gif") wc = WordCloud(max_words=30,stopwords=STOPWORDS|{'s','t','m','re','oh','right','don','know','well','hey','gonna','okay','yeah','go','really','think','hi','uh','look','god','mean','one','ye','guy','y','got','come','now'},font_path='/Users/elaine/Library/Fonts/Berlin.ttf') for c in mainChars: wc.generate(lines[uniqueSpeakers.index(c)]) wc.to_file(c+".png")
def __init__(self): corenlp_dir = "corenlp-python/stanford-corenlp-full-2014-08-27/" self.corenlp = StanfordCoreNLP(corenlp_dir) # wait a few minutes... print("corenlp object initiated")
def compress(sentence): global parser if not parser: parser = StanfordCoreNLP(corenlp_dir) text = sentence.simple words = word_tokenize(text) w_features = [dict() for w in words] stemmed = [None for w in words] labels = list() # add basic features # first/last words for i in range(1,6): if i < len(words): for x in range(i): w_features[x]["infirst"+str(i)] = True w_features[-1-x]["inlast"+str(i)] = True #pos = [ x[1] for x in nltk.pos_tag(a.o_words) ] for i in range(len(words)): w = words[i] features = w_features[i] #capitalization if w.isupper(): features["isupper"] = True elif w[0].isupper(): features["firstupper"] = True w = w.lower() #word class if w in negation: features["negation"] = True elif w in punct: features["punct"] = True elif w in stopWords: features["stopWords"] = True #pos #a.posfeatures[i]["pos_"+pos[i]] = True # compute the basic term frequencies of all words in paragraphs # for use in building corpus-wide quarry term frequency if w not in model.idf.stopWords: termFreq[w] += 1 stem = stemmer.stem(w) suffix = "" if len(stem) < len(w) and w.startswith(stem): suffix = w[len(stem):] stemmed[i] = (stem, suffix) features["stem_"+stemmed[i][0]] = True features["affix_"+stemmed[i][1]] = True #Stanford tree features text = text.encode('ascii', 'ignore') tree = None dependencies = None try: results = parser.raw_parse(text) tree = [] dependencies = [] for s in results['sentences']: tree.append(tree_re.search(s['parsetree']).group(0)) dependencies += s['dependencies'] except: print(text) print( "Unexpected error:", sys.exc_info()[0]) #print(a.tree) if tree: tree = Tree.fromstring(tree[0].encode('ascii', 'ignore')) #print(str(tree)) paths = list(getPathsToLeaves(tree)) #print(paths) for i in range(min(len(paths), len(words))): #print(paths[i][1]) w_features[i]["tree_depth_"+str(len(paths[i][1]))] = True for x in range(0,2): w_features[i][str(x)+"_up_"+paths[i][1][-1-x]] = True for n in paths[i][1]: w_features[i]["tree_"+n] = True w_features[i][str(paths[i][2])+"_from_left"] = True #print(a.treefeatures[0]) if dependencies: #make a tree out of it d_tree = defaultdict(list) mother_relations = defaultdict(list) daughter_relations = defaultdict(list) for dep in dependencies: d_tree[dep[1]].append((dep[0], dep[2])) mother_relations[dep[1]].append(dep[0]) daughter_relations[dep[2]].append(dep[0]) #now we can check depth and such #print(d_tree) depths = getDepths(d_tree, u'ROOT', dict(), 0) #print(depths) for i in range(len(words)): w = words[i] treefeatures = w_features[i] if w in depths: w_depth = depths[w] treefeatures["dep_depth_"+str(w_depth)] = True if w_depth > 3: treefeatures["dep_depth_over_3"] = True if w_depth > 5: treefeatures["dep_depth_over_5"] = True if w in mother_relations: for rel in mother_relations[w]: treefeatures["dep_mother_"+rel] = True if w in daughter_relations: for rel in daughter_relations[w]: treefeatures["dep_daughter_"+rel] = True # get max tfidf for scaling maxtfidf = max( tf*idf.idf[w] for w, tf in termFreq.items() ) partitions = 5 # now add tfidf threshold features for i in range(len(words)): w = words[i].lower() if w not in stopWords and w not in punct: features = w_features[i] tfidf = termFreq[w] * idf.idf[w] scaled = tfidf / maxtfidf * partitions for x in range(1,partitions): if tfidf > x: features[str(x*100/partitions)+"percenttfidf"] = True #for f in w_features: # print(f) # add previous features and classify for i in range(len(words)): f = w_features[i].copy() for prev in range(2): if i > prev: prevstring = "prev"+str(prev)+"_" f[prevstring+labels[-1-prev]] = True prevfeatures = w_features[i-1-prev] for k,v in prevfeatures.items(): if not k.startswith("in"): f[prevstring+k] = v #print("with prev:") #print(f) # classify vector = vec.transform(f) vector = selector.transform(vector) result = classifier.predict(vector) l = result[0] #print(l) labels.append(l) # use labels to clear out print(labels) retained_words = list() for i in range(len(labels)): if labels[i] != 'O': retained_words.append(words[i]) newsentence = "" for i in range(len(retained_words)): if i != 0 and retained_words[i] not in punct and retained_words[i-1] not in ["``"]: newsentence += " " newsentence += retained_words[i] sentence.simple = newsentence return sentence
class StanforExtractor(object): def __init__(self): corenlp_dir = "corenlp-python/stanford-corenlp-full-2014-08-27/" self.corenlp = StanfordCoreNLP(corenlp_dir) # wait a few minutes... print("corenlp object initiated") def tag_text(self, text): """ :param text: :return: """ assert type(text) == str sents = self.corenlp.raw_parse(text) return sents def expand_rels_double(self, rel_words, sent): """ :param rel_words: [wrd1,wrd2] :param sent: in tagged_text['sentences'], ['dependencies'] for each sent :return: """ assert type(rel_words) == list assert type(sent) == list assert len(rel_words) == 2 rel_tmp = [rel_words[0], rel_words[1]] for rel_1 in sent: if rel_1[1] == rel_words[0] and rel_1[2] == rel_words[1]: continue rel_1 = list(rel_1) # print(rel_1) # if prep_ or prepc_ is the tag # appos_tag = 1 neg_tag = 0 if rel_1[0].startswith(u"prep_") or rel_1[0].startswith(u"prepc_"): middle_word = rel_1[0][rel_1[0].find("_") + 1 :] rel_1 = [rel_1[1], middle_word, rel_1[2]] elif rel_1[0] == u"appos": rel_1 = [rel_1[1], rel_1[2]] # appos_tag = -1 elif rel_1[0] == u"neg": # neg_tag = 1 rel_1 = [rel_1[1], rel_1[2]] else: continue # rel_1 = [rel_1[1],rel_1[2]] if rel_words[0] in rel_1: append_start = 1 rel_1.remove(rel_words[0]) elif rel_words[1] in rel_1: append_start = -1 rel_1.remove(rel_words[1]) else: continue # append_start = append_start*appos_tag # if neg_tag == 1: # if append_start == 1: rel_tmp = [" ".join(rel_1)] + rel_tmp else: rel_tmp = rel_tmp + [" ".join(rel_1)] return rel_tmp def expand_rels_wordlist(self, rel_words, sent): """ :param rel_words: [wrd1,wrd2,..] :param sent: in tagged_text['sentences'], ['dependencies'] for each sent :return: """ assert type(rel_words) == list assert type(sent) == list rel_tmp = [] for rel_1 in sent: # for each word in sentence, rel_1 is the relation mapper from stanford tagger dependencies # if rel_1[1] in rel_words and rel_1[2] in rel_words: # continue rel_1 = list(rel_1) # print(rel_1) # if prep_ or prepc_ is the tag # appos_tag = 1 neg_tag = 0 if rel_1[0].startswith(u"prep_") or rel_1[0].startswith(u"prepc_"): middle_word = rel_1[0][rel_1[0].find("_") + 1 :] rel_1 = [rel_1[1], middle_word, rel_1[2]] elif rel_1[0] == u"appos": rel_1 = [rel_1[1], rel_1[2]] # appos_tag = -1 elif rel_1[0] == u"neg": # what to do here? # neg_tag = 1 rel_1 = [rel_1[1], rel_1[2]] else: continue wrd_present = False for wrd in rel_1: if wrd in rel_words: rel_1.remove(wrd) wrd_present = True if wrd_present: # pdb.set_trace() if len(rel_1) > 0: rel_tmp.append(" ".join(rel_1)) return " ".join(rel_tmp) def expand_rels(self, tmp_rels, sent): """ add relevant sents to start or end of tmp_rels :param tmp_rels: :param sent: :return: """ # pdb.set_trace() print("sent", sent) final_rels = [] for rel_full in tmp_rels: rel_words = [rel_full[1], rel_full[2]] rel_tmp = self.expand_rels_double(rel_words, sent) final_rels.append(rel_tmp) # print('final_res:',final_rels) return final_rels def identify_rels(self, tagged_text): """ :param tagged_text: :return: """ assert "sentences" in tagged_text.keys() assert "dependencies" in tagged_text["sentences"][0].keys() all_rels = [] for sent in tagged_text["sentences"]: tmp_rels = [] for rel in sent["dependencies"]: if rel[0] in [u"nn", u"dobj"]: tmp_rels.append(rel) if len(tmp_rels) > 0: final_rels = self.expand_rels(tmp_rels, sent["dependencies"]) all_rels.append(final_rels) return all_rels def identify_word_rels(self, all_words, tagged_text): """ :param all_words: list of words/phrases :param tagged_text: :return: """ assert "sentences" in tagged_text.keys() assert "dependencies" in tagged_text["sentences"][0].keys() words_rels = {} # pdb.set_trace() for wrd in all_words: wrd_rels = [] for sent in tagged_text["sentences"]: rel_frm_sent = self.expand_rels_wordlist(wrd.split(), sent["dependencies"]) if len(rel_frm_sent) > 0: wrd_rels.append(rel_frm_sent) words_rels[wrd] = ",".join(wrd_rels) return words_rels def identify_time(self, text): """ :param text: :return: """ time_strs = [] text_tag = self.tag_text(text) for sent in text_tag["sentences"]: words = sent["words"] prev_wrd_tag = False for wrd in words: wrd_tag = wrd[1] assert type(wrd_tag) == dict # if u'Timex' in wrd_tag: # timex_string = wrd_tag['Timex'] # new_end = timex_string.rfind('</TIMEX3>') # timex_string = timex_string[:new_end] # new_start = timex_string.rfind('>') # time_word = timex_string[new_start+1:] # time_strs.append(time_word) if u"NamedEntityTag" in wrd_tag: if wrd_tag[u"NamedEntityTag"] in [u"DATE", u"TIME"]: if not prev_wrd_tag: time_strs.append(wrd[0]) else: prev_wrd = time_strs.pop() new_wrd = prev_wrd + " " + wrd[0] time_strs.append(new_wrd) prev_wrd_tag = True else: prev_wrd_tag = False else: prev_wrd_tag = False time_final = [] for wrd in time_strs: if wrd not in time_final: time_final.append(wrd) return time_final def ret_time_rels(self, text): """ :param text: :return: """ tagged_text = self.tag_text(text) all_times = self.identify_time(text) time_rels = self.identify_word_rels(all_times, tagged_text) return time_rels def return_rels(self, text): """ :param text: :return: """ text_tag = self.tag_text(text) rels_all = self.identify_rels(text_tag) return rels_all def identify_name(self, text): """ :param text: :return: """ name_strs = [] text_tag = self.tag_text(text) for sent in text_tag["sentences"]: words = sent["words"] prev_wrd_tag = False for wrd in words: wrd_tag = wrd[1] assert type(wrd_tag) == dict # if u'Timex' in wrd_tag: # timex_string = wrd_tag['Timex'] # new_end = timex_string.rfind('</TIMEX3>') # timex_string = timex_string[:new_end] # new_start = timex_string.rfind('>') # time_word = timex_string[new_start+1:] # time_strs.append(time_word) if u"NamedEntityTag" in wrd_tag: if wrd_tag[u"NamedEntityTag"] in [u"PERSON"]: if not prev_wrd_tag: name_strs.append(wrd[0]) else: prev_wrd = name_strs.pop() new_wrd = prev_wrd + " " + wrd[0] name_strs.append(new_wrd) prev_wrd_tag = True else: prev_wrd_tag = False else: prev_wrd_tag = False names_final = [] for wrd in name_strs: if wrd not in names_final: names_final.append(wrd) return names_final
def stanfordParse(text, corenlpDir='stanford-corenlp-full-2013-11-12/'): global stanford if stanford is None: stanford = StanfordCoreNLP(corenlpDir) return stanford.raw_parse(text)
# -*- coding: utf-8 -*- from corenlp import StanfordCoreNLP import time local_corenlp_path = './tmp' # Simple usage nlp = StanfordCoreNLP(local_corenlp_path) sentence = 'Guangdong University of Foreign Studies is located in Guangzhou.' print('Tokenize:', nlp.word_tokenize(sentence)) print('Part of Speech:', nlp.pos_tag(sentence)) print('Named Entities:', nlp.ner(sentence)) print('Constituency Parsing:', nlp.parse(sentence)) print('Dependency Parsing:', nlp.dependency_parse(sentence)) del nlp time.sleep(10) # Other human languages support, e.g. Chinese nlp = StanfordCoreNLP(local_corenlp_path, lang='zh', quiet=False) sentence = '清华大学位于北京。' print(nlp.word_tokenize(sentence)) print(nlp.pos_tag(sentence)) print(nlp.ner(sentence)) print(nlp.parse(sentence)) print(nlp.dependency_parse(sentence)) del nlp time.sleep(10) # General Stanford CoreNLP API
def loadParser(self): corenlp_dir = os.environ['STANFORD'] self.parser = StanfordCoreNLP(corenlp_dir + "/") # wait a few minutes...
u'.' ] ahs_test = "And be it further enacted, That the seat of government of said Territory is hereby located temporarily at Fort Leavenworth; and that such portions of the public buildings as may not be actually used and needed for military purposes, may be occupied and used, under the direction of the Governor and Legislative Assembly, for such public purposes as may be required under the provisions of this act." # Make socket transport = TSocket.TSocket(server, port) # Buffering is critical. Raw sockets are very slow transport = TTransport.TBufferedTransport(transport) # Wrap in a protocol protocol = TBinaryProtocol.TBinaryProtocol(transport) # Create a client to use the protocol encoder client = StanfordCoreNLP.Client(protocol) # Connect! transport.open() # This list is for options for how we'd like the output formatted. See README.md for the full list of possible options. # Note that the DEFAULT is what you would get if you specified "oneline" on the command line, or "None" here. # You have to pass in something, and unfortunately it doesn't seem like that something can be None or an empty list. # See http://diwakergupta.github.io/thrift-missing-guide/#_defining_structs for a possible explanation as to why... # So, the following examples are VALID values for the second argument to these parse_* methods. # (There are, of course, many more valid combinations depending on what the Stanford Parser supports.) #outputOptions = ["-outputFormat", "typedDependencies,penn", "-outputFormatOptions", "basicDependencies"] outputOptions = ["-outputFormat", "oneline"] #outputOptions = ["-outputFormat", "typedDependencies"] ''' try:
from corenlp import StanfordCoreNLP corenlp_dir = "../../Scripts/stanford-corenlp-full-2014-08-27/" corenlp = StanfordCoreNLP(corenlp_dir) # wait a few minutes... result = corenlp.raw_parse( "What is birth date of the wife of the first black president of the United States?" ) print((result['sentences'][0]['dependencies']))
def typedependencies(sent_list,neg_words,compound_word_list): pos_dict = {} depend_dict = {} depend_list = [] proper_names = [] # neg_words = [] compound_dic = {} nlp = StanfordCoreNLP('http://localhost:9000') for i in range(len(sent_list)): compound_list = [] print sent_list[i] output = nlp.annotate(sent_list[i], properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse,ner', 'outputFormat': 'json' }) # pprint.pprint(output) x = output['sentences'][0]['basic-dependencies'] # pprint.pprint(output['sentences'][0]['parse']) # pprint.pprint(x) # print '-------------------------------------------------' for j in range(len(x)): if 'compound' in x[j]['dep']: # compound_word(x[j]) ll = [x[j]['governorGloss'],x[j]['governor'], x[j]['dependentGloss'],x[j]['dependent']] compound_dic[x[j]['governor']] = x[j]['governorGloss'] compound_dic[x[j]['dependent']] = x[j]['dependentGloss'] # compound_list.append(ll) d = [x[j]['dep'],x[j]['governorGloss'],str(x[j]['governor']) ,x[j]['dependentGloss'],str(x[j]['dependent'])] depend_list.append(d) # getting the negative words.. if 'neg' in x[j]['dep']: x1 = x[j]['governorGloss'].lower() x2 = x[j]['dependentGloss'].lower() if x1 not in stopwords: neg_words.append([x1,x[j]['governor']]) else: neg_words.append([x2,x[j]['dependent']]) if 'conj' in x[j]['dep']: x1 = x[j]['governorGloss'].lower() x2 = x[j]['dependentGloss'].lower() if x1 in neg_prefix: neg_words.append([x2,x[j]['dependent']]) # elif (x2 == 'not' or x2 == 'nor' or x2 == 'non'): # neg_words.append(x1) elif x2 in neg_prefix: neg_words.append([x1,x[j]['governor']]) print (x[j]['dep'] + '-->' + x[j]['governorGloss'] + '-' + str(x[j]['governor']) + ' ' + x[j]['dependentGloss'] + '-' + str(x[j]['dependent'])) print '===================================' for key,value in sorted(compound_dic.items()): compound_list.append([key,value]) # print compound_word(compound_list) compound_dic.clear() y = output['sentences'][0]['tokens'] for k in range(len(y)): pos_dict[y[k]['word']] = y[k]['pos'] if 'NNP' in y[k]['pos']: proper_names.append(y[k]['word']) depend_dict[i] = depend_list depend_list = [] if len(compound_list) > 0: w = compound_word(compound_list) else: w = [] for jj in range(len(w)): if w[jj] != '': print w[jj] compound_word_list.append(w[jj]) print '--------NAMES------' + str(proper_names) print '--------NEGATIVE----' + str(neg_words) return depend_dict,pos_dict,proper_names