def splitCol8toWords(): ## using corenlp to do split up job ## corenlp setting corenlp_dir = "stanford-corenlp-full-2014-08-27/" corenlp = StanfordCoreNLP(corenlp_dir) ## load dataset with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTupleDec9.json') as t: trainTup = json.load(t) fres = open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTuple_splitTitleNumBW_Dec9.txt','w') split_res = [] for num,tup in enumerate(trainTup): ## after modify col8 and save, col8 now may be empty.. if not tup[8]: continue ## use corenlp to splitup res = corenlp.parse(tup[8]) par = json.loads(res) slist = par["sentences"][0]['words'] temp = [] for s in slist: temp.append(s[0]) split_res.append([tup[0],tup[1],tup[6],temp]) fres.write(tup[0]+'\t'+tup[1]+'\t'+tup[6]+'\t'+','.join(temp)+'\n') print 'No.', num,tup[6] print tup[8] print [tup[0],tup[1],tup[6],temp] ## record new dataset with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTuple_splitTitleNumBW_Dec9.json','w') as f: json.dump(split_res,f) fres.close()
def stanford_parse(corpus_path): """Parse a directory (recursively) with the Stanford parser...""" import os import ast try: from corenlp import StanfordCoreNLP except: raise ValueError("CoreNLP not installed.") path_part, corpus_name = os.path.split(corpus_path) new_corpus_folder = 'parsed_%s' % corpus_name new_corpus_path = os.path.join(path_part, new_corpus_folder) if not os.path.exists(new_corpus_path): os.makedirs(new_corpus_path) corenlp = StanfordCoreNLP() files = os.listdir(corpus_path) for root, dirs, files in os.walk(corpus_path, topdown=True): for name in files: filepath = os.path.join(root, name) f = open(filepath) raw = f.read() parsed_text = ast.literal_eval(corenlp.parse(raw)) for index, sent in enumerate(parsed_text['sentences']): syntax_tree = sent['parsetree'] plain_text = sent['text'] subcorpus_path = os.path.join(new_corpus_path, subcorpus_name) if not os.path.exists(subcorpus_path): os.makedirs(subcorpus_path)
def afterModifyCol8_splitCol8(): col8_splitup = [] with open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/npovTrail2/Nov8data/train_afterdataclean_modifiedcleanedTupleNov8.json') as t: trainTup = json.load(t) corenlp_dir = "stanford-corenlp-full-2014-08-27V3.4.1" corenlp = StanfordCoreNLP(corenlp_dir) for num, tup in enumerate(trainTup): print 'No.',num print 'orin: ',tup[8] res = corenlp.parse(tup[8]) par = json.loads(res) print par slist = par["sentences"][0]['words'] # print slist temp = [] for s in slist: temp.append(s[0]) col8_splitup.append(temp) print temp ## check dependencies split dlist = par['sentences'][0]['dependencies'] demp = [] for d in dlist: demp.append(d) print demp if num == 4: break
def afterModifyCol8_splitCol8(): col8_splitup = [] with open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/npovTrail2/Nov8data/train_afterdataclean_modifiedcleanedTupleNov8.json') as t: trainTup = json.load(t) corenlp_dir = "stanford-corenlp-full-2014-08-27V3.4.1" # corenlp_dir = "stanford-corenlp-full-2015-01-29" # corenlp_dir = "stanford-corenlp-full-2013-06-20" corenlp = StanfordCoreNLP(corenlp_dir) # res = corenlp.parse("Bell, a company which is based in LA, makes and distributes computer products. I hate you.") # par = json.loads(res) # for i in par["sentences"][0]['dependencies']: # print i for num, tup in enumerate([trainTup[1853]]): print 'No.',num print 'orin: ',tup[8] res = corenlp.parse(tup[8]) par = json.loads(res) # print par slist = par["sentences"][0]['words'] # print slist temp = [] for s in slist: temp.append(s[0]) col8_splitup.append(temp) print temp ## check dependencies split dlist = par['sentences'][0]['dependencies'] demp = [] for d in dlist: demp.append(d) print demp if num == 4: break
def f1f2f3f4f5f6f7(file_,file2_): ## using corenlp to do split up job ## corenlp setting corenlp_dir = "stanford-corenlp-full-2014-08-27/" corenlp = StanfordCoreNLP(corenlp_dir) ## load dataset with open(file_) as t: trainTup = json.load(t) ## data structure to hold fea1 to fea7 a list feaLst = [] for num,tup in enumerate(trainTup): ## after modify col8 and save, col8 now may be empty.. if not tup[8]: continue print "No. %d tup in processing.." % (num) ## use corenlp to splitup res = corenlp.parse(tup[8]) par = json.loads(res) print tup[8] ## use corenlp to get lemma and pos for p,word in enumerate(par["sentences"][0]['words']): print str(p)+'th w in tupl '+str(num) tmp = {} tmp['Word'] = word[0] tmp['Lemma'] = word[1]['Lemma'] tmp['POS'] = word[1]['PartOfSpeech'] feaLst.append(tmp) ## add pos-1,pos+1,pos-2 and pos+2 slen = len(feaLst) for ind,val in enumerate(feaLst): if (ind-1) >= 0 and (ind-1) <= slen-1: val['POS-1'] = feaLst[ind-1]['POS'] else: val['POS-1'] = "NA" if (ind+1) >= 0 and (ind+1) <= slen -1: val['POS+1'] = feaLst[ind+1]['POS'] else: val['POS+1'] = "NA" if (ind-2) >= 0 and (ind-2) <= slen -1: val['POS-2'] = feaLst[ind-2]['POS'] else: val['POS-2'] = "NA" if (ind+2) >=0 and (ind+2) <= slen -1: val['POS+2'] = feaLst[ind+2]['POS'] else: val['POS+2'] = "NA" for i in feaLst: print 'w:',i['Word'],' lemma:',i['Lemma'],' pos-2:',i['POS-2'],' pos-1:',i['POS-1'],' pos:',i['POS'],' pos+1:',i['POS+1'],' pos+2:',i['POS+2'] with open(file2_,'w') as o: json.dump(feaLst,o) print len(feaLst) print len(trainTup)
def corenlpLemmaPOS_stanfordparserDependency_split_equalChecking(): ## corenlp setting corenlp_dir = "stanford-corenlp-full-2014-08-27/" corenlp = StanfordCoreNLP(corenlp_dir) ## stanfordDependencies setting sd = StanfordDependencies.get_instance(backend="subprocess", version="3.4.1") os.environ["STANFORD_PARSER"] = "stanford-parser-full-2014-08-27/" os.environ["STANFORD_MODELS"] = "stanford-parser-full-2014-08-27/" parser = stanford.StanfordParser( model_path="stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ) with open("../../dataclean_Nov8_2015/train_afterdataclean_modifiedcleanedTupleNov8.json") as t: trainTup = json.load(t) for num, tup in enumerate(trainTup): ## after modify col8 and save, col8 now may be empty.. if not tup[8]: continue ## use corenlp to split sentence print "No.", num print tup[8] res = corenlp.parse(tup[8]) par = json.loads(res) slist = par["sentences"][0]["words"] print slist temp = [] for s in slist: temp.append(s[0]) print temp ## use stanfordDependencies to do split sentence sentences = parser.raw_parse(tup[8]) s = "" for line in sentences: for sentence in line: s += str(sentence) sent = sd.convert_tree(s) print sent detemp = [] for t in sent: detemp.append(t[1]) print detemp for di, ti in zip(detemp, temp): if di == ti: pass else: if ( (ti == "(" and di == "-LRB-") or (ti == ")" and di == "-RRB-") or (ti == "[" and di == "-LSB-") or (ti == "]" and di == "-RSB-") ): print "diff in parenthesis" pass else: print "{", di, " ,", ti, " }"
def checkCoreNLPSplit_DependencySplit(file_): with open(file_) as f: tset = json.load(f) ## corenlp setting corenlp_dir = "stanford-corenlp-full-2014-08-27/" corenlp = StanfordCoreNLP(corenlp_dir) ## stanfordDependencies setting sd = StanfordDependencies.get_instance(backend="subprocess",version='3.4.1') os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2014-08-27/' os.environ['STANFORD_MODELS'] = 'stanford-parser-full-2014-08-27/' parser = stanford.StanfordParser(model_path="stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") for num, tup in enumerate(tset): print num if not tup[8]: continue ## use corenlp to splitup res = corenlp.parse(tup[8]) par = json.loads(res) slist = par["sentences"][0]['words'] temp = [] for s in slist: temp.append(s[0]) ## use stanfordDependencies to do split sentence sentences = parser.raw_parse(tup[8]) s="" for line in sentences: for sentence in line: s+=str(sentence) sent = sd.convert_tree(s) detemp = [] for t in sent: detemp.append(t[1]) ## check if same for di,ti in zip(detemp,temp): if di == ti: pass else: if (ti == '(' and di == '-LRB-') or (ti == ')' and di == '-RRB-') or (ti == '[' and di == '-LSB-') or (ti == ']' and di == '-RSB-'): print "diff in parenthesis" pass else: print "!!!" print "{",di,' ,',ti," }"
def parse(sentence): from corenlp import StanfordCoreNLP parser = StanfordCoreNLP() data = parser.parse(sentence) #print data open_file = open("data.json", "wb") open_file.write(data) open_file.close()
def remove_tuples0MoreThan1BiasedWord_fromOriginalTuple(): ## using corenlp to do split up job ## corenlp setting corenlp_dir = "stanford-corenlp-full-2014-08-27/" corenlp = StanfordCoreNLP(corenlp_dir) ## load dataset with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTupleDec9.json') as t: trainTup = json.load(t) # with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTuple_splitTitleNumBW_stripPuncNum_Dec9.json') as a: # verify = json.load(a) b = open('../../devDataclean_Dec8_2015/dev_biasword0ormorethan1_modifiedFile_dec11.txt','w') res2 = [] for num,tup in enumerate(trainTup): print num ## after modify col8 and save, col8 now may be empty.. if not tup[8]: continue ## use corenlp to splitup res = corenlp.parse(tup[8]) par = json.loads(res) slist = par["sentences"][0]['words'] temp = [] for s in slist: temp.append(s[0]) ## count of biased word cnum = temp.count(tup[6]) if cnum == 1: ## verify if the qualified sent is the same as the split col8 file: dev_afterdataclean_modifiedcleanedTuple_splitTitleNumBW_Dec9.json # if (verify[num][2] == tup[6]) and (verify[num][0] == tup[0]): res2.append(tup) # else: # print "two file are diff" # print verify[num] # print tup # sys.exit() else: b.write(str(tup)+'\n') with open('../../devDataclean_Dec8_2015/dev_afterdataclean_modifiedcleanedTuple_elimBiasWord0orMoreThanOne_fullTup_Dec11.json','w') as f: json.dump(res2,f) b.close()
def stanford_parse(data, corpus_name = 'corpus'): from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) print "\n%s: Initialising CoreNLP... \n" % thetime import os import ast try: from corenlp import StanfordCoreNLP except: raise ValueError("CoreNLP not installed.") from corpkit.progressbar import ProgressBar corenlp = StanfordCoreNLP() if not os.path.exists(corpus_name): os.makedirs(corpus_name) p = ProgressBar(len(data)) for index, datum in enumerate(data): p.animate(index) text = datum[0] metadata = datum[1] number_of_zeroes = len(str(len(data))) - 1 filename = str(index).zfill(number_of_zeroes) + '.txt' file_data = [] parsed_text = ast.literal_eval(corenlp.parse(text)) trees = [] raw_texts = [] for index, sent in enumerate(parsed_text['sentences']): syntax_tree = sent['parsetree'] plain_text = sent['text'] trees.append(syntax_tree) raw_texts.append(plain_text) #subcorpus_path = os.path.join(new_corpus_path, subcorpus_name) file_data = ['<raw>' + '\n'.join(raw_texts) + '\n</raw>', '<parse>' + '\n'.join(trees) + '\n</parse>', ] if not os.path.exists(os.path.join(corpus_name, metadata)): os.makedirs(os.path.join(corpus_name, metadata)) try: fo=open(os.path.join(corpus_name, metadata, filename),"w") except IOError: print "Error writing file." fo.write('\n'.join(file_data)) fo.close() p.animate(len(data)) print 'Done!'
def parse_file(): sentence_file = open('sentences.txt', 'w') dep_file = open('deps.txt', 'w') tree_file = open('trees.txt', 'w') abstracts = [line.strip() for line in open('relabs.txt', 'r')] corenlp = StanfordCoreNLP() for abstract in abstracts: parse = corenlp.parse(abstract) xml = json.loads(parse) sentences = xml['sentences'] for sentence in sentences: # Write sentence sentence_file.write(sentence['text'] + "\n") # Write parse tree tree_file.write(sentence['parsetree'] + "\n") # Write dependencies for dep in sentence['dependencies']: dep_file.write('@'.join(dep) + "\t") dep_file.write("\n") dep_file.close() tree_file.close() sentence_file.close()
def split_one(params): from corenlp import StanfordCoreNLP index, fn, start, end = params # skip 'start' line lno = 0 fin = open(fn) while lno < start: fin.readline() lno += 1 ret = [] parser = StanfordCoreNLP("stanford-corenlp-full-2015-01-29/", properties="default.properties", serving=False) for i in xrange(start, end): line = fin.readline() ll = line.decode('utf8').strip().split('\t') """pay attention to here !!!""" # if len(ll) != 3: # continue # if not ll[2].endswith('@en'): # continue # text = ll[2][1:-4] if len(ll) != 2: continue text = ll[1] text = text.replace('\\n', ' ').replace('\\r', ' ') try: rsp = json.loads(parser.parse(text)) sentences = [] for s in rsp['sentences']: sentences.append(s['text']) ret.append(('%s\t%s' % (ll[0], '\t'.join(sentences))).encode('utf8')) except Exception as e: print e fin.close() return ret
def local_split_description(fn_in, fn_out): from corenlp import StanfordCoreNLP parser = StanfordCoreNLP("stanford-corenlp-full-2015-01-29/", properties="default.properties", serving=False) with open(fn_out, 'w') as fout: with open(fn_in) as fin: for line in fin: ll = line.decode('utf8').strip().split('\t') if len(ll) != 3: continue if not ll[2].endswith('@en'): continue text = ll[2][1:-4] text = text.replace('\\n', ' ').replace('\\r', ' ') try: rsp = json.loads(parser.parse(text)) sentences = [] for s in rsp['sentences']: sentences.append(s['text']) print >> fout, ('%s\t%s' % (ll[0], '\t'.join(sentences))).encode('utf8') except Exception as e: print e.message
class StoreParser(): def __init__(self): self.corenlp = StanfordCoreNLP() self.treemanipulator = TreeManipulator() self.stemmer = Stemmer() self.blacklist = [line.strip() for line in open("blacklist.txt", 'r')] def parse_abstract(self, id): count_dict = dict() if id in self.blacklist: print 'Id {0} is blacklisted.'.format(id) return count_dict path = "NPs/" + id + ".nps" try: with open(path, 'r') as file: for line in file: line = line.strip() split = line.split("\t") stemmed_np = split[0] count = split[1] unstemmed = dict() for i in xrange(2,len(split),2): unstemmed[split[i]] = int(split[i+1]) count_dict[stemmed_np] = [int(count), unstemmed] except IOError: try: print "Parsing " + str(id) try: abstract = medeley_fetch.get_abstract_for_id(id) except Exception as e: if e.args[0] == "TooManyRequestsException": print "Skipping due to server overload, consider halting program..." elif e.args[0] == "PaperHasNoAbstractException": print "Object has no abstract, probably not a paper..." else: print "Unknown exception occured when fetching paper..." return count_dict # Can happen due to server overload, but apparently for other reasons as well parse = self.corenlp.parse(abstract) document = json.loads(parse) with open("Parses/" + id + ".json", 'w') as file: file.write(parse) # Extract all the nps from the parse trees # TODO: Not that important, I guess for sentence in document['sentences']: parse_tree = sentence['parsetree'] nltk_tree = Tree(parse_tree) nps = self.treemanipulator.get_all_np_variations(nltk_tree) for original_np in nps: if original_np != "": stemmed_np = self.stemmer.stem_string(original_np) if stemmed_np in count_dict.keys(): count_dict[stemmed_np][0] += 1 count_dict[stemmed_np][1][original_np] += 1 else: count_dict[stemmed_np] = [1, defaultdict(int)] count_dict[stemmed_np][1][original_np] = 1 with open(path, 'w') as file: for key in count_dict.iterkeys(): file.write(str(key) + "\t" + str(count_dict[key][0]) + "\t") for original_np in count_dict[key][1].iterkeys(): file.write(str(original_np) + "\t" + str(count_dict[key][1][original_np]) + "\t") file.write("\n") except pexpect.ExceptionPexpect: print "Timeout during parsing. Verify that the content is rubbish, and add to the blacklist..." exit() return count_dict
corenlp_dir = "stanford-corenlp-full-2014-08-27/" #server = jsonrpc.ServerProxy(jsonrpc.JsonRpc20(),jsonrpc.TransportTcpIp(addr=("127.0.0.1", 8080),timeout=200.0)) server = StanfordCoreNLP(corenlp_dir) orig_file = open('pubmed_1000.csv', 'r') new_file = open('coref-1000.csv', 'w') count = 0 gotdata = 1 result = [] for line in orig_file.readlines(): cols = line.split('\t') message = cols[2] simplemessage = "Stanford University is located in California. It is a great university." print "Sending line: " + str(count) data = server.parse(message) ''' while not gotdata: try: print "Sending line: " + str(count) data = server.parse(message) gotdata = 1 except: print "The connection got reseted." break ''' result = loads(data) #gotdata = 0 if 'coref' in result.keys(): for coref in result['coref']: for one in coref:
# -*- coding: utf-8 -*- from corenlp import StanfordCoreNLP import time local_corenlp_path = './tmp' # Simple usage nlp = StanfordCoreNLP(local_corenlp_path) sentence = 'Guangdong University of Foreign Studies is located in Guangzhou.' print('Tokenize:', nlp.word_tokenize(sentence)) print('Part of Speech:', nlp.pos_tag(sentence)) print('Named Entities:', nlp.ner(sentence)) print('Constituency Parsing:', nlp.parse(sentence)) print('Dependency Parsing:', nlp.dependency_parse(sentence)) del nlp time.sleep(10) # Other human languages support, e.g. Chinese nlp = StanfordCoreNLP(local_corenlp_path, lang='zh', quiet=False) sentence = '清华大学位于北京。' print(nlp.word_tokenize(sentence)) print(nlp.pos_tag(sentence)) print(nlp.ner(sentence)) print(nlp.parse(sentence)) print(nlp.dependency_parse(sentence)) del nlp time.sleep(10) # General Stanford CoreNLP API
import pdb # Get the corpus-file open corpusjson = 'protest.json' jsonobject = json.load(codecs.open(corpusjson)) # Get and clean the text: texts = (clean_html(parse(StringIO(obj[4].replace( "\n", " ")))).getroot().text_content() for obj in jsonobject) print "Story text generator object created." # Turn it into a string object, then an html object, then back into string... #texts = clean_html(parse(StringIO(text))).getroot().text_content() print "Setting up parser: " # Set up the parser stanford_parser = StanfordCoreNLP() print "Creating parser generator object: " # Parse dat. parsed_texts = (stanford_parser.parse(unicode(text)) for text in texts) # Save the result to a file # Not sure how enumerate() works with generators; ostensibly a wrapper which # retains laziness, but I don't wanna risk it and introduce more variables. i = 0 # So, it's gross. Whatever. for story in parsed_texts: i += 1 with codecs.open(str(i) + ".json", 'w') as fh: json.dump(json.loads(story), fh, indent=2)
# Get the corpus-file open corpusjson = 'protest.json' jsonobject = json.load(codecs.open(corpusjson)) # Get and clean the text: texts = (clean_html(parse(StringIO(obj[4].replace("\n", " ")))).getroot().text_content() for obj in jsonobject) print "Story text generator object created." # Turn it into a string object, then an html object, then back into string... #texts = clean_html(parse(StringIO(text))).getroot().text_content() print "Setting up parser: " # Set up the parser stanford_parser = StanfordCoreNLP() print "Creating parser generator object: " # Parse dat. parsed_texts = (stanford_parser.parse(unicode(text)) for text in texts) # Save the result to a file # Not sure how enumerate() works with generators; ostensibly a wrapper which # retains laziness, but I don't wanna risk it and introduce more variables. i = 0 # So, it's gross. Whatever. for story in parsed_texts: i += 1 with codecs.open(str(i)+".json", 'w') as fh: json.dump(json.loads(story), fh, indent=2)