def main(arg): dir = os.path.dirname(__file__) filename = os.path.join(dir, 'stanford-corenlp-python/stanford-corenlp-full-2014-08-27/*') configFileLoc = os.path.join(dir, 'config.ini') proc = CoreNLP(configfile=configFileLoc, corenlp_jars=[filename]) with open(arg, "r") as file: data = removeHeadings(file) parsed = proc.parse_doc(data) data = [] for s in parsed[u'sentences']: sent = str(' '.join(s[u'tokens'])) data.append(sent.translate(string.maketrans("",""), string.punctuation)) data1 = ".".join(data) data1 = data1.replace("..",".") data1 = data1.replace(" "," ") data1 = data1.replace(" .",". ") data2 = " ".join(data) data2 = data2.replace(" "," ") file_train1 = open("data/a1_train1.txt", "w") file_train1.write(data1) file_train1.close() file_train2 = open("data/a1_train2.txt", "w") file_train2.write(data2) file_train2.close() file_test1 = open("data/a1_test1.txt", "w") file_test1.write(clean1(data1)) file_test1.close() file_test2 = open("data/a1_test2.txt", "w") file_test2.write(clean(data2)) file_test2.close()
def __parse_text(self): if exists_in_s3('{}/{}'.format(s3_output_prefix, self.outfilename)): self.__load_parse_result() return ss = CoreNLP('parse', corenlp_jars = ['~/software/stanford-corenlp-full-2015-12-09/*']) self.parsed = ss.parse_doc(self.sentences) ss.cleanup()
def __init__(self): self.proc = CoreNLP('parse') self.ner = json.load(open('../data/delexicalization/ner_dict.json')) self.semcategory = json.load( open('../data/delexicalization/delex_dict.json')) self.descriptions = json.load( open('../data/delexicalization/descriptions.json'))
def phrases(): #STOPWORDS is the list of words we'd like to discards in our stopwords =[".","?","!",','] proc = CoreNLP("nerparse",corenlp_jars=[java]) p=[] i=1 print "#### Traitement et mise en forme des questions extraites ####" with open(quest,'r') as inp: for line in inp: print "traitement de la ligne " + str(i) p.append(proc.parse_doc(line)) i+=1 with open('./output/phrases.txt','w') as outp: with open('./output/ressources1.txt','w') as outr: for elmt in p: for tok in elmt["sentences"][0]["lemmas"]: if not tok in stopwords: a =tok print a outr.write(a+'\n'.decode().encode('utf-8')) outr.write('\n'.decode().encode('utf-8')) for tok in elmt["sentences"][0]["tokens"]: if not tok in stopwords: outp.write(tok.decode().encode('utf-8')+'\n'.decode().encode('utf-8')) outp.write('\n'.decode().encode('utf-8'))
def get_test_references(): de, en = [], [] proc = CoreNLP('ssplit') # Insert test references in training data entries = Entry.objects(set='test') for entry in entries: for triple in entry.triples: agent = triple.agent.name patient = triple.patient.name de.append(agent) name = ' '.join( agent.replace('\'', '').replace('\"', '').split('_')) out = proc.parse_doc(name) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace( '-RRB-', ')') text += ' ' en.append(text.strip()) de.append(patient) name = ' '.join( patient.replace('\'', '').replace('\"', '').split('_')) out = proc.parse_doc(name) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace( '-RRB-', ')') text += ' ' en.append(text.strip()) return de, en
def run(self, fin, fout): self.proc = CoreNLP('ssplit') entity_maps = p.load(open(os.path.join(fin, 'eval1.cPickle'))) f = open(os.path.join(fin, 'eval1.bpe.de.output.postprocessed.dev')) texts = f.read().lower().split('\n') f.close() print len(texts), len(entity_maps) for i, text in enumerate(texts[:-1]): entity_map = entity_maps[i] for tag in entity_map: name = ' '.join(entity_map[tag].name.lower().replace('\'', '').replace('\"', '').split('_')) texts[i] = texts[i].replace(tag.lower(), str(name)) f = open(fout, 'w') for text in texts: out = self.proc.parse_doc(text)['sentences'] text = [] for i, snt in enumerate(out): text.extend(snt['tokens']) text = ' '.join(text).replace('-LRB- ', '(').replace(' -RRB-', ')').strip() f.write(text.encode('utf-8')) f.write('\n') f.close()
def __init__(self, fdev, ftest): self.proc = CoreNLP('ssplit') self.get_results(fdev, ftest) # DEV dev_order, dev_gold = [], [] DEV_DIR = u'../data/dev' for dir in os.listdir(DEV_DIR): if dir != u'.DS_Store': f = os.path.join(DEV_DIR, dir) for fname in os.listdir(f): if fname != u'.DS_Store': print os.path.join(f, fname) _order, _gold = self.order(os.path.join(f, fname), u'dev') dev_order.extend(_order) dev_gold.extend(_gold) self.write_hyps(dev_order, fdev + '.ordered') utils.write_references('results/gold/dev.en', dev_gold) # TEST test_order, test_gold = [], [] TEST_FILE = u'../data/test/triples/test.xml' _order, _gold = self.order(TEST_FILE, u'test') test_order.extend(_order) self.write_hyps(test_order, ftest + '.ordered') # save previous orders self.save_prev_order()
def __init__(self, homedir='./'): from stanford_corenlp_pywrapper import CoreNLP self.corenlp = CoreNLP( configdict={ 'annotators': 'tokenize, ssplit, pos, lemma, parse, ner' }, output_types=['pos', 'lemma', 'parse', 'ner'], corenlp_jars=[homedir + "lib/stanford-corenlp-full-2015-04-20/*"])
def __init__(self, in_train, in_dev): self.proc = CoreNLP('ssplit') self.parser = CoreNLP('parse') self.in_train = in_train self.in_dev = in_dev self.text_id = 0 self.trainset() self.testset()
def __parse_text(self): if exists_in_s3('{}/{}'.format(s3_output_prefix, self.outfilename)): self.__load_parse_result() return ss = CoreNLP( 'parse', corenlp_jars=['~/software/stanford-corenlp-full-2015-12-09/*']) self.parsed = ss.parse_doc(self.sentences) ss.cleanup()
def __init__(self): # self.proc = CoreNLP(configdict={'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse'}, corenlp_jars=["/usr/local/lib/stanford-corenlp-full-2015-12-09/*"]) self.proc = CoreNLP( configdict={ 'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse' }, corenlp_jars=[ "/CORENLPDIRECTORY/stanford-corenlp-full-2015-12-09/*", "/Users/akira/stanford-corenlp-full-2015-12-09/sutime" ])
def __init__(self, _set='train', save_references=True): self._set = _set self.proc = CoreNLP('coref') self.proc_parse = CoreNLP('parse') self.e2f = utils.get_e2f('../data/lex.e2f') self.save_references = save_references # referring expressions per entity self.refexes = {}
def __init__(self): #self.server = ServerProxy(JsonRpc20(), # TransportTcpIp(addr=("127.0.0.1", 8080))) corenlp_dir = "/usr/local/lib/stanford-corenlp-full-2017-06-09/*" self.server = CoreNLP(configdict={ 'annotators': 'tokenize,ssplit,pos,depparse,lemma,ner', 'depparse.model': 'edu/stanford/nlp/models/parser/nndep/english_SD.gz' }, corenlp_jars=[corenlp_dir])
def __init__(self, in_train, in_dev, out_vocab, out_train, out_dev, out_test): self.proc = CoreNLP('ssplit') self.parser = CoreNLP('parse') self.in_train = in_train self.in_dev = in_dev self.out_vocab = out_vocab self.out_train = out_train self.out_dev = out_dev self.out_test = out_test self.text_id = 0 self.trainset() self.testset()
def __init__(self, fname, _set='train'): self.proc = CoreNLP('parse') self._set = _set f = open(fname) doc = f.read() f.close() doc = doc.split((50 * '*') + '\n') print 'Doc size: ', len(doc) for entry in doc: entry = entry.split('\n\n') _, entryId, size, semcategory = entry[0].replace('\n', '').split() entity_map = dict( map(lambda entity: entity.split(' | '), entry[2].replace('\nENTITY MAP\n', '').split('\n'))) lexEntries = entry[3].replace('\nLEX\n', '').split('\n-')[:-1] for lex in lexEntries: if lex[0] == '\n': lex = lex[1:] lex = lex.split('\n') lexId = lex[0] text = lex[1].replace('TEXT: ', '').strip() template = lex[2].replace('TEMPLATE: ', '') correct = lex[3].replace('CORRECT: ', '').strip() comment = lex[4].replace('COMMENT: ', '').strip() if comment in ['g', 'good']: print template print 10 * '-' self.update_template(entryId, size, semcategory, _set, lexId, template) references = self.process_references( text, template, entity_map) self.save_references(references) elif correct != '' and comment != 'wrong': print correct print 10 * '-' self.update_template(entryId, size, semcategory, _set, lexId, correct) references = self.process_references( text, correct, entity_map) self.save_references(references)
def lemmatize(l): result = [] from stanford_corenlp_pywrapper import CoreNLP proc = CoreNLP("pos", corenlp_jars=["stanford-corenlp-full-2015-04-20/*"], UnicodeDecodeError='skip') for doc_words in l: single_dict = proc.parse_doc(doc_words) row = [] for each_dict in single_dict['sentences']: for word in each_dict['lemmas']: row.append(word) result.append(row) return result
def write_hyps(hyps, fname): proc = CoreNLP('ssplit') f = open(fname, 'w') for hyp in hyps: out = proc.parse_doc(hyp) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace('-RRB-', ')') text += ' ' f.write(text.encode('utf-8')) f.write('\n') f.close()
def lemmaMapper(itr): pipeline = CoreNLP( configdict={'annotators': "tokenize,ssplit,pos,lemma"}, corenlp_jars=["./stanford-corenlp-full-2015-04-20/*"]) return map( lambda tc: (tc[0], plainTextToLemmas(tc[1], stopWords, pipeline)), itr)
def main(): print 'Initializing...' proc = CoreNLP("coref") verb2noun, noun2verb, verb2actor, actor2verb = utils.noun_verb( 'data/morph-verbalization-v1.01.txt') sub2word = utils.subgraph_word('data/verbalization-list-v1.06.txt') aligner = Aligner(verb2noun, noun2verb, verb2actor, actor2verb, sub2word, proc) corpora = ['LDC2015E86', 'LDC2016E25'] dir = 'data/LDC2016E25/data/alignments/split' print 'Processing...' train_set, dev_set, test_set = [], [], [] train, dev, test = run(dir, aligner) train_set.extend(train) dev_set.extend(dev) test_set.extend(test) print 'Writing...' write('data/alignments/training', train_set) write('data/alignments/dev', dev_set) write('data/alignments/test', test_set)
def get_parser(): corenlp = CoreNLP( configdict={'annotators': 'tokenize,ssplit,pos,lemma,ner'}, output_types=['ssplit', 'ner'], corenlp_jars=[config.STANFORD_CORENLP_DIR]) return corenlp
def __init__(self, analysisType): self.analysisType = analysisType coreNLPPath = os.path.join(os.path.dirname(__file__), '../../lib/stanfordCoreNLP.jar') coreNLPModelsPath = os.path.join(os.path.dirname(__file__), '../../lib/stanfordCoreNLPModels.jar') if StanfordCoreNLP.proc == None: StanfordCoreNLP.proc = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, lemma, ner, parse, dcoref'}, corenlp_jars=[coreNLPPath, coreNLPModelsPath])
def start_corenlp(): proc = CoreNLP("pos", corenlp_jars=[ osp.join(this_dir, "3rdparty/stanford-corenlp-full-2015-04-20/*") ], comm_mode='SOCKET') return proc
def __init__(self, analysisType): self.analysisType = analysisType # print("ANALYSIS: " + str(analysisType)) if StanfordCoreNLP.proc == None: StanfordCoreNLP.proc = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, lemma, ner, parse, sentiment, dcoref, relation, natlog, openie'}, corenlp_jars=[os.path.join(os.path.dirname(__file__), '../../lib/*')]) #, comm_mode='PIPE')
def __init__(self): proc = CoreNLP("coref") verb2noun, noun2verb, verb2actor, actor2verb = utils.noun_verb( 'data/morph-verbalization-v1.01.txt') sub2word = utils.subgraph_word('data/verbalization-list-v1.06.txt') self.aligner = Aligner(verb2noun, noun2verb, verb2actor, actor2verb, sub2word, proc)
class StanfordNLP: def __init__(self): #self.server = ServerProxy(JsonRpc20(), # TransportTcpIp(addr=("127.0.0.1", 8080))) corenlp_dir="/usr/local/lib/stanford-corenlp-full-2017-06-09/*" self.server = CoreNLP(configdict={'annotators': 'tokenize,ssplit,pos,depparse,lemma,ner','depparse.model':'edu/stanford/nlp/models/parser/nndep/english_SD.gz'}, corenlp_jars=[corenlp_dir]) def parse(self, text): return self.server.parse_doc(text)
def start_server(self): self.corenlp = CoreNLP( corenlp_jars=[ os.path.join(self.config["CORENLP_HOME"], self.config[self.lang]["corenlp_jar"]), os.path.join(self.config["CORENLP_HOME"], self.config[self.lang]["corenlp_models_jar"]), ], server_port=self.config[self.lang]["port"], configdict=self.config[self.lang]["properties"], ) print "Serving on http://%s:%s" % ("localhost", self.config[self.lang]["port"])
def entity_ner(): ''' Named entity types of the entities :return: ''' def get_stats(dataset, setname): stats = [] for text, refex in dataset: refex_tokens = refex.split() out = proc.parse_doc(text) tokens, ners = [], [] for snt in out['sentences']: tokens.extend(snt['tokens']) ners.extend(snt['ner']) for i, token in enumerate(tokens): found = True if refex_tokens[0] == token: for j, refex_token in enumerate(refex_tokens): if refex_token != tokens[i + j]: found = False break if found: ner = ners[i] stats.append(ner) break print setname freq = dict(nltk.FreqDist(stats)) total = sum(freq.values()) for name, freq in freq.iteritems(): print name, freq, float(freq) / total print 10 * '-' proc = CoreNLP('ner') train_data = p.load(open(TRAIN_REFEX_FILE)) dev_data = p.load(open(DEV_REFEX_FILE)) test_data = p.load(open(TEST_REFEX_FILE)) train_refex = map( lambda x: (x['text'], x['refex'].replace('eos', '').strip()), train_data) dev_refex = map( lambda x: (x['text'], x['refex'].replace('eos', '').strip()), dev_data) test_refex = map( lambda x: (x['text'], x['refex'].replace('eos', '').strip()), test_data) get_stats(train_refex, 'TRAIN') get_stats(dev_refex, 'DEV') get_stats(test_refex, 'TEST')
def split_and_tokenize(doc): ''' Reads a text document, splits sentences and tokenize them with the python wrapper of the Stanford CoreNLP. More info: https://github.com/brendano/stanford_corenlp_pywrapper :param doc: path to the :return: ''' parse_mode = "ssplit" # tokenization and sentence splitting coreNlpPath = "/Users/ana/workspace/stanford_corenlp_pywrapper/stanford-corenlp-full-2017-06-09/*" parser = CoreNLP(parse_mode, corenlp_jars=[coreNlpPath]) json_name = "database.mpqa.2.0/docs/" + doc.split("\n")[0] + ".json" if not os.path.exists(json_name): doc_path = "database.mpqa.2.0/docs/" + doc.split("\n")[0] document = codecs.open(doc_path, "r", encoding="utf-8").read() data_source_parse = parser.parse_doc(document) with open(json_name, 'w') as fp: json.dump(data_source_parse, fp, sort_keys=True, indent=2)
def main(): if not os.path.exists(IN_FILE + '_rf'): print('First reformatting file...') out_format = open(IN_FILE + '_rf', 'w') with open(IN_FILE) as handle: for line in tqdm(handle): tline = line.strip() if tline == '': out_format.write('\n') else: out_format.write(tline + ' ') print('Sentence tokenizer!') print('Loading Stanford CoreNLP...') proc = CoreNLP(configdict={ 'annotators': 'tokenize,ssplit', 'tokenize.options': 'ptb3Escaping=False' }, output_types=['tokenize,ssplit'], corenlp_jars=[CORENLP_PATH]) out_file = open(IN_FILE + '_sts', 'w') sentence_count = 0 print('Opening file ' + IN_FILE + '_rf' + '...') with open(IN_FILE + '_rf') as handle: lines = handle.readlines() for line in tqdm(lines): the_text = line.strip() # Use Stanford instead parsed = proc.parse_doc(the_text) sentence_count += len(parsed['sentences']) for sent in parsed['sentences']: the_tokens = [i.replace(' ', '') for i in sent['tokens']] the_sent = ' '.join(the_tokens) assert len(the_sent.split(' ')) == len(sent['tokens']) out_file.write(the_sent.encode('utf-8') + '\n') print('Number of sentences so far: ' + '{:,}'.format(sentence_count)) out_file.close()
class StanfordPreprocessor(object): def __init__(self, homedir='./'): from stanford_corenlp_pywrapper import CoreNLP self.corenlp = CoreNLP( configdict={ 'annotators': 'tokenize, ssplit, pos, lemma, parse, ner' }, output_types=['pos', 'lemma', 'parse', 'ner'], corenlp_jars=[homedir + "lib/stanford-corenlp-full-2015-04-20/*"]) def parse(self, document): return self.corenlp.parse_doc(document)
class SentenceDelimiter(): def __init__(self, corenlp_path): self.proc = CoreNLP("ssplit", corenlp_jars=[os.path.join(corenlp_path, '*')]) def get_sentences(self, text): res = self.proc.parse_doc(text) for sentence in res['sentences']: sentence_text = ' '.join(sentence['tokens']).encode('utf8') sentence_text = ' '.join(sentence_text.split()) sentence_text = sentence_text.replace('-LRB-', '(').replace('-RRB-', ')') sentence_text = sentence_text.replace('-LSB-', '[').replace('-RSB-', ']') sentence_text = sentence_text.replace('-LCB-', '{').replace('-RCB-', '}') yield escape(sentence_text)
def CoreNLP_tokenizer(): proc = CoreNLP(configdict={'annotators': 'tokenize,ssplit'}, corenlp_jars=[path.join(CoreNLP_path(), '*')]) def tokenize_context(context): parsed = proc.parse_doc(context) tokens = [] char_offsets = [] for sentence in parsed['sentences']: tokens += sentence['tokens'] char_offsets += sentence['char_offsets'] return tokens, char_offsets return tokenize_context
class BioNLPEnrichment(BaseEnrichment): """ """ def __init__(self): """ Load and initialize any external models or data here """ self.corenlp = CoreNLP("pos", corenlp_jars=["./enrichments/stanford-corenlp-full-2015-12-09/*"]) def enrichment_value(self,tweet): """ Calculate enrichment value """ rep = self.corenlp.parse_doc(tweet["actor"]["summary"]) return rep def __repr__(self): """ Add a description of the class's function here """ return("Stanford core NLP applied to user bio")
def __init__(self, lang='en', en_ner=False): # feature parameters self.lang = lang # [NLTK wrapper for Stanford CoreNLP] (too slow, results soso.) if en_ner == 'nltk': self.entity_cols = ['PERSON', 'ORG', 'LOCATION', 'FACILITY', 'GPE'] self.sner_root = '/home/marsan/workspace/stanford_nlp/stanford-ner-2015-04-20' self.sner_classifier = self.sner_root+'/classifiers/english.all.3class.distsim.crf.ser.gz' self.sner_main = self.sner_root+'/stanford-ner.jar' self.st = NERTagger(self.sner_classifier, self.sner_main, encoding='utf-8') # [Stanford CoreNLP pywrapper] (still slow, reaults too noisy) if en_ner == 'corenlp': self.entity_cols = ['LOCATION', 'TIME', 'PERSON', 'ORGANIZATION', 'MONEY', 'PERCENT', 'DATE'] self.snlp = CoreNLP("ner", corenlp_jars=["%s/stanford-corenlp-full-2015-04-20/*" % snlp_path])
class kawata_corenlp_handler: def __init__(self): # self.proc = CoreNLP(configdict={'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse'}, corenlp_jars=["/usr/local/lib/stanford-corenlp-full-2015-12-09/*"]) self.proc = CoreNLP( configdict={ 'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse' }, corenlp_jars=[ "/CORENLPDIRECTORY/stanford-corenlp-full-2015-12-09/*", "/Users/akira/stanford-corenlp-full-2015-12-09/sutime" ]) def __join_text_date(self, text, date): ''' Join text and date. ''' date_s = dt.date2str(date) return '[<date>{0}</date>]\n{1}'.format(date_s, text) def get_words(self, text, date): n_text = unidecode.unidecode(text) joint_text = self.__join_text_date(n_text, date) joint_text = n_text p = self.proc.parse_doc(joint_text)["sentences"][0] # print p words = list() words = zip(p["ner"], p["tokens"], p["ner"]) stop = stopwords.words("english") words = filter(lambda x: x[1] not in stop, words) words = map(lambda x: (x[0], x[1].lower(), x[2]), words) # I cannot understand what is most suitable in above line. ws = list() w = ("", "", "") for v in words: if v[0] != 'O' and v[0] == w[0]: w = (w[0], w[1] + " " + v[1], w[2]) else: ws.append(w) w = v if w[0] != "": ws.append(w) words = ws return words[1:]
def CoreNLP_tokenizer(): proc = CoreNLP(configdict={'annotators': 'tokenize,ssplit'}, corenlp_jars=[path.join(CoreNLP_path, '*')]) def tokenize_with_offset(context): parsed = proc.parse_doc(context) return [(sentence['tokens'], sentence['char_offsets'][0][0], sentence['char_offsets'][-1][-1]) for sentence in parsed['sentences']] def tokenize(sentence): parsed = proc.parse_doc(sentence) tokens = [] for sentence in parsed['sentences']: tokens += sentence['tokens'] return tokens return tokenize_with_offset, tokenize
class BodyNLPEnrichment(BaseEnrichment): """ """ def __init__(self): """ Load and initialize any external models or data here """ self.corenlp = CoreNLP("pos", corenlp_jars=["/home/jkolb/stanford-corenlp-full-2015-04-20/*"]) def enrichment_value(self, tweet): """ Calculate enrichment value """ rep = self.corenlp.parse_doc(tweet["body"]) return rep def __repr__(self): """ Add a description of the class's function here """ return "Stanford core NLP applied to tweet body"
class StanfordParser(object): """ Stanford parser """ def __init__(self, corenlp_jars): self.proc = CoreNLP("parse", corenlp_jars=corenlp_jars) def parse(self, text): # {u'sentences': # [ # {u'parse': u'(ROOT (S (VP (NP (INTJ (UH hello)) (NP (NN world)))) (. !)))' # u'tokens': [u'hello', u'world', u'.'], # u'lemmas': [u'hello', u'world', u'.'], # u'pos': [u'UH', u'NN', u'.'], # u'char_offsets': [[0, 5], [6, 11], [11, 12]] # }, # ... # ] # } json_rst = self.proc.parse_doc(text) if json_rst: for sent in json_rst['sentences']: parse_tree= sent['parse'] yield parse_tree
# key: name, value = index number map_name_index = {name: index for (name,index) in zip(doc_names, range(len(doc_names)))} # create dictionary key: doc name, value =docment doc_dic = {name: doc for (name,doc) in zip(doc_names,documents)} # save documents for name in doc_dic: f = open(os.path.join(out_file_bios_folder,name + ".txt"), "w") f.write(doc_dic[name]) f.close() #%% Text Processing ########################################################## proc = CoreNLP("pos", corenlp_jars=["/Users/Steven/Documents/corenlp/stanford-corenlp-full-2015-04-20/*"]) # You can also specify the annotators directly. For example, say we want to # parse but don't want lemmas. This can be done with the configdict option: p = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, parse'}, output_types=['pos','parse'], corenlp_jars=["/Users/Steven/Documents/corenlp/stanford-corenlp-full-2015-04-20/*"]) doc_dic_normalized = {} # key: document name, value = list of lemmas # note: remove stopwords, punctuation, numbers, websites, -lrb-, -rrb- # this pattern is only going to match two cases of words: data, data-driven # so ignores punctuation, numbers, parenthesis -rrb-, -lrb-, special characters # ignore so use match instead of search # match: Determine if the RE matches at the beginning of the string. # ^ = beginning of string, $ = end of string so https://www.coursera.org is ignored
from __future__ import print_function from stanford_corenlp_pywrapper import CoreNLP from nltk import * import os proc = CoreNLP("parse", corenlp_jars=["stanford/stanford-corenlp-full-2015-04-20/*"]) #correct subdirectory by coded type goes here #comment all this to do a single text file instead of a directory path = 'data/engelhard/A/' for filename in os.listdir(path): print(filename) with open(path+filename, 'rU') as f: engelhard = f.read() engelhard2 = engelhard.decode('utf8', 'ignore') trees = proc.parse_doc(engelhard2) print(engelhard2) #this is set as parse (parsing with named entity recognition) but you can also change it to different options, like: #ssplit for tokenization and sentence splitting #pos for pos and lemmas #ner for pos and ner and lemmas #parse for pos, lemmas, trees, dependencies #nerparse for parsing with ner, pos, lemmas, dependencies #coref for coreference including constituent parsing #comment this to do coref trees = proc.parse_doc(engelhard2) #print(trees)
# Note: Stanford NLP: parentheis are -rrb- and -lrb- # stopwords list , add "I" since Stanford NLP does not lowercase I but stopwords # from nltk includes "i" stop = set(stopwords.words('english')) stop.add("I") # regex: only keep words composed of alphanumeric characters or alphanumeric or ! or ? # words joined by "-" (e.g. keep data-driven) # ignore parenthesis -rrb-, -lrb- so use match instead of search # match: Determine if the RE matches at the beginning of the string. pattern = re.compile(r'^(?:[A-Za-z0-9]+[- ][A-za-z0-9]+|[A-Za-z0-9]+|[?!]+)$') #pattern_parenthesis = re.compile("-rrb-|-lrb-") proc = CoreNLP("pos", corenlp_jars=["/Users/Documents/corenlp/stanford-corenlp-full-2015-04-20/*"]) # You can also specify the annotators directly. For example, say we want to # parse but don't want lemmas. This can be done with the configdict option: # no longer need to specify output_types (the outputs to include are inferred from the annotators setting p = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, parse, lemma, ner,entitymentions, dcoref'}, #output_types=['pos','parse'], corenlp_jars=["/Users/Documents/corenlp/stanford-corenlp-full-2015-04-20/*"]) data_lemmas = copy.deepcopy(data_names) # deep copy otherwise change data_clean since list of objects # lemmatize quotes and description for row in data_lemmas: # Now it's ready to parse documents. You give it a string and it returns JSON-safe data structures # dictionary key = 'sentences', value = list of sentences
def __init__(self): """ Load and initialize any external models or data here """ self.corenlp = CoreNLP("pos", corenlp_jars=["/home/jkolb/stanford-corenlp-full-2015-04-20/*"])
def __init__(self): global CACHEDIR CoreNLP.__init__(self, "parse", corenlp_jars=[CORENLP_JARS_DIR + "*"])
class ner(object): def __init__(self, lang='en', en_ner=False): # feature parameters self.lang = lang # [NLTK wrapper for Stanford CoreNLP] (too slow, results soso.) if en_ner == 'nltk': self.entity_cols = ['PERSON', 'ORG', 'LOCATION', 'FACILITY', 'GPE'] self.sner_root = '/home/marsan/workspace/stanford_nlp/stanford-ner-2015-04-20' self.sner_classifier = self.sner_root+'/classifiers/english.all.3class.distsim.crf.ser.gz' self.sner_main = self.sner_root+'/stanford-ner.jar' self.st = NERTagger(self.sner_classifier, self.sner_main, encoding='utf-8') # [Stanford CoreNLP pywrapper] (still slow, reaults too noisy) if en_ner == 'corenlp': self.entity_cols = ['LOCATION', 'TIME', 'PERSON', 'ORGANIZATION', 'MONEY', 'PERCENT', 'DATE'] self.snlp = CoreNLP("ner", corenlp_jars=["%s/stanford-corenlp-full-2015-04-20/*" % snlp_path]) #=========================================== # Standford CoreNLP pywrapper #=========================================== def get_ner_stanford_corenlp(self, txt): tree = self.snlp.parse_doc(txt.upper()) ners = {n: [] for n in self.entity_cols} results = [list(zip(r['ner'], r['tokens'])) for r in tree['sentences']] results = [(k[0], k[1].lower()) for v in results for k in v if k[0] in self.entity_cols] ners = {k: [] for k in self.entity_cols} for k,v in results: ners[k].append(v) ners = {k: list(set(v)) for k,v in ners.items()} return ners # #=========================================== # # Standford CoreNLP (slow but better) # #=========================================== def get_ner_tags(self, text): ners = {} terms = [(k,v) for k,v in self.st.tag(text.split()) if v != 'O'] for t in self.entity_cols: ners[t] = list(set([re.sub('[^0-9a-zA-Z]+', ' ', k.lower()) for k,v in terms if v == t])) return ners #=========================================== # NLTK NER (very bad accuracy, a lot garbage) #=========================================== def get_ner_nltk(self, text): sents = nltk.sent_tokenize(text) # sentences tokenized_sents = [nltk.word_tokenize(s) for s in sents] tagged_sents = [nltk.pos_tag(s) for s in tokenized_sents] chunked_sents = [x for x in nltk.ne_chunk_sents(tagged_sents)] raw = self.traverseTree(chunked_sents) ners = {} for n in self.entity_cols: ners[n] = [] [ners[k].append(v.lower()) for k,v in raw] for n in self.entity_cols: ners[n] = list(set(ners[n])) return ners def traverseTree(self, tree): result = [] for subtree in tree: if type(subtree) == nltk.tree.Tree: if subtree.label() in self.entity_cols: result += [(subtree.label(), subtree[0][0])] else: result += (self.traverseTree(subtree)) return result
elif char == ')' or char == ']': right += 1 continue if left == right: outputList.append(char) output = ''.join(outputList) return output PRPList = ["He", "he", "She", "she", "His", "his", "Her", "him", "her", "him,", "him.", "her,", "her."] monthElement = "january|february|march|april|may|june|july|august|september|october|november|december" dateElement = "1|2|3|4|5|6|7|8|9|0" monthPattern = re.compile(monthElement, re.IGNORECASE) datePattern = re.compile(dateElement, re.IGNORECASE) procCOR = CoreNLP("coref", corenlp_jars=[jar_path]) readFile = (open(file_path)).read() filteredFile = bracketProcess(readFile) dictCOR = procCOR.parse_doc(filteredFile) entitiesCOR = dictCOR['entities'] sentencesCOR = dictCOR['sentences'] replaceList = [] for i in entitiesCOR: mentionList = i['mentions'] if not len(mentionList) == 1: catchList = [] for j in mentionList: item = [j['sentence']] item.append(j['tokspan_in_sentence'])
def main(): proc = CoreNLP("pos", corenlp_jars=["/home/is/daiki-ku/opt/stanford-corenlp-full-2016-10-31/*"]) proc.parse_doc("hello world. how are you?")
TODO: Implement differents parameters for the phrase extraction from sentences. Actual parameters are He, the usual NER from Stanford CoreNLP and the unigramm model without stopwords """ #mport re import numpy as np from stanford_corenlp_pywrapper import CoreNLP #Loading the Stanford CoreNLP Lib data = "./extracted-quest/quest-en.txt" loc= "/people/panou/Stage/projet/stanford-corenlp-full-2015-04-20/*" #STOPWORDS is the list of words we'd like to discards in our stopwords =[".","?","!",','] proc = CoreNLP("nerparse",corenlp_jars=[loc]) p=[] i=1 with open(data,'r') as inp: for line in inp: print "traitement de la ligne " + str(i) p.append(proc.parse_doc(line)) i+=1 with open('./phrases.txt','w') as out: for elmt in p: #print elmt["sentences"][0]["tokens"] for tok in elmt["sentences"][0]["lemmas"]: if not tok in stopwords: out.write(tok+'\n') out.write('\n')
#!/usr/bin/python # This script will extract a single article (all paragraphs) on the launcher grid. """ 118238@10\tSen.~^~Barack~^~Obama~^~and~^~his~^~wife~^~,~^~Michelle~^~Obama~^~,~^~have~^~released~^~eight~^~years~^~of~^~joint~^~returns~^~.\tO~^~PERSON~^~PERSON~^~O~^~O~^~O~^~O~^~PERSON~^~PERSON~^~O~^~O~^~O~^~DURATION~^~DURATION~^~O~^~O~^~O~^~O """ from stanford_corenlp_pywrapper import CoreNLP import os import sys # Prepare the parser proc = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, parse, lemma, ner'}, output_types=["pos","parse"], corenlp_jars=["/work/02092/vsochat/wrangler/SOFTWARE/stanford-corenlp-full-2015-04-20/*"]) input_file = sys.argv[1] output_file = sys.argv[2] # Any errors will have entries written to an error file for inspection error_file = output_file.replace(".txt",".err") filey = open(input_file,"rb") lines = filey.readlines()[0] filey.close() # Format expected to be: # "12345|<text><p>hello this is text, sentence one!</p><p>sentence two!</p></text>" article_id,text = lines.split("|") text = text.replace("</text>","").replace("<text>","").strip("\n").replace('"',"") paragraphs = text.split("<p>")
in_file_name = 'classbios.txt' split = in_file_name.split(".") out_file_name_lines = split[0] + "_lines." + split[1] out_file_name_normalized_line = split[0] + "_normalized_line ." + split[1] out_file_name_normalized_sentence = split[0] + "_normalized_sentence ." + split[1] out_file_name_normalized_tokens = split[0] + "_normalized_tokens ." + split[1] k = 20 # top k unigrams and bigrams out_file_name_unigrams = "{0}_top{1}{2}.json".format(split[0],k,"Unigrams") out_file_name_bigrams = "{0}_top{1}{2}.json".format(split[0],k,"Bigrams") os.chdir(path) os.listdir('.') # see if file is in directory proc = CoreNLP("pos", corenlp_jars=["/Users/Steven/Documents/corenlp/stanford-corenlp-full-2015-04-20/*"]) # You can also specify the annotators directly. For example, say we want to # parse but don't want lemmas. This can be done with the configdict option: p = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, parse'}, output_types=['pos','parse'], corenlp_jars=["/Users/Steven/Documents/corenlp/stanford-corenlp-full-2015-04-20/*"]) #%% Functions ############################################################## def getFrequency(ls, ignore = set(), pattern = re.compile(r'.') ): """Gets the frequency of elements in list, ignoring elements in ignore set and matching the pattern Args:
from stanford_corenlp_pywrapper import CoreNLP import os proc = CoreNLP("ner", corenlp_jars=["/Users/Jerry/Downloads/stanford-corenlp-full-2015-12-09/*"]) input_path = '/Users/Jerry/Documents/CMPS290H/Project/data/dataset' output_path = '/Users/Jerry/Documents/CMPS290H/Project/data/dictionary/name.tsv' #parse files output = open(output_path,'w') for filename in os.listdir(input_path): try: input_file = open(filename,'r') x = input_file.read() out = proc.parse_doc(x) ner_tags = out['sentences'][0]['ner'] num_tokens = len(ner_tags) lemmas = out['sentences'][0]['lemmas'] first_indexes = (i for i in xrange(num_tokens) if ner_tags[i] == "PERSON" and (i == 0 or ner_tags[i-1] != "PERSON")) for begin_index in first_indexes: # find the end of the PERSON phrase (consecutive tokens tagged as PERSON) end_index = begin_index + 1 while end_index < num_tokens and ner_tags[end_index] == "PERSON": end_index += 1 end_index -= 1 mention_text = " ".join(map(lambda i: lemmas[i], xrange(begin_index, end_index + 1))) print("%s %s" % (filename, mention_text)) output.write("%s\n" % mention_text) except IndexError: pass
def __init__(self): """ Load and initialize any external models or data here """ self.corenlp = CoreNLP("pos", corenlp_jars=["./enrichments/stanford-corenlp-full-2015-12-09/*"])
from stanford_corenlp_pywrapper import CoreNLP from pprint import pprint import glob proc = CoreNLP("ssplit", corenlp_jars=["stanford/stanford-corenlp-full-2015-04-20/*"]) path = 'data/engelhard/0/' for filename in glob.glob(path+'*.txt'): print(filename) with open(filename, 'rU') as f: engelhard = f.read() engelhard2 = engelhard.decode('utf8', 'ignore') print(engelhard2) a = proc.parse_doc(engelhard2) pprint(a['sentences'][0]['tokens'])
class NER: def __init__(self, lang): self.lang = lang self.config = ner_config def start_server(self): self.corenlp = CoreNLP( corenlp_jars=[ os.path.join(self.config["CORENLP_HOME"], self.config[self.lang]["corenlp_jar"]), os.path.join(self.config["CORENLP_HOME"], self.config[self.lang]["corenlp_models_jar"]), ], server_port=self.config[self.lang]["port"], configdict=self.config[self.lang]["properties"], ) print "Serving on http://%s:%s" % ("localhost", self.config[self.lang]["port"]) # text = [paragraphs] (one per line) def query(self, text): if self.lang == "CMN": return self.stanford_ner(text) if self.lang == "SPA": return self.freeling_ner(text) if self.lang == "ENG": return self.stanford_ner(text) def stanford_ner(self, text): mentions = [] for paragraph in text: paragraph_mentions = [] response = self.corenlp.parse_doc(paragraph) sentences = response["sentences"] # print '\n\n', paragraph for sentence in sentences: paragraph_mentions.extend(self.process_stanford_sentence(sentence)) mentions.append(paragraph_mentions) return mentions def process_stanford_sentence(self, sentence): mentions = [] for index, word in enumerate(sentence["tokens"]): ner_type = sentence["ner"][index] if ner_type in stanford_good_entity_types: if index > 0 and sentence["ner"][index - 1] == ner_type: # concat this token with the previous mentions[-1].word += ( " " + word ) # TODO: this is buggy, think of a better way (perhaps using the offsets and sentence.substring(start, end)) mentions[-1].end = sentence["char_offsets"][index][1] else: mentions.append( Mention( word, sentence["char_offsets"][index][0], sentence["char_offsets"][index][1], ner_type, "name", "link", ) ) return mentions def freeling_ner(self, text, name): print "\n\nINPUT TEXT:", text entities = get_entities(text) mentions = [] # build Mentions for (form, count, classification) in entities: print "FREELING FOUND: %s: %d | %s" % (form, count, classification) # word, begin, end, ner, name, link mentions.append(Mention(form, 0, 1, classification, "name", "link")) return mentions
files = dict() os.makedirs("bio_output") os.chdir(os.path.join(os.getcwd(),"bio_output")) for i in range(len(doc)): files[file_names[i]] = doc[i].replace("\n"," ") file = open(file_names[i]+".txt", "w") file.write(doc[i]) file.close() #%% check set of non-word characters and stopword os.chdir("/Users/apple/Documents/MSiA/Fall 2015/Text analytics/HW/hw3") proc = CoreNLP("pos", corenlp_jars=["/Users/apple/corenlp/stanford-corenlp-full-2015-04-20/*"]) for i in files.keys(): text = files[i] parsed = proc.parse_doc(text) to_flat = [x["lemmas"] for x in parsed["sentences"]] words = [item for sublist in to_flat for item in sublist] files[i] = words #%% #import stopwords from nltk stopwords, add 'I' since it is also counted as a stopword stopWord = set(stopwords.words('english')) stopWord.add("I") all_words = [] nonWords = re.compile(r"^\b[a-zA-Z]+-?[a-zA-z]+$") for i in files.keys():
""" Input is multiple text files. Each text file represents one document. Output is just as many text files, with the ".anno" extension instead. Each output file consists of one JSON object. USAGE proc_text_files.py MODE [files...] e.g. python proc_text_files.py pos *.txt """ import sys, re mode = sys.argv[1] from stanford_corenlp_pywrapper import CoreNLP ss = CoreNLP(mode, corenlp_jars=["/Users/Doctor_Einstein/Documents/stockMartket/analysis/nlp/stanford/*"]) for filename in sys.argv[2:]: outfile = re.sub(r'\.txt$',"", filename) + ".anno" print>>sys.stderr, "%s -> %s" % (filename, outfile) text = open(filename).read().decode('utf8', 'replace') jdoc = ss.parse_doc(text, raw=True) with open(outfile, 'w') as fp: print>>fp, jdoc
import csv import json import sys import os import fstphrases from stanford_corenlp_pywrapper import CoreNLP import argparse parser = argparse.ArgumentParser() parser.add_argument('--corpus', help='the thing in the middle of corpus/{}/raw', required=True) parser.add_argument('--nlpjar', help='where is core nlp?', required=True) parser.add_argument('--tagset', help='np fst tag set', required=False) args = parser.parse_args() proc = CoreNLP("parse", corenlp_jars=[args.nlpjar + '/*']) try: os.remove('corpora/' + args.corpus + '/processed/all.anno_plus') except: pass if args.tagset is None: print "[*] using default tagset for npfst" def get_phrases(tags, toks): '''extract phrases with npfst''' phrases = fstphrases.extract_from_poses(tags, 'NP', tagset=args.tagset) phrases_deets = []