def __init__(self, in_train, in_dev): self.proc = CoreNLP('ssplit') self.parser = CoreNLP('parse') self.in_train = in_train self.in_dev = in_dev self.text_id = 0 self.trainset() self.testset()
def __init__(self, _set='train', save_references=True): self._set = _set self.proc = CoreNLP('coref') self.proc_parse = CoreNLP('parse') self.e2f = utils.get_e2f('../data/lex.e2f') self.save_references = save_references # referring expressions per entity self.refexes = {}
def __init__(self, in_train, in_dev, out_vocab, out_train, out_dev, out_test): self.proc = CoreNLP('ssplit') self.parser = CoreNLP('parse') self.in_train = in_train self.in_dev = in_dev self.out_vocab = out_vocab self.out_train = out_train self.out_dev = out_dev self.out_test = out_test self.text_id = 0 self.trainset() self.testset()
def __init__(self): self.proc = CoreNLP('parse') self.ner = json.load(open('../data/delexicalization/ner_dict.json')) self.semcategory = json.load( open('../data/delexicalization/delex_dict.json')) self.descriptions = json.load( open('../data/delexicalization/descriptions.json'))
def get_parser(): corenlp = CoreNLP( configdict={'annotators': 'tokenize,ssplit,pos,lemma,ner'}, output_types=['ssplit', 'ner'], corenlp_jars=[config.STANFORD_CORENLP_DIR]) return corenlp
def get_test_references(): de, en = [], [] proc = CoreNLP('ssplit') # Insert test references in training data entries = Entry.objects(set='test') for entry in entries: for triple in entry.triples: agent = triple.agent.name patient = triple.patient.name de.append(agent) name = ' '.join( agent.replace('\'', '').replace('\"', '').split('_')) out = proc.parse_doc(name) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace( '-RRB-', ')') text += ' ' en.append(text.strip()) de.append(patient) name = ' '.join( patient.replace('\'', '').replace('\"', '').split('_')) out = proc.parse_doc(name) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace( '-RRB-', ')') text += ' ' en.append(text.strip()) return de, en
def main(): print 'Initializing...' proc = CoreNLP("coref") verb2noun, noun2verb, verb2actor, actor2verb = utils.noun_verb( 'data/morph-verbalization-v1.01.txt') sub2word = utils.subgraph_word('data/verbalization-list-v1.06.txt') aligner = Aligner(verb2noun, noun2verb, verb2actor, actor2verb, sub2word, proc) corpora = ['LDC2015E86', 'LDC2016E25'] dir = 'data/LDC2016E25/data/alignments/split' print 'Processing...' train_set, dev_set, test_set = [], [], [] train, dev, test = run(dir, aligner) train_set.extend(train) dev_set.extend(dev) test_set.extend(test) print 'Writing...' write('data/alignments/training', train_set) write('data/alignments/dev', dev_set) write('data/alignments/test', test_set)
def lemmaMapper(itr): pipeline = CoreNLP( configdict={'annotators': "tokenize,ssplit,pos,lemma"}, corenlp_jars=["./stanford-corenlp-full-2015-04-20/*"]) return map( lambda tc: (tc[0], plainTextToLemmas(tc[1], stopWords, pipeline)), itr)
def __init__(self, analysisType): self.analysisType = analysisType coreNLPPath = os.path.join(os.path.dirname(__file__), '../../lib/stanfordCoreNLP.jar') coreNLPModelsPath = os.path.join(os.path.dirname(__file__), '../../lib/stanfordCoreNLPModels.jar') if StanfordCoreNLP.proc == None: StanfordCoreNLP.proc = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, lemma, ner, parse, dcoref'}, corenlp_jars=[coreNLPPath, coreNLPModelsPath])
def run(self, fin, fout): self.proc = CoreNLP('ssplit') entity_maps = p.load(open(os.path.join(fin, 'eval1.cPickle'))) f = open(os.path.join(fin, 'eval1.bpe.de.output.postprocessed.dev')) texts = f.read().lower().split('\n') f.close() print len(texts), len(entity_maps) for i, text in enumerate(texts[:-1]): entity_map = entity_maps[i] for tag in entity_map: name = ' '.join(entity_map[tag].name.lower().replace('\'', '').replace('\"', '').split('_')) texts[i] = texts[i].replace(tag.lower(), str(name)) f = open(fout, 'w') for text in texts: out = self.proc.parse_doc(text)['sentences'] text = [] for i, snt in enumerate(out): text.extend(snt['tokens']) text = ' '.join(text).replace('-LRB- ', '(').replace(' -RRB-', ')').strip() f.write(text.encode('utf-8')) f.write('\n') f.close()
def __init__(self, fdev, ftest): self.proc = CoreNLP('ssplit') self.get_results(fdev, ftest) # DEV dev_order, dev_gold = [], [] DEV_DIR = u'../data/dev' for dir in os.listdir(DEV_DIR): if dir != u'.DS_Store': f = os.path.join(DEV_DIR, dir) for fname in os.listdir(f): if fname != u'.DS_Store': print os.path.join(f, fname) _order, _gold = self.order(os.path.join(f, fname), u'dev') dev_order.extend(_order) dev_gold.extend(_gold) self.write_hyps(dev_order, fdev + '.ordered') utils.write_references('results/gold/dev.en', dev_gold) # TEST test_order, test_gold = [], [] TEST_FILE = u'../data/test/triples/test.xml' _order, _gold = self.order(TEST_FILE, u'test') test_order.extend(_order) self.write_hyps(test_order, ftest + '.ordered') # save previous orders self.save_prev_order()
def __init__(self, homedir='./'): from stanford_corenlp_pywrapper import CoreNLP self.corenlp = CoreNLP( configdict={ 'annotators': 'tokenize, ssplit, pos, lemma, parse, ner' }, output_types=['pos', 'lemma', 'parse', 'ner'], corenlp_jars=[homedir + "lib/stanford-corenlp-full-2015-04-20/*"])
def start_corenlp(): proc = CoreNLP("pos", corenlp_jars=[ osp.join(this_dir, "3rdparty/stanford-corenlp-full-2015-04-20/*") ], comm_mode='SOCKET') return proc
def __init__(self): proc = CoreNLP("coref") verb2noun, noun2verb, verb2actor, actor2verb = utils.noun_verb( 'data/morph-verbalization-v1.01.txt') sub2word = utils.subgraph_word('data/verbalization-list-v1.06.txt') self.aligner = Aligner(verb2noun, noun2verb, verb2actor, actor2verb, sub2word, proc)
def __init__(self, analysisType): self.analysisType = analysisType # print("ANALYSIS: " + str(analysisType)) if StanfordCoreNLP.proc == None: StanfordCoreNLP.proc = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, lemma, ner, parse, sentiment, dcoref, relation, natlog, openie'}, corenlp_jars=[os.path.join(os.path.dirname(__file__), '../../lib/*')]) #, comm_mode='PIPE')
def __parse_text(self): if exists_in_s3('{}/{}'.format(s3_output_prefix, self.outfilename)): self.__load_parse_result() return ss = CoreNLP( 'parse', corenlp_jars=['~/software/stanford-corenlp-full-2015-12-09/*']) self.parsed = ss.parse_doc(self.sentences) ss.cleanup()
def __init__(self): # self.proc = CoreNLP(configdict={'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse'}, corenlp_jars=["/usr/local/lib/stanford-corenlp-full-2015-12-09/*"]) self.proc = CoreNLP( configdict={ 'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse' }, corenlp_jars=[ "/CORENLPDIRECTORY/stanford-corenlp-full-2015-12-09/*", "/Users/akira/stanford-corenlp-full-2015-12-09/sutime" ])
def entity_ner(): ''' Named entity types of the entities :return: ''' def get_stats(dataset, setname): stats = [] for text, refex in dataset: refex_tokens = refex.split() out = proc.parse_doc(text) tokens, ners = [], [] for snt in out['sentences']: tokens.extend(snt['tokens']) ners.extend(snt['ner']) for i, token in enumerate(tokens): found = True if refex_tokens[0] == token: for j, refex_token in enumerate(refex_tokens): if refex_token != tokens[i + j]: found = False break if found: ner = ners[i] stats.append(ner) break print setname freq = dict(nltk.FreqDist(stats)) total = sum(freq.values()) for name, freq in freq.iteritems(): print name, freq, float(freq) / total print 10 * '-' proc = CoreNLP('ner') train_data = p.load(open(TRAIN_REFEX_FILE)) dev_data = p.load(open(DEV_REFEX_FILE)) test_data = p.load(open(TEST_REFEX_FILE)) train_refex = map( lambda x: (x['text'], x['refex'].replace('eos', '').strip()), train_data) dev_refex = map( lambda x: (x['text'], x['refex'].replace('eos', '').strip()), dev_data) test_refex = map( lambda x: (x['text'], x['refex'].replace('eos', '').strip()), test_data) get_stats(train_refex, 'TRAIN') get_stats(dev_refex, 'DEV') get_stats(test_refex, 'TEST')
def __init__(self): #self.server = ServerProxy(JsonRpc20(), # TransportTcpIp(addr=("127.0.0.1", 8080))) corenlp_dir = "/usr/local/lib/stanford-corenlp-full-2017-06-09/*" self.server = CoreNLP(configdict={ 'annotators': 'tokenize,ssplit,pos,depparse,lemma,ner', 'depparse.model': 'edu/stanford/nlp/models/parser/nndep/english_SD.gz' }, corenlp_jars=[corenlp_dir])
def __init__(self, fname, _set='train'): self.proc = CoreNLP('parse') self._set = _set f = open(fname) doc = f.read() f.close() doc = doc.split((50 * '*') + '\n') print 'Doc size: ', len(doc) for entry in doc: entry = entry.split('\n\n') _, entryId, size, semcategory = entry[0].replace('\n', '').split() entity_map = dict( map(lambda entity: entity.split(' | '), entry[2].replace('\nENTITY MAP\n', '').split('\n'))) lexEntries = entry[3].replace('\nLEX\n', '').split('\n-')[:-1] for lex in lexEntries: if lex[0] == '\n': lex = lex[1:] lex = lex.split('\n') lexId = lex[0] text = lex[1].replace('TEXT: ', '').strip() template = lex[2].replace('TEMPLATE: ', '') correct = lex[3].replace('CORRECT: ', '').strip() comment = lex[4].replace('COMMENT: ', '').strip() if comment in ['g', 'good']: print template print 10 * '-' self.update_template(entryId, size, semcategory, _set, lexId, template) references = self.process_references( text, template, entity_map) self.save_references(references) elif correct != '' and comment != 'wrong': print correct print 10 * '-' self.update_template(entryId, size, semcategory, _set, lexId, correct) references = self.process_references( text, correct, entity_map) self.save_references(references)
def CoreNLP_tokenizer(): proc = CoreNLP(configdict={'annotators': 'tokenize,ssplit'}, corenlp_jars=[path.join(CoreNLP_path(), '*')]) def tokenize_context(context): parsed = proc.parse_doc(context) tokens = [] char_offsets = [] for sentence in parsed['sentences']: tokens += sentence['tokens'] char_offsets += sentence['char_offsets'] return tokens, char_offsets return tokenize_context
def write_hyps(hyps, fname): proc = CoreNLP('ssplit') f = open(fname, 'w') for hyp in hyps: out = proc.parse_doc(hyp) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace('-RRB-', ')') text += ' ' f.write(text.encode('utf-8')) f.write('\n') f.close()
def lemmatize(l): result = [] from stanford_corenlp_pywrapper import CoreNLP proc = CoreNLP("pos", corenlp_jars=["stanford-corenlp-full-2015-04-20/*"], UnicodeDecodeError='skip') for doc_words in l: single_dict = proc.parse_doc(doc_words) row = [] for each_dict in single_dict['sentences']: for word in each_dict['lemmas']: row.append(word) result.append(row) return result
def CoreNLP_tokenizer(): proc = CoreNLP(configdict={'annotators': 'tokenize,ssplit'}, corenlp_jars=[path.join(CoreNLP_path, '*')]) def tokenize_with_offset(context): parsed = proc.parse_doc(context) return [(sentence['tokens'], sentence['char_offsets'][0][0], sentence['char_offsets'][-1][-1]) for sentence in parsed['sentences']] def tokenize(sentence): parsed = proc.parse_doc(sentence) tokens = [] for sentence in parsed['sentences']: tokens += sentence['tokens'] return tokens return tokenize_with_offset, tokenize
def split_and_tokenize(doc): ''' Reads a text document, splits sentences and tokenize them with the python wrapper of the Stanford CoreNLP. More info: https://github.com/brendano/stanford_corenlp_pywrapper :param doc: path to the :return: ''' parse_mode = "ssplit" # tokenization and sentence splitting coreNlpPath = "/Users/ana/workspace/stanford_corenlp_pywrapper/stanford-corenlp-full-2017-06-09/*" parser = CoreNLP(parse_mode, corenlp_jars=[coreNlpPath]) json_name = "database.mpqa.2.0/docs/" + doc.split("\n")[0] + ".json" if not os.path.exists(json_name): doc_path = "database.mpqa.2.0/docs/" + doc.split("\n")[0] document = codecs.open(doc_path, "r", encoding="utf-8").read() data_source_parse = parser.parse_doc(document) with open(json_name, 'w') as fp: json.dump(data_source_parse, fp, sort_keys=True, indent=2)
def main(): if not os.path.exists(IN_FILE + '_rf'): print('First reformatting file...') out_format = open(IN_FILE + '_rf', 'w') with open(IN_FILE) as handle: for line in tqdm(handle): tline = line.strip() if tline == '': out_format.write('\n') else: out_format.write(tline + ' ') print('Sentence tokenizer!') print('Loading Stanford CoreNLP...') proc = CoreNLP(configdict={ 'annotators': 'tokenize,ssplit', 'tokenize.options': 'ptb3Escaping=False' }, output_types=['tokenize,ssplit'], corenlp_jars=[CORENLP_PATH]) out_file = open(IN_FILE + '_sts', 'w') sentence_count = 0 print('Opening file ' + IN_FILE + '_rf' + '...') with open(IN_FILE + '_rf') as handle: lines = handle.readlines() for line in tqdm(lines): the_text = line.strip() # Use Stanford instead parsed = proc.parse_doc(the_text) sentence_count += len(parsed['sentences']) for sent in parsed['sentences']: the_tokens = [i.replace(' ', '') for i in sent['tokens']] the_sent = ' '.join(the_tokens) assert len(the_sent.split(' ')) == len(sent['tokens']) out_file.write(the_sent.encode('utf-8') + '\n') print('Number of sentences so far: ' + '{:,}'.format(sentence_count)) out_file.close()
# CoreNLP # coreNlpPath="/home/mihaylov/research/TAC2016/tac2016-kbp-event-nuggets/corenlp/stanford-corenlp-full-2015-12-09/*;/home/mihaylov/research/TAC2016/tac2016-kbp-event-nuggets/corenlp/stanford-srparser-2014-10-23-models.jar" # #coreNlpPath="/home/mihaylov/research/TAC2016/tac2016-kbp-event-nuggets/corenlp/stanford-corenlp-full-2015-12-09/*;" # # # #server # # coreNlpPath="/home/mitarb/mihaylov/research/TAC2016/tac2016-kbp-event-nuggets/corenlp/stanford-corenlp-full-2015-12-09/*" # # coreNlpPath="/home/mitarb/mihaylov/research/TAC2016/tac2016-kbp-event-nuggets/corenlp/stanford-corenlp-full-2015-12-09/*;" coreNlpPath = "/Users/mihaylov/research/libs/corenlp_executables/stanford-corenlp-full-2015-12-09/*" if len(sys.argv) > 3: coreNlpPath = sys.argv[3] print "coreNlpPath:%s" % coreNlpPath parse_mode = "pos" # "pos", "parse" parser = CoreNLP(parse_mode, corenlp_jars=coreNlpPath.split(';')) print("Processing %s input files.." % len(input_files_in_dir)) for fid, file_name in enumerate(input_files_in_dir): print "-" * 10 print "--- " + file_name + " ---" print "-" * 10 try: file_base_name = get_file_base_name(file_name) print("File %s of %s:%s" % (fid + 1, len(input_files_in_dir), file_name)) output_dir_file = os.path.join(output_dir, file_base_name + "_prep") if not os.path.exists(output_dir_file): os.makedirs(output_dir_file)
import sys from settings import * from stanford_corenlp_pywrapper import CoreNLP with open('EECS_annotated_samples_anonymized') as handle: lines = handle.readlines() lines = [line.strip() for line in lines] proc = CoreNLP('pos', corenlp_jars=[PATH_TO_STANFORD_CORENLP]) out_file = open('crf-input-data', 'w') cur_line, cur_parsed, cur_mapped, cur_pos = [None] * 4 current_nonos = 0 in_annotations = False in_type = None for line in lines: if line == '': current_nonos += sum([1 for tok in cur_mapped if tok != 'O']) #print('Non-Os is now: ' + str(current_nonos)) #print(cur_mapped) #print('\n\n\n') assert len(cur_mapped) == len(cur_pos) and len(cur_pos) == len( cur_parsed) for i in range(0, len(cur_mapped)): out_file.write(cur_parsed[i] + '\t' + cur_pos[i] + '\t' + cur_mapped[i] + '\n') out_file.write('\n') in_annotations = False continue
# stopwords list , add "I" since Stanford NLP does not lowercase I but stopwords # from nltk includes "i" stop = set(stopwords.words('english')) stop.add("I") # regex: only keep words composed of alphanumeric characters or alphanumeric or ! or ? # words joined by "-" (e.g. keep data-driven) # ignore parenthesis -rrb-, -lrb- so use match instead of search # match: Determine if the RE matches at the beginning of the string. pattern = re.compile(r'^(?:[A-Za-z0-9]+[- ][A-za-z0-9]+|[A-Za-z0-9]+|[?!]+)$') #pattern_parenthesis = re.compile("-rrb-|-lrb-") proc = CoreNLP( "pos", corenlp_jars=[ "/Users/Documents/corenlp/stanford-corenlp-full-2015-04-20/*" ]) # You can also specify the annotators directly. For example, say we want to # parse but don't want lemmas. This can be done with the configdict option: # no longer need to specify output_types (the outputs to include are inferred from the annotators setting p = CoreNLP( configdict={ 'annotators': 'tokenize, ssplit, pos, parse, lemma, ner,entitymentions, dcoref' }, #output_types=['pos','parse'], corenlp_jars=[ "/Users/Documents/corenlp/stanford-corenlp-full-2015-04-20/*" ])
# -*- coding: utf-8 -*- import SocketServer from stanford_corenlp_pywrapper import CoreNLP class MyTCPHandler(SocketServer.BaseRequestHandler): def handle(self): self.data = self.request.recv(1024).strip() print self.data if not isinstance(self.data, unicode): document = unicode(self.data, 'utf-8') jdoc = ss.parse_doc(document, raw=True) self.request.sendall(jdoc) if __name__ == "__main__": HOST, PORT = "localhost", 9998 # Enter FULL path to folder containing extracted Stanford Core NLP ss = CoreNLP(configdict={'annotators': 'tokenize, ssplit, pos, parse'}, corenlp_jars=["stanford-corenlp-full-2015-01-29/*"]) print "model loaded" server = SocketServer.TCPServer((HOST, PORT), MyTCPHandler) server.serve_forever()