def run_entity_annotator(self, doctag, annotator): """ Classify a document using an annotator and insert results into the database :param doctag: tag of the document :param annotator: annotator to classify :return: """ sentences = self.get_sentences(doctag) data = bottle.request.json output = {} for a in self.entity_annotators: # a in (annotator_name, annotator_engine, annotator_etype) if a[0] == annotator: for s in sentences: sentence = Sentence(s[2], offset=s[3], sid=s[1], did=doctag) #sentence.process_sentence(self.corenlp) sentence.process_corenlp_output(ast.literal_eval(s[4])) sentence_text = " ".join([t.text for t in sentence.tokens]) sentence_output = self.entity_annotators[a].annotate_sentence(sentence_text) #print sentence_output sentence_entities = self.entity_annotators[a].process_sentence(sentence_output, sentence) for e in sentence_entities: sentence_entities[e].normalize() self.add_entity(sentence_entities[e], annotator) output[e] = str(sentence_entities[e]) # print output return json.dumps(output)
def run_relation_annotator(self, doctag, annotator): """ Classify a document using an annotator and insert results into the database :param doctag: tag of the document :param annotator: annotator to classify :return: """ # process whole document instead of sentence by sentence sentences = self.get_sentences(doctag) data = bottle.request.json output = {} for a in self.relation_annotators: # a in (annotator_name, annotator_engine, annotator_etype) if a[0] == annotator: input_sentences = [] for s in sentences: sentence = Sentence(s[2], offset=s[3], sid=s[1], did=doctag) sentence.process_corenlp_output(ast.literal_eval(s[4])) sentence = self.get_entities(sentence) input_sentences.append(sentence) sentence_results = self.relation_annotators[a].annotate_sentences(input_sentences) for sentence in input_sentences: if a[1] == "jsre": pred, original = sentence_results[s[1]] sentence_relations = self.relation_annotators[a].process_sentence(pred, original, sentence) elif a[1] == "smil": sentence_relations = self.relation_annotators[a].process_sentence(sentence) for p in sentence_relations: self.add_relation(p, annotator) output[p.pid] = str(p) return json.dumps(output)
def get_document(self, doctag): # return document entry with doctag cur = self.db_conn.cursor() query = """SELECT distinct id, doctag, title, doctext FROM document WHERE doctag =%s;""" # print "QUERY", query cur.execute(query, (doctag, )) res = cur.fetchone() if res is not None: result = { 'docID': res[1], 'title': res[2], 'docText': res[3], 'abstract': { 'sentences': [] } } sentences = self.get_sentences(doctag) for s in sentences: sentence = Sentence(s[2], offset=s[3], sid=s[1], did=doctag) sentence.process_corenlp_output(ast.literal_eval(s[4])) sentence = self.get_entities(sentence) result['abstract']['sentences'].append(sentence.get_dic("all")) output = json.dumps(result) return output else: return json.dumps( {'error': 'could not find document {}'.format(doctag)})
def sentence_tokenize(self, doctype): """ Split the document text into sentences, add to self.sentences list :param doctype: Can be used in the future to choose different methods """ # first sentence should be the title if it exists if self.title: sid = self.did + ".s0" self.sentences.append(Sentence(self.title, sid=sid, did=self.did)) # inputtext = clean_whitespace(self.text) inputtext = self.text with codecs.open("/tmp/geniainput.txt", 'w', 'utf-8') as geniainput: geniainput.write(inputtext) current_dir = os.getcwd() os.chdir(geniass_path) geniaargs = [ "./geniass", "/tmp/geniainput.txt", "/tmp/geniaoutput.txt" ] Popen(geniaargs, stdout=PIPE, stderr=PIPE).communicate() os.chdir(current_dir) offset = 0 with codecs.open("/tmp/geniaoutput.txt", 'r', "utf-8") as geniaoutput: for l in geniaoutput: stext = l.strip() if stext == "": offset = self.get_space_between_sentences(offset) continue sid = self.did + ".s" + str(len(self.sentences)) self.sentences.append( Sentence(stext, offset=offset, sid=sid, did=self.did)) offset += len(stext) offset = self.get_space_between_sentences(offset)
def sentence_tokenize(self, doctype): """ Split the document text into sentences, add to self.sentences list :param doctype: Can be used in the future to choose different methods """ # first sentence should be the title if it exists if self.title: sid = self.did + ".s0" self.sentences.append(Sentence(self.title, sid=sid, did=self.did)) # inputtext = clean_whitespace(self.text) inputtext = self.text if not os.path.exists(geniass_path + '/tmp/'): os.mkdir(geniass_path + '/tmp/') with codecs.open(geniass_path + "/tmp/geniainput.txt", 'w', 'utf-8') as geniainput: geniainput.write(inputtext) current_dir = os.getcwd() os.chdir(geniass_path) geniacmd = "geniass.exe tmp/geniainput.txt tmp/geniaoutput.txt" call(geniacmd, shell=True) os.chdir(current_dir) offset = 0 with codecs.open(geniass_path + "/tmp/geniaoutput.txt", 'r', "utf-8") as geniaoutput: for l in geniaoutput: stext = l.strip() if stext == "": offset = self.get_space_between_sentences(offset) continue sid = self.did + ".s" + str(len(self.sentences)) self.sentences.append( Sentence(stext, offset=offset, sid=sid, did=self.did)) offset += len(stext) offset = self.get_space_between_sentences(offset)
def generate_data(self, corpus, modelname, pairtypes): # TODO: remove old model pcount = 0 truepcount = 0 ns = 0 for did in corpus.documents: doc_entities = corpus.documents[did].get_entities("goldstandard") examplelines = [] # logging.info("{}".format(sentence.sid)) # sentence_entities = sentence.entities.elist["goldstandard"] # logging.debug("sentence {} has {} entities ({})".format(sentence.sid, len(sentence_entities), len(sentence.entities.elist["goldstandard"]))) for pair in itertools.permutations(doc_entities, 2): sn1 = int(pair[0].sid.split(".")[-1][1:]) sn2 = int(pair[1].sid.split(".")[-1][1:]) # if self.pairtype in corpus.type_sentences and pair[0].sid not in corpus.type_sentences[self.pairtype]: # continue if abs(sn2 - sn1) > 0 or pair[0].start == pair[ 1].start or pair[0].end == pair[1].end: continue # if self.pairtype in ("Has_Sequence_Identical_To", "Is_Functionally_Equivalent_To") and pair[0].type != pair[1].type: # continue #if pair[0].text == pair[1].text: # continue # logging.info("{}=>{}|{}=>{}".format(pair[0].type, pair[1].type, pairtypes[0], pairtypes[1])) if pair[0].type in config.pair_types[self.pairtype][ "source_types"] and pair[1].type in config.pair_types[ self.pairtype]["target_types"]: #if pair[0].type in config.event_types[self.pairtype]["source_types"] and pair[1].type in config.event_types[self.pairtype]["target_types"]: #pair[1].type in config.pair_types[self.pairtype]["source_types"] and pair[0].type in config.pair_types[self.pairtype]["target_types"]: # logging.debug(pair) #if pair[0].type not in config.pair_types[self.pairtype]["source_types"]: # pair = (pair[1], pair[0]) pid = did + ".p" + str(pcount) # self.pairs[pid] = (e1id, e2id) if sn1 != sn2: sentence1 = corpus.documents[did].get_sentence( pair[0].sid) sentence2 = corpus.documents[did].get_sentence( pair[1].sid) sentence = Sentence(text=sentence1.text + " " + sentence2.text, offset=sentence1.offset) sentence.tokens = sentence1.tokens + sentence2.tokens for t in pair[1].tokens: t.order += len(sentence1.tokens) else: sentence = corpus.documents[did].get_sentence( pair[0].sid) f, label = self.generate_features(sentence, pair) self.features.append(f) self.labels.append(label) self.pairs.append(pair)
def load_corpus(self, corenlpserver, process=True): total_lines = sum(1 for line in open(self.path)) widgets = [ pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer() ] pbar = pb.ProgressBar(widgets=widgets, maxval=total_lines, redirect_stdout=True).start() time_per_abs = [] with codecs.open(self.path, 'r', "utf-8") as trainfile: current = 0 for line in trainfile: #logging.debug('%s:%s/%s', f, current + 1, total) x = line.strip().split(" ") did = x[0] doctext = " ".join(x[1:]) newdoc = Document(doctext, process=False, did=did) #newdoc.sentence_tokenize("biomedical") sid = did + ".s0" newdoc.sentences.append( Sentence(doctext, offset=0, sid=sid, did=did)) if process: newdoc.process_document(corenlpserver, "biomedical") self.documents[newdoc.did] = newdoc # abs_time = time.time() - t # time_per_abs.append(abs_time) #logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) pbar.update(current + 1) current += 1 pbar.finish()
def load_corpus(self, corenlpserver, process=True): widgets = [ pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer() ] nlines = 0 with open(self.path) as f: for nlines, l in enumerate(f): pass print nlines pbar = pb.ProgressBar(widgets=widgets, maxval=nlines).start() with codecs.open(self.path, 'r', "utf-8") as corpusfile: doc_text = "" sentences = [] for i, l in enumerate(corpusfile): if l.startswith("###"): # new doc if doc_text != "": logging.debug("creating document: {}".format(doc_text)) newdoc = Document(doc_text, process=False, did=did) newdoc.sentences = sentences[:] newdoc.process_document(corenlpserver, "biomedical") # logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc doc_text = "" did = "JNLPBA" + l.strip().split(":")[-1] logging.debug("starting new document:" + did) sentence_text = "" doc_offset = 0 sentences = [] elif l.strip() == "" and sentence_text != "": # new sentence #logging.debug("creating mew sentence: {}".format(sentence_text)) sid = did + ".s" + str(len(sentences)) this_sentence = Sentence(sentence_text, offset=doc_offset, sid=sid, did=did) doc_offset += len(sentence_text) + 1 doc_text += sentence_text + " " sentences.append(this_sentence) if i == nlines: logging.debug("creating document: {}".format(doc_text)) newdoc = Document(doc_text, process=False, did=did) newdoc.sentences = sentences[:] newdoc.process_document(corenlpserver, "biomedical") # logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc doc_text = "" # start new sentence sentence_text = "" else: #logging.debug(str(i) + "/" + str(l)) t = l.strip().split("\t") if sentence_text != "": sentence_text += " " #if t[1] == "B-protein" sentence_text += t[0] pbar.update(i) pbar.finish()
def generate_data(self, corpus, modelname, pairtypes): # TODO: remove old model pcount = 0 truepcount = 0 ns = 0 for did in corpus.documents: doc_entities = corpus.documents[did].get_entities("goldstandard") examplelines = [] # logging.info("{}".format(sentence.sid)) # sentence_entities = sentence.entities.elist["goldstandard"] # logging.debug("sentence {} has {} entities ({})".format(sentence.sid, len(sentence_entities), len(sentence.entities.elist["goldstandard"]))) for pair in itertools.permutations(doc_entities, 2): sn1 = int(pair[0].sid.split(".")[-1][1:]) sn2 = int(pair[1].sid.split(".")[-1][1:]) # if self.pairtype in corpus.type_sentences and pair[0].sid not in corpus.type_sentences[self.pairtype]: # continue if abs(sn2 - sn1) > 0 or pair[0].start == pair[1].start or pair[0].end == pair[1].end: continue # if self.pairtype in ("Has_Sequence_Identical_To", "Is_Functionally_Equivalent_To") and pair[0].type != pair[1].type: # continue #if pair[0].text == pair[1].text: # continue # logging.info("{}=>{}|{}=>{}".format(pair[0].type, pair[1].type, pairtypes[0], pairtypes[1])) if pair[0].type in config.seedev_types.pair_types[self.pairtype]["source_types"] and pair[1].type in config.seedev_types.pair_types[self.pairtype]["target_types"]: #if pair[0].type in config.event_types[self.pairtype]["source_types"] and pair[1].type in config.event_types[self.pairtype]["target_types"]: #pair[1].type in config.pair_types[self.pairtype]["source_types"] and pair[0].type in config.pair_types[self.pairtype]["target_types"]: # logging.debug(pair) #if pair[0].type not in config.pair_types[self.pairtype]["source_types"]: # pair = (pair[1], pair[0]) pid = did + ".p" + str(pcount) # self.pairs[pid] = (e1id, e2id) if sn1 != sn2: sentence1 = corpus.documents[did].get_sentence(pair[0].sid) sentence2 = corpus.documents[did].get_sentence(pair[1].sid) sentence = Sentence(text = sentence1.text + " " + sentence2.text, offset=sentence1.offset) sentence.tokens = sentence1.tokens + sentence2.tokens for t in pair[1].tokens: t.order += len(sentence1.tokens) else: sentence = corpus.documents[did].get_sentence(pair[0].sid) f, label = self.generate_features(sentence, pair) self.features.append(f) self.labels.append(label) self.pairs.append(pair)
def get_document(self, doctag): # return document entry with doctag cur = self.db_conn.cursor() query = """SELECT distinct id, doctag, title, doctext FROM document WHERE doctag =%s;""" # print "QUERY", query cur.execute(query, (doctag,)) res = cur.fetchone() if res is not None: result = {'docID': res[1], 'title': res[2], 'docText': res[3], 'abstract':{'sentences':[]}} sentences = self.get_sentences(doctag) for s in sentences: sentence = Sentence(s[2], offset=s[3], sid=s[1], did=doctag) sentence.process_corenlp_output(ast.literal_eval(s[4])) sentence = self.get_entities(sentence) result['abstract']['sentences'].append(sentence.get_dic("all")) output = json.dumps(result) return output else: return json.dumps({'error': 'could not find document {}'.format(doctag)})
def pos_tag(text): """ Tokenize a given text and generates a list of Sentence objects. :param text: Tokenizes a given text and generates a list of Sentence objects, with the appropiate POS-tags added. :return: A list of Sentence objects representing the sentences in the text. """ sentences = [] for count, sentence in enumerate(nltk.sent_tokenize(text)): tokens = OrderedDict() # get the tokens and POS tags for word, tag in nltk.pos_tag(nltk.word_tokenize(sentence)): tokens[word] = tag # sentence is now tokenized and tokens have POS tags sentences.append(Sentence(count, tokens)) return sentences
def load_corpus(self, corenlpserver, process=True): soup = BeautifulSoup(codecs.open(self.path, 'r', "utf-8"), 'html.parser') docs = soup.find_all("article") widgets = [ pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer() ] pbar = pb.ProgressBar(widgets=widgets, maxval=len(docs)).start() n_lines = 1 time_per_abs = [] for doc in docs: did = "GENIA" + doc.articleinfo.bibliomisc.text.split(":")[1] title = doc.title.sentence.get_text() sentences = doc.abstract.find_all("sentence") doc_sentences = [] doc_text = title + " " doc_offset = 0 for si, s in enumerate(sentences): t = time.time() stext = s.get_text() sid = did + ".s" + str(si) doc_text += stext + " " this_sentence = Sentence(stext, offset=doc_offset, sid=sid, did=did) doc_offset = len(doc_text) doc_sentences.append(this_sentence) newdoc = Document(doc_text, process=False, did=did) newdoc.sentences = doc_sentences[:] newdoc.process_document(corenlpserver, "biomedical") #logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) pbar.update(n_lines) n_lines += 1 pbar.finish() abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver): # self.path is the base directory of the files of this corpus trainfiles = [ self.path + '/' + f for f in os.listdir(self.path) if f.endswith('.xml') ] total = len(trainfiles) current = 0 time_per_abs = [] for f in trainfiles: logging.debug('%s:%s/%s', f, current + 1, total) current += 1 with open(f, 'r') as xml: #parse DDI corpus file t = time.time() root = ET.fromstring(xml.read()) doctext = "" did = root.get('id') doc_sentences = [] # get the sentences of this document doc_offset = 0 # offset of the current sentence relative to the document for sentence in root.findall('sentence'): sid = sentence.get('id') #logging.info(sid) text = sentence.get('text') text = text.replace('\r\n', ' ') doctext += " " + text # generate the full text of this document this_sentence = Sentence(text, offset=doc_offset, sid=sid, did=did) doc_offset = len(doctext) doc_sentences.append(this_sentence) #logging.info(len(doc_sentences)) newdoc = Document(doctext, process=False, did=did) newdoc.sentences = doc_sentences[:] newdoc.process_document(corenlpserver, "biomedical") #logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True): # self.path is just one file with every document time_per_abs = [] with open(self.path, 'r') as xml: t = time.time() root = ET.fromstring(xml.read()) all_docs = root.findall("document") widgets = [ pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer() ] pbar = pb.ProgressBar(widgets=widgets, maxval=len(all_docs)).start() for i, doc in enumerate(all_docs): doctext = "" did = doc.get('id') doc_sentences = [] # get the sentences of this document doc_offset = 0 # offset of the current sentence relative to the document for sentence in doc.findall('sentence'): sid = sentence.get('id') #logging.info(sid) text = sentence.get('text') #text = text.replace('\r\n', ' ') doctext += " " + text # generate the full text of this document this_sentence = Sentence(text, offset=doc_offset, sid=sid, did=did) doc_offset = len(doctext) doc_sentences.append(this_sentence) newdoc = Document(doctext, process=False, did=did) newdoc.sentences = doc_sentences[:] newdoc.process_document(corenlpserver, "biomedical") self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) pbar.update(i + 1) pbar.finish() abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True): total_lines = sum(1 for line in open(self.path)) time_per_abs = [] with codecs.open(self.path, 'r', "utf-8") as trainfile: current = 0 ddi = "" for line in trainfile: #logging.debug('%s:%s/%s', f, current + 1, total) if line.startswith("ID"): did = line.strip().split("\t")[1] print did elif line.startswith("sentence"): doctext = line.strip().split("\t")[1] newdoc = Document(doctext, process=False, did=did) sid = did + ".s0" newdoc.sentences.append( Sentence(doctext, offset=0, sid=sid, did=did)) if process: newdoc.process_document(corenlpserver) self.documents[newdoc.did] = newdoc