def load_corpus(self, corenlpserver, process=True): total_lines = sum(1 for line in open(self.path)) widgets = [pb.Percentage(), " ", pb.Bar(), " ", pb.AdaptiveETA(), " ", pb.Timer()] pbar = pb.ProgressBar(widgets=widgets, maxval=total_lines, redirect_stdout=True).start() time_per_abs = [] with codecs.open(self.path, "r", "utf-8") as trainfile: current = 0 for line in trainfile: # logging.debug('%s:%s/%s', f, current + 1, total) x = line.strip().split(" ") did = x[0] doctext = " ".join(x[1:]) newdoc = Document(doctext, process=False, did=did) # newdoc.sentence_tokenize("biomedical") sid = did + ".s0" newdoc.sentences.append(Sentence(doctext, offset=0, sid=sid, did=did)) if process: newdoc.process_document(corenlpserver, "biomedical") self.documents[newdoc.did] = newdoc # abs_time = time.time() - t # time_per_abs.append(abs_time) # logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) pbar.update(current + 1) current += 1 pbar.finish()
def load_corpus(self, corenlpserver, process=True): """Load the CHEMDNER corpus file on the dir element""" # open filename and parse lines total_lines = sum(1 for line in open(self.path)) widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()] pbar = pb.ProgressBar(widgets=widgets, maxval=total_lines).start() n_lines = 1 time_per_abs = [] with io.open(self.path, 'r', encoding="utf-8") as inputfile: for line in inputfile: t = time.time() # each line is PMID title abs tsv = line.split('\t') doctext = tsv[1].strip().replace("<", "(").replace(">", ")") + " " doctext += tsv[2].strip().replace("<", "(").replace(">", ")") newdoc = Document(doctext, process=False, did=tsv[0], title=tsv[1].strip() + ".") newdoc.sentence_tokenize("biomedical") if process: newdoc.process_document(corenlpserver, "biomedical") self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) pbar.update(n_lines) n_lines += 1 pbar.finish() abs_avg = sum(time_per_abs)*1.0/len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def process_documents(): corpus = Corpus("corpora/Thaliana/pubmed") final_text = [] corenlp_client = StanfordCoreNLP('http://localhost:9000') lcount = 0 starts = set() with codecs.open("corpora/Thaliana/documents.txt", 'r', 'utf-8') as docfile: for l in docfile: print lcount if l[:20] in starts: continue lcount += 1 starts.add(l[:20]) newdoc = Document(l.strip()) newdoc.process_document(corenlp_client) for sentence in newdoc.sentences: print[t.text for t in sentence.tokens] newtext = "" corpus.documents["d" + str(lcount)] = newdoc """for s in newdoc.sentences: for t in s.tokens: newtext += t.text + " " final_text.append(newtext)""" # if lcount > 10: # break if lcount % 1000 == 0: corpus.save( "corpora/Thaliana/thaliana-documents_{}.pickle".format( str(lcount / 1000)))
def process_documents(corpus_path): corpus = Corpus(corpus_path) final_text = [] corenlp_client = StanfordCoreNLP('http://localhost:9000') lcount = 0 starts = set() with codecs.open(corpus_path, 'r', 'utf-8') as docfile: for l in docfile: print lcount if l[:10] in starts: print "repeated abstract:", l[:10] continue lcount += 1 starts.add(l[:10]) values = l.strip().split("\t") pmid = values[0] abs_text = " ".join(values[1:]) newdoc = Document(abs_text, did="PMID" + pmid) newdoc.process_document(corenlp_client) #for sentence in newdoc.sentences: # print [t.text for t in sentence.tokens] newtext = "" newdoc.did = "PMID" + pmid corpus.documents["PMID" + pmid] = newdoc """for s in newdoc.sentences: for t in s.tokens: newtext += t.text + " " final_text.append(newtext)""" # if lcount > 10: # break if lcount % 1000 == 0: corpus.save("{}_{}.pickle".format(corpus_path, str(lcount/1000))) corpus = Corpus(corpus_path) corpus.save("{}_{}.pickle".format(corpus_path, str(lcount / 1000)))
def process_documents(corpus_path): corpus = Corpus(corpus_path) final_text = [] corenlp_client = StanfordCoreNLP('http://localhost:9000') lcount = 0 starts = set() with codecs.open(corpus_path, 'r', 'utf-8') as docfile: for l in docfile: print lcount if l[:10] in starts: print "repeated abstract:", l[:10] continue lcount += 1 starts.add(l[:10]) values = l.strip().split("\t") pmid = values[0] abs_text = " ".join(values[1:]) newdoc = Document(abs_text, did="PMID" + pmid) newdoc.process_document(corenlp_client) #for sentence in newdoc.sentences: # print [t.text for t in sentence.tokens] newtext = "" newdoc.did = "PMID" + pmid corpus.documents["PMID" + pmid] = newdoc """for s in newdoc.sentences: for t in s.tokens: newtext += t.text + " " final_text.append(newtext)""" # if lcount > 10: # break if lcount % 1000 == 0: corpus.save("{}_{}.pickle".format(corpus_path, str(lcount / 1000))) corpus = Corpus(corpus_path) corpus.save("{}_{}.pickle".format(corpus_path, str(lcount / 1000)))
def load_corpus(self, corenlpserver, process=True): widgets = [ pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer() ] nlines = 0 with open(self.path) as f: for nlines, l in enumerate(f): pass print nlines pbar = pb.ProgressBar(widgets=widgets, maxval=nlines).start() with codecs.open(self.path, 'r', "utf-8") as corpusfile: doc_text = "" sentences = [] for i, l in enumerate(corpusfile): if l.startswith("###"): # new doc if doc_text != "": logging.debug("creating document: {}".format(doc_text)) newdoc = Document(doc_text, process=False, did=did) newdoc.sentences = sentences[:] newdoc.process_document(corenlpserver, "biomedical") # logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc doc_text = "" did = "JNLPBA" + l.strip().split(":")[-1] logging.debug("starting new document:" + did) sentence_text = "" doc_offset = 0 sentences = [] elif l.strip() == "" and sentence_text != "": # new sentence #logging.debug("creating mew sentence: {}".format(sentence_text)) sid = did + ".s" + str(len(sentences)) this_sentence = Sentence(sentence_text, offset=doc_offset, sid=sid, did=did) doc_offset += len(sentence_text) + 1 doc_text += sentence_text + " " sentences.append(this_sentence) if i == nlines: logging.debug("creating document: {}".format(doc_text)) newdoc = Document(doc_text, process=False, did=did) newdoc.sentences = sentences[:] newdoc.process_document(corenlpserver, "biomedical") # logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc doc_text = "" # start new sentence sentence_text = "" else: #logging.debug(str(i) + "/" + str(l)) t = l.strip().split("\t") if sentence_text != "": sentence_text += " " #if t[1] == "B-protein" sentence_text += t[0] pbar.update(i) pbar.finish()
def load_corpus(self, corenlpserver, process=True): # self.path is the base directory of the files of this corpus trainfiles = [self.path + '/' + f for f in os.listdir(self.path) if f.endswith('.txt')] total = len(trainfiles) widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()] pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start() time_per_abs = [] for current, f in enumerate(trainfiles): #logging.debug('%s:%s/%s', f, current + 1, total) print '{}:{}/{}'.format(f, current + 1, total) did = f.split(".")[0] t = time.time() with open(f, 'r') as txt: doctext = txt.read() newdoc = Document(doctext, process=False, did=did) newdoc.sentence_tokenize("biomedical") if process: newdoc.process_document(corenlpserver, "biomedical") self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) #logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) pbar.update(current+1) pbar.finish() abs_avg = sum(time_per_abs)*1.0/len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True): trainfiles = [self.path + '/' + f for f in os.listdir(self.path)] total = len(trainfiles) widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()] pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start() time_per_abs = [] for current, f in enumerate(trainfiles): #logging.debug('%s:%s/%s', f, current + 1, total) print '{}:{}/{}'.format(f, current + 1, total) did = f t = time.time() with open(f, 'r') as f: article = "<Article>" + f.read() + "</Article>" soup = BeautifulSoup(article, 'xml') #doc = soup.find_all("article") title = soup.ArticleTitle.get_text() abstract = soup.AbstractText.get_text() doc_text = title + " " + abstract newdoc = Document(doc_text, process=False, did=did) newdoc.sentence_tokenize("biomedical") newdoc.process_document(corenlpserver, "biomedical") #logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) pbar.update(current) pbar.finish() abs_avg = sum(time_per_abs)*1.0/len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True): total_lines = sum(1 for line in open(self.path)) widgets = [ pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer() ] pbar = pb.ProgressBar(widgets=widgets, maxval=total_lines, redirect_stdout=True).start() time_per_abs = [] with codecs.open(self.path, 'r', "utf-8") as trainfile: current = 0 for line in trainfile: #logging.debug('%s:%s/%s', f, current + 1, total) x = line.strip().split(" ") did = x[0] doctext = " ".join(x[1:]) newdoc = Document(doctext, process=False, did=did) #newdoc.sentence_tokenize("biomedical") sid = did + ".s0" newdoc.sentences.append( Sentence(doctext, offset=0, sid=sid, did=did)) if process: newdoc.process_document(corenlpserver, "biomedical") self.documents[newdoc.did] = newdoc # abs_time = time.time() - t # time_per_abs.append(abs_time) #logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) pbar.update(current + 1) current += 1 pbar.finish()
def load_corpus(self, corenlpserver): # self.path is the base directory of the files of this corpus # if more than one file: trainfiles = [self.path + f for f in os.listdir(self.path) if not f.endswith('~')] # opens all files in folder (see config file) widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', ' ', pb.Timer()] pbar = pb.ProgressBar(widgets=widgets, maxval=len(trainfiles)).start() for i, openfile in enumerate(trainfiles): # print("file: "+openfile) with open(openfile, 'r') as inputfile: newdoc = Document(inputfile.read(), process=False, did=os.path.basename(openfile), title = "titulo_"+os.path.basename(openfile)) newdoc.process_document(corenlpserver, "biomedical") #process_document chama o tokenizer valid = True invalid_sids = [] for s in newdoc.sentences: if s.text in ['[start section id="{}"]'.format(section) for section in self.invalid_sections]: valid = False if not valid: invalid_sids.append(s.sid) if s.text in ['[end section id="{}"]'.format(section) for section in self.invalid_sections]: valid = True if (s.text.startswith("[") and s.text.endswith("]")) or s.text.istitle(): newdoc.title_sids.append(s.sid) newdoc.invalid_sids = invalid_sids logging.debug("invalid sentences: {}".format(invalid_sids)) logging.debug("title sentences: {}".format(newdoc.title_sids)) self.documents[newdoc.did] = newdoc pbar.update(i+1)
def load_corpus(self, corenlpserver, process=True): # self.path is just one file with every document time_per_abs = [] with open(self.path, 'r') as xml: t = time.time() root = ET.fromstring(xml.read()) all_docs = root.findall("document") widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()] pbar = pb.ProgressBar(widgets=widgets, maxval=len(all_docs)).start() for i, doc in enumerate(all_docs): doctext = "" did = doc.get('id') doc_sentences = [] # get the sentences of this document doc_offset = 0 # offset of the current sentence relative to the document for sentence in doc.findall('sentence'): sid = sentence.get('id') #logging.info(sid) text = sentence.get('text') #text = text.replace('\r\n', ' ') doctext += " " + text # generate the full text of this document this_sentence = Sentence(text, offset=doc_offset, sid=sid, did=did) doc_offset = len(doctext) doc_sentences.append(this_sentence) newdoc = Document(doctext, process=False, did=did) newdoc.sentences = doc_sentences[:] newdoc.process_document(corenlpserver, "biomedical") self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) pbar.update(i+1) pbar.finish() abs_avg = sum(time_per_abs)*1.0/len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver): docs = self.get_docs(self.path) total = len(docs) current = 0 time_per_abs = [] ts = set() for f in docs: logging.debug('%s:%s/%s', f[0], current + 1, total) current += 1 #parse DDI corpus file t = time.time() #print root.tag docid = f[ 0] # TODO: actually each paragraph should be it's own documents, that should help offset issues doctext = "" doc_sentences = [] # get the sentences of this document doc_offset = 0 # offset of the current sentence relative to the document sents = self.get_paragraphs(f) for p in sents: logging.debug("processing {}".format(p[0])) senttext = p[1].text.replace("\n", " ") for ne in p[1].findall("ne"): #doctext += ne.text senttext += ne.text if ne.tail: #doctext += ne.tail senttext += ne.tail.replace("\n", " ") #logging.debug(senttext) #this_sentence = Sentence(senttext, offset=doc_offset, sid=p[0], did=docid) doctext += senttext + "\n" doc_offset = len(doctext) #doc_sentences.append(this_sentence) #logging.info(len(doc_sentences)) newdoc = Document(doctext, process=False, did=docid, ssplit=True) #newdoc.sentences = doc_sentences[:] newdoc.process_document(corenlpserver, "biomedical") #logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc #for s in self.documents[newdoc.did].sentences: # logging.debug("sentence {} has {} tokens".format(s.sid, len(s.tokens))) # logging.debug([(t.start, t.end) for t in s.tokens]) abs_time = time.time() - t time_per_abs.append(abs_time) logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True): soup = BeautifulSoup(codecs.open(self.path, 'r', "utf-8"), 'html.parser') docs = soup.find_all("article") widgets = [ pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer() ] pbar = pb.ProgressBar(widgets=widgets, maxval=len(docs)).start() n_lines = 1 time_per_abs = [] for doc in docs: did = "GENIA" + doc.articleinfo.bibliomisc.text.split(":")[1] title = doc.title.sentence.get_text() sentences = doc.abstract.find_all("sentence") doc_sentences = [] doc_text = title + " " doc_offset = 0 for si, s in enumerate(sentences): t = time.time() stext = s.get_text() sid = did + ".s" + str(si) doc_text += stext + " " this_sentence = Sentence(stext, offset=doc_offset, sid=sid, did=did) doc_offset = len(doc_text) doc_sentences.append(this_sentence) newdoc = Document(doc_text, process=False, did=did) newdoc.sentences = doc_sentences[:] newdoc.process_document(corenlpserver, "biomedical") #logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) pbar.update(n_lines) n_lines += 1 pbar.finish() abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver): docs = self.get_docs(self.path) total = len(docs) current = 0 time_per_abs = [] ts = set() for f in docs: logging.debug("%s:%s/%s", f[0], current + 1, total) current += 1 # parse DDI corpus file t = time.time() # print root.tag docid = f[0] # TODO: actually each paragraph should be it's own documents, that should help offset issues doctext = "" doc_sentences = [] # get the sentences of this document doc_offset = 0 # offset of the current sentence relative to the document sents = self.get_paragraphs(f) for p in sents: logging.debug("processing {}".format(p[0])) senttext = p[1].text.replace("\n", " ") for ne in p[1].findall("ne"): # doctext += ne.text senttext += ne.text if ne.tail: # doctext += ne.tail senttext += ne.tail.replace("\n", " ") # logging.debug(senttext) # this_sentence = Sentence(senttext, offset=doc_offset, sid=p[0], did=docid) doctext += senttext + "\n" doc_offset = len(doctext) # doc_sentences.append(this_sentence) # logging.info(len(doc_sentences)) newdoc = Document(doctext, process=False, did=docid, ssplit=True) # newdoc.sentences = doc_sentences[:] newdoc.process_document(corenlpserver, "biomedical") # logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc # for s in self.documents[newdoc.did].sentences: # logging.debug("sentence {} has {} tokens".format(s.sid, len(s.tokens))) # logging.debug([(t.start, t.end) for t in s.tokens]) abs_time = time.time() - t time_per_abs.append(abs_time) logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def create_sentences(self, doctag, text): # Create sentence entries based on text from document doctag cur = self.db_conn.cursor() newdoc = Document(text, process=False, did=doctag) newdoc.sentence_tokenize("biomedical") for i, sentence in enumerate(newdoc.sentences): corenlpres = sentence.process_sentence(self.corenlp) query = """INSERT INTO sentence(senttag, doctag, senttext, sentoffset, corenlp) VALUES (%s, %s, %s, %s, %s);""" try: cur.execute(query, (sentence.sid, doctag, sentence.text.encode("utf8"), sentence.offset, str(corenlpres).encode("utf8"))) self.db_conn.commit() #inserted_id = cur.lastrowid #return str(inserted_id) except MySQLdb.MySQLError as e: self.db_conn.rollback() logging.debug(e)
def load_corpus(self, corenlpserver): # self.path is the base directory of the files of this corpus trainfiles = [ self.path + '/' + f for f in os.listdir(self.path) if f.endswith('.xml') ] total = len(trainfiles) current = 0 time_per_abs = [] for f in trainfiles: logging.debug('%s:%s/%s', f, current + 1, total) current += 1 with open(f, 'r') as xml: #parse DDI corpus file t = time.time() root = ET.fromstring(xml.read()) doctext = "" did = root.get('id') doc_sentences = [] # get the sentences of this document doc_offset = 0 # offset of the current sentence relative to the document for sentence in root.findall('sentence'): sid = sentence.get('id') #logging.info(sid) text = sentence.get('text') text = text.replace('\r\n', ' ') doctext += " " + text # generate the full text of this document this_sentence = Sentence(text, offset=doc_offset, sid=sid, did=did) doc_offset = len(doctext) doc_sentences.append(this_sentence) #logging.info(len(doc_sentences)) newdoc = Document(doctext, process=False, did=did) newdoc.sentences = doc_sentences[:] newdoc.process_document(corenlpserver, "biomedical") #logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True): total_lines = sum(1 for line in open(self.path)) time_per_abs = [] with codecs.open(self.path, 'r', "utf-8") as trainfile: current = 0 ddi = "" for line in trainfile: #logging.debug('%s:%s/%s', f, current + 1, total) if line.startswith("ID"): did = line.strip().split("\t")[1] print did elif line.startswith("sentence"): doctext = line.strip().split("\t")[1] newdoc = Document(doctext, process=False, did=did) sid = did + ".s0" newdoc.sentences.append(Sentence(doctext, offset=0, sid=sid, did=did)) if process: newdoc.process_document(corenlpserver) self.documents[newdoc.did] = newdoc
def load_corpus(self, corenlpserver, process=True): # self.path is just one file with every document time_per_abs = [] with open(self.path, 'r') as xml: t = time.time() root = ET.fromstring(xml.read()) all_docs = root.findall("document") widgets = [ pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer() ] pbar = pb.ProgressBar(widgets=widgets, maxval=len(all_docs)).start() for i, doc in enumerate(all_docs): doctext = "" did = doc.get('id') doc_sentences = [] # get the sentences of this document doc_offset = 0 # offset of the current sentence relative to the document for sentence in doc.findall('sentence'): sid = sentence.get('id') #logging.info(sid) text = sentence.get('text') #text = text.replace('\r\n', ' ') doctext += " " + text # generate the full text of this document this_sentence = Sentence(text, offset=doc_offset, sid=sid, did=did) doc_offset = len(doctext) doc_sentences.append(this_sentence) newdoc = Document(doctext, process=False, did=did) newdoc.sentences = doc_sentences[:] newdoc.process_document(corenlpserver, "biomedical") self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) pbar.update(i + 1) pbar.finish() abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True): widgets = [pb.Percentage(), " ", pb.Bar(), " ", pb.ETA(), " ", pb.Timer()] nlines = 0 with open(self.path) as f: for nlines, l in enumerate(f): pass print nlines pbar = pb.ProgressBar(widgets=widgets, maxval=nlines).start() with codecs.open(self.path, "r", "utf-8") as corpusfile: doc_text = "" sentences = [] for i, l in enumerate(corpusfile): if l.startswith("###"): # new doc if doc_text != "": logging.debug("creating document: {}".format(doc_text)) newdoc = Document(doc_text, process=False, did=did) newdoc.sentences = sentences[:] newdoc.process_document(corenlpserver, "biomedical") # logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc doc_text = "" did = "JNLPBA" + l.strip().split(":")[-1] logging.debug("starting new document:" + did) sentence_text = "" doc_offset = 0 sentences = [] elif l.strip() == "" and sentence_text != "": # new sentence # logging.debug("creating mew sentence: {}".format(sentence_text)) sid = did + ".s" + str(len(sentences)) this_sentence = Sentence(sentence_text, offset=doc_offset, sid=sid, did=did) doc_offset += len(sentence_text) + 1 doc_text += sentence_text + " " sentences.append(this_sentence) if i == nlines: logging.debug("creating document: {}".format(doc_text)) newdoc = Document(doc_text, process=False, did=did) newdoc.sentences = sentences[:] newdoc.process_document(corenlpserver, "biomedical") # logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc doc_text = "" # start new sentence sentence_text = "" else: # logging.debug(str(i) + "/" + str(l)) t = l.strip().split("\t") if sentence_text != "": sentence_text += " " # if t[1] == "B-protein" sentence_text += t[0] pbar.update(i) pbar.finish()
def load_corpus(self, corenlpserver, process=True): total_lines = sum(1 for line in open(self.path)) time_per_abs = [] with codecs.open(self.path, 'r', "utf-8") as trainfile: current = 0 ddi = "" for line in trainfile: #logging.debug('%s:%s/%s', f, current + 1, total) if line.startswith("ID"): did = line.strip().split("\t")[1] print did elif line.startswith("sentence"): doctext = line.strip().split("\t")[1] newdoc = Document(doctext, process=False, did=did) sid = did + ".s0" newdoc.sentences.append( Sentence(doctext, offset=0, sid=sid, did=did)) if process: newdoc.process_document(corenlpserver) self.documents[newdoc.did] = newdoc
def load_corpus(self, corenlpserver, process=True): """Load the CHEMDNER corpus file on the dir element""" # open filename and parse lines total_lines = sum(1 for line in open(self.path)) widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()] pbar = pb.ProgressBar(widgets=widgets, maxval=total_lines).start() n_lines = 1 time_per_abs = [] with codecs.open(self.path, 'r', "utf-8") as inputfile: for line in inputfile: t = time.time() # each line is PMID title abs tsv = line.split('\t') doctext = tsv[2].strip().replace("<", "(").replace(">", ")") newdoc = Document(doctext, process=False, did=tsv[0], title=tsv[1].strip()) newdoc.sentence_tokenize("biomedical") if process: newdoc.process_document(corenlpserver, "biomedical") self.documents[newdoc.did] = newdoc n_lines += 1 abs_time = time.time() - t time_per_abs.append(abs_time) pbar.update(n_lines+1) pbar.finish() abs_avg = sum(time_per_abs)*1.0/len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True): # self.path is the base directory of the files of this corpus trainfiles = [self.path + '/' + f for f in os.listdir(self.path) if f.endswith('.txt')] total = len(trainfiles) widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()] pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start() time_per_abs = [] for current, f in enumerate(trainfiles): #logging.debug('%s:%s/%s', f, current + 1, total) print '{}:{}/{}'.format(f, current + 1, total) did = f.split(".")[0].split("/")[-1] t = time.time() with io.open(f, 'r', encoding='utf8') as txt: doctext = txt.read() newdoc = Document(doctext, process=False, did=did) newdoc.sentence_tokenize("biomedical") if process: newdoc.process_document(corenlpserver, "biomedical") self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) #logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) pbar.update(current+1) pbar.finish() abs_avg = sum(time_per_abs)*1.0/len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def add_more_sentences(self, corpuspath): """ Load sentences with relations from another corpus :param corpuspath: corpus path :return: """ corpus2 = pickle.load(open(corpuspath, 'rb')) for did in corpus2.documents: for sentence in corpus2.documents[did].sentences: if any([len(e.targets)> 1 for e in sentence.entities.elist["goldstandard"]]): print ("found sentence with relations:", sentence.sid) self.documents[sentence.sid] = Document(sentence.text, sentences=[sentence]) self.save("corpora/Thaliana/seedev-extended.pickle")
def load_corpus(self, corenlpserver, process=True): soup = BeautifulSoup(codecs.open(self.path, 'r', "utf-8"), 'html.parser') docs = soup.find_all("article") widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()] pbar = pb.ProgressBar(widgets=widgets, maxval=len(docs)).start() n_lines = 1 time_per_abs = [] for doc in docs: did = "GENIA" + doc.articleinfo.bibliomisc.text.split(":")[1] title = doc.title.sentence.get_text() sentences = doc.abstract.find_all("sentence") doc_sentences = [] doc_text = title + " " doc_offset = 0 for si, s in enumerate(sentences): t = time.time() stext = s.get_text() sid = did + ".s" + str(si) doc_text += stext + " " this_sentence = Sentence(stext, offset=doc_offset, sid=sid, did=did) doc_offset = len(doc_text) doc_sentences.append(this_sentence) newdoc = Document(doc_text, process=False, did=did) newdoc.sentences = doc_sentences[:] newdoc.process_document(corenlpserver, "biomedical") #logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) pbar.update(n_lines) n_lines += 1 pbar.finish() abs_avg = sum(time_per_abs)*1.0/len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def generate_corpus(self, text): """ Create a corpus object from the input text. :param text: :return: """ test_corpus = Corpus("") newdoc = Document(text, process=False, did="d0", title="Test document") newdoc.sentence_tokenize("biomedical") newdoc.process_document(self.corenlp, "biomedical") test_corpus.documents["d0"] = newdoc return test_corpus