def load_corpus(self, corenlpserver, process=True): widgets = [ pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer() ] nlines = 0 with open(self.path) as f: for nlines, l in enumerate(f): pass print nlines pbar = pb.ProgressBar(widgets=widgets, maxval=nlines).start() with codecs.open(self.path, 'r', "utf-8") as corpusfile: doc_text = "" sentences = [] for i, l in enumerate(corpusfile): if l.startswith("###"): # new doc if doc_text != "": logging.debug("creating document: {}".format(doc_text)) newdoc = Document(doc_text, process=False, did=did) newdoc.sentences = sentences[:] newdoc.process_document(corenlpserver, "biomedical") # logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc doc_text = "" did = "JNLPBA" + l.strip().split(":")[-1] logging.debug("starting new document:" + did) sentence_text = "" doc_offset = 0 sentences = [] elif l.strip() == "" and sentence_text != "": # new sentence #logging.debug("creating mew sentence: {}".format(sentence_text)) sid = did + ".s" + str(len(sentences)) this_sentence = Sentence(sentence_text, offset=doc_offset, sid=sid, did=did) doc_offset += len(sentence_text) + 1 doc_text += sentence_text + " " sentences.append(this_sentence) if i == nlines: logging.debug("creating document: {}".format(doc_text)) newdoc = Document(doc_text, process=False, did=did) newdoc.sentences = sentences[:] newdoc.process_document(corenlpserver, "biomedical") # logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc doc_text = "" # start new sentence sentence_text = "" else: #logging.debug(str(i) + "/" + str(l)) t = l.strip().split("\t") if sentence_text != "": sentence_text += " " #if t[1] == "B-protein" sentence_text += t[0] pbar.update(i) pbar.finish()
def load_corpus(self, corenlpserver, process=True): widgets = [pb.Percentage(), " ", pb.Bar(), " ", pb.ETA(), " ", pb.Timer()] nlines = 0 with open(self.path) as f: for nlines, l in enumerate(f): pass print nlines pbar = pb.ProgressBar(widgets=widgets, maxval=nlines).start() with codecs.open(self.path, "r", "utf-8") as corpusfile: doc_text = "" sentences = [] for i, l in enumerate(corpusfile): if l.startswith("###"): # new doc if doc_text != "": logging.debug("creating document: {}".format(doc_text)) newdoc = Document(doc_text, process=False, did=did) newdoc.sentences = sentences[:] newdoc.process_document(corenlpserver, "biomedical") # logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc doc_text = "" did = "JNLPBA" + l.strip().split(":")[-1] logging.debug("starting new document:" + did) sentence_text = "" doc_offset = 0 sentences = [] elif l.strip() == "" and sentence_text != "": # new sentence # logging.debug("creating mew sentence: {}".format(sentence_text)) sid = did + ".s" + str(len(sentences)) this_sentence = Sentence(sentence_text, offset=doc_offset, sid=sid, did=did) doc_offset += len(sentence_text) + 1 doc_text += sentence_text + " " sentences.append(this_sentence) if i == nlines: logging.debug("creating document: {}".format(doc_text)) newdoc = Document(doc_text, process=False, did=did) newdoc.sentences = sentences[:] newdoc.process_document(corenlpserver, "biomedical") # logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc doc_text = "" # start new sentence sentence_text = "" else: # logging.debug(str(i) + "/" + str(l)) t = l.strip().split("\t") if sentence_text != "": sentence_text += " " # if t[1] == "B-protein" sentence_text += t[0] pbar.update(i) pbar.finish()
def load_corpus(self, corenlpserver, process=True): # self.path is just one file with every document time_per_abs = [] with open(self.path, 'r') as xml: t = time.time() root = ET.fromstring(xml.read()) all_docs = root.findall("document") widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()] pbar = pb.ProgressBar(widgets=widgets, maxval=len(all_docs)).start() for i, doc in enumerate(all_docs): doctext = "" did = doc.get('id') doc_sentences = [] # get the sentences of this document doc_offset = 0 # offset of the current sentence relative to the document for sentence in doc.findall('sentence'): sid = sentence.get('id') #logging.info(sid) text = sentence.get('text') #text = text.replace('\r\n', ' ') doctext += " " + text # generate the full text of this document this_sentence = Sentence(text, offset=doc_offset, sid=sid, did=did) doc_offset = len(doctext) doc_sentences.append(this_sentence) newdoc = Document(doctext, process=False, did=did) newdoc.sentences = doc_sentences[:] newdoc.process_document(corenlpserver, "biomedical") self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) pbar.update(i+1) pbar.finish() abs_avg = sum(time_per_abs)*1.0/len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True): soup = BeautifulSoup(codecs.open(self.path, 'r', "utf-8"), 'html.parser') docs = soup.find_all("article") widgets = [ pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer() ] pbar = pb.ProgressBar(widgets=widgets, maxval=len(docs)).start() n_lines = 1 time_per_abs = [] for doc in docs: did = "GENIA" + doc.articleinfo.bibliomisc.text.split(":")[1] title = doc.title.sentence.get_text() sentences = doc.abstract.find_all("sentence") doc_sentences = [] doc_text = title + " " doc_offset = 0 for si, s in enumerate(sentences): t = time.time() stext = s.get_text() sid = did + ".s" + str(si) doc_text += stext + " " this_sentence = Sentence(stext, offset=doc_offset, sid=sid, did=did) doc_offset = len(doc_text) doc_sentences.append(this_sentence) newdoc = Document(doc_text, process=False, did=did) newdoc.sentences = doc_sentences[:] newdoc.process_document(corenlpserver, "biomedical") #logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) pbar.update(n_lines) n_lines += 1 pbar.finish() abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver): # self.path is the base directory of the files of this corpus trainfiles = [ self.path + '/' + f for f in os.listdir(self.path) if f.endswith('.xml') ] total = len(trainfiles) current = 0 time_per_abs = [] for f in trainfiles: logging.debug('%s:%s/%s', f, current + 1, total) current += 1 with open(f, 'r') as xml: #parse DDI corpus file t = time.time() root = ET.fromstring(xml.read()) doctext = "" did = root.get('id') doc_sentences = [] # get the sentences of this document doc_offset = 0 # offset of the current sentence relative to the document for sentence in root.findall('sentence'): sid = sentence.get('id') #logging.info(sid) text = sentence.get('text') text = text.replace('\r\n', ' ') doctext += " " + text # generate the full text of this document this_sentence = Sentence(text, offset=doc_offset, sid=sid, did=did) doc_offset = len(doctext) doc_sentences.append(this_sentence) #logging.info(len(doc_sentences)) newdoc = Document(doctext, process=False, did=did) newdoc.sentences = doc_sentences[:] newdoc.process_document(corenlpserver, "biomedical") #logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True): # self.path is just one file with every document time_per_abs = [] with open(self.path, 'r') as xml: t = time.time() root = ET.fromstring(xml.read()) all_docs = root.findall("document") widgets = [ pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer() ] pbar = pb.ProgressBar(widgets=widgets, maxval=len(all_docs)).start() for i, doc in enumerate(all_docs): doctext = "" did = doc.get('id') doc_sentences = [] # get the sentences of this document doc_offset = 0 # offset of the current sentence relative to the document for sentence in doc.findall('sentence'): sid = sentence.get('id') #logging.info(sid) text = sentence.get('text') #text = text.replace('\r\n', ' ') doctext += " " + text # generate the full text of this document this_sentence = Sentence(text, offset=doc_offset, sid=sid, did=did) doc_offset = len(doctext) doc_sentences.append(this_sentence) newdoc = Document(doctext, process=False, did=did) newdoc.sentences = doc_sentences[:] newdoc.process_document(corenlpserver, "biomedical") self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) pbar.update(i + 1) pbar.finish() abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True): soup = BeautifulSoup(codecs.open(self.path, 'r', "utf-8"), 'html.parser') docs = soup.find_all("article") widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()] pbar = pb.ProgressBar(widgets=widgets, maxval=len(docs)).start() n_lines = 1 time_per_abs = [] for doc in docs: did = "GENIA" + doc.articleinfo.bibliomisc.text.split(":")[1] title = doc.title.sentence.get_text() sentences = doc.abstract.find_all("sentence") doc_sentences = [] doc_text = title + " " doc_offset = 0 for si, s in enumerate(sentences): t = time.time() stext = s.get_text() sid = did + ".s" + str(si) doc_text += stext + " " this_sentence = Sentence(stext, offset=doc_offset, sid=sid, did=did) doc_offset = len(doc_text) doc_sentences.append(this_sentence) newdoc = Document(doc_text, process=False, did=did) newdoc.sentences = doc_sentences[:] newdoc.process_document(corenlpserver, "biomedical") #logging.info(len(newdoc.sentences)) self.documents[newdoc.did] = newdoc abs_time = time.time() - t time_per_abs.append(abs_time) logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time)) pbar.update(n_lines) n_lines += 1 pbar.finish() abs_avg = sum(time_per_abs)*1.0/len(time_per_abs) logging.info("average time per abstract: %ss" % abs_avg)