Beispiel #1
0
 def load_corpus(self, corenlpserver, process=True):
     total_lines = sum(1 for line in open(self.path))
     widgets = [pb.Percentage(), " ", pb.Bar(), " ", pb.AdaptiveETA(), " ", pb.Timer()]
     pbar = pb.ProgressBar(widgets=widgets, maxval=total_lines, redirect_stdout=True).start()
     time_per_abs = []
     with codecs.open(self.path, "r", "utf-8") as trainfile:
         current = 0
         for line in trainfile:
             # logging.debug('%s:%s/%s', f, current + 1, total)
             x = line.strip().split(" ")
             did = x[0]
             doctext = " ".join(x[1:])
             newdoc = Document(doctext, process=False, did=did)
             # newdoc.sentence_tokenize("biomedical")
             sid = did + ".s0"
             newdoc.sentences.append(Sentence(doctext, offset=0, sid=sid, did=did))
             if process:
                 newdoc.process_document(corenlpserver, "biomedical")
             self.documents[newdoc.did] = newdoc
             # abs_time = time.time() - t
             # time_per_abs.append(abs_time)
             # logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
             pbar.update(current + 1)
             current += 1
     pbar.finish()
Beispiel #2
0
 def load_corpus(self, corenlpserver, process=True):
     """Load the CHEMDNER corpus file on the dir element"""
     # open filename and parse lines
     total_lines = sum(1 for line in open(self.path))
     widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()]
     pbar = pb.ProgressBar(widgets=widgets, maxval=total_lines).start()
     n_lines = 1
     time_per_abs = []
     with io.open(self.path, 'r', encoding="utf-8") as inputfile:
         for line in inputfile:
             t = time.time()
             # each line is PMID  title   abs
             tsv = line.split('\t')
             doctext = tsv[1].strip().replace("<", "(").replace(">", ")") + " "
             doctext += tsv[2].strip().replace("<", "(").replace(">", ")")
             newdoc = Document(doctext, process=False,
                               did=tsv[0], title=tsv[1].strip() + ".")
             newdoc.sentence_tokenize("biomedical")
             if process:
                 newdoc.process_document(corenlpserver, "biomedical")
             self.documents[newdoc.did] = newdoc
             abs_time = time.time() - t
             time_per_abs.append(abs_time)
             pbar.update(n_lines)
             n_lines += 1
     pbar.finish()
     abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #3
0
def process_documents():
    corpus = Corpus("corpora/Thaliana/pubmed")
    final_text = []
    corenlp_client = StanfordCoreNLP('http://localhost:9000')
    lcount = 0
    starts = set()
    with codecs.open("corpora/Thaliana/documents.txt", 'r',
                     'utf-8') as docfile:
        for l in docfile:
            print lcount
            if l[:20] in starts:
                continue
            lcount += 1
            starts.add(l[:20])

            newdoc = Document(l.strip())
            newdoc.process_document(corenlp_client)
            for sentence in newdoc.sentences:
                print[t.text for t in sentence.tokens]
            newtext = ""
            corpus.documents["d" + str(lcount)] = newdoc
            """for s in newdoc.sentences:
                for t in s.tokens:
                    newtext += t.text + " "
            final_text.append(newtext)"""
            # if lcount > 10:
            #     break
            if lcount % 1000 == 0:
                corpus.save(
                    "corpora/Thaliana/thaliana-documents_{}.pickle".format(
                        str(lcount / 1000)))
Beispiel #4
0
def process_documents(corpus_path):
    corpus = Corpus(corpus_path)
    final_text = []
    corenlp_client = StanfordCoreNLP('http://localhost:9000')
    lcount = 0
    starts = set()
    with codecs.open(corpus_path, 'r', 'utf-8') as docfile:
        for l in docfile:
            print lcount
            if l[:10] in starts:
                print "repeated abstract:", l[:10]
                continue
            lcount += 1
            starts.add(l[:10])
            values = l.strip().split("\t")
            pmid = values[0]
            abs_text = " ".join(values[1:])
            newdoc = Document(abs_text, did="PMID" + pmid)
            newdoc.process_document(corenlp_client)
            #for sentence in newdoc.sentences:
            #    print [t.text for t in sentence.tokens]
            newtext = ""
            newdoc.did = "PMID" + pmid
            corpus.documents["PMID" + pmid] = newdoc
            """for s in newdoc.sentences:
                for t in s.tokens:
                    newtext += t.text + " "
            final_text.append(newtext)"""
            # if lcount > 10:
            #     break
            if lcount % 1000 == 0:
                corpus.save("{}_{}.pickle".format(corpus_path, str(lcount/1000)))
                corpus = Corpus(corpus_path)
    corpus.save("{}_{}.pickle".format(corpus_path, str(lcount / 1000)))
Beispiel #5
0
def process_documents(corpus_path):
    corpus = Corpus(corpus_path)
    final_text = []
    corenlp_client = StanfordCoreNLP('http://localhost:9000')
    lcount = 0
    starts = set()
    with codecs.open(corpus_path, 'r', 'utf-8') as docfile:
        for l in docfile:
            print lcount
            if l[:10] in starts:
                print "repeated abstract:", l[:10]
                continue
            lcount += 1
            starts.add(l[:10])
            values = l.strip().split("\t")
            pmid = values[0]
            abs_text = " ".join(values[1:])
            newdoc = Document(abs_text, did="PMID" + pmid)
            newdoc.process_document(corenlp_client)
            #for sentence in newdoc.sentences:
            #    print [t.text for t in sentence.tokens]
            newtext = ""
            newdoc.did = "PMID" + pmid
            corpus.documents["PMID" + pmid] = newdoc
            """for s in newdoc.sentences:
                for t in s.tokens:
                    newtext += t.text + " "
            final_text.append(newtext)"""
            # if lcount > 10:
            #     break
            if lcount % 1000 == 0:
                corpus.save("{}_{}.pickle".format(corpus_path,
                                                  str(lcount / 1000)))
                corpus = Corpus(corpus_path)
    corpus.save("{}_{}.pickle".format(corpus_path, str(lcount / 1000)))
Beispiel #6
0
 def load_corpus(self, corenlpserver, process=True):
     widgets = [
         pb.Percentage(), ' ',
         pb.Bar(), ' ',
         pb.ETA(), ' ',
         pb.Timer()
     ]
     nlines = 0
     with open(self.path) as f:
         for nlines, l in enumerate(f):
             pass
     print nlines
     pbar = pb.ProgressBar(widgets=widgets, maxval=nlines).start()
     with codecs.open(self.path, 'r', "utf-8") as corpusfile:
         doc_text = ""
         sentences = []
         for i, l in enumerate(corpusfile):
             if l.startswith("###"):  # new doc
                 if doc_text != "":
                     logging.debug("creating document: {}".format(doc_text))
                     newdoc = Document(doc_text, process=False, did=did)
                     newdoc.sentences = sentences[:]
                     newdoc.process_document(corenlpserver, "biomedical")
                     # logging.info(len(newdoc.sentences))
                     self.documents[newdoc.did] = newdoc
                     doc_text = ""
                 did = "JNLPBA" + l.strip().split(":")[-1]
                 logging.debug("starting new document:" + did)
                 sentence_text = ""
                 doc_offset = 0
                 sentences = []
             elif l.strip() == "" and sentence_text != "":  # new sentence
                 #logging.debug("creating mew sentence: {}".format(sentence_text))
                 sid = did + ".s" + str(len(sentences))
                 this_sentence = Sentence(sentence_text,
                                          offset=doc_offset,
                                          sid=sid,
                                          did=did)
                 doc_offset += len(sentence_text) + 1
                 doc_text += sentence_text + " "
                 sentences.append(this_sentence)
                 if i == nlines:
                     logging.debug("creating document: {}".format(doc_text))
                     newdoc = Document(doc_text, process=False, did=did)
                     newdoc.sentences = sentences[:]
                     newdoc.process_document(corenlpserver, "biomedical")
                     # logging.info(len(newdoc.sentences))
                     self.documents[newdoc.did] = newdoc
                     doc_text = ""
                 # start new sentence
                 sentence_text = ""
             else:
                 #logging.debug(str(i) + "/" + str(l))
                 t = l.strip().split("\t")
                 if sentence_text != "":
                     sentence_text += " "
                 #if t[1] == "B-protein"
                 sentence_text += t[0]
             pbar.update(i)
         pbar.finish()
Beispiel #7
0
 def load_corpus(self, corenlpserver, process=True):
     # self.path is the base directory of the files of this corpus
     trainfiles = [self.path + '/' + f for f in os.listdir(self.path) if f.endswith('.txt')]
     total = len(trainfiles)
     widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
     pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
     time_per_abs = []
     for current, f in enumerate(trainfiles):
         #logging.debug('%s:%s/%s', f, current + 1, total)
         print '{}:{}/{}'.format(f, current + 1, total)
         did = f.split(".")[0]
         t = time.time()
         with open(f, 'r') as txt:
             doctext = txt.read()
         newdoc = Document(doctext, process=False, did=did)
         newdoc.sentence_tokenize("biomedical")
         if process:
             newdoc.process_document(corenlpserver, "biomedical")
         self.documents[newdoc.did] = newdoc
         abs_time = time.time() - t
         time_per_abs.append(abs_time)
         #logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
         pbar.update(current+1)
     pbar.finish()
     abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #8
0
    def load_corpus(self, corenlpserver, process=True):
        trainfiles = [self.path + '/' + f for f in os.listdir(self.path)]
        total = len(trainfiles)
        widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
        pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
        time_per_abs = []
        for current, f in enumerate(trainfiles):
            #logging.debug('%s:%s/%s', f, current + 1, total)
            print '{}:{}/{}'.format(f, current + 1, total)
            did = f
            t = time.time()
            with open(f, 'r') as f:
                article = "<Article>" + f.read() +  "</Article>"
            soup = BeautifulSoup(article, 'xml')
            #doc = soup.find_all("article")
            title = soup.ArticleTitle.get_text()
            abstract = soup.AbstractText.get_text()
            doc_text = title + " " + abstract

            newdoc = Document(doc_text, process=False, did=did)
            newdoc.sentence_tokenize("biomedical")
            newdoc.process_document(corenlpserver, "biomedical")
            #logging.info(len(newdoc.sentences))
            self.documents[newdoc.did] = newdoc
            abs_time = time.time() - t
            time_per_abs.append(abs_time)
            logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
            pbar.update(current)
        pbar.finish()
        abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
        logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #9
0
 def load_corpus(self, corenlpserver, process=True):
     total_lines = sum(1 for line in open(self.path))
     widgets = [
         pb.Percentage(), ' ',
         pb.Bar(), ' ',
         pb.ETA(), ' ',
         pb.Timer()
     ]
     pbar = pb.ProgressBar(widgets=widgets,
                           maxval=total_lines,
                           redirect_stdout=True).start()
     time_per_abs = []
     with codecs.open(self.path, 'r', "utf-8") as trainfile:
         current = 0
         for line in trainfile:
             #logging.debug('%s:%s/%s', f, current + 1, total)
             x = line.strip().split(" ")
             did = x[0]
             doctext = " ".join(x[1:])
             newdoc = Document(doctext, process=False, did=did)
             #newdoc.sentence_tokenize("biomedical")
             sid = did + ".s0"
             newdoc.sentences.append(
                 Sentence(doctext, offset=0, sid=sid, did=did))
             if process:
                 newdoc.process_document(corenlpserver, "biomedical")
             self.documents[newdoc.did] = newdoc
             # abs_time = time.time() - t
             # time_per_abs.append(abs_time)
             #logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
             pbar.update(current + 1)
             current += 1
     pbar.finish()
Beispiel #10
0
    def load_corpus(self, corenlpserver):
        # self.path is the base directory of the files of this corpus

#         if more than one file:
        trainfiles = [self.path + f for f in os.listdir(self.path) if not f.endswith('~')] # opens all files in folder (see config file)
        widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', ' ', pb.Timer()]
        pbar = pb.ProgressBar(widgets=widgets, maxval=len(trainfiles)).start()
        for i, openfile in enumerate(trainfiles):
            # print("file: "+openfile)
            with open(openfile, 'r') as inputfile:
                newdoc = Document(inputfile.read(), process=False, did=os.path.basename(openfile), title = "titulo_"+os.path.basename(openfile))
            newdoc.process_document(corenlpserver, "biomedical") #process_document chama o tokenizer
            valid = True
            invalid_sids = []
            for s in newdoc.sentences:
                if s.text in ['[start section id="{}"]'.format(section) for section in self.invalid_sections]:
                    valid = False
                if not valid:
                    invalid_sids.append(s.sid)
                if s.text in ['[end section id="{}"]'.format(section) for section in self.invalid_sections]:
                    valid = True
                if (s.text.startswith("[") and s.text.endswith("]")) or s.text.istitle():
                    newdoc.title_sids.append(s.sid)
            newdoc.invalid_sids = invalid_sids
            logging.debug("invalid sentences: {}".format(invalid_sids))
            logging.debug("title sentences: {}".format(newdoc.title_sids))
            self.documents[newdoc.did] = newdoc
            pbar.update(i+1)
Beispiel #11
0
 def load_corpus(self, corenlpserver, process=True):
     # self.path is just one file with every document
     time_per_abs = []
     with open(self.path, 'r') as xml:
         t = time.time()
         root = ET.fromstring(xml.read())
         all_docs = root.findall("document")
         widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
         pbar = pb.ProgressBar(widgets=widgets, maxval=len(all_docs)).start()
         for i, doc in enumerate(all_docs):
             doctext = ""
             did = doc.get('id')
             doc_sentences = [] # get the sentences of this document
             doc_offset = 0 # offset of the current sentence relative to the document
             for sentence in doc.findall('sentence'):
                 sid = sentence.get('id')
                 #logging.info(sid)
                 text = sentence.get('text')
                 #text = text.replace('\r\n', '  ')
                 doctext += " " + text # generate the full text of this document
                 this_sentence = Sentence(text, offset=doc_offset, sid=sid, did=did)
                 doc_offset = len(doctext)
                 doc_sentences.append(this_sentence)
             newdoc = Document(doctext, process=False, did=did)
             newdoc.sentences = doc_sentences[:]
             newdoc.process_document(corenlpserver, "biomedical")
             self.documents[newdoc.did] = newdoc
             abs_time = time.time() - t
             time_per_abs.append(abs_time)
             pbar.update(i+1)
         pbar.finish()
     abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #12
0
 def load_corpus(self, corenlpserver):
     docs = self.get_docs(self.path)
     total = len(docs)
     current = 0
     time_per_abs = []
     ts = set()
     for f in docs:
         logging.debug('%s:%s/%s', f[0], current + 1, total)
         current += 1
         #parse DDI corpus file
         t = time.time()
         #print root.tag
         docid = f[
             0]  # TODO: actually each paragraph should be it's own documents, that should help offset issues
         doctext = ""
         doc_sentences = []  # get the sentences of this document
         doc_offset = 0  # offset of the current sentence relative to the document
         sents = self.get_paragraphs(f)
         for p in sents:
             logging.debug("processing {}".format(p[0]))
             senttext = p[1].text.replace("\n", " ")
             for ne in p[1].findall("ne"):
                 #doctext += ne.text
                 senttext += ne.text
                 if ne.tail:
                     #doctext += ne.tail
                     senttext += ne.tail.replace("\n", " ")
             #logging.debug(senttext)
             #this_sentence = Sentence(senttext, offset=doc_offset, sid=p[0], did=docid)
             doctext += senttext + "\n"
             doc_offset = len(doctext)
             #doc_sentences.append(this_sentence)
             #logging.info(len(doc_sentences))
         newdoc = Document(doctext, process=False, did=docid, ssplit=True)
         #newdoc.sentences = doc_sentences[:]
         newdoc.process_document(corenlpserver, "biomedical")
         #logging.info(len(newdoc.sentences))
         self.documents[newdoc.did] = newdoc
         #for s in self.documents[newdoc.did].sentences:
         #    logging.debug("sentence {} has {} tokens".format(s.sid, len(s.tokens)))
         #    logging.debug([(t.start, t.end) for t in s.tokens])
         abs_time = time.time() - t
         time_per_abs.append(abs_time)
         logging.info("%s sentences, %ss processing time" %
                      (len(newdoc.sentences), abs_time))
     abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #13
0
    def load_corpus(self, corenlpserver, process=True):

        soup = BeautifulSoup(codecs.open(self.path, 'r', "utf-8"),
                             'html.parser')
        docs = soup.find_all("article")
        widgets = [
            pb.Percentage(), ' ',
            pb.Bar(), ' ',
            pb.ETA(), ' ',
            pb.Timer()
        ]
        pbar = pb.ProgressBar(widgets=widgets, maxval=len(docs)).start()
        n_lines = 1
        time_per_abs = []
        for doc in docs:
            did = "GENIA" + doc.articleinfo.bibliomisc.text.split(":")[1]
            title = doc.title.sentence.get_text()
            sentences = doc.abstract.find_all("sentence")
            doc_sentences = []
            doc_text = title + " "
            doc_offset = 0
            for si, s in enumerate(sentences):
                t = time.time()
                stext = s.get_text()
                sid = did + ".s" + str(si)
                doc_text += stext + " "
                this_sentence = Sentence(stext,
                                         offset=doc_offset,
                                         sid=sid,
                                         did=did)
                doc_offset = len(doc_text)
                doc_sentences.append(this_sentence)
            newdoc = Document(doc_text, process=False, did=did)
            newdoc.sentences = doc_sentences[:]
            newdoc.process_document(corenlpserver, "biomedical")
            #logging.info(len(newdoc.sentences))
            self.documents[newdoc.did] = newdoc
            abs_time = time.time() - t
            time_per_abs.append(abs_time)
            logging.debug("%s sentences, %ss processing time" %
                          (len(newdoc.sentences), abs_time))
            pbar.update(n_lines)
            n_lines += 1
        pbar.finish()
        abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs)
        logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #14
0
 def load_corpus(self, corenlpserver):
     docs = self.get_docs(self.path)
     total = len(docs)
     current = 0
     time_per_abs = []
     ts = set()
     for f in docs:
         logging.debug("%s:%s/%s", f[0], current + 1, total)
         current += 1
         # parse DDI corpus file
         t = time.time()
         # print root.tag
         docid = f[0]  # TODO: actually each paragraph should be it's own documents, that should help offset issues
         doctext = ""
         doc_sentences = []  # get the sentences of this document
         doc_offset = 0  # offset of the current sentence relative to the document
         sents = self.get_paragraphs(f)
         for p in sents:
             logging.debug("processing {}".format(p[0]))
             senttext = p[1].text.replace("\n", " ")
             for ne in p[1].findall("ne"):
                 # doctext += ne.text
                 senttext += ne.text
                 if ne.tail:
                     # doctext += ne.tail
                     senttext += ne.tail.replace("\n", " ")
             # logging.debug(senttext)
             # this_sentence = Sentence(senttext, offset=doc_offset, sid=p[0], did=docid)
             doctext += senttext + "\n"
             doc_offset = len(doctext)
             # doc_sentences.append(this_sentence)
             # logging.info(len(doc_sentences))
         newdoc = Document(doctext, process=False, did=docid, ssplit=True)
         # newdoc.sentences = doc_sentences[:]
         newdoc.process_document(corenlpserver, "biomedical")
         # logging.info(len(newdoc.sentences))
         self.documents[newdoc.did] = newdoc
         # for s in self.documents[newdoc.did].sentences:
         #    logging.debug("sentence {} has {} tokens".format(s.sid, len(s.tokens)))
         #    logging.debug([(t.start, t.end) for t in s.tokens])
         abs_time = time.time() - t
         time_per_abs.append(abs_time)
         logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
     abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #15
0
 def create_sentences(self, doctag, text):
     # Create sentence entries based on text from document doctag
     cur = self.db_conn.cursor()
     newdoc = Document(text, process=False,
                               did=doctag)
     newdoc.sentence_tokenize("biomedical")
     for i, sentence in enumerate(newdoc.sentences):
         corenlpres = sentence.process_sentence(self.corenlp)
         query = """INSERT INTO sentence(senttag, doctag, senttext, sentoffset, corenlp) VALUES (%s, %s, %s, %s, %s);"""
         try:
             cur.execute(query, (sentence.sid, doctag, sentence.text.encode("utf8"), sentence.offset,
                                 str(corenlpres).encode("utf8")))
             self.db_conn.commit()
             #inserted_id = cur.lastrowid
             #return str(inserted_id)
         except MySQLdb.MySQLError as e:
             self.db_conn.rollback()
             logging.debug(e)
Beispiel #16
0
 def load_corpus(self, corenlpserver):
     # self.path is the base directory of the files of this corpus
     trainfiles = [
         self.path + '/' + f for f in os.listdir(self.path)
         if f.endswith('.xml')
     ]
     total = len(trainfiles)
     current = 0
     time_per_abs = []
     for f in trainfiles:
         logging.debug('%s:%s/%s', f, current + 1, total)
         current += 1
         with open(f, 'r') as xml:
             #parse DDI corpus file
             t = time.time()
             root = ET.fromstring(xml.read())
             doctext = ""
             did = root.get('id')
             doc_sentences = []  # get the sentences of this document
             doc_offset = 0  # offset of the current sentence relative to the document
             for sentence in root.findall('sentence'):
                 sid = sentence.get('id')
                 #logging.info(sid)
                 text = sentence.get('text')
                 text = text.replace('\r\n', '  ')
                 doctext += " " + text  # generate the full text of this document
                 this_sentence = Sentence(text,
                                          offset=doc_offset,
                                          sid=sid,
                                          did=did)
                 doc_offset = len(doctext)
                 doc_sentences.append(this_sentence)
             #logging.info(len(doc_sentences))
             newdoc = Document(doctext, process=False, did=did)
             newdoc.sentences = doc_sentences[:]
             newdoc.process_document(corenlpserver, "biomedical")
             #logging.info(len(newdoc.sentences))
             self.documents[newdoc.did] = newdoc
             abs_time = time.time() - t
             time_per_abs.append(abs_time)
             logging.info("%s sentences, %ss processing time" %
                          (len(newdoc.sentences), abs_time))
     abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #17
0
 def load_corpus(self, corenlpserver, process=True):
     total_lines = sum(1 for line in open(self.path))
     time_per_abs = []
     with codecs.open(self.path, 'r', "utf-8") as trainfile:
         current = 0
         ddi = ""
         for line in trainfile:
             #logging.debug('%s:%s/%s', f, current + 1, total)
             if line.startswith("ID"):
                 did = line.strip().split("\t")[1]
                 print did
             elif line.startswith("sentence"):
                 doctext = line.strip().split("\t")[1]
                 newdoc = Document(doctext, process=False, did=did)
                 sid = did + ".s0"
                 newdoc.sentences.append(Sentence(doctext, offset=0, sid=sid, did=did))
                 if process:
                     newdoc.process_document(corenlpserver)
                 self.documents[newdoc.did] = newdoc
Beispiel #18
0
 def load_corpus(self, corenlpserver, process=True):
     # self.path is just one file with every document
     time_per_abs = []
     with open(self.path, 'r') as xml:
         t = time.time()
         root = ET.fromstring(xml.read())
         all_docs = root.findall("document")
         widgets = [
             pb.Percentage(), ' ',
             pb.Bar(), ' ',
             pb.AdaptiveETA(), ' ',
             pb.Timer()
         ]
         pbar = pb.ProgressBar(widgets=widgets,
                               maxval=len(all_docs)).start()
         for i, doc in enumerate(all_docs):
             doctext = ""
             did = doc.get('id')
             doc_sentences = []  # get the sentences of this document
             doc_offset = 0  # offset of the current sentence relative to the document
             for sentence in doc.findall('sentence'):
                 sid = sentence.get('id')
                 #logging.info(sid)
                 text = sentence.get('text')
                 #text = text.replace('\r\n', '  ')
                 doctext += " " + text  # generate the full text of this document
                 this_sentence = Sentence(text,
                                          offset=doc_offset,
                                          sid=sid,
                                          did=did)
                 doc_offset = len(doctext)
                 doc_sentences.append(this_sentence)
             newdoc = Document(doctext, process=False, did=did)
             newdoc.sentences = doc_sentences[:]
             newdoc.process_document(corenlpserver, "biomedical")
             self.documents[newdoc.did] = newdoc
             abs_time = time.time() - t
             time_per_abs.append(abs_time)
             pbar.update(i + 1)
         pbar.finish()
     abs_avg = sum(time_per_abs) * 1.0 / len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #19
0
 def load_corpus(self, corenlpserver, process=True):
     widgets = [pb.Percentage(), " ", pb.Bar(), " ", pb.ETA(), " ", pb.Timer()]
     nlines = 0
     with open(self.path) as f:
         for nlines, l in enumerate(f):
             pass
     print nlines
     pbar = pb.ProgressBar(widgets=widgets, maxval=nlines).start()
     with codecs.open(self.path, "r", "utf-8") as corpusfile:
         doc_text = ""
         sentences = []
         for i, l in enumerate(corpusfile):
             if l.startswith("###"):  # new doc
                 if doc_text != "":
                     logging.debug("creating document: {}".format(doc_text))
                     newdoc = Document(doc_text, process=False, did=did)
                     newdoc.sentences = sentences[:]
                     newdoc.process_document(corenlpserver, "biomedical")
                     # logging.info(len(newdoc.sentences))
                     self.documents[newdoc.did] = newdoc
                     doc_text = ""
                 did = "JNLPBA" + l.strip().split(":")[-1]
                 logging.debug("starting new document:" + did)
                 sentence_text = ""
                 doc_offset = 0
                 sentences = []
             elif l.strip() == "" and sentence_text != "":  # new sentence
                 # logging.debug("creating mew sentence: {}".format(sentence_text))
                 sid = did + ".s" + str(len(sentences))
                 this_sentence = Sentence(sentence_text, offset=doc_offset, sid=sid, did=did)
                 doc_offset += len(sentence_text) + 1
                 doc_text += sentence_text + " "
                 sentences.append(this_sentence)
                 if i == nlines:
                     logging.debug("creating document: {}".format(doc_text))
                     newdoc = Document(doc_text, process=False, did=did)
                     newdoc.sentences = sentences[:]
                     newdoc.process_document(corenlpserver, "biomedical")
                     # logging.info(len(newdoc.sentences))
                     self.documents[newdoc.did] = newdoc
                     doc_text = ""
                 # start new sentence
                 sentence_text = ""
             else:
                 # logging.debug(str(i) + "/" + str(l))
                 t = l.strip().split("\t")
                 if sentence_text != "":
                     sentence_text += " "
                 # if t[1] == "B-protein"
                 sentence_text += t[0]
             pbar.update(i)
         pbar.finish()
Beispiel #20
0
 def load_corpus(self, corenlpserver, process=True):
     total_lines = sum(1 for line in open(self.path))
     time_per_abs = []
     with codecs.open(self.path, 'r', "utf-8") as trainfile:
         current = 0
         ddi = ""
         for line in trainfile:
             #logging.debug('%s:%s/%s', f, current + 1, total)
             if line.startswith("ID"):
                 did = line.strip().split("\t")[1]
                 print did
             elif line.startswith("sentence"):
                 doctext = line.strip().split("\t")[1]
                 newdoc = Document(doctext, process=False, did=did)
                 sid = did + ".s0"
                 newdoc.sentences.append(
                     Sentence(doctext, offset=0, sid=sid, did=did))
                 if process:
                     newdoc.process_document(corenlpserver)
                 self.documents[newdoc.did] = newdoc
Beispiel #21
0
 def load_corpus(self, corenlpserver, process=True):
     """Load the CHEMDNER corpus file on the dir element"""
     # open filename and parse lines
     total_lines = sum(1 for line in open(self.path))
     widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()]
     pbar = pb.ProgressBar(widgets=widgets, maxval=total_lines).start()
     n_lines = 1
     time_per_abs = []
     with codecs.open(self.path, 'r', "utf-8") as inputfile:
         for line in inputfile:
             t = time.time()
             # each line is PMID  title   abs
             tsv = line.split('\t')
             doctext = tsv[2].strip().replace("<", "(").replace(">", ")")
             newdoc = Document(doctext, process=False,
                               did=tsv[0], title=tsv[1].strip())
             newdoc.sentence_tokenize("biomedical")
             if process:
                 newdoc.process_document(corenlpserver, "biomedical")
             self.documents[newdoc.did] = newdoc
             n_lines += 1
             abs_time = time.time() - t
             time_per_abs.append(abs_time)
             pbar.update(n_lines+1)
     pbar.finish()
     abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #22
0
 def load_corpus(self, corenlpserver, process=True):
     # self.path is the base directory of the files of this corpus
     trainfiles = [self.path + '/' + f for f in os.listdir(self.path) if f.endswith('.txt')]
     total = len(trainfiles)
     widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()]
     pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
     time_per_abs = []
     for current, f in enumerate(trainfiles):
         #logging.debug('%s:%s/%s', f, current + 1, total)
         print '{}:{}/{}'.format(f, current + 1, total)
         did = f.split(".")[0].split("/")[-1]
         t = time.time()
         with io.open(f, 'r', encoding='utf8') as txt:
             doctext = txt.read()
         newdoc = Document(doctext, process=False, did=did)
         newdoc.sentence_tokenize("biomedical")
         if process:
             newdoc.process_document(corenlpserver, "biomedical")
         self.documents[newdoc.did] = newdoc
         abs_time = time.time() - t
         time_per_abs.append(abs_time)
         #logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
         pbar.update(current+1)
     pbar.finish()
     abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #23
0
    def load_corpus(self, corenlpserver, process=True):
        trainfiles = [self.path + '/' + f for f in os.listdir(self.path)]
        total = len(trainfiles)
        widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
        pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
        time_per_abs = []
        for current, f in enumerate(trainfiles):
            #logging.debug('%s:%s/%s', f, current + 1, total)
            print '{}:{}/{}'.format(f, current + 1, total)
            did = f
            t = time.time()
            with open(f, 'r') as f:
                article = "<Article>" + f.read() +  "</Article>"
            soup = BeautifulSoup(article, 'xml')
            #doc = soup.find_all("article")
            title = soup.ArticleTitle.get_text()
            abstract = soup.AbstractText.get_text()
            doc_text = title + " " + abstract

            newdoc = Document(doc_text, process=False, did=did)
            newdoc.sentence_tokenize("biomedical")
            newdoc.process_document(corenlpserver, "biomedical")
            #logging.info(len(newdoc.sentences))
            self.documents[newdoc.did] = newdoc
            abs_time = time.time() - t
            time_per_abs.append(abs_time)
            logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
            pbar.update(current)
        pbar.finish()
        abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
        logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #24
0
 def add_more_sentences(self, corpuspath):
     """
     Load sentences with relations from another corpus
     :param corpuspath: corpus path
     :return:
     """
     corpus2 = pickle.load(open(corpuspath, 'rb'))
     for did in corpus2.documents:
         for sentence in corpus2.documents[did].sentences:
             if any([len(e.targets)> 1 for e in sentence.entities.elist["goldstandard"]]):
                 print ("found sentence with relations:", sentence.sid)
                 self.documents[sentence.sid] = Document(sentence.text, sentences=[sentence])
     self.save("corpora/Thaliana/seedev-extended.pickle")
Beispiel #25
0
    def load_corpus(self, corenlpserver, process=True):

        soup = BeautifulSoup(codecs.open(self.path, 'r', "utf-8"), 'html.parser')
        docs = soup.find_all("article")
        widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()]
        pbar = pb.ProgressBar(widgets=widgets, maxval=len(docs)).start()
        n_lines = 1
        time_per_abs = []
        for doc in docs:
            did = "GENIA" + doc.articleinfo.bibliomisc.text.split(":")[1]
            title = doc.title.sentence.get_text()
            sentences = doc.abstract.find_all("sentence")
            doc_sentences = []
            doc_text = title + " "
            doc_offset = 0
            for si, s in enumerate(sentences):
                t = time.time()
                stext = s.get_text()
                sid = did + ".s" + str(si)
                doc_text += stext + " "
                this_sentence = Sentence(stext, offset=doc_offset, sid=sid, did=did)
                doc_offset = len(doc_text)
                doc_sentences.append(this_sentence)
            newdoc = Document(doc_text, process=False, did=did)
            newdoc.sentences = doc_sentences[:]
            newdoc.process_document(corenlpserver, "biomedical")
            #logging.info(len(newdoc.sentences))
            self.documents[newdoc.did] = newdoc
            abs_time = time.time() - t
            time_per_abs.append(abs_time)
            logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
            pbar.update(n_lines)
            n_lines += 1
        pbar.finish()
        abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
        logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #26
0
 def generate_corpus(self, text):
     """
     Create a corpus object from the input text.
     :param text:
     :return:
     """
     test_corpus = Corpus("")
     newdoc = Document(text, process=False, did="d0", title="Test document")
     newdoc.sentence_tokenize("biomedical")
     newdoc.process_document(self.corenlp, "biomedical")
     test_corpus.documents["d0"] = newdoc
     return test_corpus