Beispiel #1
0
 def load_corpus(self, corenlpserver, process=True):
     # self.path is the base directory of the files of this corpus
     trainfiles = [self.path + '/' + f for f in os.listdir(self.path) if f.endswith('.txt')]
     total = len(trainfiles)
     widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
     pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
     time_per_abs = []
     for current, f in enumerate(trainfiles):
         #logging.debug('%s:%s/%s', f, current + 1, total)
         print '{}:{}/{}'.format(f, current + 1, total)
         did = f.split(".")[0]
         t = time.time()
         with open(f, 'r') as txt:
             doctext = txt.read()
         newdoc = Document(doctext, process=False, did=did)
         newdoc.sentence_tokenize("biomedical")
         if process:
             newdoc.process_document(corenlpserver, "biomedical")
         self.documents[newdoc.did] = newdoc
         abs_time = time.time() - t
         time_per_abs.append(abs_time)
         #logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
         pbar.update(current+1)
     pbar.finish()
     abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #2
0
    def load_corpus(self, corenlpserver, process=True):
        trainfiles = [self.path + '/' + f for f in os.listdir(self.path)]
        total = len(trainfiles)
        widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
        pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
        time_per_abs = []
        for current, f in enumerate(trainfiles):
            #logging.debug('%s:%s/%s', f, current + 1, total)
            print '{}:{}/{}'.format(f, current + 1, total)
            did = f
            t = time.time()
            with open(f, 'r') as f:
                article = "<Article>" + f.read() +  "</Article>"
            soup = BeautifulSoup(article, 'xml')
            #doc = soup.find_all("article")
            title = soup.ArticleTitle.get_text()
            abstract = soup.AbstractText.get_text()
            doc_text = title + " " + abstract

            newdoc = Document(doc_text, process=False, did=did)
            newdoc.sentence_tokenize("biomedical")
            newdoc.process_document(corenlpserver, "biomedical")
            #logging.info(len(newdoc.sentences))
            self.documents[newdoc.did] = newdoc
            abs_time = time.time() - t
            time_per_abs.append(abs_time)
            logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
            pbar.update(current)
        pbar.finish()
        abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
        logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #3
0
 def load_corpus(self, corenlpserver, process=True):
     # self.path is the base directory of the files of this corpus
     trainfiles = [self.path + '/' + f for f in os.listdir(self.path) if f.endswith('.txt')]
     total = len(trainfiles)
     widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
     pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
     time_per_abs = []
     for current, f in enumerate(trainfiles):
         #logging.debug('%s:%s/%s', f, current + 1, total)
         print ('{}:{}/{}'.format(f, current + 1, total))
         did = f.split(".")[0].split("/")[-1]
         t = time.time()
         with codecs.open(f, 'r', 'utf-8') as txt:
             doctext = txt.read()
         doctext = doctext.replace("\n", " ")
         newdoc = Document(doctext, process=False, did=did)
         newdoc.sentence_tokenize("biomedical")
         if process:
             newdoc.process_document(corenlpserver, "biomedical")
         self.documents[newdoc.did] = newdoc
         abs_time = time.time() - t
         time_per_abs.append(abs_time)
         #logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
         pbar.update(current+1)
     pbar.finish()
     abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #4
0
 def load_corpus(self, corenlpserver, process=True):
     """Load the CHEMDNER corpus file on the dir element"""
     # open filename and parse lines
     total_lines = sum(1 for line in open(self.path))
     widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()]
     pbar = pb.ProgressBar(widgets=widgets, maxval=total_lines).start()
     n_lines = 1
     time_per_abs = []
     with io.open(self.path, 'r', encoding="utf-8") as inputfile:
         for line in inputfile:
             t = time.time()
             # each line is PMID  title   abs
             tsv = line.split('\t')
             doctext = tsv[1].strip().replace("<", "(").replace(">", ")") + " "
             doctext += tsv[2].strip().replace("<", "(").replace(">", ")")
             newdoc = Document(doctext, process=False,
                               did=tsv[0], title=tsv[1].strip() + ".")
             newdoc.sentence_tokenize("biomedical")
             if process:
                 newdoc.process_document(corenlpserver, "biomedical")
             self.documents[newdoc.did] = newdoc
             abs_time = time.time() - t
             time_per_abs.append(abs_time)
             pbar.update(n_lines)
             n_lines += 1
     pbar.finish()
     abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #5
0
    def load_corpus(self, corenlpserver, process=True):
        trainfiles = [self.path + '/' + f for f in os.listdir(self.path)]
        total = len(trainfiles)
        widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
        pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
        time_per_abs = []
        for current, f in enumerate(trainfiles):
            #logging.debug('%s:%s/%s', f, current + 1, total)
            print '{}:{}/{}'.format(f, current + 1, total)
            did = f
            t = time.time()
            with open(f, 'r') as f:
                article = "<Article>" + f.read() +  "</Article>"
            soup = BeautifulSoup(article, 'xml')
            #doc = soup.find_all("article")
            title = soup.ArticleTitle.get_text()
            abstract = soup.AbstractText.get_text()
            doc_text = title + " " + abstract

            newdoc = Document(doc_text, process=False, did=did)
            newdoc.sentence_tokenize("biomedical")
            newdoc.process_document(corenlpserver, "biomedical")
            #logging.info(len(newdoc.sentences))
            self.documents[newdoc.did] = newdoc
            abs_time = time.time() - t
            time_per_abs.append(abs_time)
            logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
            pbar.update(current)
        pbar.finish()
        abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
        logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #6
0
 def load_corpus(self, corenlpserver, process=True):
     """Load the CHEMDNER corpus file on the dir element"""
     # open filename and parse lines
     total_lines = sum(1 for line in open(self.path))
     widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()]
     pbar = pb.ProgressBar(widgets=widgets, maxval=total_lines).start()
     n_lines = 1
     time_per_abs = []
     with io.open(self.path, 'r', encoding="utf-8") as inputfile:
         for line in inputfile:
             t = time.time()
             # each line is PMID  title   abs
             tsv = line.split('\t')
             doctext = tsv[1].strip().replace("<", "(").replace(">", ")").replace(". ", ", ") + ". "
             doctext += tsv[2].strip().replace("<", "(").replace(">", ")")
             newdoc = Document(doctext, process=False,
                               did=tsv[0], title=tsv[1].strip() + ".")
             newdoc.sentence_tokenize("biomedical")
             if process:
                 newdoc.process_document(corenlpserver, "biomedical")
             self.documents[newdoc.did] = newdoc
             abs_time = time.time() - t
             time_per_abs.append(abs_time)
             pbar.update(n_lines)
             n_lines += 1
     pbar.finish()
     abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
     logging.info("average time per abstract: %ss" % abs_avg)
Beispiel #7
0
 def generate_corpus(self, text):
     """
     Create a corpus object from the input text.
     :param text:
     :return:
     """
     test_corpus = Corpus("")
     newdoc = Document(text, process=False, did="d0", title="Test document")
     newdoc.sentence_tokenize("biomedical")
     newdoc.process_document(self.corenlp, "biomedical")
     test_corpus.documents["d0"] = newdoc
     return test_corpus
Beispiel #8
0
 def create_sentences(self, doctag, text):
     # Create sentence entries based on text from document doctag
     cur = self.db_conn.cursor()
     newdoc = Document(text, process=False,
                               did=doctag)
     newdoc.sentence_tokenize("biomedical")
     for i, sentence in enumerate(newdoc.sentences):
         corenlpres = sentence.process_sentence(self.corenlp)
         query = """INSERT INTO sentence(senttag, doctag, senttext, sentoffset, corenlp) VALUES (%s, %s, %s, %s, %s);"""
         try:
             cur.execute(query, (sentence.sid, doctag, sentence.text.encode("utf8"), sentence.offset,
                                 str(corenlpres).encode("utf8")))
             self.db_conn.commit()
             #inserted_id = cur.lastrowid
             #return str(inserted_id)
         except MySQLdb.MySQLError as e:
             self.db_conn.rollback()
             logging.debug(e)