Beispiel #1
0
    def load_corpus(self, corenlpserver):
        # self.path is the base directory of the files of this corpus

#         if more than one file:
        trainfiles = [self.path + f for f in os.listdir(self.path) if not f.endswith('~')] # opens all files in folder (see config file)
        widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', ' ', pb.Timer()]
        pbar = pb.ProgressBar(widgets=widgets, maxval=len(trainfiles)).start()
        for i, openfile in enumerate(trainfiles):
            # print("file: "+openfile)
            with open(openfile, 'r') as inputfile:
                newdoc = Document(inputfile.read(), process=False, did=os.path.basename(openfile), title = "titulo_"+os.path.basename(openfile))
            newdoc.process_document(corenlpserver, "biomedical") #process_document chama o tokenizer
            valid = True
            invalid_sids = []
            for s in newdoc.sentences:
                if s.text in ['[start section id="{}"]'.format(section) for section in self.invalid_sections]:
                    valid = False
                if not valid:
                    invalid_sids.append(s.sid)
                if s.text in ['[end section id="{}"]'.format(section) for section in self.invalid_sections]:
                    valid = True
                if (s.text.startswith("[") and s.text.endswith("]")) or s.text.istitle():
                    newdoc.title_sids.append(s.sid)
            newdoc.invalid_sids = invalid_sids
            logging.debug("invalid sentences: {}".format(invalid_sids))
            logging.debug("title sentences: {}".format(newdoc.title_sids))
            self.documents[newdoc.did] = newdoc
            pbar.update(i+1)