def extractAndCleanCorpus(self): files = os.path.join(self.dirCorpus, "*xml") for xmlFile in glob.glob(files): with open(xmlFile, 'r', encoding='utf-8') as content_file: content = content_file.read() striped = FormatEval.strip_tags(content, self.corpusTag) self._saveFile(striped, self.dirLabel, os.path.basename(xmlFile))
def partition(self): self.createPartitionFolders(self.dirCorpus, self.testPercentage, self.numberOfPartition) bibl_list = FormatEval.get_list_of_tag_from_dir(self.dirCorpus) # faire une liste de toutes les bibl dans les fichiers [(nom_fichier, bibl_index)] # shuffle de cette liste label/train # sort la liste par fichier # pour chaque fichier effacer les bibl qui ne font pas partie de l'index self.createEvaluationfiles(self.dirCorpus, self.testPercentage, self.numberOfPartition, bibl_list)
def createEvaluationfiles(self, dirCorpus, testPercentage, numberOfPartition, allBibl): dirPartitions = self.getDirPartitionNames() for dirPartition in dirPartitions: (annotateDir, testDir, trainDir, modelDir, _) = self.getDirTestNames(dirPartition) testCorpus, trainCorpus = FormatEval.getShuffledCorpus(allBibl, testPercentage) trainFile = os.path.join(trainDir, 'train.xml') self.saveListToFile(trainCorpus, trainFile) cleanCorpus = FormatEval.stripTags(testCorpus) cleanFile = os.path.join(annotateDir, 'test_clean.xml') self.saveListToFile(cleanCorpus, cleanFile) # In test.xml we need to duplicate <bibl> inside <bibl>, in order to present the same data for evaluation # Bilbo does not format the "same" data equaly between train and annotation evalFile = os.path.join(testDir, 'test.xml') testCorpus = FormatEval.getBiblList("\n".join(testCorpus)) self.saveListToFile(testCorpus, evalFile)
def createEvaluationfiles(self, dirCorpus, testPercentage, numberOfPartition, bibl_list): dirPartitions = self.getDirPartitionNames() for dirPartition in dirPartitions: (annotateDir, testDir, trainDir, modelDir, _) = self.getDirTestNames(dirPartition) testCorpus, trainCorpus = FormatEval.getShuffledCorpus(bibl_list, testPercentage) #print testCorpus #print trainCorpus # files used for training (100 - testPercentage % of the corpus) FormatEval.copy_files_for_eval(self.dirCorpus, trainDir, trainCorpus) # files used for evaluation keeping annotations (testPercentage % of the corpus) FormatEval.copy_files_for_eval(self.dirCorpus, testDir, testCorpus) # files used for evaluation, strip the annotations # they will be labeled by bilbo FormatEval.copy_files_for_eval(self.dirCorpus, annotateDir, testCorpus, 'bibl', strip=True)
def getAndSaveAllBibl(self, dirCorpus): allBibl = FormatEval.getBiblFromDir(dirCorpus) fileName = os.path.join(self.getDirEvalName(), 'all_bibl.xml') self.saveListToFile(allBibl, fileName) return allBibl