def formatTrain(self): self.bilboOptions.T = True self.bilboOptions.L = False bilbo = Bilbo(self.dirResult, self.bilboOptions, "crf_model_simple") # To save tmpFiles in testDir corpus = Corpus(self.dirCorpus, self.bilboOptions) corpus.extract(1, self.corpusTag) bilbo.crf.prepareTrain(corpus, 1, "evaldata_CRF.txt", 1, 1) #CRF training data extraction
def annotate(self, dirCorpus, dirModel, typeCorpus, external=0): """ Automatic annotation of references Parameters ---------- dirCorpus : string directory where the references to be annotated are dirModel : string directory where the learned CRF model and SVM model have been saved typeCorpus : int, {1, 2, 3} 1 : corpus 1, 2 : corpus 2... external : int, {1, 0} 1 : if the references are external data except CLEO, 0 : if that of CLEO it is used to decide whether Bilbo learn call a SVM classification or not. """ corpus = Corpus(dirCorpus, self.options) self.crf.setDirModel(dirModel) # files = corpus.getFiles() filesTab = self._list_split(files, 50) for fname in filesTab: if typeCorpus == 1: corpus = self.annotateCorpus1(dirModel, corpus, fname) elif typeCorpus == 2: corpus = self.annotateCorpus2(dirModel, corpus, fname, external) corpus.deleteAllFiles() self.deleteTmpFiles()
def annotate(self): for dirPartition in self.dirPartitions: (annotateDir, testDir, trainDir, modelDir, resultDir) = self.partitions.getDirTestNames(dirPartition) # annotation of test data striped tagged self._setBilboAnnotate() self._del_tmp_file(resultDir) bilbo = Bilbo(resultDir, self.bilboOptions, "crf_model_simple") bilbo.annotate(annotateDir, modelDir, 1) # train with test data for evaluation self._setBilboTrain() self._del_tmp_file(trainDir) bilbo = Bilbo(trainDir, self.bilboOptions, "crf_model_simple") # To save tmpFiles in testDir corpus = Corpus(testDir, self.bilboOptions) corpus.extract(1, "bibl") bilbo.crf.prepareTrain(corpus, 1, "evaldata_CRF.txt", 1, 1) #CRF training data extraction
def train(self, dirCorpus, dirModel, typeCorpus): """ CRF model learning (corpus 1 and 2), SVM model learning (corpus 2) Corpus object declaration Parameters ---------- dirCorpus : string directory where training references (notes) are dirModel : string directory where CRF and SVM models are saved typeCorpus : int, {1, 2, 3} type of corpus 1 : corpus 1, 2 : corpus 2... """ corpus = Corpus(dirCorpus, self.options) self.crf.setDirModel(dirModel) if typeCorpus == 1: print "Extract references..." corpus.extract(1, "bibl") print "crf training data extraction..." self.crf.prepareTrain(corpus, 1, "trainingdata_CRF.txt", 1, 1) # CRF training data extraction self.crf.runTrain(dirModel, "trainingdata_CRF_Wapiti.txt", self.crfmodelname) # CRF model learning elif typeCorpus == 2: print "Extract notes..." corpus.extract(2, "note") optsvm = self.options.s if optsvm == True: print "svm source data extraction..." self.crf.prepareTrain( corpus, 2, "data04SVM_ori.txt", 1 ) # Source data extraction for SVM note classification print "svm training data extraction..." self.svm.prepareTrain(corpus) # Training data extraction for SVM note classification print "svm training..." self.svm.runTrain(dirModel) # SVM model learning print "crf training data extraction..." self.crf.prepareTrain(corpus, 2, "trainingdata_CRF.txt", 1, 1, optsvm) # CRF training data extraction self.crf.runTrain( dirModel, "trainingdata_CRF_Wapiti.txt", self.crfmodelname ) # CRF model learning #self.crf.runTrain(dirModel, "trainingdata_CRF_nega_Wapiti.txt", "revueswapiti_nega", 0.0000001) #Do not work, too homogeneous print self.deleteTmpFiles()