Example #1
0
	def formatTrain(self):
		self.bilboOptions.T = True
		self.bilboOptions.L = False
		bilbo = Bilbo(self.dirResult, self.bilboOptions, "crf_model_simple") # To save tmpFiles in testDir
		corpus = Corpus(self.dirCorpus, self.bilboOptions)
		corpus.extract(1, self.corpusTag)
		bilbo.crf.prepareTrain(corpus, 1, "evaldata_CRF.txt", 1, 1) #CRF training data extraction
Example #2
0
File: Bilbo.py Project: Unaah/bilbo
    def annotate(self, dirCorpus, dirModel, typeCorpus, external=0):
        """
		Automatic annotation of references 
		
		Parameters
		----------
		dirCorpus : string
			directory where the references to be annotated are
		dirModel : string
			directory where the learned CRF model and SVM model have been saved
		typeCorpus : int, {1, 2, 3}
			1 : corpus 1, 2 : corpus 2...
		external : int, {1, 0}
			1 : if the references are external data except CLEO, 0 : if that of CLEO
			it is used to decide whether Bilbo learn call a SVM classification or not.
		"""
        corpus = Corpus(dirCorpus, self.options)
        self.crf.setDirModel(dirModel)  #
        files = corpus.getFiles()
        filesTab = self._list_split(files, 50)
        for fname in filesTab:
            if typeCorpus == 1:
                corpus = self.annotateCorpus1(dirModel, corpus, fname)
            elif typeCorpus == 2:
                corpus = self.annotateCorpus2(dirModel, corpus, fname, external)
            corpus.deleteAllFiles()

        self.deleteTmpFiles()
Example #3
0
	def annotate(self):
		for dirPartition in self.dirPartitions:
			(annotateDir, testDir, trainDir, modelDir, resultDir) = self.partitions.getDirTestNames(dirPartition)
			
			# annotation of test data striped tagged
			self._setBilboAnnotate()
			self._del_tmp_file(resultDir)
			bilbo = Bilbo(resultDir, self.bilboOptions, "crf_model_simple")
			bilbo.annotate(annotateDir, modelDir, 1)
			
			# train with test data for evaluation
			self._setBilboTrain()
			self._del_tmp_file(trainDir)
			bilbo = Bilbo(trainDir, self.bilboOptions, "crf_model_simple") # To save tmpFiles in testDir
			corpus = Corpus(testDir, self.bilboOptions)
			corpus.extract(1, "bibl")
			bilbo.crf.prepareTrain(corpus, 1, "evaldata_CRF.txt", 1, 1) #CRF training data extraction
Example #4
0
File: Bilbo.py Project: Unaah/bilbo
    def train(self, dirCorpus, dirModel, typeCorpus):
        """
		CRF model learning (corpus 1 and 2), SVM model learning (corpus 2)
		Corpus object declaration
		
		Parameters
		----------
		dirCorpus : string
			directory where training references (notes) are
		dirModel : string
			directory where CRF and SVM models are saved
		typeCorpus : int, {1, 2, 3}
			type of corpus
			1 : corpus 1, 2 : corpus 2...
		"""
        corpus = Corpus(dirCorpus, self.options)
        self.crf.setDirModel(dirModel)
        if typeCorpus == 1:
            print "Extract references..."
            corpus.extract(1, "bibl")
            print "crf training data extraction..."
            self.crf.prepareTrain(corpus, 1, "trainingdata_CRF.txt", 1, 1)  # CRF training data extraction
            self.crf.runTrain(dirModel, "trainingdata_CRF_Wapiti.txt", self.crfmodelname)  # CRF model learning

        elif typeCorpus == 2:
            print "Extract notes..."
            corpus.extract(2, "note")
            optsvm = self.options.s
            if optsvm == True:
                print "svm source data extraction..."
                self.crf.prepareTrain(
                    corpus, 2, "data04SVM_ori.txt", 1
                )  # Source data extraction for SVM note classification
                print "svm training data extraction..."
                self.svm.prepareTrain(corpus)  # Training data extraction for SVM note classification
                print "svm training..."
                self.svm.runTrain(dirModel)  # SVM model learning

            print "crf training data extraction..."
            self.crf.prepareTrain(corpus, 2, "trainingdata_CRF.txt", 1, 1, optsvm)  # CRF training data extraction
            self.crf.runTrain(
                dirModel, "trainingdata_CRF_Wapiti.txt", self.crfmodelname
            )  # CRF model learning			#self.crf.runTrain(dirModel, "trainingdata_CRF_nega_Wapiti.txt", "revueswapiti_nega", 0.0000001) #Do not work, too homogeneous
            print
        self.deleteTmpFiles()