Exemple #1
0
	def prepareTrain(self, corpus, typeCorpus, fileRes, tr=-1, extOption=-1, optsvm=True):
		"""
		Prepare CRF training data
		
		Parameters
		----------
		corpus : Corpus
		typeCorpus : int, {1, 2, 3}
			1 : corpus 1, 2 : corpus 2...
		fileRes : string
			output file name
		tr : int, {1, 0, -1, -2} (default -1)
			check if training or test data
			1 : train, 0 : test without label, -1 : test with label, -2 : test only label 
		extOption : int, {-1, 1, ...} (default -1)
			extra option for crf training/test data format
			check if data is internal data, if yes we'll use a modified index for corpus type 2
			-1 : data format for SVM 
			1 : data format for normal CRF training/test data 
			2-5 : (not yet provided)
		"""	
		listReferences = corpus.getListReferences(typeCorpus)
		newListReferences = ListReferences(listReferences, typeCorpus)
		extractor = Extract_crf(self.options)
		nbRef = corpus.nbReference(typeCorpus)

		'generation of training index for each reference'
		extractor.randomgen(newListReferences, 1)
				
		'if corpus type 2 and extOption=1, we use a modified index list' #!!!!!!!!!!
		if typeCorpus == 2 and extOption == 1:
			'modify the indices to eliminate the reference (or not print the reference) classified as non-bibl BY SVM'
			if optsvm == True : #if not, do not modify
				extractor.extractIndices(self.dirResult+"svm_predictions_training", newListReferences)
			extractor.extract(typeCorpus, nbRef, self.dirResult+fileRes, newListReferences, tr, extOption)
			
		else: # typeCorpus == 1 or (typeCorpus == 2 and isFrstExt == -1)
			########## SOURCE DATA EXTRACTION FOR SVM OR CORPUS 1 (BUT THESE ARE DIFFERENT !!!)
			extractor.extract(typeCorpus, nbRef, self.dirResult+fileRes, newListReferences, tr, extOption)
		
		return
Exemple #2
0
	def prepareTest(self, corpus, typeCorpus, indiceSvm = 0):
		"""
		Prepare CRF test data
		
		Parameters
		----------
		corpus : Corpus
		typeCorpus : int, {1, 2, 3}
			1 : corpus 1, 2 : corpus 2...
		indiceSvm : int, {0, -1, 2}
			0 : normal(corpus 1)
			-1 : data04SVM (corpus2),
			2 : external data => svm isn't called
		"""
		listReferences = corpus.getListReferences(typeCorpus)
		listReferencesObj = ListReferences(listReferences, typeCorpus)
		
		extractor = Extract_crf(self.options)
		nbRef = corpus.nbReference(typeCorpus)
		
		'generation of test index for each reference'
		extractor.randomgen(ListReferences(listReferencesObj.getReferences(),typeCorpus), 0)
		
		if indiceSvm == -1:
			extractor.extract(typeCorpus, nbRef, self.dirResult+"data04SVM_ori.txt", ListReferences(listReferencesObj.getReferences(),typeCorpus))
		else: 
			'file for CRF training'
			if typeCorpus == 2 and indiceSvm != 2 :
				extractor.extractIndices4new(self.dirResult+"svm_predictions_new", ListReferences(listReferencesObj.getReferences(),typeCorpus))
			
			extractor.extract(typeCorpus, nbRef, self.dirResult+"testdatawithlabel_CRF.txt",ListReferences(listReferencesObj.getReferences(),typeCorpus), -1, 1)			
			extractor.extract(typeCorpus, nbRef, self.dirResult+"testdata_CRF.txt",ListReferences(listReferencesObj.getReferences(),typeCorpus), 0, 1)

		return ListReferences(listReferencesObj.getReferences(),typeCorpus)