def prepareTrain(self, corpus, typeCorpus, fileRes, tr=-1, extOption=-1, optsvm=True): """ Prepare CRF training data Parameters ---------- corpus : Corpus typeCorpus : int, {1, 2, 3} 1 : corpus 1, 2 : corpus 2... fileRes : string output file name tr : int, {1, 0, -1, -2} (default -1) check if training or test data 1 : train, 0 : test without label, -1 : test with label, -2 : test only label extOption : int, {-1, 1, ...} (default -1) extra option for crf training/test data format check if data is internal data, if yes we'll use a modified index for corpus type 2 -1 : data format for SVM 1 : data format for normal CRF training/test data 2-5 : (not yet provided) """ listReferences = corpus.getListReferences(typeCorpus) newListReferences = ListReferences(listReferences, typeCorpus) extractor = Extract_crf(self.options) nbRef = corpus.nbReference(typeCorpus) 'generation of training index for each reference' extractor.randomgen(newListReferences, 1) 'if corpus type 2 and extOption=1, we use a modified index list' #!!!!!!!!!! if typeCorpus == 2 and extOption == 1: 'modify the indices to eliminate the reference (or not print the reference) classified as non-bibl BY SVM' if optsvm == True : #if not, do not modify extractor.extractIndices(self.dirResult+"svm_predictions_training", newListReferences) extractor.extract(typeCorpus, nbRef, self.dirResult+fileRes, newListReferences, tr, extOption) else: # typeCorpus == 1 or (typeCorpus == 2 and isFrstExt == -1) ########## SOURCE DATA EXTRACTION FOR SVM OR CORPUS 1 (BUT THESE ARE DIFFERENT !!!) extractor.extract(typeCorpus, nbRef, self.dirResult+fileRes, newListReferences, tr, extOption) return
def prepareTest(self, corpus, typeCorpus, indiceSvm = 0): """ Prepare CRF test data Parameters ---------- corpus : Corpus typeCorpus : int, {1, 2, 3} 1 : corpus 1, 2 : corpus 2... indiceSvm : int, {0, -1, 2} 0 : normal(corpus 1) -1 : data04SVM (corpus2), 2 : external data => svm isn't called """ listReferences = corpus.getListReferences(typeCorpus) listReferencesObj = ListReferences(listReferences, typeCorpus) extractor = Extract_crf(self.options) nbRef = corpus.nbReference(typeCorpus) 'generation of test index for each reference' extractor.randomgen(ListReferences(listReferencesObj.getReferences(),typeCorpus), 0) if indiceSvm == -1: extractor.extract(typeCorpus, nbRef, self.dirResult+"data04SVM_ori.txt", ListReferences(listReferencesObj.getReferences(),typeCorpus)) else: 'file for CRF training' if typeCorpus == 2 and indiceSvm != 2 : extractor.extractIndices4new(self.dirResult+"svm_predictions_new", ListReferences(listReferencesObj.getReferences(),typeCorpus)) extractor.extract(typeCorpus, nbRef, self.dirResult+"testdatawithlabel_CRF.txt",ListReferences(listReferencesObj.getReferences(),typeCorpus), -1, 1) extractor.extract(typeCorpus, nbRef, self.dirResult+"testdata_CRF.txt",ListReferences(listReferencesObj.getReferences(),typeCorpus), 0, 1) return ListReferences(listReferencesObj.getReferences(),typeCorpus)