def __init__(self, dirResult='', options={}, crfmodelname="crf_model_simple"): #Set the default result directory """ Attributes ---------- crf : CRF svm : SVM dirResult : string directory for output files """ main = os.path.realpath(__file__).split('/') self.rootDir = "/".join(main[:len(main)-3]) if dirResult == '' : dirResult = os.path.join(self.rootDir, 'Result') if not os.path.exists(dirResult): os.makedirs(dirResult) self.dirResult = mkdtemp(dir = dirResult) + '/' self.crf = CRF(self.dirResult, options) self.svm = SVM(self.dirResult, options) self.options = options self.crfmodelname = crfmodelname
class Bilbo(object): """ A machine Bilbo that trains a CRF (and a SVM) model and automatically labels new references. """ def __init__(self, dirResult="", options={}, crfmodelname="crf_model_simple"): # Set the default result directory """ Attributes ---------- crf : CRF svm : SVM dirResult : string directory for output files """ main = os.path.realpath(__file__).split("/") self.rootDir = "/".join(main[: len(main) - 3]) if dirResult == "": dirResult = os.path.join(self.rootDir, "Result") if not os.path.exists(dirResult): os.makedirs(dirResult) self.dirResult = mkdtemp(dir=dirResult) + "/" self.crf = CRF(self.dirResult, options) self.svm = SVM(self.dirResult, options) self.options = options self.crfmodelname = crfmodelname def train(self, dirCorpus, dirModel, typeCorpus): """ CRF model learning (corpus 1 and 2), SVM model learning (corpus 2) Corpus object declaration Parameters ---------- dirCorpus : string directory where training references (notes) are dirModel : string directory where CRF and SVM models are saved typeCorpus : int, {1, 2, 3} type of corpus 1 : corpus 1, 2 : corpus 2... """ corpus = Corpus(dirCorpus, self.options) self.crf.setDirModel(dirModel) if typeCorpus == 1: print "Extract references..." corpus.extract(1, "bibl") print "crf training data extraction..." self.crf.prepareTrain(corpus, 1, "trainingdata_CRF.txt", 1, 1) # CRF training data extraction self.crf.runTrain(dirModel, "trainingdata_CRF_Wapiti.txt", self.crfmodelname) # CRF model learning elif typeCorpus == 2: print "Extract notes..." corpus.extract(2, "note") optsvm = self.options.s if optsvm == True: print "svm source data extraction..." self.crf.prepareTrain( corpus, 2, "data04SVM_ori.txt", 1 ) # Source data extraction for SVM note classification print "svm training data extraction..." self.svm.prepareTrain(corpus) # Training data extraction for SVM note classification print "svm training..." self.svm.runTrain(dirModel) # SVM model learning print "crf training data extraction..." self.crf.prepareTrain(corpus, 2, "trainingdata_CRF.txt", 1, 1, optsvm) # CRF training data extraction self.crf.runTrain( dirModel, "trainingdata_CRF_Wapiti.txt", self.crfmodelname ) # CRF model learning #self.crf.runTrain(dirModel, "trainingdata_CRF_nega_Wapiti.txt", "revueswapiti_nega", 0.0000001) #Do not work, too homogeneous print self.deleteTmpFiles() def annotate(self, dirCorpus, dirModel, typeCorpus, external=0): """ Automatic annotation of references Parameters ---------- dirCorpus : string directory where the references to be annotated are dirModel : string directory where the learned CRF model and SVM model have been saved typeCorpus : int, {1, 2, 3} 1 : corpus 1, 2 : corpus 2... external : int, {1, 0} 1 : if the references are external data except CLEO, 0 : if that of CLEO it is used to decide whether Bilbo learn call a SVM classification or not. """ corpus = Corpus(dirCorpus, self.options) self.crf.setDirModel(dirModel) # files = corpus.getFiles() filesTab = self._list_split(files, 50) for fname in filesTab: if typeCorpus == 1: corpus = self.annotateCorpus1(dirModel, corpus, fname) elif typeCorpus == 2: corpus = self.annotateCorpus2(dirModel, corpus, fname, external) corpus.deleteAllFiles() self.deleteTmpFiles() def annotateCorpus1(self, dirModel, corpus, fname): """ Automatic annotation of reference type 1 (reference) Parameters ---------- dirModel : string directory where the learned CRF model has been saved corpus : Corpus set of references that we want to annotate fname : string name of file to be annotated """ print "Extract references..." corpus.extract(1, "bibl", fname) print "crf data extraction for labeling..." self.crf.prepareTest(corpus, 1) print "crf run test for labeling..." self.crf.runTest(dirModel, "testdata_CRF_Wapiti.txt", self.crfmodelname) print "corpus add tag for labeling..." corpus.addTagReferences(self.dirResult, "testEstCRF.xml", "bibl", 1) return corpus def annotateCorpus2(self, dirModel, corpus, fname, external=0): """ Automatic annotation of reference type 2 (note) Parameters ---------- dirModel : string directory where learned CRF model and SVM model have been saved corpus : Corpus set of notes that we want to annotate fname : string name of file to be annotated external : int, {1, 0} 1 : if external data, 0 : if CLEO data See also -------- Oct. 18, 2012 SVM classification problem is fixed Check the classification result of reference (reference.train) in 'addTagReferences' method of 'Corpus' class that is called in 'annotateCorpus2' method of 'Bilbo' class. """ print "Extract notes..." corpus.extract(2, "note", fname, external) if external == 0 and self.options.s: # if not external data and svm option is true print "svm source data extraction..." self.crf.prepareTest(corpus, 2, -1) # last argument:int, -1:prepare source data for SVM learning, default:0 print "svm data extraction for labeling..." self.svm.prepareTest(corpus) self.svm.runTest(dirModel) print "crf data extraction for labeling..." newlistReferences = self.crf.prepareTest(corpus, 2) self.crf.runTest(dirModel, "testdata_CRF_Wapiti.txt", self.crfmodelname) self.crf.postProcessTest("testEstCRF.txt", "testEstCLNblCRF.txt", newlistReferences.getReferences()) corpus.addTagReferences(self.dirResult, "testEstCRF.xml", "note", 2, newlistReferences.getReferences()) else: # if external data : external=1, we do not call a SVM model print "crf data extraction for labeling..." self.crf.prepareTest(corpus, 2, 2) # indiceSvm=2 at prepareTest(self, corpus, typeCorpus, indiceSvm = 0) print "crf run test for labeling..." self.crf.runTest(dirModel, "testdata_CRF_Wapiti.txt", self.crfmodelname) print "corpus add tag for labeling..." corpus.addTagReferences(self.dirResult, "testEstCRF.xml", "note", 2) return corpus def deleteTmpFiles(self): dirResultRoot = os.path.abspath(os.path.join(self.dirResult, os.path.pardir)) + "/" toKeep = [] if self.options.k == "primary": toKeep = ["testEstCRF.xml", "testEstCRF.txt", "testdatawithlabel_CRF.txt"] if self.options.k != "all": for dir_name, sub_dirs, files in os.walk(self.dirResult): for f in files: if f in toKeep: shutil.copyfile(dir_name + f, dirResultRoot + f) os.unlink(os.path.join(dir_name, f)) os.rmdir(self.dirResult) def _list_split(self, flist, size): """ Split a filelist Parameters ---------- flist : list list to be split size : int new file list size result : list new file list """ result = [[]] while len(flist) > 0: if len(result[-1]) >= size: result.append([]) result[-1].append(flist.pop(0)) return result """memory""" def mem(self, size="rss"): """Generalization; memory sizes: rss, rsz, vsz.""" return os.popen("ps -p %d -o %s | tail -1" % (os.getpid(), size)).read() def rss(self): """Return ps -o rss (resident) memory in kB.""" return self.mem("rss") def rsz(self): """Return ps -o rsz (resident + text) memory in kB.""" return self.mem("rsz") def vsz(self): """Return ps -o vsz (virtual) memory in kB.""" return self.mem("vsz")