Example #1
0
	def __init__(self, dirResult='', options={}, crfmodelname="crf_model_simple"): #Set the default result directory
		"""
		Attributes
		----------
		crf : CRF
		svm : SVM
		dirResult : string
			directory for output files
		"""
		main = os.path.realpath(__file__).split('/')
		self.rootDir = "/".join(main[:len(main)-3])
		
		if dirResult == '' : dirResult = os.path.join(self.rootDir, 'Result')
		if not os.path.exists(dirResult): os.makedirs(dirResult)
		self.dirResult = mkdtemp(dir = dirResult) + '/'
		self.crf = CRF(self.dirResult, options)
		self.svm = SVM(self.dirResult, options)
		self.options = options
		self.crfmodelname = crfmodelname
Example #2
0
File: Bilbo.py Project: Unaah/bilbo
class Bilbo(object):
    """
	A machine Bilbo that trains a CRF (and a SVM) model and automatically labels new references.
	"""

    def __init__(self, dirResult="", options={}, crfmodelname="crf_model_simple"):  # Set the default result directory
        """
		Attributes
		----------
		crf : CRF
		svm : SVM
		dirResult : string
			directory for output files
		"""
        main = os.path.realpath(__file__).split("/")
        self.rootDir = "/".join(main[: len(main) - 3])

        if dirResult == "":
            dirResult = os.path.join(self.rootDir, "Result")
        if not os.path.exists(dirResult):
            os.makedirs(dirResult)
        self.dirResult = mkdtemp(dir=dirResult) + "/"
        self.crf = CRF(self.dirResult, options)
        self.svm = SVM(self.dirResult, options)
        self.options = options
        self.crfmodelname = crfmodelname

    def train(self, dirCorpus, dirModel, typeCorpus):
        """
		CRF model learning (corpus 1 and 2), SVM model learning (corpus 2)
		Corpus object declaration
		
		Parameters
		----------
		dirCorpus : string
			directory where training references (notes) are
		dirModel : string
			directory where CRF and SVM models are saved
		typeCorpus : int, {1, 2, 3}
			type of corpus
			1 : corpus 1, 2 : corpus 2...
		"""
        corpus = Corpus(dirCorpus, self.options)
        self.crf.setDirModel(dirModel)
        if typeCorpus == 1:
            print "Extract references..."
            corpus.extract(1, "bibl")
            print "crf training data extraction..."
            self.crf.prepareTrain(corpus, 1, "trainingdata_CRF.txt", 1, 1)  # CRF training data extraction
            self.crf.runTrain(dirModel, "trainingdata_CRF_Wapiti.txt", self.crfmodelname)  # CRF model learning

        elif typeCorpus == 2:
            print "Extract notes..."
            corpus.extract(2, "note")
            optsvm = self.options.s
            if optsvm == True:
                print "svm source data extraction..."
                self.crf.prepareTrain(
                    corpus, 2, "data04SVM_ori.txt", 1
                )  # Source data extraction for SVM note classification
                print "svm training data extraction..."
                self.svm.prepareTrain(corpus)  # Training data extraction for SVM note classification
                print "svm training..."
                self.svm.runTrain(dirModel)  # SVM model learning

            print "crf training data extraction..."
            self.crf.prepareTrain(corpus, 2, "trainingdata_CRF.txt", 1, 1, optsvm)  # CRF training data extraction
            self.crf.runTrain(
                dirModel, "trainingdata_CRF_Wapiti.txt", self.crfmodelname
            )  # CRF model learning			#self.crf.runTrain(dirModel, "trainingdata_CRF_nega_Wapiti.txt", "revueswapiti_nega", 0.0000001) #Do not work, too homogeneous
            print
        self.deleteTmpFiles()

    def annotate(self, dirCorpus, dirModel, typeCorpus, external=0):
        """
		Automatic annotation of references 
		
		Parameters
		----------
		dirCorpus : string
			directory where the references to be annotated are
		dirModel : string
			directory where the learned CRF model and SVM model have been saved
		typeCorpus : int, {1, 2, 3}
			1 : corpus 1, 2 : corpus 2...
		external : int, {1, 0}
			1 : if the references are external data except CLEO, 0 : if that of CLEO
			it is used to decide whether Bilbo learn call a SVM classification or not.
		"""
        corpus = Corpus(dirCorpus, self.options)
        self.crf.setDirModel(dirModel)  #
        files = corpus.getFiles()
        filesTab = self._list_split(files, 50)
        for fname in filesTab:
            if typeCorpus == 1:
                corpus = self.annotateCorpus1(dirModel, corpus, fname)
            elif typeCorpus == 2:
                corpus = self.annotateCorpus2(dirModel, corpus, fname, external)
            corpus.deleteAllFiles()

        self.deleteTmpFiles()

    def annotateCorpus1(self, dirModel, corpus, fname):
        """
		Automatic annotation of reference type 1 (reference)
		
		Parameters
		----------
		dirModel : string
			directory where the learned CRF model has been saved
		corpus : Corpus
			set of references that we want to annotate
		fname :	string
			name of file to be annotated
		"""
        print "Extract references..."
        corpus.extract(1, "bibl", fname)
        print "crf data extraction for labeling..."
        self.crf.prepareTest(corpus, 1)
        print "crf run test for labeling..."
        self.crf.runTest(dirModel, "testdata_CRF_Wapiti.txt", self.crfmodelname)
        print "corpus add tag for labeling..."
        corpus.addTagReferences(self.dirResult, "testEstCRF.xml", "bibl", 1)

        return corpus

    def annotateCorpus2(self, dirModel, corpus, fname, external=0):
        """
		Automatic annotation of reference type 2 (note)
		
		Parameters
		----------
		dirModel : string
			directory where learned CRF model and SVM model have been saved
		corpus : Corpus
			set of notes that we want to annotate
		fname :	string
			name of file to be annotated
		external : int, {1, 0}
			1 : if external data, 0 : if CLEO data

		See also
		--------
		Oct. 18, 2012 	SVM classification problem is fixed
						Check the classification result of reference (reference.train) in 'addTagReferences' method
						of 'Corpus' class that is called in 'annotateCorpus2' method of 'Bilbo' class.
		"""
        print "Extract notes..."
        corpus.extract(2, "note", fname, external)
        if external == 0 and self.options.s:  # if not external data and svm option is true
            print "svm source data extraction..."
            self.crf.prepareTest(corpus, 2, -1)  # last argument:int, -1:prepare source data for SVM learning, default:0
            print "svm data extraction for labeling..."
            self.svm.prepareTest(corpus)
            self.svm.runTest(dirModel)

            print "crf data extraction for labeling..."
            newlistReferences = self.crf.prepareTest(corpus, 2)
            self.crf.runTest(dirModel, "testdata_CRF_Wapiti.txt", self.crfmodelname)
            self.crf.postProcessTest("testEstCRF.txt", "testEstCLNblCRF.txt", newlistReferences.getReferences())
            corpus.addTagReferences(self.dirResult, "testEstCRF.xml", "note", 2, newlistReferences.getReferences())

        else:  # if external data : external=1, we do not call a SVM model
            print "crf data extraction for labeling..."
            self.crf.prepareTest(corpus, 2, 2)  # indiceSvm=2 at prepareTest(self, corpus, typeCorpus, indiceSvm = 0)
            print "crf run test for labeling..."
            self.crf.runTest(dirModel, "testdata_CRF_Wapiti.txt", self.crfmodelname)
            print "corpus add tag for labeling..."
            corpus.addTagReferences(self.dirResult, "testEstCRF.xml", "note", 2)

        return corpus

    def deleteTmpFiles(self):
        dirResultRoot = os.path.abspath(os.path.join(self.dirResult, os.path.pardir)) + "/"
        toKeep = []
        if self.options.k == "primary":
            toKeep = ["testEstCRF.xml", "testEstCRF.txt", "testdatawithlabel_CRF.txt"]
        if self.options.k != "all":
            for dir_name, sub_dirs, files in os.walk(self.dirResult):
                for f in files:
                    if f in toKeep:
                        shutil.copyfile(dir_name + f, dirResultRoot + f)
                    os.unlink(os.path.join(dir_name, f))
            os.rmdir(self.dirResult)

    def _list_split(self, flist, size):
        """
		Split a filelist
		
		Parameters
		----------
		flist : list
			list to be split
		size : int
			new file list size
		result : list
			new file list
		"""
        result = [[]]
        while len(flist) > 0:
            if len(result[-1]) >= size:
                result.append([])
            result[-1].append(flist.pop(0))
        return result

    """memory"""

    def mem(self, size="rss"):
        """Generalization; memory sizes: rss, rsz, vsz."""
        return os.popen("ps -p %d -o %s | tail -1" % (os.getpid(), size)).read()

    def rss(self):
        """Return ps -o rss (resident) memory in kB."""
        return self.mem("rss")

    def rsz(self):
        """Return ps -o rsz (resident + text) memory in kB."""
        return self.mem("rsz")

    def vsz(self):
        """Return ps -o vsz (virtual) memory in kB."""
        return self.mem("vsz")