Example #1
0
File: utils.py Project: Unaah/bilbo
def labeling(line, modelname, options):
	"""
	Label input sting. Called by simpleLabeling or detailLabeling
	"""
	tmpDir = rootDir+'/simpletmp'
	resDir = os.getcwd() #current working directory
	
	dtype = options.t
	if dtype == "bibl" : typeCorpus = 1
	elif dtype == "note" : typeCorpus = 2
	dirModel = os.path.join(rootDir, 'model/corpus')+str(typeCorpus)+"/"+options.m+"/"
	
	bilbo = Bilbo(resDir, options, modelname)
	if not os.path.exists(tmpDir): 
		os.makedirs(tmpDir)
	else : #delete all existing files
		for dir_name, sub_dirs, files in os.walk(tmpDir):
			for f in files : os.unlink(os.path.join(dir_name, f))
	#tmp file generation	
	filename = os.path.join(tmpDir, 'tmp.xml')
	tmpFile = open(filename, "w")
	tmpFile.write('<list'+dtype.title()+'>\n')	
	tmpFile.write('<'+dtype+'> '+str(line)+' </'+dtype+'>')
	tmpFile.write('\n</list'+dtype.title()+'>\n')
	tmpFile.close()
	
	if options.t == "note" and options.e: bilbo.annotate(tmpDir, dirModel, typeCorpus, 1)
	else : bilbo.annotate(tmpDir, dirModel, typeCorpus)
	
	tmp_str = ''.join(open(os.path.join(resDir, 'tmp.xml')).readlines())
	
	os.unlink(filename)	
	os.rmdir(tmpDir)
	
	return tmp_str
Example #2
0
	def train(self):
		for dirPartition in self.dirPartitions:
			print "dirPartition", dirPartition
			(annotateDir, testDir, trainDir, modelDir, resultDir) = self.partitions.getDirTestNames(dirPartition)
			
			self._del_tmp_file(trainDir) # tmp file of test data are here
			bilbo = Bilbo(modelDir, self.bilboOptions, "crf_model_simple") # tmpFiles saved in modelDir if -k all
			bilbo.train(trainDir, modelDir, 1)
Example #3
0
	def train(self):
		for dirPartition in self.dirPartitions:
			print "dirPartition", dirPartition
			(annotateDir, testDir, trainDir, modelDir, resultDir) = self.partitions.getDirTestNames(dirPartition)
			
			#self._del_tmp_file(modelDir)
			bilbo = Bilbo(modelDir, self.bilboOptions, "crf_model_simple") # To save tmpFiles in modelDir
			bilbo.train(trainDir, modelDir, 1)
Example #4
0
def annoterCorpus(corpus, request):
	dirModel = os.path.abspath('../../model/corpus' + str(corpus) + "/revues/") + "/"
	dir_in = os.path.abspath('tmp/in') + "/"
	dir_out = os.path.abspath('tmp/out') + "/"

	if corpus == 2: optStr = '-T -t note'
	else: optStr = '-T -t bibl'

	if hasattr(request, 'doi'):
		optStr += ' -d'

	parser = defaultOptions()
	options, args = parser.parse_args(optStr.split())

	bilbo = Bilbo(dir_out, options, "crf_model_simple")
	bilbo.annotate(dir_in, dirModel, corpus)
	return
Example #5
0
	def annotate(self):
		for dirPartition in self.dirPartitions:
			(annotateDir, testDir, trainDir, modelDir, resultDir) = self.partitions.getDirTestNames(dirPartition)
			
			# annotation of test data striped tagged
			self._setBilboAnnotate()
			self._del_tmp_file(resultDir)
			bilbo = Bilbo(resultDir, self.bilboOptions, "crf_model_simple")
			bilbo.annotate(annotateDir, modelDir, 1)
			
			# train with test data for evaluation
			self._setBilboTrain()
			self._del_tmp_file(trainDir)
			bilbo = Bilbo(trainDir, self.bilboOptions, "crf_model_simple") # To save tmpFiles in testDir
			corpus = Corpus(testDir, self.bilboOptions)
			corpus.extract(1, "bibl")
			bilbo.crf.prepareTrain(corpus, 1, "evaldata_CRF.txt", 1, 1) #CRF training data extraction
Example #6
0
def labeling(line, modelname, options):
	"""
	Label input sting. Called by simpleLabeling or detailLabeling
	"""
	#tmpDir = rootDir+'/simpletmp'
	#resDir = os.getcwd() #current working directory
	#It's better to have secure tmp dirs for multiple threads
	tmpDir = tempfile.mkdtemp(prefix='bilbo_labeling_tmp')
	resDir = tempfile.mkdtemp(prefix='bilbo_labeling_res_dir')

	dtype = options.t
	if dtype == "bibl" : typeCorpus = 1
	elif dtype == "note" : typeCorpus = 2
	dirModel = os.path.join(rootDir, 'model/corpus')+str(typeCorpus)+"/"+options.m+"/"

	bilbo = Bilbo(resDir, options, modelname)
	if not os.path.exists(tmpDir):
		os.makedirs(tmpDir)
	else : #delete all existing files
		for dir_name, sub_dirs, files in os.walk(tmpDir):
			for f in files : os.unlink(os.path.join(dir_name, f))
	#tmp file generation
	filename = os.path.join(tmpDir, 'tmp.xml')
	tmpFile = open(filename, "w")
	tmpFile.write('<list'+dtype.title()+'>\n')
	tmpFile.write('<'+dtype+'> ')
	tmpFile.write(line.encode(encoding="utf8"))
	tmpFile.write(' </'+dtype+'>')
	tmpFile.write('\n</list'+dtype.title()+'>\n')
	tmpFile.close()

	if options.t == "note" and options.e: bilbo.annotate(tmpDir, dirModel, typeCorpus, 1)
	else : bilbo.annotate(tmpDir, dirModel, typeCorpus)

	#tmp_str = ''.join(open(os.path.join(resDir, 'tmp.xml')).readlines())
	tmp_str = unicode('')
	with codecs.open(os.path.join(resDir, 'tmp.xml'), encoding='utf8') as tmp_str_fp:
		for line in tmp_str_fp:
			tmp_str += unicode(line)

	os.unlink(filename)
	os.rmdir(tmpDir)
	shutil.rmtree(resDir) #Because this one may not be empty when we delete it

	return tmp_str
Example #7
0
File: Main.py Project: morban/bilbo
        print "\t  xml => simple xml"
        print "\t  simple => only labeled references without article contents or original tags"
        print "  -d : --doi"
        print "\t Digital object identifier (doi) extraction via crossref site, default='False'"
        print "  -e : --exterdata"
        print "\t Labeling data different from training set. Do not use svm filtering when using this option. default='False'"

        print "Arguments"
        print "  arg1 : <string>"
        print "\t input data folder where the data files are (training or labeling)"
        print "  arg2 : <string>"
        print "\t output data folder where the result files are saved\n"

    else:
        if options.g == "simple":
            bilbo = Bilbo(str(args[1]), options, "crf_model_simple")
        elif options.g == "detail":
            bilbo = Bilbo(str(args[1]), options, "crf_model_detail")

        dtype = options.t
        if dtype == "bibl":
            typeCorpus = 1
        elif dtype == "note":
            typeCorpus = 2
        dirModel = os.path.join(rootDir, "model/corpus") + str(typeCorpus) + "/" + options.m + "/"
        if not os.path.exists(dirModel):
            os.makedirs(dirModel)

        if options.T:  # training
            bilbo.train(str(args[0]), dirModel, typeCorpus)
        elif options.L:  # labeling
Example #8
0
	def annotate(self):
		self.bilboOptions.T = False
		self.bilboOptions.L = True
		bilbo = Bilbo(self.dirResult, self.bilboOptions, "crf_model_simple")
		bilbo.annotate(self.dirLabel, self.dirModel, 1)