Esempio n. 1
0
def extractFormulaFeature(query):
	"""
	Step 1: Query to MathML
	"""
	math_obj =  asciitomathml.asciitomathml.AsciiMathML()
	math_obj.parse_string(query)
	mathML = math_obj.to_xml_string()
	mathML = mathML.replace("<math xmlns=\"http://www.w3.org/1998/Math/MathML\">","<math>") 
	mathML = mathML.replace("&","") 
	
	"""
	Step 2: MathML to Formula object
	"""
	#Extract four types of formula_obj
	formula_obj = Formula()
	(sem_features, struc_features, const_features, var_features) = features_extraction(mathML)            
            
	# Generate index terms
	inorder_sem_terms = ino_sem_terms(sem_features)
	sorted_sem_terms = sort_sem_terms(sem_features)
            
	#Insert into formulas table
	formula_obj.inorder_term = inorder_sem_terms
	formula_obj.sorted_term = sorted_sem_terms
	formula_obj.structure_term = struc_features
	formula_obj.constant_term = const_features
	formula_obj.variable_term = var_features
	formula_obj.status = 1

	"""
	Step 3: Extract feature
	"""
	featureAll = readFeature('all')
	formula = formula_obj
	formula.structure = []
	formula.semantic = []
	formula.constant = []
	formula.variable = []
	#semantic
	if formula.sorted_term!= '[]':
		f_semantic_array = formula.sorted_term[0]
		for line in f_semantic_array:
			fa = line.split('$')
			for f in fa:
				f = semantic_rep(f)
				if  f != "":
					formula.semantic.append(f)
	#structure
	if formula.structure_term!= '[]':
		f_structure_array = formula.structure_term
		#print f_structure_array
		for f in f_structure_array:
			f = struct_rep(f)
			if f != "":
				formula.structure.append(f)		
	#constant
	if formula.constant_term!= '[]':
		f_constant_array = formula.constant_term
		#print f_constant_array
		for f in f_constant_array:
			f = const_rep(f)
			if f != "":
				formula.constant.append(f)
	#variable
	if formula.variable_term!= '[]':
		f_variable_array = formula.variable_term
		for f in f_variable_array:
			f = var_rep(f)
			if f != "":
				formula.variable.append(f)
	"""
	Step 4: Build vector
	"""
	line = []
	
	#print formula.semantic
	#print formula.structure
	#print formula.constant
	#print formula.variable
	
	for s in readFeature('semantic'):
		line.append(min(1,formula.semantic.count(s)))
	for s in readFeature('structure'):
		line.append(min(1,formula.structure.count(s)))
	for c in readFeature('constant'):
		line.append(min(1,formula.constant.count(c)))
	for v in readFeature('variable'):
		line.append(min(1,formula.variable.count(v)))
	return  line
Esempio n. 2
0
def buildVector():
	#extract feature term
	semantic,structure,constant,variable = get_formula_feature_term()
	featureAll = semantic + structure +constant + variable
	with open(path+'/data/all.feature','w') as outfile:
		json.dump(featureAll,outfile)
	
	all_formula = Formula.objects.all()
	all_tag = TagDefinition.objects.all()
	
	for formula in all_formula:
		
		formula.semantic = []
		formula.structure = []
		formula.constant = []
		formula.variable = []
		
		#semantic
		if formula.sorted_term!= '[]':
			f_semantic_array = formula.sorted_term[1:-1].split(',')
			for line in f_semantic_array:
				fa = line.split('$')
				for f in fa:
					f = semantic_rep(f)
					if  f != "":
						formula.semantic.append(f)
		#structure
		if formula.structure_term!= '[]':
			f_structure_array = formula.structure_term[1:-1].split(',')
			for f in f_structure_array:
				f = struct_rep(f)
				if f != "":
					formula.structure.append(f)		
		#constant
		if formula.constant_term!= '[]':
			f_constant_array = formula.constant_term[1:-1].split(',')
			for f in f_constant_array:
				f = const_rep(f)
				if f != "":
					formula.constant.append(f)
		#variable
		if formula.variable_term!= '[]':
			f_variable_array = formula.variable_term[1:-1].split(',')
			for f in f_variable_array:
				f = var_rep(f)
				if f != "":
					formula.variable.append(f)
	

	#------------------ Build data vector ------------------------------
	term_matrix = []
	for f in all_formula:
		line = []
		line.append(int(f.indexid))
		print line
		for s in semantic:
			line.append(min(1,f.semantic.count(s)))
		for s in structure:
			line.append(min(1,f.structure.count(s)))
		for c in constant:
			line.append(min(1,f.constant.count(c)))
		for v in variable:
			line.append(min(1,f.variable.count(v)))
		#tag
		"""
		f.tagdef = Tag.objects.filter(question_id = f.id)
		f.tagid = []
		for tag in f.tagdef:
			f.tagid.append(int(tag.tagdefinition.id))
		for t in all_tag:	
			line.append(min(1,f.tagid.count(t.id)))
		"""
		term_matrix.append(line)
		
	with open(path+'/data/data.vector','w') as outfile:
		json.dump(term_matrix,outfile)
	with open(path+'/data/data2.vector','w') as outfile:
		for line in term_matrix:
			for item in line:
				outfile.write(str(item))
				outfile.write("\t")
			outfile.write("\n")
	return