def extractFormulaFeature(query): """ Step 1: Query to MathML """ math_obj = asciitomathml.asciitomathml.AsciiMathML() math_obj.parse_string(query) mathML = math_obj.to_xml_string() mathML = mathML.replace("<math xmlns=\"http://www.w3.org/1998/Math/MathML\">","<math>") mathML = mathML.replace("&","") """ Step 2: MathML to Formula object """ #Extract four types of formula_obj formula_obj = Formula() (sem_features, struc_features, const_features, var_features) = features_extraction(mathML) # Generate index terms inorder_sem_terms = ino_sem_terms(sem_features) sorted_sem_terms = sort_sem_terms(sem_features) #Insert into formulas table formula_obj.inorder_term = inorder_sem_terms formula_obj.sorted_term = sorted_sem_terms formula_obj.structure_term = struc_features formula_obj.constant_term = const_features formula_obj.variable_term = var_features formula_obj.status = 1 """ Step 3: Extract feature """ featureAll = readFeature('all') formula = formula_obj formula.structure = [] formula.semantic = [] formula.constant = [] formula.variable = [] #semantic if formula.sorted_term!= '[]': f_semantic_array = formula.sorted_term[0] for line in f_semantic_array: fa = line.split('$') for f in fa: f = semantic_rep(f) if f != "": formula.semantic.append(f) #structure if formula.structure_term!= '[]': f_structure_array = formula.structure_term #print f_structure_array for f in f_structure_array: f = struct_rep(f) if f != "": formula.structure.append(f) #constant if formula.constant_term!= '[]': f_constant_array = formula.constant_term #print f_constant_array for f in f_constant_array: f = const_rep(f) if f != "": formula.constant.append(f) #variable if formula.variable_term!= '[]': f_variable_array = formula.variable_term for f in f_variable_array: f = var_rep(f) if f != "": formula.variable.append(f) """ Step 4: Build vector """ line = [] #print formula.semantic #print formula.structure #print formula.constant #print formula.variable for s in readFeature('semantic'): line.append(min(1,formula.semantic.count(s))) for s in readFeature('structure'): line.append(min(1,formula.structure.count(s))) for c in readFeature('constant'): line.append(min(1,formula.constant.count(c))) for v in readFeature('variable'): line.append(min(1,formula.variable.count(v))) return line
def buildVector(): #extract feature term semantic,structure,constant,variable = get_formula_feature_term() featureAll = semantic + structure +constant + variable with open(path+'/data/all.feature','w') as outfile: json.dump(featureAll,outfile) all_formula = Formula.objects.all() all_tag = TagDefinition.objects.all() for formula in all_formula: formula.semantic = [] formula.structure = [] formula.constant = [] formula.variable = [] #semantic if formula.sorted_term!= '[]': f_semantic_array = formula.sorted_term[1:-1].split(',') for line in f_semantic_array: fa = line.split('$') for f in fa: f = semantic_rep(f) if f != "": formula.semantic.append(f) #structure if formula.structure_term!= '[]': f_structure_array = formula.structure_term[1:-1].split(',') for f in f_structure_array: f = struct_rep(f) if f != "": formula.structure.append(f) #constant if formula.constant_term!= '[]': f_constant_array = formula.constant_term[1:-1].split(',') for f in f_constant_array: f = const_rep(f) if f != "": formula.constant.append(f) #variable if formula.variable_term!= '[]': f_variable_array = formula.variable_term[1:-1].split(',') for f in f_variable_array: f = var_rep(f) if f != "": formula.variable.append(f) #------------------ Build data vector ------------------------------ term_matrix = [] for f in all_formula: line = [] line.append(int(f.indexid)) print line for s in semantic: line.append(min(1,f.semantic.count(s))) for s in structure: line.append(min(1,f.structure.count(s))) for c in constant: line.append(min(1,f.constant.count(c))) for v in variable: line.append(min(1,f.variable.count(v))) #tag """ f.tagdef = Tag.objects.filter(question_id = f.id) f.tagid = [] for tag in f.tagdef: f.tagid.append(int(tag.tagdefinition.id)) for t in all_tag: line.append(min(1,f.tagid.count(t.id))) """ term_matrix.append(line) with open(path+'/data/data.vector','w') as outfile: json.dump(term_matrix,outfile) with open(path+'/data/data2.vector','w') as outfile: for line in term_matrix: for item in line: outfile.write(str(item)) outfile.write("\t") outfile.write("\n") return