def read_file_behrooz(filename, file_id, semantic, missing_tags=None, problem_files=None): """ Read file for parsing :type filename: string :param filename: file to be parsed :rtype: list(SymbolTree) :return list of Symbol trees found in the file """ #s = time.time() (ext, content) = MathDocument.read_doc_file(filename) t = MathExtractor.Behrooz_parse_from_xml(content, 1, window=3, operator=semantic, missing_tags=missing_tags, problem_files=problem_files) # #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t))) # for item in t: # #slttuplesList = SymbolTree.get_pairs(item, window='all') # print(item) return t
def get_collection(self, ): except_count = 0 dictionary_formula_tuples = {} root = self.collection_file_path for directory in os.listdir(root): temp_address = root+"/"+directory+"/" if not os.path.isdir(temp_address): continue temp_address = temp_address +"/Articles" for filename in os.listdir(temp_address): file_path = temp_address + '/' + filename parts = filename.split('/') file_name = os.path.splitext(parts[len(parts) - 1])[0] try: (ext, content) = MathDocument.read_doc_file(file_path) formulas = MathExtractor.parse_from_xml(content, 1, operator=(not self.read_slt), missing_tags=None, problem_files=None) temp = str(unicodedata.normalize('NFKD', file_name).encode('ascii', 'ignore')) temp = temp[2:] file_name = temp[:-1] for key in formulas: tuples = formulas[key].get_pairs(window=2, eob=True) dictionary_formula_tuples[file_name + ":" + str(key)] = tuples except: except_count += 1 print(file_name) return dictionary_formula_tuples
def read_file(filename, file_id, semantic, missing_tags=None, problem_files=None): """ Read file for parsing :type filename: string :param filename: file to be parsed :rtype: list(SymbolTree) :return list of Symbol trees found in the file """ #s = time.time() (ext, content) = MathDocument.read_doc_file(filename) if ext == '.tex' and not semantic: t = MathExtractor.parse_from_tex(content, file_id) #print("file %s took %s"%(file_id,time.time()-s)) return [t], 0 elif ext in {'.xhtml', '.mathml', '.mml', '.html'}: t, n_err = MathExtractor.parse_from_xml(content, file_id, operator=semantic, missing_tags=missing_tags, problem_files=problem_files) #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t))) for item in t: #slttuplesList = SymbolTree.get_pairs(item, window='all') print(item) return t, n_err else: if ext == '.tex' and semantic: if "invalid_filetype" not in problem_files: problem_files["invalid_filetype"] = set([filename]) else: problem_files["invalid_filetype"].add(filename) print('invalid file format %s for %s in operator tree mode' % (ext, filename)) else: problem_files["unknown_filetype"] = problem_files.get( "unknown_filetype", set()) problem_files["unknown_filetype"].add(filename) print('Unknown filetype %s for %s' % (ext, filename)) return [], 0
def latex_to_polish_list(expression): seen_ops = set() for transition in MathExtractor.parse_from_tex_opt(expression).get_pairs( window=1, eob=True): trans_tuple = transition.split("\t") # is var if trans_tuple[0].startswith("V"): yield alias(trans_tuple[0].split("!")[1], is_var=True) continue # trans_tuple[0]+trans_tuple[-1] is presumed to be op's unique ID in OPTree if trans_tuple[0] + trans_tuple[-1] not in seen_ops: yield alias(trans_tuple[0].split("!")[1]) seen_ops.add(trans_tuple[0] + trans_tuple[-1]) else: seen_ops.remove(trans_tuple[0] + trans_tuple[-1])
def get_query(self,): except_count = 0 dictionary_query_tuples = {} for j in range(1, 21): temp_address = self.queries_directory_path + '/' + str(j) + '.html' try: (ext, content) = MathDocument.read_doc_file(temp_address) formulas = MathExtractor.parse_from_xml(content, 1, operator=(not self.read_slt), missing_tags=None, problem_files=None) for key in formulas: tuples = formulas[key].get_pairs(window=2, eob=True) dictionary_query_tuples[j] = tuples except: except_count += 1 print(j) return dictionary_query_tuples
def ConvertWikipediaToSLTTuplesNewVersion(filePathForresults, filename, dirId, lst, missing_tags=None, problem_files=None): try: parts = filename.split('/') file_name = os.path.splitext(parts[len(parts) - 1])[0] # parts = parts[len(parts)-1].split(".") # FileID = parts[0] # for i in range (1,len(parts)-1): # FileID = FileID + "."+parts[i] (ext, content) = MathDocument.read_doc_file(filename) formulas = MathExtractor.parse_from_xml(content, 1, operator=False, missing_tags=missing_tags, problem_files=problem_files) # formulas = MathExtractor.parse_from_xml(content,1, operator=False, missing_tags=missing_tags,problem_files=problem_files) #formulas = MathExtractor.behrooz_parse_from_xml(content=content, content_id=1, operator=True, missing_tags=missing_tags) for key in formulas: tuples = formulas[key].get_pairs(window=1, eob=True) if not tuples: return f = open( filePathForresults + "/" + str(dirId) + "/" + file_name + ":" + str(key) + ".txt", "w+") for t in tuples: f.write(t + "\n") f.close() #fileP = filePathForresults + "/" + str(dirId) + "/" + FileID + ":" + str(key) + ".txt" #f = open(fileP, "w+") #for t in tuples: #f.write(t+"\n") #f.close() except: print(filename)
def behrooz_queryPreparation(filename, resultFile, file_id, missing_tags=None, problem_files=None): (ext, content) = MathDocument.read_doc_file(filename) #formulas = MathExtractor.parse_from_xml(content,1, operator=False, missing_tags=missing_tags,problem_files=problem_files) formulas = MathExtractor.parse_from_xml(content, 1, operator=False, missing_tags=missing_tags, problem_files=problem_files) for key in formulas: tuples = formulas[key].get_pairs(window=1, eob=True) if not tuples: return f = open(resultFile, "w+") for t in tuples: f.write(t + "\n") f.close()