def read_file(filename, file_id, missing_tags=None, problem_files=None): """ Read file for parsing :type filename: string :param filename: file to be parsed :rtype: list(SymbolTree) :return list of Symbol trees found in the file """ #s = time.time() (ext,content) = MathDocument.read_doc_file(filename) if ext == '.tex': t = MathExtractor.parse_from_tex(content, file_id) #print("file %s took %s"%(file_id,time.time()-s)) return [t] elif ext in {'.xhtml', '.mathml', '.mml', '.html'}: t = MathExtractor.parse_from_xml(content, file_id, missing_tags=missing_tags, problem_files=problem_files) #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t))) return t else: problem_files["unknown_filetype"] = problem_files.get("unknown_filetype", set()) problem_files["unknown_filetype"].add(filename) print('Unknown filetype %s for %s' % (ext, filename)) return []
def read_file(filename, file_id, missing_tags=None, problem_files=None): """ Read file for parsing :type filename: string :param filename: file to be parsed :rtype: list(SymbolTree) :return list of Symbol trees found in the file """ #s = time.time() (ext, content) = MathDocument.read_doc_file(filename) if ext == '.tex': t = MathExtractor.parse_from_tex(content, file_id) #print("file %s took %s"%(file_id,time.time()-s)) return [t] elif ext in {'.xhtml', '.mathml', '.mml', '.html'}: t = MathExtractor.parse_from_xml(content, file_id, missing_tags=missing_tags, problem_files=problem_files) #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t))) return t else: problem_files["unknown_filetype"] = problem_files.get( "unknown_filetype", set()) problem_files["unknown_filetype"].add(filename) print('Unknown filetype %s for %s' % (ext, filename)) return []
def find_mathml(self,docid,position): """ Find a specific math expression :param docid: document number or -1 (to read query) :type docid: int :param position: relative number of math expr within document :type position: int :return MathML or None :rtype: string """ if docid < 0: # hack to allow for reading queries instead (ext,content) = self.read_doc_file(self.queries) else: (ext,content) = self.read_doc_file(self.find_doc_file(docid)) if ext == '.tex': if position > 0: print("Warning: .tex documents have only one expression; position %i ignored\n"%position) mathml = LatexToMathML.convert_to_mathml(content) else: maths = MathExtractor.math_tokens(content) if position >= len(maths): print("Cannot find MathML expression: position %i too large"%position) return None mathml = maths[position] return(mathml)
def get(self, doc_id, location, expression): if not doc_id in self.cached_locations: self.cached_locations[doc_id] = {} if location in self.cached_locations[doc_id]: return self.cached_locations[doc_id][location] else: #first time the expression is seen, check.... if expression in self.cached_expressions: #expression has been retrieved before but at different location... prev_doc_id, prev_location = self.cached_expressions[expression] return self.cached_locations[prev_doc_id][prev_location] else: control = Control(self.control_filename) # control file name (after indexing) document_finder = MathDocument(control) mathml = document_finder.find_mathml(doc_id, location) mathml = MathExtractor.isolate_pmml(mathml) if isinstance(mathml, bytes): mathml = mathml.decode('UTF-8') # save on cache... self.cached_locations[doc_id][location] = mathml self.cached_expressions[expression] = (doc_id, location) return mathml
def process_query_batch(args): """ Given a query, generate query tuples for the math index :param args: :return: nil """ stats = Stats() fileid = os.getpid() query_list, topk, math_index = args math_index.openDB(fileid, topk) stats.num_documents = len(query_list) for (query_num, query_string) in query_list: trees = MathExtractor.parse_from_xml(query_string, query_num, stats.missing_tags, stats.problem_files) stats.num_expressions += len(trees) # also need to handle keyword queries if present terms = re.findall(r"<keyword[^>]*>\s*([^<]*\S)\s*</keyword>", query_string) stats.num_keywords += len(terms) math_index.search(fileid, query_num, trees, terms, topk) math_index.closeDB(fileid) return (fileid, stats)
def get(self, doc_id, location, expression, force_update=False): if not doc_id in self.cached_locations: self.cached_locations[doc_id] = {} if location in self.cached_locations[doc_id] and not force_update: return self.cached_locations[doc_id][location] else: #first time the expression is seen, check.... if expression in self.cached_expressions and not force_update: #expression has been retrieved before but at different location... prev_doc_id, prev_location = self.cached_expressions[ expression] return self.cached_locations[prev_doc_id][prev_location] else: control = Control(self.control_filename ) # control file name (after indexing) document_finder = MathDocument(control) mathml = document_finder.find_mathml(doc_id, location) mathml = MathExtractor.isolate_pmml(mathml) if isinstance(mathml, bytes): mathml = mathml.decode('UTF-8') # save on cache... self.cached_locations[doc_id][location] = mathml self.cached_expressions[expression] = (doc_id, location) return mathml
def __init__(self, name, expression, mathml=None, initRetrievalTime='undefined'): self.name = name self.mathml = mathml self.results = {} self.documents = {} if mathml is not None: # parse from mathml (additional information extracted) self.tree = MathExtractor.convert_and_link_mathml(mathml) self.expression = self.tree.tostring() else: # parse from SLT string (no mathml information available) self.tree = SymbolTree.parse_from_slt(expression) self.expression = expression self.constraints = Query.create_default_constraints(self.tree) self.sorted_results = None self.sorted_result_index = None self.sorted_abs_ranks = None self.sorted_documents = None self.sorted_document_index = None self.elapsed_time = 0.0 # RZ: add tuple-based retrieval time and other measures. self.initRetrievalTime = initRetrievalTime self.postings = None self.matchedFormulae = None self.matchedDocs = None # cache ... self.html_queryblock = {}
def __init__(self, query, expression, original_ranking, original_score, mathml=None): self.query = query self.original_ranking = original_ranking self.original_score = original_score self.mathml = mathml self.new_scores = [0.0] if mathml is not None: # parse from mathml (additional information extracted) self.tree = MathExtractor.convert_and_link_mathml(mathml) self.expression = self.tree.tostring() out_file = open("probando.txt", 'w', encoding='utf-8') out_file.write(self.tree.tostring()) out_file.close() else: # parse from SLT string (no mathml information available) self.tree = SymbolTree.parse_from_slt(expression) self.expression = expression if self.tree.tostring() != expression: print("Bad conversion for result for query " + query.name + ": " + expression + " -> " + self.tree.tostring()) exit(1) self.locations = [] self.matched_elements = [] self.unified_elements = {} self.wildcard_matches = {} self.all_unified = [] self.times_rendered = 0
def __init__(self, name, expression, mathml=None, initRetrievalTime='undefined', max_results=0): self.name = name self.mathml = mathml self.results = {} self.documents = {} if mathml is not None: # parse from mathml (additional information extracted) self.tree = MathExtractor.convert_and_link_mathml(mathml) self.expression = self.tree.tostring() else: # parse from SLT string (no mathml information available) self.tree = SymbolTree.parse_from_slt(expression) self.expression = expression self.constraints = Query.create_default_constraints(self.tree) self.sorted_results = None self.sorted_result_index = None self.sorted_abs_ranks = None self.sorted_documents = None self.sorted_document_index = None self.elapsed_time = 0.0 # RZ: add tuple-based retrieval time and other measures. self.initRetrievalTime = initRetrievalTime self.postings = None self.matchedFormulae = None self.matchedDocs = None # Re-rank at most K results self.max_results = max_results # cache ... self.html_queryblock = {}
def __init__(self, query, expression, original_ranking, original_score, mathml=None): self.query = query self.original_ranking = original_ranking self.original_score = original_score self.mathml = mathml self.new_scores = [0.0] if mathml is not None: # parse from mathml (additional information extracted) self.tree = MathExtractor.convert_and_link_mathml(mathml) self.expression = self.tree.tostring() else: # parse from SLT string (no mathml information available) self.tree = SymbolTree.parse_from_slt(expression) self.expression = expression # print(self.tree.tostring() == expression) self.locations = [] self.matched_elements = [] self.unified_elements = [] self.times_rendered = 0
def process_query_batch(args): """ Given a query, generate query tuples for the math index :param args: :return: nil """ stats = Stats() fileid = os.getpid() system, db, run_tag, query_list, topk, math_index, strategy = args math_index.openDB(fileid,topk) stats.num_documents = len(query_list) for (query_num,query_string) in query_list: trees = MathExtractor.parse_from_xml(query_string, query_num, stats.missing_tags, stats.problem_files) stats.num_expressions += len(trees) math_index.search(fileid, query_num, trees) # also need to handle keyword queries if present math_index.closeDB(fileid) return (fileid,stats)