Beispiel #1
0
def read_file(filename, file_id, missing_tags=None, problem_files=None):
    """
    Read file for parsing

    :type filename: string
    :param filename: file to be parsed

    :rtype: list(SymbolTree)
    :return list of Symbol trees found in the file
    """
    #s = time.time()
    (ext,content) = MathDocument.read_doc_file(filename)
    if ext == '.tex':
        t = MathExtractor.parse_from_tex(content, file_id)
        #print("file %s took %s"%(file_id,time.time()-s))
        return [t]
    elif ext in {'.xhtml', '.mathml', '.mml', '.html'}:
        t = MathExtractor.parse_from_xml(content, file_id, missing_tags=missing_tags, problem_files=problem_files)
        #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t)))
        return t
    else:
        problem_files["unknown_filetype"] = problem_files.get("unknown_filetype", set())
        problem_files["unknown_filetype"].add(filename)
        print('Unknown filetype %s for %s' % (ext, filename))
        return []
Beispiel #2
0
def read_file(filename, file_id, missing_tags=None, problem_files=None):
    """
    Read file for parsing

    :type filename: string
    :param filename: file to be parsed

    :rtype: list(SymbolTree)
    :return list of Symbol trees found in the file
    """
    #s = time.time()
    (ext, content) = MathDocument.read_doc_file(filename)
    if ext == '.tex':
        t = MathExtractor.parse_from_tex(content, file_id)
        #print("file %s took %s"%(file_id,time.time()-s))
        return [t]
    elif ext in {'.xhtml', '.mathml', '.mml', '.html'}:
        t = MathExtractor.parse_from_xml(content,
                                         file_id,
                                         missing_tags=missing_tags,
                                         problem_files=problem_files)
        #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t)))
        return t
    else:
        problem_files["unknown_filetype"] = problem_files.get(
            "unknown_filetype", set())
        problem_files["unknown_filetype"].add(filename)
        print('Unknown filetype %s for %s' % (ext, filename))
        return []
    def find_mathml(self,docid,position):
        """
        Find a specific math expression
        :param docid: document number or -1 (to read query)
        :type  docid: int
        :param position: relative number of math expr within document
        :type  position: int

        :return MathML or None
        :rtype: string
        """
        if docid < 0: # hack to allow for reading queries instead
            (ext,content) = self.read_doc_file(self.queries)
        else:
            (ext,content) = self.read_doc_file(self.find_doc_file(docid))
        if ext == '.tex':
            if position > 0:
                print("Warning: .tex documents have only one expression; position %i ignored\n"%position)
            mathml = LatexToMathML.convert_to_mathml(content)
        else:
            maths = MathExtractor.math_tokens(content)
            if position >= len(maths):
                print("Cannot find MathML expression: position %i too large"%position)
                return None
            mathml = maths[position]
        return(mathml)
Beispiel #4
0
    def get(self, doc_id, location, expression):
        if not doc_id in self.cached_locations:
            self.cached_locations[doc_id] = {}

        if location in self.cached_locations[doc_id]:
            return self.cached_locations[doc_id][location]
        else:
            #first time the expression is seen, check....

            if expression in self.cached_expressions:
                #expression has been retrieved before but at different location...
                prev_doc_id, prev_location = self.cached_expressions[expression]

                return self.cached_locations[prev_doc_id][prev_location]
            else:

                control = Control(self.control_filename) # control file name (after indexing)
                document_finder = MathDocument(control)

                mathml = document_finder.find_mathml(doc_id, location)
                mathml = MathExtractor.isolate_pmml(mathml)
                if isinstance(mathml, bytes):
                    mathml = mathml.decode('UTF-8')

                # save on cache...
                self.cached_locations[doc_id][location] = mathml
                self.cached_expressions[expression] = (doc_id, location)

                return mathml
def process_query_batch(args):
    """
    Given a query, generate query tuples for the math index
    :param args:
    :return: nil
    """
    stats = Stats()
    fileid = os.getpid()

    query_list, topk, math_index = args
    math_index.openDB(fileid, topk)

    stats.num_documents = len(query_list)

    for (query_num, query_string) in query_list:
        trees = MathExtractor.parse_from_xml(query_string, query_num,
                                             stats.missing_tags,
                                             stats.problem_files)
        stats.num_expressions += len(trees)

        # also need to handle keyword queries if present
        terms = re.findall(r"<keyword[^>]*>\s*([^<]*\S)\s*</keyword>",
                           query_string)
        stats.num_keywords += len(terms)

        math_index.search(fileid, query_num, trees, terms, topk)

    math_index.closeDB(fileid)
    return (fileid, stats)
Beispiel #6
0
    def get(self, doc_id, location, expression, force_update=False):
        if not doc_id in self.cached_locations:
            self.cached_locations[doc_id] = {}

        if location in self.cached_locations[doc_id] and not force_update:
            return self.cached_locations[doc_id][location]
        else:
            #first time the expression is seen, check....

            if expression in self.cached_expressions and not force_update:
                #expression has been retrieved before but at different location...
                prev_doc_id, prev_location = self.cached_expressions[
                    expression]

                return self.cached_locations[prev_doc_id][prev_location]
            else:

                control = Control(self.control_filename
                                  )  # control file name (after indexing)
                document_finder = MathDocument(control)

                mathml = document_finder.find_mathml(doc_id, location)
                mathml = MathExtractor.isolate_pmml(mathml)
                if isinstance(mathml, bytes):
                    mathml = mathml.decode('UTF-8')

                # save on cache...
                self.cached_locations[doc_id][location] = mathml
                self.cached_expressions[expression] = (doc_id, location)

                return mathml
Beispiel #7
0
    def __init__(self, name, expression, mathml=None, initRetrievalTime='undefined'):
        self.name = name

        self.mathml = mathml
        self.results = {}
        self.documents = {}

        if mathml is not None:
            # parse from mathml (additional information extracted)
            self.tree = MathExtractor.convert_and_link_mathml(mathml)
            self.expression = self.tree.tostring()
        else:
            # parse from SLT string (no mathml information available)
            self.tree = SymbolTree.parse_from_slt(expression)
            self.expression = expression

        self.constraints = Query.create_default_constraints(self.tree)

        self.sorted_results = None
        self.sorted_result_index = None
        self.sorted_abs_ranks = None
        self.sorted_documents = None
        self.sorted_document_index = None
        self.elapsed_time = 0.0

        # RZ: add tuple-based retrieval time and other measures.
        self.initRetrievalTime = initRetrievalTime
        self.postings = None
        self.matchedFormulae = None
        self.matchedDocs = None

        # cache ...
        self.html_queryblock = {}
Beispiel #8
0
    def __init__(self, query, expression, original_ranking, original_score, mathml=None):
        self.query = query
        self.original_ranking = original_ranking
        self.original_score = original_score
        self.mathml = mathml
        self.new_scores = [0.0]

        if mathml is not None:
            # parse from mathml (additional information extracted)
            self.tree = MathExtractor.convert_and_link_mathml(mathml)
            self.expression = self.tree.tostring()

            out_file = open("probando.txt", 'w', encoding='utf-8')
            out_file.write(self.tree.tostring())
            out_file.close()
        else:
            # parse from SLT string (no mathml information available)
            self.tree = SymbolTree.parse_from_slt(expression)
            self.expression = expression
        if self.tree.tostring() != expression:
            print("Bad conversion for result for query " + query.name + ": " + expression + " -> " + self.tree.tostring())
            exit(1)
        self.locations = []
        self.matched_elements = []
        self.unified_elements = {}
        self.wildcard_matches = {}
        self.all_unified = []
        self.times_rendered = 0
Beispiel #9
0
    def find_mathml(self,docid,position):
        """
        Find a specific math expression
        :param docid: document number or -1 (to read query)
        :type  docid: int
        :param position: relative number of math expr within document
        :type  position: int

        :return MathML or None
        :rtype: string
        """
        if docid < 0: # hack to allow for reading queries instead
            (ext,content) = self.read_doc_file(self.queries)
        else:
            (ext,content) = self.read_doc_file(self.find_doc_file(docid))
        if ext == '.tex':
            if position > 0:
                print("Warning: .tex documents have only one expression; position %i ignored\n"%position)
            mathml = LatexToMathML.convert_to_mathml(content)
        else:
            maths = MathExtractor.math_tokens(content)
            if position >= len(maths):
                print("Cannot find MathML expression: position %i too large"%position)
                return None
            mathml = maths[position]
        return(mathml)
Beispiel #10
0
    def __init__(self,
                 name,
                 expression,
                 mathml=None,
                 initRetrievalTime='undefined',
                 max_results=0):
        self.name = name

        self.mathml = mathml
        self.results = {}
        self.documents = {}

        if mathml is not None:
            # parse from mathml (additional information extracted)
            self.tree = MathExtractor.convert_and_link_mathml(mathml)
            self.expression = self.tree.tostring()
        else:
            # parse from SLT string (no mathml information available)
            self.tree = SymbolTree.parse_from_slt(expression)
            self.expression = expression

        self.constraints = Query.create_default_constraints(self.tree)

        self.sorted_results = None
        self.sorted_result_index = None
        self.sorted_abs_ranks = None
        self.sorted_documents = None
        self.sorted_document_index = None
        self.elapsed_time = 0.0

        # RZ: add tuple-based retrieval time and other measures.
        self.initRetrievalTime = initRetrievalTime
        self.postings = None
        self.matchedFormulae = None
        self.matchedDocs = None

        # Re-rank at most K results
        self.max_results = max_results

        # cache ...
        self.html_queryblock = {}
Beispiel #11
0
    def __init__(self, query, expression, original_ranking, original_score, mathml=None):
        self.query = query
        self.original_ranking = original_ranking
        self.original_score = original_score
        self.mathml = mathml
        self.new_scores = [0.0]

        if mathml is not None:
            # parse from mathml (additional information extracted)
            self.tree = MathExtractor.convert_and_link_mathml(mathml)
            self.expression = self.tree.tostring()
        else:
            # parse from SLT string (no mathml information available)
            self.tree = SymbolTree.parse_from_slt(expression)
            self.expression = expression

        # print(self.tree.tostring() == expression)

        self.locations = []
        self.matched_elements = []
        self.unified_elements = []
        self.times_rendered = 0
Beispiel #12
0
def process_query_batch(args):
    """
    Given a query, generate query tuples for the math index
    :param args:
    :return: nil
    """
    stats = Stats()
    fileid = os.getpid()

    system, db, run_tag, query_list, topk, math_index, strategy = args
    math_index.openDB(fileid,topk)

    stats.num_documents = len(query_list)

    for (query_num,query_string) in query_list:
        trees = MathExtractor.parse_from_xml(query_string, query_num, stats.missing_tags, stats.problem_files)
        stats.num_expressions += len(trees)
        math_index.search(fileid, query_num, trees)

        # also need to handle keyword queries if present
    
    math_index.closeDB(fileid)
    return (fileid,stats)