Ejemplo n.º 1
0
    def get(self, doc_id, location, expression):
        if not doc_id in self.cached_locations:
            self.cached_locations[doc_id] = {}

        if location in self.cached_locations[doc_id]:
            return self.cached_locations[doc_id][location]
        else:
            #first time the expression is seen, check....

            if expression in self.cached_expressions:
                #expression has been retrieved before but at different location...
                prev_doc_id, prev_location = self.cached_expressions[expression]

                return self.cached_locations[prev_doc_id][prev_location]
            else:

                control = Control(self.control_filename) # control file name (after indexing)
                document_finder = MathDocument(control)

                mathml = document_finder.find_mathml(doc_id, location)
                mathml = MathExtractor.isolate_pmml(mathml)
                if isinstance(mathml, bytes):
                    mathml = mathml.decode('UTF-8')

                # save on cache...
                self.cached_locations[doc_id][location] = mathml
                self.cached_expressions[expression] = (doc_id, location)

                return mathml
def find_formula_ids(tsv_results, control_filename):
    control = Control(control_filename)
    document_finder = MathDocument(control)

    for query_offset in tsv_results:
        print("Processing Query: " + str(query_offset))
        total_locs = len(tsv_results[query_offset]["results"])
        for index, result in enumerate(tsv_results[query_offset]["results"]):
            doc, loc = result
            mathml = document_finder.find_mathml(doc, loc)

            elem_content = io.StringIO(mathml) # treat the string as if a file
            root = xml.etree.ElementTree.parse(elem_content).getroot()

            if "id" in root.attrib:
                math_id = root.attrib["id"]
            else:
                print("ERROR: No formula id found for Query " + str(query_offset) +
                      ", doc = " + str(doc) + ", loc = " + str(loc))
                math_id = "math.error"

            #print(str((query_offset, doc, loc, math_id)))
            tsv_results[query_offset]["math_ids"].append(math_id)
            
            if index > 0 and (index + 1) % 25 == 0:
                print("... done " + str(index + 1) + " of " + str(total_locs))
Ejemplo n.º 3
0
    def get(self, doc_id, location, expression, force_update=False):
        if not doc_id in self.cached_locations:
            self.cached_locations[doc_id] = {}

        if location in self.cached_locations[doc_id] and not force_update:
            return self.cached_locations[doc_id][location]
        else:
            #first time the expression is seen, check....

            if expression in self.cached_expressions and not force_update:
                #expression has been retrieved before but at different location...
                prev_doc_id, prev_location = self.cached_expressions[
                    expression]

                return self.cached_locations[prev_doc_id][prev_location]
            else:

                control = Control(self.control_filename
                                  )  # control file name (after indexing)
                document_finder = MathDocument(control)

                mathml = document_finder.find_mathml(doc_id, location)
                mathml = MathExtractor.isolate_pmml(mathml)
                if isinstance(mathml, bytes):
                    mathml = mathml.decode('UTF-8')

                # save on cache...
                self.cached_locations[doc_id][location] = mathml
                self.cached_expressions[expression] = (doc_id, location)

                return mathml
Ejemplo n.º 4
0
import codecs
import sys
from sys import argv

from tangent.utility.control import Control
from tangent.math.mathdocument import MathDocument

__author__ = 'FWTompa'

if __name__ == '__main__':

    if sys.stdout.encoding != 'utf8':
        sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer, 'strict')
    if sys.stderr.encoding != 'utf8':
        sys.stderr = codecs.getwriter('utf8')(sys.stderr.buffer, 'strict')

    if len(argv) != 4 or argv[1] == "help":
        print("Use: python get_math.py <cntl> <doc#> <expr#>")
        print("        where (doc# < 0) => use queryfile")
        sys.exit()

    cntl = Control(argv[1])  # control file name (after indexing)
    d = MathDocument(cntl)
    docno = int(argv[2])
    exprno = int(argv[3])
    print("doc " + argv[2] + ": " +
          d.find_doc_file(docno))  #print document file name
    print(d.find_mathml(docno, exprno))  # doc_num and pos_num
Ejemplo n.º 5
0
import codecs
import sys
from sys import argv

from tangent.utility.control import Control
from tangent.math.mathdocument import MathDocument

__author__ = 'FWTompa'



if __name__ == '__main__':

    if sys.stdout.encoding != 'utf8':
      sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer, 'strict')
    if sys.stderr.encoding != 'utf8':
      sys.stderr = codecs.getwriter('utf8')(sys.stderr.buffer, 'strict')

    if len(argv) != 4 or argv[1] == "help":
        print("Use: python get_math.py <cntl> <doc#> <expr#>")
        print("        where (doc# < 0) => use queryfile")
        sys.exit()

    cntl = Control(argv[1]) # control file name (after indexing)
    d = MathDocument(cntl)
    print(d.find_mathml(int(argv[2]),int(argv[3])))  # doc_num and pos_num