def find_formula_ids(tsv_results, control_filename):
    control = Control(control_filename)
    document_finder = MathDocument(control)

    for query_offset in tsv_results:
        print("Processing Query: " + str(query_offset))
        total_locs = len(tsv_results[query_offset]["results"])
        for index, result in enumerate(tsv_results[query_offset]["results"]):
            doc, loc = result
            mathml = document_finder.find_mathml(doc, loc)

            elem_content = io.StringIO(mathml) # treat the string as if a file
            root = xml.etree.ElementTree.parse(elem_content).getroot()

            if "id" in root.attrib:
                math_id = root.attrib["id"]
            else:
                print("ERROR: No formula id found for Query " + str(query_offset) +
                      ", doc = " + str(doc) + ", loc = " + str(loc))
                math_id = "math.error"

            #print(str((query_offset, doc, loc, math_id)))
            tsv_results[query_offset]["math_ids"].append(math_id)
            
            if index > 0 and (index + 1) % 25 == 0:
                print("... done " + str(index + 1) + " of " + str(total_locs))
Ejemplo n.º 2
0
    def get(self, doc_id, location, expression):
        if not doc_id in self.cached_locations:
            self.cached_locations[doc_id] = {}

        if location in self.cached_locations[doc_id]:
            return self.cached_locations[doc_id][location]
        else:
            #first time the expression is seen, check....

            if expression in self.cached_expressions:
                #expression has been retrieved before but at different location...
                prev_doc_id, prev_location = self.cached_expressions[expression]

                return self.cached_locations[prev_doc_id][prev_location]
            else:

                control = Control(self.control_filename) # control file name (after indexing)
                document_finder = MathDocument(control)

                mathml = document_finder.find_mathml(doc_id, location)
                mathml = MathExtractor.isolate_pmml(mathml)
                if isinstance(mathml, bytes):
                    mathml = mathml.decode('UTF-8')

                # save on cache...
                self.cached_locations[doc_id][location] = mathml
                self.cached_expressions[expression] = (doc_id, location)

                return mathml
Ejemplo n.º 3
0
    def get(self, doc_id, location, expression, force_update=False):
        if not doc_id in self.cached_locations:
            self.cached_locations[doc_id] = {}

        if location in self.cached_locations[doc_id] and not force_update:
            return self.cached_locations[doc_id][location]
        else:
            #first time the expression is seen, check....

            if expression in self.cached_expressions and not force_update:
                #expression has been retrieved before but at different location...
                prev_doc_id, prev_location = self.cached_expressions[
                    expression]

                return self.cached_locations[prev_doc_id][prev_location]
            else:

                control = Control(self.control_filename
                                  )  # control file name (after indexing)
                document_finder = MathDocument(control)

                mathml = document_finder.find_mathml(doc_id, location)
                mathml = MathExtractor.isolate_pmml(mathml)
                if isinstance(mathml, bytes):
                    mathml = mathml.decode('UTF-8')

                # save on cache...
                self.cached_locations[doc_id][location] = mathml
                self.cached_expressions[expression] = (doc_id, location)

                return mathml
Ejemplo n.º 4
0
def math_indexer_task(pargs) -> (str, list):
    """
    creates index tuples for the expressions in this subcollection
    :param pargs:
    :return: (fileid, combined_stats)
    """
    math_index, cntl, chunkid = pargs
    combined_stats = Stats()

    docs = MathDocument(cntl)

    (chunk_size, mappings) = docs.read_mapping_file(chunkid)
    combined_stats.num_documents += len(mappings)

    seen_docs = []  # just dump them as they come
    for (doc_id, filename) in enumerate(mappings,start=chunkid*chunk_size):
##        print('parsing %s, id:%s ' % (filename, doc_id),flush=True)
        try:
            # get all the symbol trees found in file
            for tree in read_file(filename, doc_id, missing_tags=combined_stats.missing_tags,
                               problem_files=combined_stats.problem_files):
                combined_stats.num_expressions += 1
                # pairs = tree.get_pairs(window) do not store pairs -- will be created in C++ module
                seen_docs.append(tree)
        except Exception as err:
            reason = str(err)
            print("Failed to process document "+filename+": "+reason, file=sys.stderr)
            combined_stats.problem_files[reason] = combined_stats.problem_files.get(reason, set())
            combined_stats.problem_files[reason].add(doc_id)

    fileid = math_index.add(seen_docs)
    print("%s is done saving to database %s" % (chunkid,fileid), flush=True)
    return fileid, combined_stats
Ejemplo n.º 5
0
def read_file(filename, file_id, missing_tags=None, problem_files=None):
    """
    Read file for parsing

    :type filename: string
    :param filename: file to be parsed

    :rtype: list(SymbolTree)
    :return list of Symbol trees found in the file
    """
    #s = time.time()
    (ext, content) = MathDocument.read_doc_file(filename)
    if ext == '.tex':
        t = MathExtractor.parse_from_tex(content, file_id)
        #print("file %s took %s"%(file_id,time.time()-s))
        return [t]
    elif ext in {'.xhtml', '.mathml', '.mml', '.html'}:
        t = MathExtractor.parse_from_xml(content,
                                         file_id,
                                         missing_tags=missing_tags,
                                         problem_files=problem_files)
        #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t)))
        return t
    else:
        problem_files["unknown_filetype"] = problem_files.get(
            "unknown_filetype", set())
        problem_files["unknown_filetype"].add(filename)
        print('Unknown filetype %s for %s' % (ext, filename))
        return []
Ejemplo n.º 6
0
def read_file(filename, file_id, missing_tags=None, problem_files=None):
    """
    Read file for parsing

    :type filename: string
    :param filename: file to be parsed

    :rtype: list(SymbolTree)
    :return list of Symbol trees found in the file
    """
    #s = time.time()
    (ext,content) = MathDocument.read_doc_file(filename)
    if ext == '.tex':
        t = MathExtractor.parse_from_tex(content, file_id)
        #print("file %s took %s"%(file_id,time.time()-s))
        return [t]
    elif ext in {'.xhtml', '.mathml', '.mml', '.html'}:
        t = MathExtractor.parse_from_xml(content, file_id, missing_tags=missing_tags, problem_files=problem_files)
        #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t)))
        return t
    else:
        problem_files["unknown_filetype"] = problem_files.get("unknown_filetype", set())
        problem_files["unknown_filetype"].add(filename)
        print('Unknown filetype %s for %s' % (ext, filename))
        return []
Ejemplo n.º 7
0
def math_indexer_task(pargs) -> (str, list):
    """
    creates index tuples for the expressions in this subcollection
    :param pargs:
    :return: (fileid, combined_stats)
    """
    math_index, cntl, chunkid = pargs
    combined_stats = Stats()

    docs = MathDocument(cntl)

    (chunk_size, mappings) = docs.read_mapping_file(chunkid)
    combined_stats.num_documents += len(mappings)

    seen_docs = []  # just dump them as they come
    for (doc_id, filename) in enumerate(mappings, start=chunkid * chunk_size):
        ##        print('parsing %s, id:%s ' % (filename, doc_id),flush=True)
        try:
            # get all the symbol trees found in file
            for tree in read_file(filename,
                                  doc_id,
                                  missing_tags=combined_stats.missing_tags,
                                  problem_files=combined_stats.problem_files):
                combined_stats.num_expressions += 1
                combined_stats.global_expressions += len(tree.position)
                # pairs = tree.get_pairs(window) do not store pairs -- will be created in C++ module
                seen_docs.append(tree)
        except Exception as err:
            reason = str(err)
            print("Failed to process document " + filename + ": " + reason,
                  file=sys.stderr)
            combined_stats.problem_files[
                reason] = combined_stats.problem_files.get(reason, set())
            combined_stats.problem_files[reason].add(doc_id)

    fileid = math_index.add(seen_docs)
    print("%s is done saving to database %s" % (chunkid, fileid), flush=True)
    return fileid, combined_stats
def find_formula_ids(tsv_results, control_filename, mathids_cache):
    control = Control(control_filename)
    document_finder = MathDocument(control)

    for query_offset in tsv_results:
        print("Processing Query: " + str(query_offset))
        total_locs = len(tsv_results[query_offset]["results"])
        for index, result in enumerate(tsv_results[query_offset]["results"]):
            doc, loc = result

            math_id = mathids_cache.get_mathid(document_finder, doc, loc)

            #print(str((query_offset, doc, loc, math_id)))
            tsv_results[query_offset]["math_ids"].append(math_id)

            if index > 0 and (index + 1) % 25 == 0:
                print("... done " + str(index + 1) + " of " + str(total_locs),
                      end="\r")
Ejemplo n.º 9
0
def main():
    if len(sys.argv) < 5:
        print("Usage")
        print(
            "\tpython3 rerank_results.py control input_results metric output_results"
        )
        print("")
        print("Where:")
        print("\tcontrol:\tPath to tangent control file")
        print("\tinput_results:\tPath to file with results to re-rank")
        print("\tmetric:\t\tSimilarity metric to use [0-4]")
        print(
            "\toutput_results:\tPath to file where re-ranked results will be stored"
        )
        print("")
        print("Optional:")
        print("\t-w\twindow\t\t: Window for pair generation")
        print("\t-h\thtml_prefix\t: Prefix for HTML output (requires dot)")
        print("\t-c\tcondition\t: Current test condition")
        print("\t-s\tstats\t\t: File to store stats")
        print("\t-t\ttimes\t\t: File to accumulate time stats")
        print("\t-k\tmax_results\t: K number of results to rerank as maximum")
        return

    control_filename = sys.argv[1]
    input_filename = sys.argv[2]

    try:
        metric = int(sys.argv[3])
        if metric < -1 or metric > 11:
            print("Invalid similarity metric function")
            return
    except:
        print("Invalid similarity metric function")
        return

    output_filename = sys.argv[4]

    optional_params = optional_parameters(sys.argv[5:])

    #load control file
    control = Control(control_filename)  # control file name (after indexing)
    math_doc = MathDocument(control)

    if "w" in optional_params:
        try:
            window = int(optional_params["w"])
            if window <= 0:
                print("Invalid window")
                return
        except:
            print("Invalid value for window")
            return
    else:
        window = int(control.read("window"))

    if "h" in optional_params:
        html_prefix = optional_params["h"]
        if not os.path.isdir(html_prefix):
            os.makedirs(html_prefix)

    else:
        html_prefix = None

    if "c" in optional_params:
        condition = optional_params["c"]
        print("testing condition: " + condition)
    else:
        condition = "undefined"

    if "s" in optional_params:
        stats_file = optional_params["s"]
    else:
        stats_file = None

    if "k" in optional_params:
        try:
            max_k = int(optional_params["k"])
        except:
            print("Invalid max_results parameter")
            return
    else:
        max_k = 0

    if "t" in optional_params:
        times_file = optional_params["t"]
    else:
        times_file = None

    in_file = open(input_filename, 'r', newline='', encoding='utf-8')
    reader = csv.reader(in_file,
                        delimiter='\t',
                        lineterminator='\n',
                        quoting=csv.QUOTE_NONE,
                        escapechar="\\")
    lines = [row for row in reader]
    in_file.close()

    mathml_cache_file = control_filename + ".retrieval_2.cache"
    if not os.path.exists(mathml_cache_file):
        mathml_cache = MathMLCache(control_filename)
    else:
        cache_file = open(mathml_cache_file, "rb")
        mathml_cache = pickle.load(cache_file)
        cache_file.close()

    current_query = None
    current_name = None
    current_tuple_retrieval_time = 'undefined'
    all_queries = []

    #read all results to re-rank
    for idx, line in enumerate(lines):
        #parts = line.strip().split("\t")
        parts = line

        if len(parts) == 2:
            if parts[0][0] == "Q":
                current_name = parts[1]
                current_query = None

            elif parts[0][0] == "E":
                if current_name is None:
                    print("invalid expression at " + str(idx) +
                          ": query name expected first")
                else:
                    query_expression = parts[1]

                    #query_offset = len(all_queries)
                    query_offset = int(current_name.split("-")[-1]) - 1

                    if html_prefix != None:
                        mathml = mathml_cache.get(-1, query_offset,
                                                  query_expression, True)

                        # create empty directories for this query ...
                        if not os.path.isdir(html_prefix + "/" + current_name):
                            os.makedirs(html_prefix + "/" + current_name)

                        if not os.path.isdir(html_prefix + "/" + current_name +
                                             "/images"):
                            os.makedirs(html_prefix + "/" + current_name +
                                        "/images")
                    else:
                        mathml = None

                    current_query = Query(current_name, query_expression,
                                          mathml, current_tuple_retrieval_time,
                                          max_k)
                    current_name = None
                    all_queries.append(current_query)

                    print("Query: " + current_query.name + ": " +
                          current_query.expression)
                    #print(mathml)
                    #current_query.tree.save_as_dot("expre_" + str(idx) + ".gv")

            elif parts[0][0] == "C":
                if current_query is None:
                    print("invalid constraint at " + str(idx) +
                          ": query expression expected first")
                else:
                    # create a constraint tree
                    current_query.set_constraints(parts[1])

        # RZ: Record tuple-based retrieval time and other metrics.
        if len(parts) == 3 and parts[0][0] == "I" and current_query != None:
            if parts[1] == "qt":
                current_query.initRetrievalTime = float(parts[2])
            elif parts[1] == "post":
                current_query.postings = int(parts[2])
            elif parts[1] == "expr":
                current_query.matchedFormulae = int(parts[2])
            elif parts[1] == "doc":
                current_query.matchedDocs = int(parts[2])

        if len(parts) == 5:
            if parts[0][0] == "R":
                doc_id = int(parts[1])
                location = int(parts[2])
                doc_name = math_doc.find_doc_file(doc_id)

                expression = parts[3]
                score = float(parts[4])

                if html_prefix != None:
                    mathml = mathml_cache.get(doc_id, location, expression)
                else:
                    mathml = None

                if current_query is None:
                    print("Error: result listed before a query, line " +
                          str(idx))
                else:
                    current_query.add_result(doc_id, doc_name, location,
                                             expression, score, mathml)

    cache_file = open(mathml_cache_file, "wb")
    pickle.dump(mathml_cache, cache_file, pickle.HIGHEST_PROTOCOL)
    cache_file.close()

    # now, re-rank...
    print("Results loaded, reranking ...")

    # compute similarity first...

    start_time = time.time()
    for q_idx, query in enumerate(all_queries):
        #print("Evaluating: " + query.name + " - " + query.expression)

        query_start_time = time.time() * 1000  # RZ: ms
        for res_idx, exp_result in enumerate(query.results):
            result = query.results[exp_result]

            #print("Candidate: " + result.expression)

            scores = [0.0]
            if metric == -1:
                # bypass mode, generate HTML for original core ranking
                scores = [result.original_score]
                matched_c = {}
            elif metric == 0:
                # same as original based on f-measure of matched pairs..
                pairs_query = query.tree.root.get_pairs("", window)
                pairs_candidate = result.tree.root.get_pairs("", window)
                scores, matched_q, matched_c = similarity_v00(
                    pairs_query, pairs_candidate)
            elif metric == 1:
                # based on testing of alignments....
                scores, matched_q, matched_c = similarity_v01(
                    query.tree, result.tree)
            elif metric == 2:
                # Same as 0 but limiting to matching total symbols first...
                pairs_query = query.tree.root.get_pairs("", window)
                pairs_candidate = result.tree.root.get_pairs("", window)
                scores, matched_q, matched_c = similarity_v02(
                    pairs_query, pairs_candidate)
            elif metric == 3:
                # modified version of 2 which performs unification....
                pairs_candidate = result.tree.root.get_pairs("", window)
                scores, matched_q, matched_c, unified_c = similarity_v03(
                    pairs_query, pairs_candidate)
                result.set_unified_elements(unified_c)
            elif metric == 4:
                # modified version of 1 which performs unification ...
                sim_res = similarity_v04(query.tree, result.tree,
                                         query.constraints)
                scores, matched_q, matched_c, unified_c, wildcard_c, unified = sim_res
                result.set_unified_elements(unified_c)
                result.set_wildcard_matches(wildcard_c)
                result.set_all_unified(unified)
            elif metric == 5:
                # modified version of 4 which allows multiple sub matches
                sim_res = similarity_v05(query.tree, result.tree,
                                         query.constraints)
                scores, matched_q, matched_c, unified_c, wildcard_c = sim_res
                result.set_unified_elements(unified_c)
                result.set_wildcard_matches(wildcard_c)
            elif metric == 6:
                # modified version of 4 which allows subtree matches for wildcards (partial support)...
                sim_res = similarity_v06(query.tree, result.tree,
                                         query.constraints)
                scores, matched_q, matched_c, unified_c, wildcard_c, unified = sim_res
                result.set_unified_elements(unified_c)
                result.set_wildcard_matches(wildcard_c)
                result.set_all_unified(unified)
            elif metric == 7:
                # modified version of 4 which allows subtree matches for wildcards (partial support)...
                sim_res = similarity_v07(query.tree, result.tree,
                                         query.constraints)
                scores, matched_q, matched_c, unified_c, wildcard_c, unified = sim_res
                result.set_unified_elements(unified_c)
                result.set_wildcard_matches(wildcard_c)
                result.set_all_unified(unified)
            elif metric == 8:
                # modified version of 4 which allows subtree matches for wildcards (partial support)...
                sim_res = similarity_v08(query.tree, result.tree,
                                         query.constraints)
                scores, matched_q, matched_c, unified_c, wildcard_c, unified = sim_res
                result.set_unified_elements(unified_c)
                result.set_wildcard_matches(wildcard_c)
                result.set_all_unified(unified)
            elif metric == 9:
                # modified version of 4 which allows subtree matches for wildcards (partial support)...
                sim_res = similarity_v09(query.tree, result.tree,
                                         query.constraints)
                scores, matched_q, matched_c, unified_c, wildcard_c, unified = sim_res
                result.set_unified_elements(unified_c)
                result.set_wildcard_matches(wildcard_c)
                result.set_all_unified(unified)
            elif metric == 10:
                # modified version of 4 which allows subtree matches for wildcards (partial support)...
                sim_res = similarity_v10(query.tree, result.tree,
                                         query.constraints)
                scores, matched_q, matched_c, unified_c, wildcard_c, unified = sim_res
                result.set_unified_elements(unified_c)
                result.set_wildcard_matches(wildcard_c)
                result.set_all_unified(unified)
            elif metric == 11:
                # matching of metric 06 with scores from metric 04 (MSS)
                sim_res = similarity_v11(query.tree, result.tree,
                                         query.constraints)
                scores, matched_q, matched_c, unified_c, wildcard_c, unified = sim_res
                result.set_unified_elements(unified_c)
                result.set_wildcard_matches(wildcard_c)
                result.set_all_unified(unified)

            result.set_matched_elements(matched_c)

            result.new_scores = scores

        query_end_time = time.time() * 1000  # RZ: ms

        # re-rank based on new score(s)
        query.sort_results()
        query.sort_documents()
        query.elapsed_time = query_end_time - query_start_time

    end_time = time.time()
    elapsed = end_time - start_time
    print("Elapsed Time Ranking: " + str(elapsed) + "s")

    #now, store the re-ranked results...
    out_file = open(output_filename, 'w', newline='', encoding='utf-8')
    csv_writer = csv.writer(out_file,
                            delimiter='\t',
                            lineterminator='\n',
                            quoting=csv.QUOTE_NONE,
                            escapechar="\\")
    for query in all_queries:
        csv_writer.writerow([])
        query.output_query(csv_writer)
        query.output_sorted_results(csv_writer)

        if html_prefix is not None:
            print("Saving " + query.name + " to HTML file.....")
            query.save_html(html_prefix + "/" + query.name)
    out_file.close()

    #if stats file is requested ...
    if stats_file is not None:
        out_file = open(stats_file, "w")
        out_file.write(Query.stats_header("\t"))
        for query in all_queries:
            query.output_stats(out_file, "\t", condition)
        out_file.close()

    # if times file is requested ...
    if times_file is not None:
        sorted_queries = sorted([(query.name.strip(), query)
                                 for query in all_queries])

        if os.path.exists(times_file):
            out_file = open(times_file, "a")
        else:
            out_file = open(times_file, "w")
            header = "condition," + ",".join(
                [name for (name, query) in sorted_queries])
            out_file.write(header + "\n")

        line = condition

        for name, query in sorted_queries:
            line += "," + str(query.elapsed_time)

        out_file.write(line + "\n")

        out_file.close()

    print("Finished successfully")
Ejemplo n.º 10
0
    def output_query(self, out_file, cntl, topk, query_time_ms):
        out_file.write("\n")
        out_file.write("QUERY\t" + self.name + "\t" + str(query_time_ms) +
                       "\n")

        if len(self.sorted_docs) == 0:
            # no results? nothing can be output
            return
        """
        for mquery in self.mqueries:
            out_file.write("E\t" + mquery.expression + "\n")
        if self.tquery:
            for keyword in self.tquery.keywords:
                out_file.write("P\t" + keyword + "\n")
        """
        d = MathDocument(cntl)

        min_score = self.sorted_docs[len(self.sorted_docs) - 1].final_score
        if len(self.sorted_docs) < topk:
            print("Warning: Query produced less than " + str(topk) +
                  " documents. Results will be repeated",
                  flush=True)

        # force output topk results
        for idx in range(topk):
            doc = self.sorted_docs[idx % len(self.sorted_docs)]

            positions = self.get_math_pos_with_score(doc)
            try:
                exprids = list(
                    map(
                        lambda pos:
                        (d.find_mathml_id(doc.doc_id, pos[0]), pos[1]),
                        positions))
            except (IOError):  # cannot read ids
                exprids = positions
            #out_file.write("R\t" + str(doc.doc_name) + "\t" + str(doc.final_score) + "\t(at: " + str(exprids) + str(self.get_text_pos(doc))+ ")\n")
            row_elements = [str(idx + 1)]

            if idx < len(self.sorted_docs):
                # use original score
                doc_score = doc.final_score
            else:
                doc_score = min_score

            if isinstance(doc_score, list):
                row_elements.append(str(doc_score[0]))
            else:
                row_elements.append(str(doc_score))

            row_elements.append(doc.doc_name)

            # add formulas *M*
            for exprid, mscore in exprids:
                row_elements += ["*M*", str(exprid), str(mscore[0])]

            # add Keywords *W*
            if self.tquery:
                for keyword in self.tquery.keywords:
                    row_elements += ["*W*", keyword, str(doc.tscore[1])]

            out_file.write("\t".join(row_elements) + "\n")
Ejemplo n.º 11
0
__author__ = 'FWTompa'

if __name__ == '__main__':

    if sys.stdout.encoding != 'utf8':
        sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer, 'strict')
    if sys.stderr.encoding != 'utf8':
        sys.stderr = codecs.getwriter('utf8')(sys.stderr.buffer, 'strict')

    if len(argv) != 4 or argv[1] == "help":
        print("Use: python docids2doclist.py <cntl> <doc#s> <filelist>")
        print(
            "        where doc#s is a file in which each line is a set of docids"
        )
        print("        such as {23145, 31242, 125}")
        sys.exit()

    cntl = Control(argv[1])  # control file name (after indexing)
    md = MathDocument(cntl)
    doclist = []
    with open(argv[2], 'r', encoding='utf-8') as fin:
        while True:
            t = fin.readline()
            if t == "":
                break
            doclist.extend(t.strip("{} \n").split(", "))
    with open(argv[3], 'w', encoding='utf-8') as fout:
        for val in doclist:
            fout.write(md.find_doc_file(int(val)) + "\n")
    print("Created list of %d file names" % len(doclist))
Ejemplo n.º 12
0
def main():
    if len(sys.argv) < 5:
        print("Usage")
        print("\tpython3 rerank_results.py control input_results metric output_results")
        print("")
        print("Where:")
        print("\tcontrol:\tPath to tangent control file")
        print("\tinput_results:\tPath to file with results to re-rank")
        print("\tmetric:\t\tSimilarity metric to use [0-4]")
        print("\toutput_results:\tPath to file where re-ranked results will be stored")
        print("")
        print("Optional:")
        print("\t-w\twindow\t\t: Window for pair generation")
        print("\t-h\thtml_prefix\t: Prefix for HTML output (requires dot)")
        print("\t-c\tcondition\t: Current test condition")
        print("\t-s\tstats\t\t: File to store stats")
        print("\t-t\ttimes\t\t: File to accumulate time stats")
        return

    control_filename = sys.argv[1]
    input_filename = sys.argv[2]

    try:
        metric = int(sys.argv[3])
        if metric < 0 or metric > 5:
            print("Invalid similarity metric function")
            return
    except:
        print("Invalid similarity metric function")
        return

    output_filename = sys.argv[4]

    optional_params = optional_parameters(sys.argv[5:])

    #load control file
    control = Control(control_filename) # control file name (after indexing)
    math_doc = MathDocument(control)

    if "w" in optional_params:
        try:
            window = int(optional_params["w"])
            if window <= 0:
                print("Invalid window")
                return
        except:
            print("Invalid value for window")
            return
    else:
        window = int(control.read("window"))

    if "h" in optional_params:
        html_prefix = optional_params["h"]
        if not os.path.isdir(html_prefix):
            os.makedirs(html_prefix)
        if not os.path.isdir(html_prefix + "/images"):
            os.makedirs(html_prefix + "/images")
    else:
        html_prefix = None

    if "c" in optional_params:
        condition = optional_params["c"]
        print("testing condition: " + condition)
    else:
        condition = "undefined"

    if "s" in optional_params:
        stats_file = optional_params["s"]
    else:
        stats_file = None

    if "t" in optional_params:
        times_file = optional_params["t"]
    else:
        times_file = None

    in_file = open(input_filename, 'r', encoding="utf-8")
    lines = in_file.readlines()
    in_file.close()

    mathml_cache_file = control_filename + ".retrieval_2.cache"
    if not os.path.exists(mathml_cache_file):
        mathml_cache = MathMLCache(control_filename)
    else:
        cache_file = open(mathml_cache_file, "rb")
        mathml_cache = pickle.load(cache_file)
        cache_file.close()

    current_query = None
    current_name = None
    current_tuple_retrieval_time = 'undefined'
    all_queries = []

    #read all results to re-rank
    for idx, line in enumerate(lines):
        parts = line.strip().split("\t")

        if len(parts) == 2:
            if parts[0][0] == "Q":
                current_name = parts[1]
                current_query = None
            elif parts[0][0] == "E":
                if current_name is None:
                    print("invalid expression at " + str(idx) + ": query name expected first")
                else:
                    query_expression = parts[1]

                    if html_prefix != None:
                        mathml = mathml_cache.get(-1, len(all_queries), query_expression)
                    else:
                        mathml = None

                    current_query = Query(current_name, query_expression, mathml, current_tuple_retrieval_time)
                    current_name = None
                    all_queries.append(current_query)

                    print("Query: " + current_query.name + ": " + current_query.expression, flush=True)
                    #print(mathml)
                    #current_query.tree.save_as_dot("expre_" + str(idx) + ".gv")

            elif parts[0][0] == "C":
                if current_query is None:
                    print("invalid constraint at " + str(idx) + ": query expression expected first")
                else:
                    # create a constraint tree
                    current_query.set_constraints(parts[1])

        # RZ: Record tuple-based retrieval time and other metrics.
        if len(parts) == 3 and parts[0][0] == "I" and current_query != None:
            if parts[1] == "qt":
                current_query.initRetrievalTime = float( parts[2] )
            elif parts[1] == "post":
                current_query.postings = int( parts[2] )
            elif parts[1] == "expr":
                current_query.matchedFormulae = int( parts[2] )
            elif parts[1] == "doc":
                current_query.matchedDocs = int( parts[2] )

        if len(parts) == 5:
            if parts[0][0] == "R":
                doc_id = int(parts[1])
                location = int(parts[2])
                doc_name = math_doc.find_doc_file(doc_id)

                expression = parts[3]
                score = float(parts[4])

                if html_prefix != None:
                    mathml = mathml_cache.get(doc_id, location, expression)
                else:
                    mathml = None

                if current_query is None:
                    print("Error: result listed before a query, line " + str(idx))
                else:
                    current_query.add_result(doc_id, doc_name, location, expression, score, mathml)

    cache_file = open(mathml_cache_file, "wb")
    pickle.dump(mathml_cache, cache_file, pickle.HIGHEST_PROTOCOL)
    cache_file.close()

    # now, re-rank...
    # compute similarity first...

    start_time = time.time()
    for q_idx, query in enumerate(all_queries):
        pairs_query = query.tree.root.get_pairs("", window)
        #print("Evaluating: " + query.expression)

        query_start_time = time.time() * 1000 # RZ: ms
        for res_idx, exp_result in enumerate(query.results):
            result = query.results[exp_result]

            #print("Candidate: " + result.expression)

            scores = [0.0]
            if metric == 0:
                # same as original based on f-measure of matched pairs...
                pairs_candidate = result.tree.root.get_pairs("", window)
                scores, matched_q, matched_c = similarity_v00(pairs_query, pairs_candidate)
            elif metric == 1:
                # based on testing of alignments....
                scores, matched_q, matched_c = similarity_v01(query.tree, result.tree)
            elif metric == 2:
                # Same as 0 but limiting to matching total symbols first...
                pairs_candidate = result.tree.root.get_pairs("", window)
                scores, matched_q, matched_c = similarity_v02(pairs_query, pairs_candidate)
            elif metric == 3:
                # modified version of 2 which performs unification....
                pairs_candidate = result.tree.root.get_pairs("", window)
                scores, matched_q, matched_c, unified_c = similarity_v03(pairs_query, pairs_candidate)
                result.set_unified_elements(unified_c)
            elif metric == 4:
                # modified version of 1 which performs unification ...
                scores, matched_q, matched_c, unified_c = similarity_v04(query.tree, result.tree, query.constraints)
                result.set_unified_elements(unified_c)
            elif metric == 5:
                # modified version of 4 which allows multiple sub matches
                scores, matched_q, matched_c, unified_c = similarity_v05(query.tree, result.tree, query.constraints)
                result.set_unified_elements(unified_c)

            result.set_matched_elements(matched_c)

            result.new_scores = scores

        query_end_time = time.time() * 1000 # RZ: ms

        # re-rank based on new score(s)
        query.sort_results()
        query.sort_documents()
        query.elapsed_time = query_end_time - query_start_time 

    end_time = time.time() 
    elapsed = end_time - start_time
    print("Elapsed Time Ranking: " + str(elapsed) + "s")

    #now, store the re-ranked results...
    out_file = open(output_filename, "w")
    for query in all_queries:
        out_file.write("\n")
        query.output_query(out_file)
        query.output_sorted_results(out_file)

        if html_prefix is not None:
            print("Saving " + query.name + " to HTML file.....")
            query.save_html(html_prefix)
    out_file.close()

    #if stats file is requested ...
    if stats_file is not None:
        out_file = open(stats_file, "w")
        out_file.write(Query.stats_header("\t"))
        for query in all_queries:
            query.output_stats(out_file,"\t", condition)
        out_file.close()

    # if times file is requested ...
    if times_file is not  None:
        sorted_queries = sorted([(query.name.strip(), query) for query in all_queries])

        if os.path.exists(times_file):
            out_file = open(times_file, "a")
        else:
            out_file = open(times_file, "w")
            header = "condition," + ",".join([name for (name, query) in sorted_queries])
            out_file.write(header + "\n")

        line = condition

        for name, query in sorted_queries:
            line += "," + str(query.elapsed_time)

        out_file.write(line + "\n")

        out_file.close()

    print("Finished successfully")
Ejemplo n.º 13
0
from tangent.math.mathdocument import MathDocument 
from tangent.utility.control import Control
from sys import argv

cntl = Control('./tangent.cntl')
d = MathDocument(cntl)
print(d.find_doc_file(int(argv[1])))  # doc_num and pos_num
print(d.find_mathml(int(argv[1]), int(argv[2])))  # doc_num and pos_num
Ejemplo n.º 14
0
        print("     text_results\\t<file with results from text search engine>")
        print("     combined_results\\t<file to store combined results>")
        print("     combine_math\\t{'rerank' | 'average'} (mechanism for combining math results)")
        print("     mweight\\t0..100 (percentage of weight on formula matches)")
        print("and may optionally include:")
        print("     run\\t<arbitrary name for query run>")
        print("as well as other pairs.")
        print("")
        print("Optional additional command line parameters:")
        print("\t-w\twindow\t\t: Window for pair generation")
        exit()


    #load control file
    control = Control(sys.argv[1]) # control file name (after indexing)
    math_doc = MathDocument(control)

    minput_filename = control.read("math_results")
    tinput_filename = control.read("text_results")
    combiner = control.read("combine_math")
    mweight = control.read("mweight",num=True,default=70)
    output_filename = control.read("combined_results")
    
    optional_params = optional_parameters(sys.argv[2:])


    window = control.read("window",num=True,default=1)
    if "w" in optional_params:
        try:
            w = int(optional_params["w"])
            if w <= 0:
Ejemplo n.º 15
0
import codecs
import sys
from sys import argv

from tangent.utility.control import Control
from tangent.math.mathdocument import MathDocument

__author__ = 'FWTompa'

if __name__ == '__main__':

    if sys.stdout.encoding != 'utf8':
        sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer, 'strict')
    if sys.stderr.encoding != 'utf8':
        sys.stderr = codecs.getwriter('utf8')(sys.stderr.buffer, 'strict')

    if len(argv) != 4 or argv[1] == "help":
        print("Use: python get_math.py <cntl> <doc#> <expr#>")
        print("        where (doc# < 0) => use queryfile")
        sys.exit()

    cntl = Control(argv[1])  # control file name (after indexing)
    d = MathDocument(cntl)
    docno = int(argv[2])
    exprno = int(argv[3])
    print("doc " + argv[2] + ": " +
          d.find_doc_file(docno))  #print document file name
    print(d.find_mathml(docno, exprno))  # doc_num and pos_num
Ejemplo n.º 16
0
    def get(self, fileid):
        """
        ingest result tuples for topk responses to queries

        :param fileid: process id used to distinguish files
        :type  fileid: string
        :return: query responses
        :rtype:  dict mapping query_name -> CompQuery()

        Q	queryID
        E       search-expr
        R	docID   position	expression	score
        R	docID   position	expression	score
        ...
        Q	queryID
        ...
        X

        """

        if (self.runmode == "now"):
            reader = self.reader
        else:
            filename = "%s_r_%s.tsv" % (self.db, fileid)
            file_path = os.path.join(self.directory, filename)
            file = open(file_path, mode='r', encoding='utf-8', newline='')
            reader = csv.reader(file,
                                delimiter='\t',
                                lineterminator='\n',
                                quoting=csv.QUOTE_NONE,
                                escapechar="\\")
        print("Reading from math engine")
        doc_list = MathDocument(self.cntl)
        all_queries = {}
        for line in reader:
            if line:
                if line[0] == "Q":
                    current_name = line[1]
                    try:
                        current_query = all_queries[current_name]
                    except:
                        current_query = CompQuery(current_name)
                        all_queries[current_name] = current_query
                    current_expr = None
                elif line[0] == "E":
                    if current_name is None:
                        print(
                            "Invalid expression: Q tuple with query name expected first: "
                            + str(line),
                            flush=True)
                    else:
                        query_expression = line[1]
                        current_expr = Query(current_name, query_expression)
                        current_query.add_expr(current_expr)
                elif line[0] == "C":
                    print("Constraint ignored: " + line)

                elif line[0] == "I":
                    if current_name is None or current_expr is None:
                        print(
                            "Invalid information: Q tuple with query name and E tuple with expression expected first: "
                            + str(line))
                    elif line[1] == "qt":
                        current_expr.initRetrievalTime = float(line[2])
                    elif line[1] == "post":
                        current_expr.postings = int(line[2])
                    elif line[1] == "expr":
                        current_expr.matchedFormulae = int(line[2])
                    elif line[1] == "doc":
                        current_expr.matchedDocs = int(line[2])

                elif line[0] == "R":
                    if current_name is None or current_expr is None:
                        print(
                            "Invalid result item: Q tuple with query name and E tuple with expression expected first: "
                            + str(line))
                    else:
                        doc_id = int(line[1])
                        doc_name = doc_list.find_doc_file(doc_id)
                        if not doc_name:
                            doc_name = "NotADoc"
                        location = int(line[2])
                        expression = line[3]
                        score = float(line[4])
                        current_expr.add_result(doc_id, doc_name, location,
                                                expression, score)

                elif line[0] == "X":
                    break
                else:
                    print("Ignoring invalid tuple: " + str(line))
        print("Read " + str(len(all_queries)) + " queries")
        return all_queries
Ejemplo n.º 17
0
import codecs
import sys
from sys import argv

from tangent.utility.control import Control
from tangent.math.mathdocument import MathDocument

__author__ = 'FWTompa'



if __name__ == '__main__':

    if sys.stdout.encoding != 'utf8':
      sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer, 'strict')
    if sys.stderr.encoding != 'utf8':
      sys.stderr = codecs.getwriter('utf8')(sys.stderr.buffer, 'strict')

    if len(argv) != 4 or argv[1] == "help":
        print("Use: python get_math.py <cntl> <doc#> <expr#>")
        print("        where (doc# < 0) => use queryfile")
        sys.exit()

    cntl = Control(argv[1]) # control file name (after indexing)
    d = MathDocument(cntl)
    print(d.find_mathml(int(argv[2]),int(argv[3])))  # doc_num and pos_num