Esempio n. 1
0
def read_file_behrooz(filename,
                      file_id,
                      semantic,
                      missing_tags=None,
                      problem_files=None):
    """
    Read file for parsing

    :type filename: string
    :param filename: file to be parsed

    :rtype: list(SymbolTree)
    :return list of Symbol trees found in the file
    """
    #s = time.time()
    (ext, content) = MathDocument.read_doc_file(filename)
    t = MathExtractor.Behrooz_parse_from_xml(content,
                                             1,
                                             window=3,
                                             operator=semantic,
                                             missing_tags=missing_tags,
                                             problem_files=problem_files)
    # #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t)))
    # for item in t:
    #     #slttuplesList = SymbolTree.get_pairs(item, window='all')
    #     print(item)
    return t
Esempio n. 2
0
 def get_collection(self, ):
     except_count = 0
     dictionary_formula_tuples = {}
     root = self.collection_file_path
     for directory in os.listdir(root):
         temp_address = root+"/"+directory+"/"
         if not os.path.isdir(temp_address):
             continue
         temp_address = temp_address +"/Articles"
         for filename in os.listdir(temp_address):
             file_path = temp_address + '/' + filename
             parts = filename.split('/')
             file_name = os.path.splitext(parts[len(parts) - 1])[0]
             try:
                 (ext, content) = MathDocument.read_doc_file(file_path)
                 formulas = MathExtractor.parse_from_xml(content, 1, operator=(not self.read_slt), missing_tags=None,
                                                         problem_files=None)
                 temp = str(unicodedata.normalize('NFKD', file_name).encode('ascii', 'ignore'))
                 temp = temp[2:]
                 file_name = temp[:-1]
                 for key in formulas:
                     tuples = formulas[key].get_pairs(window=2, eob=True)
                     dictionary_formula_tuples[file_name + ":" + str(key)] = tuples
             except:
                 except_count += 1
                 print(file_name)
     return dictionary_formula_tuples
Esempio n. 3
0
 def get_query(self,):
     except_count = 0
     dictionary_query_tuples = {}
     for j in range(1, 21):
         temp_address = self.queries_directory_path + '/' + str(j) + '.html'
         try:
             (ext, content) = MathDocument.read_doc_file(temp_address)
             formulas = MathExtractor.parse_from_xml(content, 1, operator=(not self.read_slt), missing_tags=None,
                                                     problem_files=None)
             for key in formulas:
                 tuples = formulas[key].get_pairs(window=2, eob=True)
                 dictionary_query_tuples[j] = tuples
         except:
             except_count += 1
             print(j)
     return dictionary_query_tuples
Esempio n. 4
0
def read_file(filename,
              file_id,
              semantic,
              missing_tags=None,
              problem_files=None):
    """
    Read file for parsing

    :type filename: string
    :param filename: file to be parsed

    :rtype: list(SymbolTree)
    :return list of Symbol trees found in the file
    """
    #s = time.time()
    (ext, content) = MathDocument.read_doc_file(filename)

    if ext == '.tex' and not semantic:
        t = MathExtractor.parse_from_tex(content, file_id)
        #print("file %s took %s"%(file_id,time.time()-s))
        return [t], 0
    elif ext in {'.xhtml', '.mathml', '.mml', '.html'}:
        t, n_err = MathExtractor.parse_from_xml(content,
                                                file_id,
                                                operator=semantic,
                                                missing_tags=missing_tags,
                                                problem_files=problem_files)
        #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t)))
        for item in t:
            #slttuplesList = SymbolTree.get_pairs(item, window='all')
            print(item)
        return t, n_err
    else:
        if ext == '.tex' and semantic:
            if "invalid_filetype" not in problem_files:
                problem_files["invalid_filetype"] = set([filename])
            else:
                problem_files["invalid_filetype"].add(filename)

            print('invalid file format %s for %s in operator tree mode' %
                  (ext, filename))
        else:
            problem_files["unknown_filetype"] = problem_files.get(
                "unknown_filetype", set())
            problem_files["unknown_filetype"].add(filename)
            print('Unknown filetype %s for %s' % (ext, filename))
        return [], 0
Esempio n. 5
0
def check_value_exists():
    root = '/home/bm3302/Downloads/NTCIR12_MathIR_WikiCorpus_v2.1.0/NTCIR12_MathIR_WikiCorpus_v2.1.0/MathTagArticles/wpmath00000'
    for j in range(1, 17):
        tempAddress = root
        if j < 10:
            tempAddress = tempAddress + '0' + str(j) + '/Articles'
        else:
            tempAddress = tempAddress + str(j) + '/Articles'
        for filename in os.listdir(tempAddress):
            try:
                filePath = tempAddress + '/' + filename
                (ext, content) = MathDocument.read_doc_file(filePath)
                # if "<mn>\n  " in content:
                if "9.80665" in content:
                    print(filePath)
                    # return
            except:
                print('-------------------------------------------' + filename)
Esempio n. 6
0
def ConvertWikipediaToSLTTuplesNewVersion(filePathForresults,
                                          filename,
                                          dirId,
                                          lst,
                                          missing_tags=None,
                                          problem_files=None):
    try:
        parts = filename.split('/')
        file_name = os.path.splitext(parts[len(parts) - 1])[0]
        # parts = parts[len(parts)-1].split(".")
        # FileID = parts[0]
        # for i in range (1,len(parts)-1):
        #     FileID = FileID + "."+parts[i]
        (ext, content) = MathDocument.read_doc_file(filename)
        formulas = MathExtractor.parse_from_xml(content,
                                                1,
                                                operator=False,
                                                missing_tags=missing_tags,
                                                problem_files=problem_files)

        # formulas = MathExtractor.parse_from_xml(content,1, operator=False, missing_tags=missing_tags,problem_files=problem_files)

        #formulas = MathExtractor.behrooz_parse_from_xml(content=content, content_id=1, operator=True, missing_tags=missing_tags)
        for key in formulas:
            tuples = formulas[key].get_pairs(window=1, eob=True)
            if not tuples:
                return
            f = open(
                filePathForresults + "/" + str(dirId) + "/" + file_name + ":" +
                str(key) + ".txt", "w+")
            for t in tuples:
                f.write(t + "\n")
            f.close()

            #fileP = filePathForresults + "/" + str(dirId) + "/" + FileID + ":" + str(key) + ".txt"
            #f = open(fileP, "w+")
            #for t in tuples:
            #f.write(t+"\n")
            #f.close()

    except:
        print(filename)
Esempio n. 7
0
def behrooz_queryPreparation(filename,
                             resultFile,
                             file_id,
                             missing_tags=None,
                             problem_files=None):
    (ext, content) = MathDocument.read_doc_file(filename)
    #formulas = MathExtractor.parse_from_xml(content,1, operator=False, missing_tags=missing_tags,problem_files=problem_files)
    formulas = MathExtractor.parse_from_xml(content,
                                            1,
                                            operator=False,
                                            missing_tags=missing_tags,
                                            problem_files=problem_files)
    for key in formulas:
        tuples = formulas[key].get_pairs(window=1, eob=True)
        if not tuples:
            return
        f = open(resultFile, "w+")
        for t in tuples:
            f.write(t + "\n")
        f.close()
Esempio n. 8
0
    def output_query(self, out_file, cntl, topk, query_time_ms):
        out_file.write("\n")
        out_file.write("QUERY\t" + self.name + "\t" + str(query_time_ms) +
                       "\n")

        if len(self.sorted_docs) == 0:
            # no results? nothing can be output
            return
        """
        for mquery in self.mqueries:
            out_file.write("E\t" + mquery.expression + "\n")
        if self.tquery:
            for keyword in self.tquery.keywords:
                out_file.write("P\t" + keyword + "\n")
        """
        d = MathDocument(cntl)

        min_score = self.sorted_docs[len(self.sorted_docs) - 1].final_score
        if len(self.sorted_docs) < topk:
            print("Warning: Query produced less than " + str(topk) +
                  " documents. Results will be repeated",
                  flush=True)

        # force output topk results
        for idx in range(topk):
            doc = self.sorted_docs[idx % len(self.sorted_docs)]

            positions = self.get_math_pos_with_score(doc)
            try:
                exprids = list(
                    map(
                        lambda pos:
                        (d.find_mathml_id(doc.doc_id, pos[0]), pos[1]),
                        positions))
            except (IOError):  # cannot read ids
                exprids = positions
            #out_file.write("R\t" + str(doc.doc_name) + "\t" + str(doc.final_score) + "\t(at: " + str(exprids) + str(self.get_text_pos(doc))+ ")\n")
            row_elements = [str(idx + 1)]

            if idx < len(self.sorted_docs):
                # use original score
                doc_score = doc.final_score
            else:
                doc_score = min_score

            if isinstance(doc_score, list):
                row_elements.append(str(doc_score[0]))
            else:
                row_elements.append(str(doc_score))

            row_elements.append(doc.doc_name)

            # add formulas *M*
            for exprid, mscore in exprids:
                row_elements += ["*M*", str(exprid), str(mscore[0])]

            # add Keywords *W*
            if self.tquery:
                for keyword in self.tquery.keywords:
                    row_elements += ["*W*", keyword, str(doc.tscore[1])]

            out_file.write("\t".join(row_elements) + "\n")