Ejemplo n.º 1
0
def read_file_behrooz(filename,
                      file_id,
                      semantic,
                      missing_tags=None,
                      problem_files=None):
    """
    Read file for parsing

    :type filename: string
    :param filename: file to be parsed

    :rtype: list(SymbolTree)
    :return list of Symbol trees found in the file
    """
    #s = time.time()
    (ext, content) = MathDocument.read_doc_file(filename)
    t = MathExtractor.Behrooz_parse_from_xml(content,
                                             1,
                                             window=3,
                                             operator=semantic,
                                             missing_tags=missing_tags,
                                             problem_files=problem_files)
    # #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t)))
    # for item in t:
    #     #slttuplesList = SymbolTree.get_pairs(item, window='all')
    #     print(item)
    return t
Ejemplo n.º 2
0
 def get_collection(self, ):
     except_count = 0
     dictionary_formula_tuples = {}
     root = self.collection_file_path
     for directory in os.listdir(root):
         temp_address = root+"/"+directory+"/"
         if not os.path.isdir(temp_address):
             continue
         temp_address = temp_address +"/Articles"
         for filename in os.listdir(temp_address):
             file_path = temp_address + '/' + filename
             parts = filename.split('/')
             file_name = os.path.splitext(parts[len(parts) - 1])[0]
             try:
                 (ext, content) = MathDocument.read_doc_file(file_path)
                 formulas = MathExtractor.parse_from_xml(content, 1, operator=(not self.read_slt), missing_tags=None,
                                                         problem_files=None)
                 temp = str(unicodedata.normalize('NFKD', file_name).encode('ascii', 'ignore'))
                 temp = temp[2:]
                 file_name = temp[:-1]
                 for key in formulas:
                     tuples = formulas[key].get_pairs(window=2, eob=True)
                     dictionary_formula_tuples[file_name + ":" + str(key)] = tuples
             except:
                 except_count += 1
                 print(file_name)
     return dictionary_formula_tuples
Ejemplo n.º 3
0
 def get_query(self,):
     except_count = 0
     dictionary_query_tuples = {}
     for j in range(1, 21):
         temp_address = self.queries_directory_path + '/' + str(j) + '.html'
         try:
             (ext, content) = MathDocument.read_doc_file(temp_address)
             formulas = MathExtractor.parse_from_xml(content, 1, operator=(not self.read_slt), missing_tags=None,
                                                     problem_files=None)
             for key in formulas:
                 tuples = formulas[key].get_pairs(window=2, eob=True)
                 dictionary_query_tuples[j] = tuples
         except:
             except_count += 1
             print(j)
     return dictionary_query_tuples
Ejemplo n.º 4
0
def read_file(filename,
              file_id,
              semantic,
              missing_tags=None,
              problem_files=None):
    """
    Read file for parsing

    :type filename: string
    :param filename: file to be parsed

    :rtype: list(SymbolTree)
    :return list of Symbol trees found in the file
    """
    #s = time.time()
    (ext, content) = MathDocument.read_doc_file(filename)

    if ext == '.tex' and not semantic:
        t = MathExtractor.parse_from_tex(content, file_id)
        #print("file %s took %s"%(file_id,time.time()-s))
        return [t], 0
    elif ext in {'.xhtml', '.mathml', '.mml', '.html'}:
        t, n_err = MathExtractor.parse_from_xml(content,
                                                file_id,
                                                operator=semantic,
                                                missing_tags=missing_tags,
                                                problem_files=problem_files)
        #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t)))
        for item in t:
            #slttuplesList = SymbolTree.get_pairs(item, window='all')
            print(item)
        return t, n_err
    else:
        if ext == '.tex' and semantic:
            if "invalid_filetype" not in problem_files:
                problem_files["invalid_filetype"] = set([filename])
            else:
                problem_files["invalid_filetype"].add(filename)

            print('invalid file format %s for %s in operator tree mode' %
                  (ext, filename))
        else:
            problem_files["unknown_filetype"] = problem_files.get(
                "unknown_filetype", set())
            problem_files["unknown_filetype"].add(filename)
            print('Unknown filetype %s for %s' % (ext, filename))
        return [], 0
Ejemplo n.º 5
0
def check_value_exists():
    root = '/home/bm3302/Downloads/NTCIR12_MathIR_WikiCorpus_v2.1.0/NTCIR12_MathIR_WikiCorpus_v2.1.0/MathTagArticles/wpmath00000'
    for j in range(1, 17):
        tempAddress = root
        if j < 10:
            tempAddress = tempAddress + '0' + str(j) + '/Articles'
        else:
            tempAddress = tempAddress + str(j) + '/Articles'
        for filename in os.listdir(tempAddress):
            try:
                filePath = tempAddress + '/' + filename
                (ext, content) = MathDocument.read_doc_file(filePath)
                # if "<mn>\n  " in content:
                if "9.80665" in content:
                    print(filePath)
                    # return
            except:
                print('-------------------------------------------' + filename)
Ejemplo n.º 6
0
def ConvertWikipediaToSLTTuplesNewVersion(filePathForresults,
                                          filename,
                                          dirId,
                                          lst,
                                          missing_tags=None,
                                          problem_files=None):
    try:
        parts = filename.split('/')
        file_name = os.path.splitext(parts[len(parts) - 1])[0]
        # parts = parts[len(parts)-1].split(".")
        # FileID = parts[0]
        # for i in range (1,len(parts)-1):
        #     FileID = FileID + "."+parts[i]
        (ext, content) = MathDocument.read_doc_file(filename)
        formulas = MathExtractor.parse_from_xml(content,
                                                1,
                                                operator=False,
                                                missing_tags=missing_tags,
                                                problem_files=problem_files)

        # formulas = MathExtractor.parse_from_xml(content,1, operator=False, missing_tags=missing_tags,problem_files=problem_files)

        #formulas = MathExtractor.behrooz_parse_from_xml(content=content, content_id=1, operator=True, missing_tags=missing_tags)
        for key in formulas:
            tuples = formulas[key].get_pairs(window=1, eob=True)
            if not tuples:
                return
            f = open(
                filePathForresults + "/" + str(dirId) + "/" + file_name + ":" +
                str(key) + ".txt", "w+")
            for t in tuples:
                f.write(t + "\n")
            f.close()

            #fileP = filePathForresults + "/" + str(dirId) + "/" + FileID + ":" + str(key) + ".txt"
            #f = open(fileP, "w+")
            #for t in tuples:
            #f.write(t+"\n")
            #f.close()

    except:
        print(filename)
Ejemplo n.º 7
0
def behrooz_queryPreparation(filename,
                             resultFile,
                             file_id,
                             missing_tags=None,
                             problem_files=None):
    (ext, content) = MathDocument.read_doc_file(filename)
    #formulas = MathExtractor.parse_from_xml(content,1, operator=False, missing_tags=missing_tags,problem_files=problem_files)
    formulas = MathExtractor.parse_from_xml(content,
                                            1,
                                            operator=False,
                                            missing_tags=missing_tags,
                                            problem_files=problem_files)
    for key in formulas:
        tuples = formulas[key].get_pairs(window=1, eob=True)
        if not tuples:
            return
        f = open(resultFile, "w+")
        for t in tuples:
            f.write(t + "\n")
        f.close()