def setUpClass(cls): cls.ns = {'ltx': ''} cls.xml1 = px.DefinitionsXML('tests/latexmled_files/1501.06563.xml') cls.xml2 = px.DefinitionsXML( 'tests/latexmled_files/enumerate_forms.xml') cls.def_text = cls.xml2.get_def_text() cls.xml_lst1 = cls.xml1.exml.findall('.//ltx:p', namespaces=cls.ns) cls.html1 = px.DefinitionsXML('tests/latexmled_files/1501.06563.html') cls.html2 = px.DefinitionsXML( 'tests/latexmled_files/1501.06563_shortened.html') cls.html_lst1 = cls.html1.exml.findall('.//p', namespaces=cls.ns)
def test_exact_tokenize1(self): dtest = px.DefinitionsXML('tests/latexmled_files/math.0402243.xml') str1 = '''une orbifolde pure est un espace analytique complexe normal _inline_math_ n’ayant que des singularités\nquotient.''' str2 = dtest.get_def_text()[0].lower() self.assertEqual(nltk.word_tokenize(str1), nltk.word_tokenize(str2))
def test_DefinitionXML_sampling(self): dd = px.DefinitionsXML( 'tests/latexmled_files/minimal_example_with_defs.xml') sample_dict = dd.get_def_sample_text_with(sample_size=4) self.assertEqual(len(sample_dict['real']), 2) self.assertEqual(len(sample_dict['nondef']), 1) self.assertTrue( 'This is an example document.' in sample_dict['nondef'][0])
def parse_clf_chunk(file_obj, clf, bio, vzer, tokr): ''' Runs the classifier and chunker on the file_obj file_obj: file object clf, bio, vzer, tokr: pickled classifiers and tokenizer ''' px = parsing_xml.DefinitionsXML(file_obj) ddum = Definiendum(px, clf, bio, vzer, tokr) return ddum.root
def test_exact_tokenize3(self): dtest = px.DefinitionsXML('tests/latexmled_files/math.0407523.xml') list1 = [ 'a', 'coherent', 'system', '_inline_math_', 'is', 'injective', 'if', 'the', 'evaluation', 'morphism', '_inline_math_', 'is', 'injective', 'as', 'a', 'morphism', 'of', 'sheaves', '.', 'moreover', '_inline_math_', 'is', 'torsion-free', 'if', 'it', 'is', 'injective', 'and', 'the', 'quotient', 'sheaf', '_inline_math_', 'is', 'torsion-free', '.' ] list2 = dtest.get_def_text()[3].lower() self.assertEqual(list1, nltk.word_tokenize(list2))
def parse_clf_chunk(file_obj, clf, bio, vzer, tokr, max_stale_tries=15): ''' Runs the classifier and chunker on the file_obj file_obj: file object clf, bio, vzer, tokr: pickled classifiers and tokenizer max_stale_tries: number of retries of OSError Stale file handle ''' retried = 0 while retried < max_stale_tries: retried += 1 try: DD = px.DefinitionsXML(file_obj) ddum = Definiendum(DD, clf, bio, vzer, tokr) break except OSError as ee: wait_delay = randint(5, 15) logging.warning(f"{ee} waiting for {wait_delay} retry: {retried}") time.sleep(wait_delay) return ddum.root
def test_contain_words1(self): dtest = px.DefinitionsXML('tests/latexmled_files/math.0412433.xml') test_set = set(nltk.word_tokenize(dtest.get_def_text()[0].lower())) ss = { '.', ';', ',', 'kirwan', 'let', 'we', 'be', 'codimension', 'components', 'divisorial', 'having', 'if', 'in', 'irreducible', 'is', 'locus', 'mild', 'of', 'one', 'other', 'part', 'resolution', 'say', 'shall', 'that', 'the', 'union', 'unstable', 'words', '_inline_math_', } self.assertSetEqual(ss, test_set)
print( ' Querying ', end='\r') qq = query() with open(args.file_names[0], 'a') as real_f, open(args.file_names[1], 'a') as nondefs_f: for l in qq: nonlocal_path = art_dict.get(l[0]) if nonlocal_path: prepath = re.sub('^/mnt/', '', nonlocal_path) print( 'file: %s ' % prepath, end='\r') local_path = os.path.join(loc_path, prepath) try: xml = px.DefinitionsXML(local_path) tdict = xml.get_def_sample_text_with() for s in tdict['real']: real_f.write(s + '\n') for s in tdict['nondef']: nondefs_f.write(s + '\n') except ValueError: print('error parsing file %s' % local_path) else: print( 'Did not found: %s ' % l[0], end='\r')
root = etree.Element("definition") root.attrib['index'] = repr(ind) statement = etree.SubElement(root, 'stmnt') statement.text = px.recutext_xml(defi) for d in get_definiendum(defi, ns): dfndum = etree.SubElement(root, 'dfndum') dfndum.text = d return root # + root = etree.Element('root') for filenm in glob.glob('data/stacks-clean/perfect.tex.xml'): try: px_file = px.DefinitionsXML(filenm) branch = px_file.create_xml_branch() root.append(branch) except ValueError as e: print('%s is empty!' % filenm) #print(etree.tostring(root, pretty_print=True).decode('utf8')) # - with open('data/short_starts_withp_graph.xml', 'w+') as stack_file: stack_file.write(etree.tostring(root, pretty_print=True).decode('utf8')) lazrd = px.DefinitionsXML('tests/latexmled_files/1501.06563.html') #print(etree.tostring(lazrd.create_xml_branch(),pretty_print=True).decode('utf8')) #print(lazrd.get_def_sample_text_with(30)['real'][2]) d1 = lazrd.find_definitions()[2]
for k,s in enumerate(sm): print('{:15} {:>10} {:>10}'.format(s[0], y_true_tmp[k], predicted[k])) return y_true, y_pred # Prepare and print metrics for the normal metrics OO = prepare_for_metrics(119, chunker, data_set=test_samples, print_output=True) y_true, predicted = prepare_for_metrics(range(len(test_samples)), chunker) print(metrics.classification_report(y_true, predicted)) # - # An example of a user fed definition chunked = chunker.parse(pos_tag(word_tokenize(Def[0]))) D =list(filter(lambda x: isinstance(x, nltk.tree.Tree), chunked))[0] ' '.join([d[0] for d in D]) art = px.DefinitionsXML('tests/latexmled_files/1501.06563.xml') p_lst = [px.recutext_xml(p) for p in art.tag_list(tag='para')] p_vec = count_vect.transform(p_lst) preds = clf.predict(p_vec) for k,p in enumerate(p_lst): print(k,preds[k],p[:100]) print('------') chunk = tree2conlltags(chunker.parse(pos_tag(word_tokenize(p_lst[63])))) for tok in chunk: print('{:15} {:>10} '.format(tok[0], tok[2])) with open('../PickleJar/chunker.pickle', 'wb') as chunker_f: pickle.dump(chunker, chunker_f)
def test_contain_words2(self): dtest = px.DefinitionsXML('tests/latexmled_files/math.0402243.xml') test_set = set(nltk.word_tokenize(dtest.get_def_text()[0].lower())) ss = {'quotient', 'singularités', 'orbifolde'} self.assertTrue(ss.issubset(test_set))
def test_exact_tokenize2(self): dtest = px.DefinitionsXML('tests/latexmled_files/math.0412433.xml') str1 = '''let _inline_math_ \n be\nthe divisorial part of the unstable locus of _inline_math_ \n ; in other words,\n\n _inline_math_ is the union of the irreducible components\nof \n _inline_math_ having codimension one in _inline_math_ \n . we shall say\nthat the kirwan resolution _inline_math_ \n is mild if _inline_math_ \n .''' str2 = dtest.get_def_text()[0].lower() self.assertEqual(nltk.word_tokenize(str1), nltk.word_tokenize(str2))
print('I am rpi%s and dealing with dir %s \n'%(rank, d)) out_path = os.path.join('/tmp/', d) try: os.mkdir(out_path) except FileExistsError as ee: print(ee, ' continuiung using this directory') #print(tar_lst) #root = etree.Element('root', name=d) for tarpath in tar_lst: # tarpath: 9201_001.tar.gz #print(os.path.join(mnt_path, d, T)) tfile_elm = etree.Element('tarfile', name=tarpath) for fname,T in peep.tar_iter(os.path.join(mnt_path, d, tarpath), '.xml'): print(fname) try: DD = px.DefinitionsXML(T) def_dict = DD.get_def_sample_text_with() except ValueError as ee: print("\n Probably empty article: %s \n"%fname, ee) def_dict = {'real': [], 'nondef': []} art_elm = etree.SubElement(tfile_elm, 'article', name=fname) for defin in def_dict['real']: defi_elm = etree.SubElement(art_elm, 'definition') defi_elm.text = defin for defin in def_dict['nondef']: defi_elm = etree.SubElement(art_elm, 'nondef') defi_elm.text = defin #print(etree.tostring(tfile_elm, pretty_print=True).decode('utf-8')) gz_filename = os.path.basename(tarpath).split('.')[0] + '.xml.gz' #logging.debug('The name of the gz_filename is: %s'%gz_filename)
if args.query: art_dict = create_dict() qq = query() change_path = lambda p: re.sub(r'^/mnt/', '/home/luis/media_home/', p) file_lst = [ change_path(art_dict[s[0]]) for s in qq if s[0] in art_dict ] else: file_lst = args.file_names for k, xml_path in enumerate(file_lst): havent_done = root.find('.//article[@name = "%s"]' % xml_path) is None if havent_done: print('Processing file: %s' % os.path.basename(xml_path), end='\r') try: px = parsing_xml.DefinitionsXML(xml_path) ddum = Definiendum(px, clf, bio, vzer, tokr) root.append(ddum.root) if k % 25 == 0 and args.output: with open(args.output, 'w') as out_f: out_f.write( etree.tostring(root, pretty_print=True).decode('utf8')) except (TypeError, etree.ParseError): print('file %s could not be parsed by parsing_xml' % os.path.basename(xml_path)) except ValueError as e: print('In the file %s found the problem' % os.path.basename(xml_path)) print(e) else: