def test_same_xml_and_html(self):
     self.assertEqual(px.recutext_xml(self.xml_lst1[34]),
                      px.recutext_html(self.html_lst1[34]))
     self.assertEqual(px.recutext_xml(self.xml_lst1[44]),
                      px.recutext_html(self.html_lst1[44]))
     self.assertEqual(px.recutext_xml(self.xml_lst1[4]),
                      px.recutext_html(self.html_lst1[4]))
def sample_article(f, ns, para_per_article=10, min_words=15):
    '''
    Usage: f be a parsable xml tree
    try to get para_per_article paragraphs from this article
    min_words: the paragraph has to have more that this amount of words
    '''
    try:
        exml = etree.parse(f, etree.XMLParser(remove_comments=True))
        para_lst_nonrand = exml.findall('.//latexml:para', ns)
        para_lst = random.sample(para_lst_nonrand, para_per_article)
    except etree.ParseError:
        print('article %s could no be parsed' % f)
        para_lst = []
    except ValueError as ve:
        print('article %s has few paragraphs: %s' % (f, ve))
        para_lst = []

    return_lst = []
    for p in para_lst:
        if px.check_sanity(p, ns):
            para_text = px.recutext_xml(p)
            if len(para_text.split()) >= min_words:  #check min_words
                return_lst.append(para_text)
        else:
            print('article %s has messed up para' % f)
    return return_lst
Esempio n. 3
0
def para_tags(f, ns, min_words=0):
    '''
    Usage: f be a parsable xml tree
    try to get para_per_article paragraphs from this article
    min_words: the paragraph has to have more that this amount of words
    '''
    try:
        exml = ET.parse(f)
        para_lst = exml.findall('.//latexml:para', ns)
    except ET.ParseError:
        print('article %s could no be parsed' % f)
        para_lst = []
    except ValueError:
        print('article %s has few paragraphs' % f)
        para_lst = []

    return_lst = []
    for p in para_lst:
        if px.check_sanity(p, ns):
            para_text = px.recutext_xml(p)
            if len(para_text.split()) >= min_words:  #check min_words
                return_lst.append(para_text)
        else:
            print('article %s has messed up para' % f)
    return return_lst
Esempio n. 4
0
def create_definition_branch(ind, defi):
    root = etree.Element("definition")
    root.attrib['index'] = repr(ind)
    statement = etree.SubElement(root, 'stmnt')
    statement.text = px.recutext_xml(defi)
    for d in get_definiendum(defi, ns):
        dfndum = etree.SubElement(root, 'dfndum')
        dfndum.text = d
    return root
                print('{:15} {:>10}  {:>10}'.format(s[0], y_true_tmp[k], predicted[k]))
    return y_true, y_pred

# Prepare and print metrics for the normal metrics
OO = prepare_for_metrics(119, chunker, data_set=test_samples, print_output=True)
y_true, predicted = prepare_for_metrics(range(len(test_samples)), chunker)
print(metrics.classification_report(y_true, predicted))
# -

# An example of a user fed definition
chunked = chunker.parse(pos_tag(word_tokenize(Def[0])))
D =list(filter(lambda x: isinstance(x, nltk.tree.Tree), chunked))[0]
' '.join([d[0] for d in D])

art = px.DefinitionsXML('tests/latexmled_files/1501.06563.xml')
p_lst = [px.recutext_xml(p) for p in art.tag_list(tag='para')] 
p_vec = count_vect.transform(p_lst)
preds = clf.predict(p_vec)

for k,p in enumerate(p_lst):
    print(k,preds[k],p[:100])
    print('------')

chunk = tree2conlltags(chunker.parse(pos_tag(word_tokenize(p_lst[63]))))
for tok in chunk:
    print('{:15} {:>10} '.format(tok[0], tok[2]))

with open('../PickleJar/chunker.pickle', 'wb') as chunker_f:
    pickle.dump(chunker, chunker_f)

with open('data/vectorizer.pickle', 'wb') as token_f:
 def test_recutext_xml(self):
     expect1 = 'For the remaining properties we state we shall assume that _inline_math_ or _inline_math_.'
     expect2 = '''Let _inline_math_ be a set of elements of _inline_math_. Recall that an _inline_math_-invariant CAD of _inline_math_ _citation_ is a partitioning of _inline_math_ into connected subsets called cells compatible with the zeros of the elements of _inline_math_. The output of a CAD algorithm applied to _inline_math_ is a description of an _inline_math_-invariant CAD _inline_math_ of _inline_math_. That is, _inline_math_ is a decomposition of _inline_math_ determined by the roots of the elements of _inline_math_ over the cells of some cylindrical algebraic decomposition _inline_math_ of _inline_math_; each element of _inline_math_ is sign-invariant throughout every cell of _inline_math_.'''
     self.assertEqual(expect1, px.recutext_xml(self.xml_lst1[19]))
     self.assertEqual(expect2, px.recutext_xml(self.xml_lst1[32]))