コード例 #1
0
def write_json_worker(args):
    topic_number, input_tokens = args
    output_tokens = []
    partial_failure = []
    input_math_token_number = 0
    for input_token in input_tokens:
        assert isinstance(input_token, (Text, Math))
        if isinstance(input_token, Text):
            output_token = [
                str(Text(token))
                for token in simple_preprocess(input_token.text)
            ]
            output_tokens.append(output_token)
        else:
            input_math_token_number += 1
            try:
                math_element = MathExtractor.isolate_cmml(input_token.math)
                output_math_tokens = [
                    str(Math(token)) for token in tokenize(math_element)
                ]
                output_tokens.append(output_math_tokens)
            except Exception as e:
                partial_failure.append(
                    '- Processing formula #{} failed: {}'.format(
                        input_math_token_number,
                        repr(e),
                    ))
    return ('\n'.join(partial_failure), topic_number, output_tokens)
コード例 #2
0
def write_json_worker(args):
    zip_filename, filename, input_paragraphs = args
    output_paragraphs = []
    partial_failure = []
    if not input_paragraphs:
        partial_failure.append(
            '- Either the input document is empty, or it failed to parse')
    for input_paragraph_number, input_paragraph in enumerate(input_paragraphs):
        input_paragraph_number += 1
        output_paragraph = []
        input_math_token_number = 0
        for input_token in input_paragraph:
            assert isinstance(input_token, (Text, Math))
            if isinstance(input_token, Text):
                output_token = str(input_token)
                output_paragraph.append(output_token)
            else:
                input_math_token_number += 1
                try:
                    math_element = MathExtractor.isolate_cmml(input_token.math)
                    output_tokens = [
                        str(Math(token)) for token in tokenize(math_element)
                    ]
                    output_paragraph.extend(output_tokens)
                except Exception as e:
                    partial_failure.append(
                        '- Processing paragraph #{}, formula #{} failed: {}'.
                        format(
                            input_paragraph_number,
                            input_math_token_number,
                            repr(e),
                        ))
        output_paragraphs.append(output_paragraph)
    return ('\n'.join(partial_failure), zip_filename, filename,
            output_paragraphs)
コード例 #3
0
def write_json_worker(args):
    filename, input_formulae = args
    output_formulae = []
    partial_failure = []
    for input_formula_id, input_formula in input_formulae:
        output_formula_id = input_formula_id
        try:
            math_element = MathExtractor.isolate_cmml(input_formula.math)
            output_formula = [
                str(Math(token)) for token in tokenize(math_element)
            ]
            output_formulae.append((output_formula_id, output_formula))
        except Exception as e:
            partial_failure.append('- Processing formula {} failed: {}'.format(
                input_formula_id,
                repr(e),
            ))
    return ('\n'.join(partial_failure), filename, output_formulae)
def write_tsv_worker(latex_rows):
    latex_input = '\n\n'.join(
        'Formula #{}:\n\[{}\]'.format(latex_row[0], latex_row[-1])
        for latex_row in latex_rows)
    cmml_rows = []
    pmml_rows = []
    try:
        xml_output = mathmlcan(latexml(latex_input))
        try:
            xml_document = unicode_to_tree(xml_output)
            for latex_row in latex_rows:
                math_elements = xml_document.xpath(
                    '//xhtml:div[@class = "ltx_para" and xhtml:p[@class = "ltx_p" and normalize-space(text()) = "Formula #{}:"]]//mathml:math'
                    .format(latex_row[0]),
                    namespaces=XML_NAMESPACES)
                if len(math_elements) >= 1:
                    math_element = math_elements[0]
                    math_tokens = tree_to_unicode(math_element)
                    try:
                        cmml_math_element = unicode_to_tree(
                            MathExtractor.isolate_cmml(math_tokens))
                        pmml_math_element = unicode_to_tree(
                            MathExtractor.isolate_pmml(math_tokens))
                        if cmml_math_element.xpath('//mathml:cerror',
                                                   namespaces=XML_NAMESPACES):
                            cmml_math_tokens = ''
                            cmml_failure = ValueError(
                                'LaTeXML output contains <cerror> elements')
                        else:
                            etree.strip_tags(
                                cmml_math_element, '{{{}}}semantics'.format(
                                    XML_NAMESPACES['mathml']))
                            cmml_math_tokens = tree_to_unicode(
                                cmml_math_element)
                            cmml_failure = None
                        if pmml_math_element.xpath('//mathml:cerror',
                                                   namespaces=XML_NAMESPACES):
                            pmml_math_tokens = ''
                            pmml_failure = ValueError(
                                'LaTeXML output contains <cerror> elements')
                        else:
                            pmml_math_tokens = tree_to_unicode(
                                pmml_math_element)
                            pmml_failure = None
                    except Exception as e:
                        cmml_math_tokens = ''
                        pmml_math_tokens = ''
                        cmml_failure = e
                        pmml_failure = e
                else:
                    cmml_math_tokens = ''
                    pmml_math_tokens = ''
                    cmml_failure = ValueError(
                        'Formula not found in LaTeXML output')
                    pmml_failure = ValueError(
                        'Formula not found in LaTeXML output')
                cmml_row = latex_row[:-1] + [cmml_math_tokens]
                pmml_row = latex_row[:-1] + [pmml_math_tokens]
                cmml_rows.append((cmml_failure, cmml_row))
                pmml_rows.append((pmml_failure, pmml_row))
        except etree.Error as e:  # LaTeXML conversion failed, try halving latex_rows
            assert len(latex_rows) > 0
            if len(latex_rows) > 1:
                #               starting_formula_number = latex_rows[0][0]
                #               ending_formula_number = latex_rows[len(latex_rows) // 2 - 1][0]
                #               print('Splitting formulae from {} to {}'.format(starting_formula_number, ending_formula_number))
                latex_rows_head = latex_rows[:len(latex_rows) // 2]
                latex_rows_tail = latex_rows[len(latex_rows) // 2:]
                cmml_rows_head, pmml_rows_head = write_tsv_worker(
                    latex_rows_head)
                cmml_rows_tail, pmml_rows_tail = write_tsv_worker(
                    latex_rows_tail)
                cmml_rows.extend(cmml_rows_head + cmml_rows_tail)
                pmml_rows.extend(pmml_rows_head + pmml_rows_tail)
            else:
                latex_row = latex_rows[0]
                cmml_math_tokens = ''
                pmml_math_tokens = ''
                cmml_row = latex_row[:-1] + [cmml_math_tokens]
                pmml_row = latex_row[:-1] + [pmml_math_tokens]
                cmml_failure = ValueError(e.msg)
                pmml_failure = ValueError(e.msg)
                cmml_rows.append((cmml_failure, cmml_row))
                pmml_rows.append((pmml_failure, pmml_row))
    except subprocess.SubprocessError as e:
        cmml_math_tokens = ''
        pmml_math_tokens = ''
        cmml_failure = e
        pmml_failure = e
        for latex_row in latex_rows:
            cmml_row = latex_row[:-1] + [cmml_math_tokens]
            pmml_row = latex_row[:-1] + [pmml_math_tokens]
            cmml_rows.append((cmml_failure, cmml_row))
            pmml_rows.append((pmml_failure, pmml_row))
    return (cmml_rows, pmml_rows)