def write_json_worker(args): topic_number, input_tokens = args output_tokens = [] partial_failure = [] input_math_token_number = 0 for input_token in input_tokens: assert isinstance(input_token, (Text, Math)) if isinstance(input_token, Text): output_token = [ str(Text(token)) for token in simple_preprocess(input_token.text) ] output_tokens.append(output_token) else: input_math_token_number += 1 try: math_element = MathExtractor.isolate_cmml(input_token.math) output_math_tokens = [ str(Math(token)) for token in tokenize(math_element) ] output_tokens.append(output_math_tokens) except Exception as e: partial_failure.append( '- Processing formula #{} failed: {}'.format( input_math_token_number, repr(e), )) return ('\n'.join(partial_failure), topic_number, output_tokens)
def write_json_worker(args): zip_filename, filename, input_paragraphs = args output_paragraphs = [] partial_failure = [] if not input_paragraphs: partial_failure.append( '- Either the input document is empty, or it failed to parse') for input_paragraph_number, input_paragraph in enumerate(input_paragraphs): input_paragraph_number += 1 output_paragraph = [] input_math_token_number = 0 for input_token in input_paragraph: assert isinstance(input_token, (Text, Math)) if isinstance(input_token, Text): output_token = str(input_token) output_paragraph.append(output_token) else: input_math_token_number += 1 try: math_element = MathExtractor.isolate_cmml(input_token.math) output_tokens = [ str(Math(token)) for token in tokenize(math_element) ] output_paragraph.extend(output_tokens) except Exception as e: partial_failure.append( '- Processing paragraph #{}, formula #{} failed: {}'. format( input_paragraph_number, input_math_token_number, repr(e), )) output_paragraphs.append(output_paragraph) return ('\n'.join(partial_failure), zip_filename, filename, output_paragraphs)
def write_json_worker(args): filename, input_formulae = args output_formulae = [] partial_failure = [] for input_formula_id, input_formula in input_formulae: output_formula_id = input_formula_id try: math_element = MathExtractor.isolate_cmml(input_formula.math) output_formula = [ str(Math(token)) for token in tokenize(math_element) ] output_formulae.append((output_formula_id, output_formula)) except Exception as e: partial_failure.append('- Processing formula {} failed: {}'.format( input_formula_id, repr(e), )) return ('\n'.join(partial_failure), filename, output_formulae)
def write_tsv_worker(latex_rows): latex_input = '\n\n'.join( 'Formula #{}:\n\[{}\]'.format(latex_row[0], latex_row[-1]) for latex_row in latex_rows) cmml_rows = [] pmml_rows = [] try: xml_output = mathmlcan(latexml(latex_input)) try: xml_document = unicode_to_tree(xml_output) for latex_row in latex_rows: math_elements = xml_document.xpath( '//xhtml:div[@class = "ltx_para" and xhtml:p[@class = "ltx_p" and normalize-space(text()) = "Formula #{}:"]]//mathml:math' .format(latex_row[0]), namespaces=XML_NAMESPACES) if len(math_elements) >= 1: math_element = math_elements[0] math_tokens = tree_to_unicode(math_element) try: cmml_math_element = unicode_to_tree( MathExtractor.isolate_cmml(math_tokens)) pmml_math_element = unicode_to_tree( MathExtractor.isolate_pmml(math_tokens)) if cmml_math_element.xpath('//mathml:cerror', namespaces=XML_NAMESPACES): cmml_math_tokens = '' cmml_failure = ValueError( 'LaTeXML output contains <cerror> elements') else: etree.strip_tags( cmml_math_element, '{{{}}}semantics'.format( XML_NAMESPACES['mathml'])) cmml_math_tokens = tree_to_unicode( cmml_math_element) cmml_failure = None if pmml_math_element.xpath('//mathml:cerror', namespaces=XML_NAMESPACES): pmml_math_tokens = '' pmml_failure = ValueError( 'LaTeXML output contains <cerror> elements') else: pmml_math_tokens = tree_to_unicode( pmml_math_element) pmml_failure = None except Exception as e: cmml_math_tokens = '' pmml_math_tokens = '' cmml_failure = e pmml_failure = e else: cmml_math_tokens = '' pmml_math_tokens = '' cmml_failure = ValueError( 'Formula not found in LaTeXML output') pmml_failure = ValueError( 'Formula not found in LaTeXML output') cmml_row = latex_row[:-1] + [cmml_math_tokens] pmml_row = latex_row[:-1] + [pmml_math_tokens] cmml_rows.append((cmml_failure, cmml_row)) pmml_rows.append((pmml_failure, pmml_row)) except etree.Error as e: # LaTeXML conversion failed, try halving latex_rows assert len(latex_rows) > 0 if len(latex_rows) > 1: # starting_formula_number = latex_rows[0][0] # ending_formula_number = latex_rows[len(latex_rows) // 2 - 1][0] # print('Splitting formulae from {} to {}'.format(starting_formula_number, ending_formula_number)) latex_rows_head = latex_rows[:len(latex_rows) // 2] latex_rows_tail = latex_rows[len(latex_rows) // 2:] cmml_rows_head, pmml_rows_head = write_tsv_worker( latex_rows_head) cmml_rows_tail, pmml_rows_tail = write_tsv_worker( latex_rows_tail) cmml_rows.extend(cmml_rows_head + cmml_rows_tail) pmml_rows.extend(pmml_rows_head + pmml_rows_tail) else: latex_row = latex_rows[0] cmml_math_tokens = '' pmml_math_tokens = '' cmml_row = latex_row[:-1] + [cmml_math_tokens] pmml_row = latex_row[:-1] + [pmml_math_tokens] cmml_failure = ValueError(e.msg) pmml_failure = ValueError(e.msg) cmml_rows.append((cmml_failure, cmml_row)) pmml_rows.append((pmml_failure, pmml_row)) except subprocess.SubprocessError as e: cmml_math_tokens = '' pmml_math_tokens = '' cmml_failure = e pmml_failure = e for latex_row in latex_rows: cmml_row = latex_row[:-1] + [cmml_math_tokens] pmml_row = latex_row[:-1] + [pmml_math_tokens] cmml_rows.append((cmml_failure, cmml_row)) pmml_rows.append((pmml_failure, pmml_row)) return (cmml_rows, pmml_rows)