Beispiel #1
0
def test_extract_table_with_column_1_headers():
    structured_content = []
    text_list = []
    table_extractor = TableExtractor()
    content = '''
    <table>
        <tr>
            <th>Column Heading 1</th>
            <th>Column Heading 2</th>
            <th>Column Heading 3</th>
        </tr>
        <tr>
            <th>Row 1 Column 1</th>
            <td>Row 1 Column 2</td>
            <td>Row 1 Column 3</td>
        </tr>
        <tr>
            <th>Row 2 Column 1</th>
            <td>Row 2 Column 2</td>
            <td>Row 2 Column 3</td>
        </tr>
    </table>
    '''
    stream = BytesIO(fix_content(content).encode('utf-8'))
    for ev, elem in etree.iterparse(stream, events=('start', 'end'),
                                    html=True):
        table_extractor.extract(elem, ev, structured_content, text_list)

    assert len(text_list) == 3
    assert len(structured_content[0]['body'][0]) == 3
    assert text_list[
        0] == r'Column Heading 1\tColumn Heading 2\tColumn Heading 3'
    assert structured_content[0]['type'] == 'table'
    assert structured_content[0]['head'][0][0] == 'Column Heading 1'
    assert structured_content[0]['body'][0][0] == 'Row 1 Column 1'
Beispiel #2
0
def test_extract_text_and_table_combo():
    structured_content = []
    text_list = []
    text_extractor = TextExtractor(excluded_tags=['table'])
    table_extractor = TableExtractor()
    content = '''
    <h1>My Heading</h1>
    <div>
    First line
    <p>My <font color="#ccc">colored</font> <a href="#">text</a> line</p>
    Last line
    </div>
    <table>
        <tr>
            <th>Column Heading 1</th>
            <th>Column Heading 2</th>
            <th>Column Heading 3</th>
        </tr>
        <tr>
            <th>Row 1 Column 1</th>
            <td>Row 1 Column 2</td>
            <td>Row 1 Column 3</td>
        </tr>
        <tr>
            <th>Row 2 Column 1</th>
            <td>Row 2 Column 2</td>
            <td>Row 2 Column 3</td>
        </tr>
    </table>
    Trailing line
    '''
    stream = BytesIO(fix_content(content).encode('utf-8'))
    for ev, elem in etree.iterparse(stream, events=('start', 'end'),
                                    html=True):
        text_extractor.extract(elem, ev, structured_content, text_list)
        table_extractor.extract(elem, ev, structured_content, text_list)

    assert len(text_list) == 8
    assert text_list[2] == 'My colored text line'
    assert text_list[5] == r'Row 1 Column 1\tRow 1 Column 2\tRow 1 Column 3'
    assert text_list[7] == 'Trailing line'
    assert structured_content[4]['type'] == 'text'
    assert structured_content[4]['text'] == 'Last line'
    assert structured_content[5]['type'] == 'table'
    assert structured_content[5]['body'][0][0] == 'Row 1 Column 1'
Beispiel #3
0
def test_extract_table_with_multiple_header_rows_using_head_tag():
    structured_content = []
    text_list = []
    table_extractor = TableExtractor()
    # noinspection SpellCheckingInspection
    content = '''
    <table>
        <thead>
            <tr>
                <td>Column Heading 1</td>
                <td>Column Heading 2</td>
                <td>Column Heading 3</td>
            </tr>
            <tr>
                <td>Row 1 Column 1</td>
                <td>Row 1 Column 2</td>
                <td>Row 1 Column 3</td>
            </tr>
        </thead>
        <tbody>
            <tr>
                <td>Row 2 Column 1</td>
                <td>Row 2 Column 2</td>
                <td>Row 2 Column 3</td>
            </tr>
        </tbody>
    </table>
    '''
    stream = BytesIO(fix_content(content).encode('utf-8'))
    for ev, elem in etree.iterparse(stream, events=('start', 'end'),
                                    html=True):
        table_extractor.extract(elem, ev, structured_content, text_list)

    assert len(text_list) == 3
    assert len(structured_content[0]['head']) == 2
    assert len(structured_content[0]['body']) == 1
    assert text_list[
        0] == r'Column Heading 1\tColumn Heading 2\tColumn Heading 3'
    assert structured_content[0]['type'] == 'table'
    assert structured_content[0]['head'][1][0] == 'Row 1 Column 1'
    assert structured_content[0]['body'][0][0] == 'Row 2 Column 1'
Beispiel #4
0
def test_extract_anchor_from_basic_table():
    structured_content = []
    text_list = []
    table_extractor = TableExtractor()
    content = '''
    <table>
        <tr>
            <th>Column Heading 1</th>
            <th>Column Heading 2</th>
            <th>Column Heading 3</th>
        </tr>
        <tr>
            <td>Row 1 Column 1</td>
            <td>Row 1 <a href="link-url">Column 2</a></td>
            <td>Row 1 Column 3</td>
        </tr>
        <tr>
            <td>Row 2 Column 1</td>
            <td>Row 2 Column 2</td>
            <td>Row 2 Column 3</td>
        </tr>
    </table>
    '''
    stream = BytesIO(fix_content(content).encode('utf-8'))
    for ev, elem in etree.iterparse(stream, events=('start', 'end'),
                                    html=True):
        table_extractor.extract(elem, ev, structured_content, text_list)

    assert len(text_list) == 3
    assert len(structured_content[1]['body'][0]) == 3
    assert text_list[
        0] == r'Column Heading 1\tColumn Heading 2\tColumn Heading 3'
    assert structured_content[0]['type'] == 'link'
    assert structured_content[0]['text'] == 'Column 2'
    assert structured_content[0]['url'] == 'link-url'
    assert structured_content[1]['type'] == 'table'
    assert structured_content[1]['head'][0][0] == 'Column Heading 1'
    assert structured_content[1]['body'][0][1] == 'Row 1 [[Column 2]]'
Beispiel #5
0
def test_extract_table_with_embedded_tags():
    structured_content = []
    text_list = []
    table_extractor = TableExtractor()
    content = '''
    <table>
        <tr>
            <th>Column Heading 1</th>
            <th>Column Heading 2</th>
            <th>Column Heading 3</th>
        </tr>
        <tr>
            <td><strong>Row 1</strong> <a href="#">Column</a> 1</td>
            <td><ul><li>Row 1</li> <li>Column 2</li></ul></td>
            <td>Row 1 Column 3</td>
        </tr>
        <tr>
            <td><div>Row 2</div> Column 1</td>
            <td>Row 2 Column 2</td>
            <td>Row 2 Column 3</td>
        </tr>
    </table>
    '''
    stream = BytesIO(fix_content(content).encode('utf-8'))
    for ev, elem in etree.iterparse(stream, events=('start', 'end'),
                                    html=True):
        table_extractor.extract(elem, ev, structured_content, text_list)

    assert len(text_list) == 3
    assert len(structured_content[1]['body'][0]) == 3
    assert text_list[1] == r'Row 1 Column 1\tRow 1 Column 2\tRow 1 Column 3'
    assert text_list[2] == r'Row 2 Column 1\tRow 2 Column 2\tRow 2 Column 3'
    assert structured_content[1]['type'] == 'table'
    assert structured_content[1]['body'][0][0] == 'Row 1 [[Column]] 1'
    assert structured_content[1]['body'][0][1] == 'Row 1 Column 2'
    assert structured_content[1]['body'][1][0] == 'Row 2 Column 1'
Beispiel #6
0
    def process_doc(self, text: str, a: Dict[str, Any]) -> None:
        # treat all text as html
        # lxml will automatically wrap plain text in a para, body and html tags
        structured_content = []
        text_list = []
        extractors = [
            ListExtractor(excluded_tags=['table']),
            TableExtractor(),
            TextExtractor(excluded_tags=[
                'ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4'
            ]),
            HeadingExtractor(excluded_tags=['ul', 'ol', 'table'])
        ]
        stream: IO[AnyStr] = BytesIO(fix_content(text).encode('utf-8'))
        for ev, elem in self.element_iterator(stream, html=True):
            process_html_element(elem, ev, extractors, structured_content,
                                 text_list, self.__nlp)

        # re-extract content in single column tables used for layout purposes only
        html = None  # memoize
        k = []
        for i, c in enumerate(structured_content):
            typ = c['type']
            if typ in ['text', 'heading']:
                k.append(1)
            elif typ == 'list':
                k.append(len(c.get('items', [])))
            elif typ == 'table':
                k.append(len(c.get('head', [])) + len(c.get('body', [])))
                if len(c.get('fields', [])) == 1:
                    if not html:
                        # reset stream to reiterate
                        stream.seek(0)

                        # read stream into str and parse as html
                        html = lxml.html.fromstring(stream.read())

                    # find single column layout table
                    contents = html.xpath(
                        ('/descendant::table[{0}]/tbody/tr/td/*|' +
                         '/descendant::table[{0}]/tr/td/*').format(c['index']))
                    root = etree.Element('div')
                    root.extend(contents)
                    sc = []
                    tl = []
                    for evt, ele in etree.iterwalk(root,
                                                   events=('start', 'end')):
                        process_html_element(ele, evt, extractors, sc, tl,
                                             self.__nlp)

                    j = len(c.get('references', []))
                    structured_content = flatten([
                        structured_content[:(i - j)], sc,
                        structured_content[(i + 1):]
                    ])
                    text_list = flatten([
                        text_list[:sum(k[:(i - j)])], tl,
                        text_list[sum(k[:(i + 1)]):]
                    ])

        data = {}
        if len(text_list) == 1:
            data['text'] = text_list[0]
        else:
            data['text'] = text_list

        if structured_content:
            data['structured_content'] = structured_content

        a['data'] = data
Beispiel #7
0
def extract_text(c: Dict[str, Any],
                 # logger: Logger,
                 a: Dict[str, Any],
                 excluded_tags: List[str],
                 output_handler: Callable,
                 f: TextIO) -> str:
    # logger.debug('process file: {}'.format(f.name))
    a.update({
        'data': {},
        'is_data': False,
        'metadata': {'doc_type': None, 'record_id': None}
    })
    it = etree.iterparse(f, events=('start', 'end'))
    stream = ((event, el) for event, el in it if el.tag not in excluded_tags)
    for event, el in stream:
        if el.tag == 'CONTENT' and event == 'end':
            a['metadata']['record_id'] = el.get('RECORDID')

        elif el.tag == 'MASTERIDENTIFER' and event == 'end':
            a['metadata']['title'] = el.text

        elif el.tag == 'TYPE' and event == 'end':
            a['metadata']['doc_type'] = el.text

        elif el.tag == 'DOCUMENTID' and event == 'end':
            a['metadata']['doc_id'] = el.text

        elif el.tag == 'VERSION' and event == 'end':
            a['metadata']['version'] = el.text

        elif el.tag == 'AUTHOR' and event == 'end':
            a['metadata']['author'] = el.text

        elif el.tag == 'ENDTIMESTAMP_MILLIS' and event == 'end':
            millis = int(el.text)
            a['metadata']['end_timestamp_millis'] = millis
            a['metadata']['end_time'] = get_iso_datetime_from_millis(millis)

        elif el.tag == 'STARTTIMESTAMP_MILLIS' and event == 'end':
            millis = int(el.text)
            a['metadata']['start_timestamp_millis'] = millis
            a['metadata']['start_time'] = get_iso_datetime_from_millis(millis)

        elif el.tag == 'CREATETIMESTAMP_MILLIS' and event == 'end':
            millis = int(el.text)
            a['metadata']['create_timestamp_millis'] = millis
            a['metadata']['create_time'] = get_iso_datetime_from_millis(millis)

        elif el.tag == 'LASTMODIFIEDTIMESTAMP_MILLIS' and event == 'end':
            millis = int(el.text)
            a['metadata']['last_modified_timestamp_millis'] = millis
            a['metadata']['last_modified_time'] = get_iso_datetime_from_millis(millis)

        elif el.tag == 'RESOURCEPATH' and event == 'end':
            a['metadata']['doc_location_path'] = el.text

        elif el.tag == 'PUBLISHEDTIMESTAMP_MILLIS' and event == 'end':
            millis = int(el.text)
            a['metadata']['published_timestamp_millis'] = millis
            a['metadata']['published_time'] = get_iso_datetime_from_millis(millis)

        elif el.tag == a['metadata']['doc_type']:
            a['is_data'] = (event == 'start')

        elif a['is_data'] and event == 'end' and el.text:
            # treat all text as html
            # lxml will automatically wrap plain text in a para, body and html tags
            structured_content = []
            text_list = []
            list_extractor = ListExtractor(excluded_tags=['table'])
            table_extractor = TableExtractor()
            text_extractor = TextExtractor(excluded_tags=['ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4'])
            heading_extractor = HeadingExtractor(excluded_tags=['ul', 'ol', 'table'])
            stream = BytesIO(fix_content(el.text).encode('utf-8'))

            for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True):
                heading_extractor.extract(elem, ev, structured_content, text_list)
                text_extractor.extract(elem, ev, structured_content, text_list)
                list_extractor.extract(elem, ev, structured_content, text_list)
                table_extractor.extract(elem, ev, structured_content, text_list)

            data = {}
            if len(text_list) == 1:
                data['text'] = text_list[0]
            else:
                data['text'] = text_list

            if structured_content:
                data['structured_content'] = structured_content

            a['data'][el.tag.lower()] = data

    now = datetime.utcnow().isoformat()
    a['files_processed'].append({
        'path': f.name,
        'time': now
    })
    write_root_dir = c['job']['write_root_dir']
    output_filename = '{}_{}.json'.format(convert_name_to_underscore(self.name), a['metadata']['record_id'])
    output_path = os.path.join(write_root_dir, output_filename)
    a['files_output'].append({
        'filename': output_filename,
        'path': output_path,
        'status': 'processed',
        'time': now
    })
    content = {'metadata': a['metadata'], 'data': a['data']}
    output_handler(output_path, content)
    return output_path
Beispiel #8
0
    def process_xml_element(self, el: etree.ElementBase, event: str,
                            a: Dict[str, Any]) -> None:
        if el.tag == 'CONTENT' and event == 'end':
            a['metadata']['record_id'] = el.get('RECORDID')

        elif el.tag == 'MASTERIDENTIFER' and event == 'end':
            a['metadata']['title'] = clean_text(el.text)

        elif el.tag == 'TYPE' and event == 'end':
            a['metadata']['doc_type'] = clean_text(el.text)

        elif el.tag == 'DOCUMENTID' and event == 'end':
            a['metadata']['doc_id'] = clean_text(el.text)

        elif el.tag == 'VERSION' and event == 'end':
            a['metadata']['version'] = clean_text(el.text)

        elif el.tag == 'AUTHOR' and event == 'end':
            a['metadata']['author'] = clean_text(el.text)

        elif el.tag == 'ENDTIMESTAMP_MILLIS' and event == 'end':
            millis = int(clean_text(el.text))
            a['metadata']['end_timestamp_millis'] = millis
            a['metadata']['end_time'] = get_iso_datetime_from_millis(millis)

        elif el.tag == 'STARTTIMESTAMP_MILLIS' and event == 'end':
            millis = int(clean_text(el.text))
            a['metadata']['start_timestamp_millis'] = millis
            a['metadata']['start_time'] = get_iso_datetime_from_millis(millis)

        elif el.tag == 'CREATETIMESTAMP_MILLIS' and event == 'end':
            millis = int(clean_text(el.text))
            a['metadata']['create_timestamp_millis'] = millis
            a['metadata']['create_time'] = get_iso_datetime_from_millis(millis)

        elif el.tag == 'LASTMODIFIEDTIMESTAMP_MILLIS' and event == 'end':
            millis = int(clean_text(el.text))
            a['metadata']['last_modified_timestamp_millis'] = millis
            a['metadata']['last_modified_time'] = get_iso_datetime_from_millis(
                millis)

        elif el.tag == 'RESOURCEPATH' and event == 'end':
            a['metadata']['doc_location_path'] = clean_text(el.text)

        elif el.tag == 'PUBLISHEDTIMESTAMP_MILLIS' and event == 'end':
            millis = int(clean_text(el.text))
            a['metadata']['published_timestamp_millis'] = millis
            a['metadata']['published_time'] = get_iso_datetime_from_millis(
                millis)

        elif el.tag == a['metadata']['doc_type']:
            a['is_data'] = (event == 'start')

        elif a['is_data'] and event == 'end' and el.text:
            # treat all text as html
            # lxml will automatically wrap plain text in a para, body and html tags
            structured_content = []
            text_list = []

            try:
                maybe_json = json.loads(el.text)
                structured_content.append({'type': 'json', 'json': maybe_json})
            except (JSONDecodeError, ValueError):
                extractors = [
                    ListExtractor(excluded_tags=['table']),
                    TableExtractor(),
                    TextExtractor(excluded_tags=[
                        'ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4'
                    ]),
                    HeadingExtractor(excluded_tags=['ul', 'ol', 'table'])
                ]
                stream: IO[AnyStr] = BytesIO(
                    fix_content(el.text).encode('utf-8'))
                for ev, elem in self.element_iterator(stream, html=True):
                    process_html_element(elem, ev, extractors,
                                         structured_content, text_list)

                # re-extract content in single column tables used for layout purposes only
                html = None  # memoize
                k = []
                for i, c in enumerate(structured_content):
                    typ = c['type']
                    if typ in ['text', 'heading']:
                        k.append(1)
                    elif typ == 'list':
                        k.append(len(c.get('items', [])))
                    elif typ == 'table':
                        k.append(
                            len(c.get('head', [])) + len(c.get('body', [])))
                        if len(c.get('fields', [])) == 1:
                            if not html:
                                # reset stream to reiterate
                                stream.seek(0)

                                # read stream into str and parse as html
                                html = lxml.html.fromstring(stream.read())

                            # find single column layout table
                            contents = html.xpath(
                                ('/descendant::table[{0}]/tbody/tr/td/*|' +
                                 '/descendant::table[{0}]/tr/td/*').format(
                                     c['index']))
                            root = etree.Element('div')
                            root.extend(contents)
                            sc = []
                            tl = []
                            for evt, ele in etree.iterwalk(root,
                                                           events=('start',
                                                                   'end')):
                                process_html_element(ele, evt, extractors, sc,
                                                     tl)

                            j = len(c.get('references', []))
                            structured_content = flatten([
                                structured_content[:(i - j)], sc,
                                structured_content[(i + 1):]
                            ])
                            text_list = flatten([
                                text_list[:sum(k[:(i - j)])], tl,
                                text_list[sum(k[:(i + 1)]):]
                            ])

            data = {}
            if len(text_list) == 1:
                data['text'] = text_list[0]
            else:
                data['text'] = text_list

            if structured_content:
                data['structured_content'] = structured_content

            a['data'][el.tag.lower()] = data
Beispiel #9
0
def process_xml_element(
    el: etree.ElementBase,
    event: str,
    accumulator: Dict[str, Any],
    excluded_html_tags: List[str],
) -> Dict[str, Any]:
    """
    Stateful, so cannot be parallelized.

    :param el: XML element
    :param event: event type [start, end]
    :param accumulator: accumulates state
    :param excluded_html_tags: XML tags to exclude
    :return: accumulated content as dict
    """
    a = deepcopy(accumulator)

    if el.tag == 'CONTENT' and event == 'end':
        a['metadata']['record_id'] = el.get('RECORDID')

    elif el.tag == 'MASTERIDENTIFER' and event == 'end':
        a['metadata']['title'] = clean_text(el.text)

    elif el.tag == 'TYPE' and event == 'end':
        a['metadata']['doc_type'] = clean_text(el.text)

    elif el.tag == 'DOCUMENTID' and event == 'end':
        a['metadata']['doc_id'] = clean_text(el.text)

    elif el.tag == 'VERSION' and event == 'end':
        a['metadata']['version'] = clean_text(el.text)

    elif el.tag == 'AUTHOR' and event == 'end':
        a['metadata']['author'] = clean_text(el.text)

    elif el.tag == 'ENDTIMESTAMP_MILLIS' and event == 'end':
        millis = int(clean_text(el.text))
        a['metadata']['end_timestamp_millis'] = millis
        a['metadata']['end_time'] = get_iso_datetime_from_millis(millis)

    elif el.tag == 'STARTTIMESTAMP_MILLIS' and event == 'end':
        millis = int(clean_text(el.text))
        a['metadata']['start_timestamp_millis'] = millis
        a['metadata']['start_time'] = get_iso_datetime_from_millis(millis)

    elif el.tag == 'CREATETIMESTAMP_MILLIS' and event == 'end':
        millis = int(clean_text(el.text))
        a['metadata']['create_timestamp_millis'] = millis
        a['metadata']['create_time'] = get_iso_datetime_from_millis(millis)

    elif el.tag == 'LASTMODIFIEDTIMESTAMP_MILLIS' and event == 'end':
        millis = int(clean_text(el.text))
        a['metadata']['last_modified_timestamp_millis'] = millis
        a['metadata']['last_modified_time'] = get_iso_datetime_from_millis(
            millis)

    elif el.tag == 'RESOURCEPATH' and event == 'end':
        a['metadata']['doc_location_path'] = clean_text(el.text)

    elif el.tag == 'PUBLISHEDTIMESTAMP_MILLIS' and event == 'end':
        millis = int(clean_text(el.text))
        a['metadata']['published_timestamp_millis'] = millis
        a['metadata']['published_time'] = get_iso_datetime_from_millis(millis)

    elif el.tag == a['metadata']['doc_type']:
        a['is_data'] = (event == 'start')

    elif a['is_data'] and event == 'end' and el.text:
        # treat all text as html
        # lxml will automatically wrap plain text in a para, body and html tags
        structured_content = []
        text_list = []
        extractors = [
            ListExtractor(excluded_tags=['table']),
            TableExtractor(),
            TextExtractor(excluded_tags=[
                'ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4'
            ]),
            HeadingExtractor(excluded_tags=['ul', 'ol', 'table'])
        ]
        stream = BytesIO(fix_content(el.text).encode('utf-8'))
        for ev, elem in element_iterator(stream, excluded_html_tags,
                                         html=True):
            structured, text = process_html_element(elem, ev, extractors)
            structured_content.extend(structured)
            text_list.extend(text)

        data = {}
        if len(text_list) == 1:
            data['text'] = text_list[0]
        else:
            data['text'] = text_list

        if structured_content:
            data['structured_content'] = structured_content

        a['data'][el.tag.lower()] = data

    return a