Example #1
0
def worker(payload):
    row, _, path, encoding, content, _ = payload

    if not is_supported_encoding(encoding):
        return UnknownEncodingError('Unknown encoding: "%s"' % encoding), row, None

    # Reading file
    if content is None:
        try:
            raw_html = read_potentially_gzipped_path(path, encoding=encoding)
        except (FileNotFoundError, UnicodeDecodeError) as e:
            return e, row, None
    else:
        raw_html = content

    # Attempting extraction
    try:
        # https://trafilatura.readthedocs.io/en/latest/corefunctions.html
        # TODO: discuss deduplication
        # TODO: fallback options
        result = bare_extraction(raw_html)
    except Exception as e:
        return TrafilaturaError(reason=e), row, None

    if result is None:
        return None, row, None

    return None, row, format_trafilatura_result(result)
Example #2
0
def test_txttocsv():
    mymeta = dict.fromkeys(METADATA_LIST)
    assert utils.txttocsv('', '',
                          mymeta) == 'None\tNone\tNone\tNone\tNone\t\t\tNone\n'
    mymeta['title'] = 'Test title'
    mymeta['url'] = 'https://example.org'
    mymeta['hostname'] = 'example.org'
    mymeta['id'] = '1'
    mymeta['license'] = 'CC BY-SA'
    assert utils.txttocsv(
        'Test text', 'Test comment', mymeta
    ) == '1\thttps://example.org\tNone\texample.org\tTest title\tNone\tTest text\tTest comment\tCC BY-SA\n'
    mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>'
    assert extract(mystring, output_format='csv',
                   config=ZERO_CONFIG) is not None
    assert extract(mystring,
                   output_format='csv',
                   include_comments=False,
                   config=ZERO_CONFIG).endswith('\tNone\n')
    # test json
    result = extract(mystring, output_format='json', config=ZERO_CONFIG)
    assert result.endswith('}') and '"fingerprint":' in result
    # bare extraction for python
    result = bare_extraction(mystring, config=ZERO_CONFIG)
    assert isinstance(result, dict) and len(result) == 14
Example #3
0
def test_txttocsv():
    mymeta = dict.fromkeys(METADATA_LIST)
    assert utils.txttocsv('', '',
                          mymeta) == 'None\tNone\tNone\tNone\tNone\t\t\n'
    mymeta['title'] = 'Test title'
    mymeta['url'] = 'https://example.org'
    mymeta['hostname'] = 'example.org'
    assert utils.txttocsv(
        'Test text', 'Test comment', mymeta
    ) == 'https://example.org\tNone\texample.org\tTest title\tNone\tTest text\tTest comment\n'
    mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>'
    assert extract(mystring, csv_output=True) is not None
    assert extract(mystring, csv_output=True,
                   include_comments=False).endswith('\t\n')
    # test json
    assert extract(mystring, json_output=True).endswith('}')
    # bare extraction for python
    result = bare_extraction(mystring)
    assert isinstance(result, dict) and len(result) == 13
Example #4
0
def test_txttocsv():
    mymeta = Document()
    assert utils.txttocsv('', '', mymeta) == 'None\tNone\tNone\tNone\tNone\t\t\tNone\n'
    mymeta.title = 'Test title'
    mymeta.url = 'https://example.org'
    mymeta.hostname = 'example.org'
    mymeta.id = '1'
    mymeta.license = 'CC BY-SA'
    assert utils.txttocsv('Test text', 'Test comment', mymeta) == '1\thttps://example.org\tNone\texample.org\tTest title\tNone\tTest text\tTest comment\tCC BY-SA\n'
    mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>'
    assert extract(mystring, output_format='csv', config=ZERO_CONFIG) is not None
    assert extract(mystring, output_format='csv', include_comments=False, config=ZERO_CONFIG).endswith('\tNone\n')
    # test json
    result = extract(mystring, output_format='json', config=ZERO_CONFIG)
    assert result.endswith('}') and '"fingerprint":' in result
    assert extract(mystring, output_format='json', include_comments=False, config=ZERO_CONFIG).endswith('}')
    # bare extraction for python
    result = bare_extraction(mystring, config=ZERO_CONFIG, as_dict=True)
    assert isinstance(result, dict) and len(result) == 17
Example #5
0
def worker(payload):
    row, _, path, encoding, content, _ = payload

    if not is_supported_encoding(encoding):
        return UnknownEncodingError('Unknown encoding: "%s"' %
                                    encoding), row, None

    # Reading file
    if content is None:
        try:
            if path.endswith('.gz'):
                with open(path, 'rb') as f:
                    raw_html_bytes = gzip.decompress(f.read())

                raw_html = raw_html_bytes.decode(encoding, errors='replace')
            else:
                with codecs.open(path,
                                 'r',
                                 encoding=encoding,
                                 errors='replace') as f:
                    raw_html = f.read()
        except UnicodeDecodeError as e:
            return e, row, None
    else:
        raw_html = content

    # Attempting extraction
    try:
        # https://trafilatura.readthedocs.io/en/latest/corefunctions.html
        # TODO: discuss deduplication
        # TODO: fallback options
        result = bare_extraction(raw_html)
    except BaseException as e:
        return e, row, None

    if result is None:
        return None, row, None

    return None, row, format_trafilatura_result(result)
Example #6
0
import fileinput
from trafilatura.core import bare_extraction
from pprint import pprint

with fileinput.input() as f:
    html = ''.join(f)

    # https://trafilatura.readthedocs.io/en/latest/corefunctions.html
    raw_content = bare_extraction(html)

    pprint(raw_content)