def worker(payload): row, _, path, encoding, content, _ = payload if not is_supported_encoding(encoding): return UnknownEncodingError('Unknown encoding: "%s"' % encoding), row, None # Reading file if content is None: try: raw_html = read_potentially_gzipped_path(path, encoding=encoding) except (FileNotFoundError, UnicodeDecodeError) as e: return e, row, None else: raw_html = content # Attempting extraction try: # https://trafilatura.readthedocs.io/en/latest/corefunctions.html # TODO: discuss deduplication # TODO: fallback options result = bare_extraction(raw_html) except Exception as e: return TrafilaturaError(reason=e), row, None if result is None: return None, row, None return None, row, format_trafilatura_result(result)
def test_txttocsv(): mymeta = dict.fromkeys(METADATA_LIST) assert utils.txttocsv('', '', mymeta) == 'None\tNone\tNone\tNone\tNone\t\t\tNone\n' mymeta['title'] = 'Test title' mymeta['url'] = 'https://example.org' mymeta['hostname'] = 'example.org' mymeta['id'] = '1' mymeta['license'] = 'CC BY-SA' assert utils.txttocsv( 'Test text', 'Test comment', mymeta ) == '1\thttps://example.org\tNone\texample.org\tTest title\tNone\tTest text\tTest comment\tCC BY-SA\n' mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>' assert extract(mystring, output_format='csv', config=ZERO_CONFIG) is not None assert extract(mystring, output_format='csv', include_comments=False, config=ZERO_CONFIG).endswith('\tNone\n') # test json result = extract(mystring, output_format='json', config=ZERO_CONFIG) assert result.endswith('}') and '"fingerprint":' in result # bare extraction for python result = bare_extraction(mystring, config=ZERO_CONFIG) assert isinstance(result, dict) and len(result) == 14
def test_txttocsv(): mymeta = dict.fromkeys(METADATA_LIST) assert utils.txttocsv('', '', mymeta) == 'None\tNone\tNone\tNone\tNone\t\t\n' mymeta['title'] = 'Test title' mymeta['url'] = 'https://example.org' mymeta['hostname'] = 'example.org' assert utils.txttocsv( 'Test text', 'Test comment', mymeta ) == 'https://example.org\tNone\texample.org\tTest title\tNone\tTest text\tTest comment\n' mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>' assert extract(mystring, csv_output=True) is not None assert extract(mystring, csv_output=True, include_comments=False).endswith('\t\n') # test json assert extract(mystring, json_output=True).endswith('}') # bare extraction for python result = bare_extraction(mystring) assert isinstance(result, dict) and len(result) == 13
def test_txttocsv(): mymeta = Document() assert utils.txttocsv('', '', mymeta) == 'None\tNone\tNone\tNone\tNone\t\t\tNone\n' mymeta.title = 'Test title' mymeta.url = 'https://example.org' mymeta.hostname = 'example.org' mymeta.id = '1' mymeta.license = 'CC BY-SA' assert utils.txttocsv('Test text', 'Test comment', mymeta) == '1\thttps://example.org\tNone\texample.org\tTest title\tNone\tTest text\tTest comment\tCC BY-SA\n' mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>' assert extract(mystring, output_format='csv', config=ZERO_CONFIG) is not None assert extract(mystring, output_format='csv', include_comments=False, config=ZERO_CONFIG).endswith('\tNone\n') # test json result = extract(mystring, output_format='json', config=ZERO_CONFIG) assert result.endswith('}') and '"fingerprint":' in result assert extract(mystring, output_format='json', include_comments=False, config=ZERO_CONFIG).endswith('}') # bare extraction for python result = bare_extraction(mystring, config=ZERO_CONFIG, as_dict=True) assert isinstance(result, dict) and len(result) == 17
def worker(payload): row, _, path, encoding, content, _ = payload if not is_supported_encoding(encoding): return UnknownEncodingError('Unknown encoding: "%s"' % encoding), row, None # Reading file if content is None: try: if path.endswith('.gz'): with open(path, 'rb') as f: raw_html_bytes = gzip.decompress(f.read()) raw_html = raw_html_bytes.decode(encoding, errors='replace') else: with codecs.open(path, 'r', encoding=encoding, errors='replace') as f: raw_html = f.read() except UnicodeDecodeError as e: return e, row, None else: raw_html = content # Attempting extraction try: # https://trafilatura.readthedocs.io/en/latest/corefunctions.html # TODO: discuss deduplication # TODO: fallback options result = bare_extraction(raw_html) except BaseException as e: return e, row, None if result is None: return None, row, None return None, row, format_trafilatura_result(result)
import fileinput from trafilatura.core import bare_extraction from pprint import pprint with fileinput.input() as f: html = ''.join(f) # https://trafilatura.readthedocs.io/en/latest/corefunctions.html raw_content = bare_extraction(html) pprint(raw_content)