Esempio n. 1
0
def guess_response_encoding(response, is_xml=False, use_chardet=False):
    """
    Function taking an urllib3 response object and attempting to guess its
    encoding.
    """
    content_type_header = response.getheader('content-type')

    suboptimal_charset = None

    if content_type_header is not None:
        parsed_header = cgi.parse_header(content_type_header)

        if len(parsed_header) > 1:
            charset = parsed_header[1].get('charset')

            if charset is not None:
                if is_supported_encoding(charset):
                    return charset.lower()
                else:
                    suboptimal_charset = charset

    data = response.data
    chunk = data[:CONTENT_CHUNK_SIZE]

    # Data is empty
    if not chunk.strip():
        return None

    # TODO: use re.search to go faster!
    if is_xml:
        matches = re.findall(CHARSET_RE, chunk)

        if len(matches) == 0:
            matches = re.findall(PRAGMA_RE, chunk)

        if len(matches) == 0:
            matches = re.findall(XML_RE, chunk)

        # NOTE: here we are returning the last one, but we could also use
        # frequency at the expense of performance
        if len(matches) != 0:
            charset = matches[-1].lower().decode()

            if is_supported_encoding(charset):
                return charset
            else:
                suboptimal_charset = charset

    if use_chardet:
        chardet_result = chardet.detect(data)

        # Could not detect anything
        if (not chardet_result or chardet_result.get('confidence') is None):
            return None

        if chardet_result['confidence'] >= CHARDET_CONFIDENCE_THRESHOLD:
            return chardet_result['encoding'].lower()

    return suboptimal_charset
Esempio n. 2
0
def worker(payload):
    row, _, path, encoding, content, _ = payload

    if not is_supported_encoding(encoding):
        return UnknownEncodingError('Unknown encoding: "%s"' % encoding), row, None

    # Reading file
    if content is None:
        try:
            raw_html = read_potentially_gzipped_path(path, encoding=encoding)
        except (FileNotFoundError, UnicodeDecodeError) as e:
            return e, row, None
    else:
        raw_html = content

    # Attempting extraction
    try:
        # https://trafilatura.readthedocs.io/en/latest/corefunctions.html
        # TODO: discuss deduplication
        # TODO: fallback options
        result = bare_extraction(raw_html)
    except Exception as e:
        return TrafilaturaError(reason=e), row, None

    if result is None:
        return None, row, None

    return None, row, format_trafilatura_result(result)
Esempio n. 3
0
def worker(payload):
    line, _, path, encoding, content, _ = payload

    if not is_supported_encoding(encoding):
        return UnknownEncodingError('Unknown encoding: "%s"' %
                                    encoding), line, None

    # Reading file
    if content is None:
        try:
            if path.endswith('.gz'):
                with open(path, 'rb') as f:
                    raw_html_bytes = gzip.decompress(f.read())

                raw_html = raw_html_bytes.decode(encoding, errors='replace')
            else:
                with codecs.open(path,
                                 'r',
                                 encoding=encoding,
                                 errors='replace') as f:
                    raw_html = f.read()
        except UnicodeDecodeError as e:
            return e, line, None
    else:
        raw_html = content

    # Attempting extraction
    try:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            content = extract_content(raw_html)
    except BaseException as e:
        return e, line, None

    return None, line, content
Esempio n. 4
0
def worker(payload):
    row, _, path, encoding, content, _ = payload

    if not is_supported_encoding(encoding):
        return UnknownEncodingError('Unknown encoding: "%s"' %
                                    encoding), row, None

    # Reading file
    if content is None:
        try:
            if path.endswith('.gz'):
                with open(path, 'rb') as f:
                    raw_html_bytes = gzip.decompress(f.read())

                raw_html = raw_html_bytes.decode(encoding, errors='replace')
            else:
                with codecs.open(path,
                                 'r',
                                 encoding=encoding,
                                 errors='replace') as f:
                    raw_html = f.read()
        except UnicodeDecodeError as e:
            return e, row, None
    else:
        raw_html = content

    # Attempting extraction
    try:
        # https://trafilatura.readthedocs.io/en/latest/corefunctions.html
        # TODO: discuss deduplication
        # TODO: fallback options
        result = bare_extraction(raw_html)
    except BaseException as e:
        return e, row, None

    if result is None:
        return None, row, None

    return None, row, format_trafilatura_result(result)
Esempio n. 5
0
 def test_is_supported_encoding(self):
     for value, result in SUPPORT_TESTS:
         assert is_supported_encoding(value) == result