Example #1
0
    def detect_encoding(self):
        """
        Try to guess the encoding

        Returns: a hash, with :encoding, :confidence, :type
                 this will return nil if an error occurred during detection or
                 no valid encoding could be found
        """
        if hasattr(self, '_detect_encoding'):
            return self._detect_encoding

        if self.data:
            self._detect_encoding = charlockholmes.detect(self.data)
            return self._detect_encoding
Example #2
0
    def detect_encoding(self):
        """
        Try to guess the encoding

        Returns: a hash, with :encoding, :confidence, :type
                 this will return nil if an error occurred during detection or
                 no valid encoding could be found
        """
        if hasattr(self, '_detect_encoding'):
            return self._detect_encoding

        if self.data:
            self._detect_encoding = charlockholmes.detect(self.data.encode('utf-8', 'ignore'))
            return self._detect_encoding
Example #3
0
def get_and_extract(url, timeout=5):
    http_client = AsyncHTTPClient()
    try:
        resp = yield http_client.fetch(url, connect_timeout=timeout)
    except:
        logging.exception('fetch: %s' % url)
        raise gen.Return({})

    content_type = resp.headers.get('Content-Type')
    segs = charset_regexp.findall(content_type)
    if segs:
        charset = segs[0]
    else:
        charset = detect(resp.body).get('encoding', '')

    try:
        body = resp.body.decode(charset, errors='ignore')
    except LookupError:
        body = resp.body.decode('utf8', errors='ignore')

    raise gen.Return(extract(body))
Example #4
0
        "file/test.sh", {
            'confidence': 21,
            'type': 'text',
            'language': 'es',
            'encoding': 'ISO-8859-1'
        }
    ],
    "elf": ["file/test", {
        'confidence': 100,
        'type': 'binary'
    }],
    "bz2": ["file/test.tar.bz2", {
        'confidence': 100,
        'type': 'binary'
    }],
    "gz": ["file/test.tar.gz", {
        'confidence': 100,
        'type': 'binary'
    }],
}

for test in TEST_FILES:
    file_path = TEST_FILES[test][0]
    file_result = TEST_FILES[test][1]
    content = open(file_path, "rb").read()
    test_result = detect(content)
    if test_result == file_result:
        print(file_path + ": OK")
    else:
        print(file_path + ": ERROR")
Example #5
0
        "file/test.c",
        {'confidence': 50, 'type': 'text', 'language': 'en', 'encoding': 'ISO-8859-1'}
    ],
    "sh": [
        "file/test.sh",
        {'confidence': 21, 'type': 'text', 'language': 'es', 'encoding': 'ISO-8859-1'}
    ],
    "elf": [
        "file/test",
        {'confidence': 100, 'type': 'binary'}
    ],
    "bz2": [
        "file/test.tar.bz2",
        {'confidence': 100, 'type': 'binary'}
    ],
    "gz": [
        "file/test.tar.gz",
        {'confidence': 100, 'type': 'binary'}
    ],
}

for test in TEST_FILES:
    file_path = TEST_FILES[test][0]
    file_result = TEST_FILES[test][1]
    content = open(file_path).read()
    test_result = detect(content)
    if test_result == file_result:
        print file_path + ": OK"
    else:
        print file_path + ": ERROR"