Ejemplo n.º 1
0
def download(uri):
    """
    Downloads the given `uri` in a temporary directory and returns that
    directory path.
    """
    response = requests.get(uri)

    if response.status_code != 200:
        raise requests.RequestException

    content_disposition = response.headers.get("content-disposition", "")
    _, params = cgi.parse_header(content_disposition)
    filename = params.get("filename")

    if not filename:
        # Using `response.url` in place of provided `Scan.uri` since the former
        # will be more accurate in case of HTTP redirect.
        filename = os.path.basename(urlparse(response.url).path)

    download_directory = tempfile.mkdtemp()
    download_file = Path(download_directory, filename)

    file_content = response.content
    with open(download_file, "wb") as f:
        f.write(file_content)

    checksums = multi_checksums(download_file, ("md5", "sha1"))

    return Download(
        directory=download_directory,
        filename=filename,
        size=len(file_content),
        sha1=checksums["sha1"],
        md5=checksums["md5"],
    )
Ejemplo n.º 2
0
def get_file_info(location, **kwargs):
    """
    Return a mapping of file information collected for the file at `location`.
    """
    result = OrderedDict()

    # TODO: move date and size these to the inventory collection step???
    result['date'] = get_last_modified_date(location) or None
    result['size'] = getsize(location) or 0

    sha1, md5, sha256 = multi_checksums(location, ('sha1', 'md5', 'sha256')).values()
    result['sha1'] = sha1
    result['md5'] = md5
    result['sha256'] = sha256

    collector = get_type(location)
    result['mime_type'] = collector.mimetype_file or None
    result['file_type'] = collector.filetype_file or None
    result['programming_language'] = collector.programming_language or None
    result['is_binary'] = bool(collector.is_binary)
    result['is_text'] = bool(collector.is_text)
    result['is_archive'] = bool(collector.is_archive)
    result['is_media'] = bool(collector.is_media)
    result['is_source'] = bool(collector.is_source)
    result['is_script'] = bool(collector.is_script)
    return result
Ejemplo n.º 3
0
 def test_multi_checksums_custom(self):
     test_file = self.get_test_loc('hash/dir1/a.png')
     result = multi_checksums(test_file, ('sha512',))
     expected = dict([
         ('sha512', u'5be9e01cd20ff288fd3c3fc46be5c2747eaa2c526197125330947a95cdb418222176b182a4680f0e435ba8f114363c45a67b30eed9a9222407e63ccbde46d3b4'),
     ])
     assert result == expected
Ejemplo n.º 4
0
def get_file_infos(location):
    """
    Return a mapping of file information collected from the file or
    directory at `location`.
    """
    from commoncode import fileutils
    from commoncode import filetype
    from commoncode.hash import multi_checksums
    from typecode import contenttype

    if on_linux:
        location = path_to_bytes(location)
    else:
        location = path_to_unicode(location)

    infos = OrderedDict()
    is_file = filetype.is_file(location)
    is_dir = filetype.is_dir(location)

    T = contenttype.get_type(location)

    infos['type'] = filetype.get_type(location, short=False)
    name = fileutils.file_name(location)
    if is_file:
        base_name, extension = fileutils.splitext(location)
    else:
        base_name = name
        extension = ''

    if on_linux:
        infos['name'] = path_to_unicode(name)
        infos['base_name'] = path_to_unicode(base_name)
        infos['extension'] = path_to_unicode(extension)
    else:
        infos['name'] = name
        infos['base_name'] = base_name
        infos['extension'] = extension

    infos['date'] = is_file and filetype.get_last_modified_date(
        location) or None
    infos['size'] = T.size
    infos.update(multi_checksums(location, (
        'sha1',
        'md5',
    )))
    infos['files_count'] = is_dir and filetype.get_file_count(location) or None
    infos['mime_type'] = is_file and T.mimetype_file or None
    infos['file_type'] = is_file and T.filetype_file or None
    infos['programming_language'] = is_file and T.programming_language or None
    infos['is_binary'] = bool(is_file and T.is_binary)
    infos['is_text'] = bool(is_file and T.is_text)
    infos['is_archive'] = bool(is_file and T.is_archive)
    infos['is_media'] = bool(is_file and T.is_media)
    infos['is_source'] = bool(is_file and T.is_source)
    infos['is_script'] = bool(is_file and T.is_script)

    return infos
Ejemplo n.º 5
0
 def test_multi_checksums(self):
     test_file = self.get_test_loc('hash/dir1/a.png')
     expected = dict([
         ('md5', u'4760fb467f1ebf3b0aeace4a3926f1a4'),
         ('sha1', u'34ac5465d48a9b04fc275f09bc2230660df8f4f7'),
         ('sha256', u'1b598db6fee8f1ec7bb919c0adf68956f3d20af8c9934a9cf2db52e1347efd35'),
     ])
     result = multi_checksums(test_file, 'md5 sha1 sha256'.split())
     assert result == expected
Ejemplo n.º 6
0
 def test_multi_checksums_shattered1(self):
     test_file = self.get_test_loc('hash/sha1-collision/shattered-1.pdf')
     expected = dict([
         ('md5', 'ee4aa52b139d925f8d8884402b0a750c'),
         ('sha1', '38762cf7f55934b34d179ae6a4c80cadccbb7f0a'),
         ('sha256', '2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0'),
         ('sha512', '3c19b2cbcf72f7f5b252ea31677b8f2323d6119e49bcc0fb55931d00132385f1e749bb24cbd68c04ac826ae8421802825d3587fe185abf709669bb9693f6b416'),
         ('sha1_git', 'ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0'),
     ])
     result = multi_checksums(test_file)
     assert result == expected
Ejemplo n.º 7
0
def get_file_infos(location):
    """
    Return a mapping of file information collected from the file or
    directory at `location`.
    """
    from commoncode import fileutils
    from commoncode import filetype
    from commoncode.hash import multi_checksums
    from typecode import contenttype

    if on_linux:
        location = path_to_bytes(location)
    else:
        location = path_to_unicode(location)

    infos = OrderedDict()
    is_file = filetype.is_file(location)
    is_dir = filetype.is_dir(location)

    T = contenttype.get_type(location)

    infos['type'] = filetype.get_type(location, short=False)
    name = fileutils.file_name(location)
    if is_file:
        base_name, extension = fileutils.splitext(location)
    else:
        base_name = name
        extension = ''

    if on_linux:
        infos['name'] = path_to_unicode(name)
        infos['base_name'] = path_to_unicode(base_name)
        infos['extension'] = path_to_unicode(extension)
    else:
        infos['name'] = name
        infos['base_name'] = base_name
        infos['extension'] = extension

    infos['date'] = is_file and filetype.get_last_modified_date(location) or None
    infos['size'] = T.size
    infos.update(multi_checksums(location, ('sha1', 'md5',)))
    infos['files_count'] = is_dir and filetype.get_file_count(location) or None
    infos['mime_type'] = is_file and T.mimetype_file or None
    infos['file_type'] = is_file and T.filetype_file or None
    infos['programming_language'] = is_file and T.programming_language or None
    infos['is_binary'] = bool(is_file and T.is_binary)
    infos['is_text'] = bool(is_file and T.is_text)
    infos['is_archive'] = bool(is_file and T.is_archive)
    infos['is_media'] = bool(is_file and T.is_media)
    infos['is_source'] = bool(is_file and T.is_source)
    infos['is_script'] = bool(is_file and T.is_script)

    return infos
Ejemplo n.º 8
0
 def test_multi_checksums_shattered2(self):
     test_file = self.get_test_loc('hash/sha1-collision/shattered-2.pdf')
     expected = dict([
         ('md5', '5bd9d8cabc46041579a311230539b8d1'),
         ('sha1', '38762cf7f55934b34d179ae6a4c80cadccbb7f0a'),
         ('sha256', 'd4488775d29bdef7993367d541064dbdda50d383f89f0aa13a6ff2e0894ba5ff'),
         ('sha512', 'f39a04842e4b28e04558496beb7cb84654ded9c00b2f873c3ef64f9dfdbc760cd0273b816858ba5b203c0dd71af8b65d6a0c1032e00e48ace0b4705eedcc1bab'),
         # Note: this is not the same as the sha1_git for shattered-1.pdf ;)
         ('sha1_git', 'b621eeccd5c7edac9b7dcba35a8d5afd075e24f2'),
     ])
     result = multi_checksums(test_file)
     assert result == expected
Ejemplo n.º 9
0
def get_file_infos(location, as_list=True):
    """
    Return a list of dictionaries of informations collected from the file or
    directory at location.
    """
    from commoncode import fileutils
    from commoncode import filetype
    from commoncode.hash import multi_checksums
    from scancode import utils
    from typecode import contenttype

    infos = OrderedDict()
    is_file = filetype.is_file(location)
    is_dir = filetype.is_dir(location)

    T = contenttype.get_type(location)

    infos['type'] = filetype.get_type(location, short=False)
    infos['name'] = fileutils.file_name(location)
    infos['extension'] = is_file and fileutils.file_extension(location) or ''
    infos['date'] = is_file and filetype.get_last_modified_date(
        location) or None
    infos['size'] = T.size
    infos.update(multi_checksums(location, (
        'sha1',
        'md5',
    )))
    infos['files_count'] = is_dir and filetype.get_file_count(location) or None
    infos['mime_type'] = is_file and T.mimetype_file or None
    infos['file_type'] = is_file and T.filetype_file or None
    infos['programming_language'] = is_file and T.programming_language or None
    infos['is_binary'] = bool(is_file and T.is_binary)
    infos['is_text'] = bool(is_file and T.is_text)
    infos['is_archive'] = bool(is_file and T.is_archive)
    infos['is_media'] = bool(is_file and T.is_media)
    infos['is_source'] = bool(is_file and T.is_source)
    infos['is_script'] = bool(is_file and T.is_script)

    if as_list:
        return [infos]
    else:
        return infos