def get_file_infos(location): """ Return a list of dictionaries of informations collected from the file or directory at location. """ from commoncode import fileutils from commoncode import filetype from commoncode.hash import sha1, md5 from typecode import contenttype T = contenttype.get_type(location) is_file = T.is_file is_dir = T.is_dir infos = OrderedDict() infos['type'] = filetype.get_type(location, short=False) infos['name'] = fileutils.file_name(location) infos['extension'] = is_file and fileutils.file_extension(location) or '' infos['date'] = is_file and filetype.get_last_modified_date( location) or None infos['size'] = T.size infos['sha1'] = is_file and sha1(location) or None infos['md5'] = is_file and md5(location) or None infos['files_count'] = is_dir and filetype.get_file_count(location) or None infos['mime_type'] = is_file and T.mimetype_file or None infos['file_type'] = is_file and T.filetype_file or None infos['programming_language'] = is_file and T.programming_language or None infos['is_binary'] = is_file and T.is_binary or None infos['is_text'] = is_file and T.is_text or None infos['is_archive'] = is_file and T.is_archive or None infos['is_media'] = is_file and T.is_media or None infos['is_source'] = is_file and T.is_source or None infos['is_script'] = is_file and T.is_script or None return [infos]
def get_file_infos(location): """ Return a list of dictionaries of informations collected from the file or directory at location. """ from commoncode import fileutils from commoncode import filetype from commoncode.hash import sha1, md5 from typecode import contenttype T = contenttype.get_type(location) is_file = T.is_file is_dir = T.is_dir infos = OrderedDict() infos['type'] = filetype.get_type(location, short=False) infos['name'] = fileutils.file_name(location) infos['extension'] = is_file and fileutils.file_extension(location) or '' infos['date'] = is_file and filetype.get_last_modified_date(location) or None infos['size'] = T.size infos['sha1'] = is_file and sha1(location) or None infos['md5'] = is_file and md5(location) or None infos['files_count'] = is_dir and filetype.get_file_count(location) or None infos['mime_type'] = is_file and T.mimetype_file or None infos['file_type'] = is_file and T.filetype_file or None infos['programming_language'] = is_file and T.programming_language or None infos['is_binary'] = is_file and T.is_binary or None infos['is_text'] = is_file and T.is_text or None infos['is_archive'] = is_file and T.is_archive or None infos['is_media'] = is_file and T.is_media or None infos['is_source'] = is_file and T.is_source or None infos['is_script'] = is_file and T.is_script or None return [infos]
def test_sha1_checksum(self): test_file = self.get_test_loc('hash/dir1/a.png') assert sha1(test_file) == u'34ac5465d48a9b04fc275f09bc2230660df8f4f7'
def test_sha1_checksum_on_dos_text(self): test_file = self.get_test_loc('hash/dir2/dos.txt') assert sha1(test_file) == u'a71718fb198630ae8ba32926015d8555a03cb06c'
def test_sha1_checksum_on_text2(self): test_file = self.get_test_loc('hash/dir2/a.txt') assert sha1(test_file) == u'3ca69e8d6c234a469d16ac28a4a658c92267c423'
def test_build_index(self): # note: this is a rather complex test because caching involves some globals cache_dir = self.get_temp_dir('index_cache') lock_file, checksum_file, cache_file = get_license_cache_paths(cache_dir=cache_dir) tree_base_dir = self.get_temp_dir('src_dir') licenses_data_dir = self.get_test_loc('cache/data/licenses', copy=True) rules_data_dir = self.get_test_loc('cache/data/rules', copy=True) # now add some file in the mock source tree new_file = os.path.join(tree_base_dir, 'some.py') with open(new_file, 'wb') as nf: nf.write('somthing') timeout = 10 assert not os.path.exists(checksum_file) assert not os.path.exists(cache_file) assert not os.path.exists(lock_file) # when a new index is built, new index files are created check_consistency = True cache.get_cached_index(cache_dir, check_consistency, timeout, tree_base_dir, licenses_data_dir, rules_data_dir) assert os.path.exists(checksum_file) assert os.path.exists(cache_file) assert not os.path.exists(lock_file) # when nothing changed a new index files is not created tree_before = open(checksum_file).read() idx_checksum_before = hash.sha1(cache_file) cache.get_cached_index(cache_dir, check_consistency, timeout, tree_base_dir, licenses_data_dir, rules_data_dir) assert tree_before == open(checksum_file).read() assert idx_checksum_before == hash.sha1(cache_file) # now add some file in the source tree new_file = os.path.join(tree_base_dir, 'some file') with open(new_file, 'wb') as nf: nf.write('somthing') # when check_consistency is False, the index is not rebuild when # new files are added check_consistency = False cache.get_cached_index(cache_dir, check_consistency, timeout, tree_base_dir, licenses_data_dir, rules_data_dir) assert tree_before == open(checksum_file).read() assert idx_checksum_before == hash.sha1(cache_file) # when check_consistency is True, the index is rebuilt when new # files are added check_consistency = True cache.get_cached_index(cache_dir, check_consistency, timeout, tree_base_dir, licenses_data_dir, rules_data_dir) assert tree_before != open(checksum_file).read() # now add some ignored file in the source tree tree_before = open(checksum_file).read() idx_checksum_before = hash.sha1(cache_file) new_file = os.path.join(tree_base_dir, 'some file.pyc') with open(new_file, 'wb') as nf: nf.write('somthing') # when check_consistency is True, the index is not rebuilt when new # files are added that are ignored check_consistency = True cache.get_cached_index(cache_dir, check_consistency, timeout, tree_base_dir, licenses_data_dir, rules_data_dir) assert tree_before == open(checksum_file).read() assert idx_checksum_before == hash.sha1(cache_file) # if the treechecksum file dies, the index is rebuilt fileutils.delete(checksum_file) idx_checksum_before = hash.sha1(cache_file) check_consistency = False cache.get_cached_index(cache_dir, check_consistency, timeout, tree_base_dir, licenses_data_dir, rules_data_dir) assert tree_before == open(checksum_file).read() # if the index cache file dies the index is rebuilt fileutils.delete(cache_file) check_consistency = False idx1 = cache.get_cached_index(cache_dir, check_consistency, timeout, tree_base_dir, licenses_data_dir, rules_data_dir) # load index, forced from file idx2 = cache.load_index(cache_file) assert idx1.to_dict(True) == idx2.to_dict(True) # reset global caches cache._LICENSE_SYMBOLS_BY_SPDX_KEY = {} cache._LICENSES_BY_KEY_INDEX = None cache._UNKNOWN_SPDX_SYMBOL = None cache._LICENSES_BY_KEY = None
def test_LicenseCache_load_or_build_from_empty(self): # recreate internal paths for testing licensedcode_cache_dir = self.get_temp_dir('index_cache') scancode_cache_dir = self.get_temp_dir('index_metafiles') idx_cache_dir = os.path.join(licensedcode_cache_dir, cache.LICENSE_INDEX_DIR) fileutils.create_dir(idx_cache_dir) cache_file = os.path.join(idx_cache_dir, cache.LICENSE_INDEX_FILENAME) lock_file = os.path.join(scancode_cache_dir, cache.LICENSE_LOCKFILE_NAME) licenses_data_dir = self.get_test_loc('cache/data/licenses', copy=True) rules_data_dir = self.get_test_loc('cache/data/rules', copy=True) assert not os.path.exists(cache_file) assert not os.path.exists(lock_file) timeout = 10 # when a new cache is built, new cache files are created _cached1 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, force=False, timeout=timeout, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert os.path.exists(cache_file) fileutils.delete(cache_file) # force=True builds an index too if none exists _cached2 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, force=True, timeout=timeout, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert os.path.exists(cache_file) # force=True rebuilds an index idx_checksum_before = hash.sha1(cache_file) _cached3 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, force=True, timeout=timeout, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert hash.sha1(cache_file) != idx_checksum_before # force=False loads an index idx_checksum_before = hash.sha1(cache_file) _cached4 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, force=False, timeout=timeout, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert hash.sha1(cache_file) == idx_checksum_before
def test_get_or_build_index_through_cache(self): # note: this is a rather complex test because caching involves some globals license_index_cache_dir = self.get_temp_dir('index_cache') _index_lock_file = os.path.join(license_index_cache_dir, 'lockfile') _tree_checksum_file = os.path.join(license_index_cache_dir, 'tree_checksums') _index_cache_file = os.path.join(license_index_cache_dir, 'index_cache') _tree_base_dir = self.get_temp_dir('src_dir') _licenses_dir = self.get_test_loc('cache/data', copy=True) _licenses_data_dir = os.path.join(_licenses_dir, 'licenses') _rules_data_dir = os.path.join(_licenses_dir, 'rules') _timeout = 10 assert not os.path.exists(_tree_checksum_file) assert not os.path.exists(_index_cache_file) assert not os.path.exists(_index_lock_file) check_consistency = True return_index = False # when a new index is built, new index files are created cache.get_or_build_index_through_cache( check_consistency, return_index, _tree_base_dir, _tree_checksum_file, _index_lock_file, _index_cache_file, _licenses_data_dir, _rules_data_dir, _timeout) assert os.path.exists(_tree_checksum_file) assert os.path.exists(_index_cache_file) assert not os.path.exists(_index_lock_file) # when nothing changed a new index files is not created tree_before = open(_tree_checksum_file).read() idx_checksum_before = hash.sha1(_index_cache_file) idx_date_before = date.get_file_mtime(_index_cache_file) cache.get_or_build_index_through_cache( check_consistency, return_index, _tree_base_dir, _tree_checksum_file, _index_lock_file, _index_cache_file, _licenses_data_dir, _rules_data_dir, _timeout) assert tree_before == open(_tree_checksum_file).read() assert idx_checksum_before == hash.sha1(_index_cache_file) assert idx_date_before == date.get_file_mtime(_index_cache_file) # now add some file in the source tree new_file = os.path.join(_tree_base_dir, 'some file') with open(new_file, 'wb') as nf: nf.write('somthing') # when check_consistency is False, the index is not rebuild when # new files are added check_consistency = False cache.get_or_build_index_through_cache( check_consistency, return_index, _tree_base_dir, _tree_checksum_file, _index_lock_file, _index_cache_file, _licenses_data_dir, _rules_data_dir, _timeout) assert tree_before == open(_tree_checksum_file).read() assert idx_checksum_before == hash.sha1(_index_cache_file) assert idx_date_before == date.get_file_mtime(_index_cache_file) # when check_consistency is True, the index is rebuilt when new # files are added check_consistency = True cache.get_or_build_index_through_cache( check_consistency, return_index, _tree_base_dir, _tree_checksum_file, _index_lock_file, _index_cache_file, _licenses_data_dir, _rules_data_dir, _timeout) assert tree_before != open(_tree_checksum_file).read() assert idx_date_before != date.get_file_mtime(_index_cache_file) # now add some ignored file in the source tree tree_before = open(_tree_checksum_file).read() idx_checksum_before = hash.sha1(_index_cache_file) idx_date_before = date.get_file_mtime(_index_cache_file) new_file = os.path.join(_tree_base_dir, 'some file.pyc') with open(new_file, 'wb') as nf: nf.write('somthing') check_consistency = True cache.get_or_build_index_through_cache( check_consistency, return_index, _tree_base_dir, _tree_checksum_file, _index_lock_file, _index_cache_file, _licenses_data_dir, _rules_data_dir, _timeout) assert tree_before == open(_tree_checksum_file).read() assert idx_checksum_before == hash.sha1(_index_cache_file) assert idx_date_before == date.get_file_mtime(_index_cache_file) # if the treechecksum file dies the index is rebuilt fileutils.delete(_tree_checksum_file) idx_checksum_before = hash.sha1(_index_cache_file) check_consistency = False cache.get_or_build_index_through_cache( check_consistency, return_index, _tree_base_dir, _tree_checksum_file, _index_lock_file, _index_cache_file, _licenses_data_dir, _rules_data_dir, _timeout) assert tree_before == open(_tree_checksum_file).read() assert idx_date_before != date.get_file_mtime(_index_cache_file) # if the index cache file dies the index is rebuilt fileutils.delete(_index_cache_file) check_consistency = False cache.get_or_build_index_through_cache( check_consistency, return_index, _tree_base_dir, _tree_checksum_file, _index_lock_file, _index_cache_file, _licenses_data_dir, _rules_data_dir, _timeout) assert tree_before == open(_tree_checksum_file).read() assert os.path.exists(_index_cache_file)
def test_hash_10(self): test_file = self.get_test_loc('hash/dir2/dos.txt') assert sha1(test_file) == 'a71718fb198630ae8ba32926015d8555a03cb06c'
def test_hash_7(self): test_file = self.get_test_loc('hash/dir2/a.txt') assert sha1(test_file) == '3ca69e8d6c234a469d16ac28a4a658c92267c423'
def test_hash_1(self): test_file = self.get_test_loc('hash/dir1/a.png') assert sha1(test_file) == '34ac5465d48a9b04fc275f09bc2230660df8f4f7'
def test_LicenseCache_load_or_build(self): # recreate internal paths for testing licensedcode_cache_dir = self.get_temp_dir('index_cache') scancode_cache_dir = self.get_temp_dir('index_metafiles') idx_cache_dir = os.path.join(licensedcode_cache_dir, cache.LICENSE_INDEX_DIR) fileutils.create_dir(idx_cache_dir) cache_file = os.path.join(idx_cache_dir, cache.LICENSE_INDEX_FILENAME) lock_file = os.path.join(scancode_cache_dir, cache.LICENSE_LOCKFILE_NAME) checksum_file = os.path.join(scancode_cache_dir, cache.LICENSE_CHECKSUM_FILE) tree_base_dir = self.get_temp_dir('src_dir') licenses_data_dir = self.get_test_loc('cache/data/licenses', copy=True) rules_data_dir = self.get_test_loc('cache/data/rules', copy=True) # now add some file in the mock source tree new_file = os.path.join(tree_base_dir, 'some.py') with open(new_file, 'w') as nf: nf.write('somthing') assert not os.path.exists(checksum_file) assert not os.path.exists(cache_file) assert not os.path.exists(lock_file) timeout = 10 # when a new cache is built, new cache files are created check_consistency = True _cached1 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=check_consistency, timeout=timeout, tree_base_dir=tree_base_dir, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert os.path.exists(checksum_file) assert os.path.exists(cache_file) # when nothing changed a new index files is not created tree_before = open(checksum_file).read() idx_checksum_before = hash.sha1(cache_file) _cached2 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=check_consistency, timeout=timeout, tree_base_dir=tree_base_dir, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert open(checksum_file).read() == tree_before assert hash.sha1(cache_file) == idx_checksum_before # now add some file in the source tree new_file = os.path.join(tree_base_dir, 'some file') with open(new_file, 'w') as nf: nf.write('somthing') # when check_consistency is False, the index is not rebuild when # new files are added check_consistency = False _cached3 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=check_consistency, timeout=timeout, tree_base_dir=tree_base_dir, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert open(checksum_file).read() == tree_before assert hash.sha1(cache_file) == idx_checksum_before # when check_consistency is True, the index is rebuilt when new # files are added check_consistency = True _cached4 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=check_consistency, timeout=timeout, tree_base_dir=tree_base_dir, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert open(checksum_file).read() != tree_before # now add some ignored file in the source tree tree_before = open(checksum_file).read() idx_checksum_before = hash.sha1(cache_file) new_file = os.path.join(tree_base_dir, 'some file.pyc') with open(new_file, 'w') as nf: nf.write('somthing') # when check_consistency is True, the index is not rebuilt when new # files are added that are ignored check_consistency = True _cached5 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=check_consistency, timeout=timeout, tree_base_dir=tree_base_dir, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert open(checksum_file).read() == tree_before assert hash.sha1(cache_file) == idx_checksum_before # if the treechecksum file dies, the index is not rebuilt if # check_consistency is False. and no new checksum is created fileutils.delete(checksum_file) idx_checksum_before = hash.sha1(cache_file) check_consistency = False _cached6 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=check_consistency, timeout=timeout, tree_base_dir=tree_base_dir, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert not os.path.exists(checksum_file) # with the treechecksum file gone, the index is rebuilt if # check_consistency is True and a new checksum is created idx_checksum_before = hash.sha1(cache_file) check_consistency = True _cached7 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=check_consistency, timeout=timeout, tree_base_dir=tree_base_dir, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert open(checksum_file).read() == tree_before # if the index cache file dies the index is rebuilt fileutils.delete(cache_file) check_consistency = False cached8 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=check_consistency, timeout=timeout, tree_base_dir=tree_base_dir, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) idx1 = cached8.index # load index, forced from file cached9 = cache.load_cache_file(cache_file) idx2 = cached9.index assert set(idx2.dictionary.keys()) == set(idx1.dictionary.keys())