def build_index(licenses_db=None, licenses_data_dir=None, rules_data_dir=None): """ Return an index built from rules and licenses directories """ from licensedcode.index import LicenseIndex from licensedcode.models import get_rules from licensedcode.models import get_all_spdx_key_tokens from licensedcode.models import get_license_tokens from licensedcode.models import licenses_data_dir as ldd from licensedcode.models import rules_data_dir as rdd from licensedcode.models import load_licenses from licensedcode.legalese import common_license_words licenses_data_dir = licenses_data_dir or ldd rules_data_dir = rules_data_dir or rdd licenses_db = licenses_db or load_licenses( licenses_data_dir=licenses_data_dir) rules = get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir) legalese = common_license_words spdx_tokens = set(get_all_spdx_key_tokens(licenses_db)) license_tokens = set(get_license_tokens()) return LicenseIndex( rules, _legalese=legalese, _spdx_tokens=spdx_tokens, _license_tokens=license_tokens, )
def test_all_spdx_tokens_exists_in_dictionary(self): idx = cache.get_index() dic = idx.dictionary licenses = cache.get_licenses_db() tokens = models.get_all_spdx_key_tokens(licenses) for token in tokens: dic[token]
def test_all_spdx_tokens_exists_in_dictionary(self): idx = cache.get_index() dic = idx.dictionary licenses = cache.get_licenses_db() tokens = set(models.get_all_spdx_key_tokens(licenses)) keys = set(idx.dictionary) try: assert tokens.issubset(keys) except: for token in tokens: dic[token]
def build_index(licenses_db=None, licenses_data_dir=None, rules_data_dir=None): """ Return an index built from rules and licenses directories """ from licensedcode.index import LicenseIndex from licensedcode.models import get_rules from licensedcode.models import get_all_spdx_key_tokens from licensedcode.models import licenses_data_dir as ldd from licensedcode.models import rules_data_dir as rdd licenses_data_dir = licenses_data_dir or ldd rules_data_dir = rules_data_dir or rdd licenses = licenses_db or build_licenses_db( licenses_data_dir=licenses_data_dir) spdx_tokens = set(get_all_spdx_key_tokens(licenses)) rules = get_rules( licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir) return LicenseIndex(rules, _spdx_tokens=spdx_tokens)
def get_cached_index(cache_dir=scancode_cache_dir, check_consistency=SCANCODE_DEV_MODE, # used for testing only timeout=LICENSE_INDEX_LOCK_TIMEOUT, tree_base_dir=scancode_src_dir, licenses_data_dir=None, rules_data_dir=None,): """ Return a LicenseIndex: either load a cached index or build and cache the index. - If the cache does not exist, a new index is built and cached. - If `check_consistency` is True, the cache is checked for consistency and rebuilt if inconsistent or stale. - If `check_consistency` is False, the cache is NOT checked for consistency If the cache files exist but ARE stale, the cache WILL NOT be rebuilt """ from licensedcode.index import LicenseIndex from licensedcode.models import get_rules from licensedcode.models import get_all_spdx_key_tokens from licensedcode.models import licenses_data_dir as ldd from licensedcode.models import rules_data_dir as rdd licenses_data_dir = licenses_data_dir or ldd rules_data_dir = rules_data_dir or rdd lock_file, checksum_file, cache_file = get_license_cache_paths(cache_dir) has_cache = exists(cache_file) has_tree_checksum = exists(checksum_file) # bypass check if no consistency check is needed if has_cache and has_tree_checksum and not check_consistency: return load_index(cache_file) # here, we have no cache or we want a validity check: lock, check # and build or rebuild as needed try: # acquire lock and wait until timeout to get a lock or die with yg.lockfile.FileLock(lock_file, timeout=timeout): current_checksum = None # is the current cache consistent or stale? if has_cache and has_tree_checksum: # if we have a saved cached index # load saved tree_checksum and compare with current tree_checksum with open(checksum_file, 'rb') as etcs: existing_checksum = etcs.read() current_checksum = tree_checksum(tree_base_dir=tree_base_dir) if current_checksum == existing_checksum: # The cache is consistent with the latest code and data # load and return return load_index(cache_file) # Here, the cache is not consistent with the latest code and # data: It is either stale or non-existing: we need to # rebuild the index and cache it rules = get_rules( licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir) license_db = get_licenses_db(licenses_data_dir=licenses_data_dir) spdx_tokens = set(get_all_spdx_key_tokens(license_db)) idx = LicenseIndex(rules, _spdx_tokens=spdx_tokens) with open(cache_file, 'wb') as ifc: ifc.write(idx.dumps()) # save the new checksums tree with open(checksum_file, 'wb') as ctcs: ctcs.write(current_checksum or tree_checksum(tree_base_dir=tree_base_dir)) return idx except yg.lockfile.FileLockTimeout: # TODO: handle unable to lock in a nicer way raise
def get_cached_index( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=SCANCODE_DEV_MODE, # used for testing only timeout=LICENSE_INDEX_LOCK_TIMEOUT, tree_base_dir=scancode_src_dir, licenses_data_dir=None, rules_data_dir=None, ): """ Return a LicenseIndex: either load a cached index or build and cache the index. - If the cache does not exist, a new index is built and cached. - If `check_consistency` is True, the cache is checked for consistency and rebuilt if inconsistent or stale. - If `check_consistency` is False, the cache is NOT checked for consistency If the cache files exist but ARE stale, the cache WILL NOT be rebuilt """ from licensedcode.index import LicenseIndex from licensedcode.models import get_rules from licensedcode.models import get_all_spdx_key_tokens from licensedcode.models import licenses_data_dir as ldd from licensedcode.models import rules_data_dir as rdd from scancode import lockfile licenses_data_dir = licenses_data_dir or ldd rules_data_dir = rules_data_dir or rdd lock_file, checksum_file, cache_file = get_license_cache_paths( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, ) has_cache = has_cache_index_file(cache_file) # bypass check if no consistency check is needed if has_cache and not check_consistency: try: return load_index(cache_file) except Exception as e: # work around some rare Windows quirks import traceback print( 'Inconsistent License index cache: checking and rebuilding index.' ) print(str(e)) print(traceback.format_exc()) has_tree_checksum = os.path.exists(checksum_file) # here, we have no cache or we want a validity check: lock, check # and build or rebuild as needed try: # acquire lock and wait until timeout to get a lock or die with lockfile.FileLock(lock_file).locked(timeout=timeout): current_checksum = None # is the current cache consistent or stale? if has_cache and has_tree_checksum: # if we have a saved cached index # load saved tree_checksum and compare with current tree_checksum with open(checksum_file) as etcs: existing_checksum = etcs.read() current_checksum = tree_checksum(tree_base_dir=tree_base_dir) if current_checksum == existing_checksum: # The cache is consistent with the latest code and data # load and return return load_index(cache_file) # Here, the cache is not consistent with the latest code and # data: It is either stale or non-existing: we need to # rebuild the index and cache it # FIXME: caching a pickle of this would be 10x times faster license_db = get_licenses_db(licenses_data_dir=licenses_data_dir) rules = get_rules(licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir) spdx_tokens = set(get_all_spdx_key_tokens(license_db)) idx = LicenseIndex(rules, _spdx_tokens=spdx_tokens) with open(cache_file, 'wb') as ifc: idx.dump(ifc) # save the new tree checksum current_checksum = tree_checksum(tree_base_dir=tree_base_dir) with open(checksum_file, 'w') as ctcs: ctcs.write(current_checksum) return idx except lockfile.LockTimeout: # TODO: handle unable to lock in a nicer way raise