def test_all_spdx_tokens_exists_in_dictionary(self): idx = cache.get_index() dic = idx.dictionary licenses = cache.get_licenses_db() tokens = models.get_all_spdx_key_tokens(licenses) for token in tokens: dic[token]
def get_licenses(location, min_score=0, include_text=False, diag=False, license_url_template=DEJACODE_LICENSE_URL): """ Yield mappings of license data detected in the file at `location`. `minimum_score` is a minimum score threshold from 0 to 100. The default is 0 means that all license matches will be returned. With any other value matches that have a score below minimum score with not be returned. if `include_text` is True, the matched text is included in the returned data. If `diag` is True, additional match details are returned with the matched_rule key of the returned mapping. """ from licensedcode.cache import get_index from licensedcode.cache import get_licenses_db idx = get_index() licenses = get_licenses_db() for match in idx.match(location=location, min_score=min_score): if include_text: matched_text = match.matched_text(whole_lines=False) for license_key in match.rule.licenses: lic = licenses.get(license_key) result = OrderedDict() result['key'] = lic.key result['score'] = match.score() result['short_name'] = lic.short_name result['category'] = lic.category result['owner'] = lic.owner result['homepage_url'] = lic.homepage_url result['text_url'] = lic.text_urls[0] if lic.text_urls else '' result['reference_url'] = license_url_template.format(lic.key) spdx_key = lic.spdx_license_key result['spdx_license_key'] = spdx_key if spdx_key: spdx_key = lic.spdx_license_key.rstrip('+') spdx_url = SPDX_LICENSE_URL.format(spdx_key) else: spdx_url = '' result['spdx_url'] = spdx_url result['start_line'] = match.start_line result['end_line'] = match.end_line matched_rule = result['matched_rule'] = OrderedDict() matched_rule['identifier'] = match.rule.identifier matched_rule['license_choice'] = match.rule.license_choice matched_rule['licenses'] = match.rule.licenses # FIXME: for sanity these should always be included??? if diag: matched_rule['matcher'] = match.matcher matched_rule['rule_length'] = match.rule.length matched_rule['matched_length'] = match.ilen() matched_rule['match_coverage'] = match.coverage() matched_rule['rule_relevance'] = match.rule.relevance # FIXME: for sanity this should always be included????? if include_text: result['matched_text'] = matched_text yield result
def process_codebase(self, codebase, licenses_reference, **kwargs): from licensedcode.cache import get_licenses_db licensing = Licensing() license_keys = set() for resource in codebase.walk(): licexps = getattr(resource, 'license_expressions', []) or [] for expression in licexps: if expression: license_keys.update(licensing.license_keys(expression)) packages = getattr(codebase, 'packages', []) or [] for package in packages: # FXIME: license_expression attribute name is changing soon expression = package.get('license_expression') if expression: license_keys.update(licensing.license_keys(expression)) resource.save(codebase) db = get_licenses_db() for key in sorted(license_keys): license_details = db[key].to_dict( include_ignorables=False, include_text=True, ) codebase.attributes.licenses_reference.append(license_details)
def get_licenses(location, min_score=0, include_text=False, diag=False): """ Yield dictionaries of license data detected in the file at location. `minimum_score` is a minimum score threshold from 0 to 100. The default is 0 means that all license matches will be returned. With any other value matches that have a score below minimum score with not be returned. If `diag` is True, additional match details are returned with the matched_rule key of the returned mapping. """ from licensedcode.cache import get_index from licensedcode.cache import get_licenses_db from licensedcode.match import get_full_matched_text idx = get_index() licenses = get_licenses_db() for match in idx.match(location=location, min_score=min_score): if include_text: matched_text = u''.join( get_full_matched_text(match, location=location, idx=idx, whole_lines=False)) for license_key in match.rule.licenses: lic = licenses.get(license_key) result = OrderedDict() result['key'] = lic.key result['score'] = match.score() result['short_name'] = lic.short_name result['category'] = lic.category result['owner'] = lic.owner result['homepage_url'] = lic.homepage_url result['text_url'] = lic.text_urls[0] if lic.text_urls else '' result['dejacode_url'] = DEJACODE_LICENSE_URL.format(lic.key) spdx_key = lic.spdx_license_key result['spdx_license_key'] = spdx_key if spdx_key: spdx_key = lic.spdx_license_key.rstrip('+') spdx_url = SPDX_LICENSE_URL.format(spdx_key) else: spdx_url = '' result['spdx_url'] = spdx_url result['start_line'] = match.start_line result['end_line'] = match.end_line matched_rule = result['matched_rule'] = OrderedDict() matched_rule['identifier'] = match.rule.identifier matched_rule['license_choice'] = match.rule.license_choice matched_rule['licenses'] = match.rule.licenses if diag: matched_rule['matcher'] = match.matcher matched_rule['rule_length'] = match.rule.length matched_rule['matched_length'] = match.ilen() matched_rule['match_coverage'] = match.coverage() matched_rule['rule_relevance'] = match.rule.relevance if include_text: result['matched_text'] = matched_text yield result
def _licenses_data_from_match( match, include_text=False, license_text_diagnostics=False, license_url_template=DEJACODE_LICENSE_URL): """ Return a list of "licenses" scan data built from a license match. Used directly only internally for testing. """ from licensedcode import cache licenses = cache.get_licenses_db() matched_text = None if include_text: if license_text_diagnostics: matched_text = match.matched_text(whole_lines=False, highlight=True) else: matched_text = match.matched_text(whole_lines=True, highlight=False) detected_licenses = [] for license_key in match.rule.license_keys(): lic = licenses.get(license_key) result = OrderedDict() detected_licenses.append(result) result['key'] = lic.key result['score'] = match.score() result['name'] = lic.name result['short_name'] = lic.short_name result['category'] = lic.category result['is_exception'] = lic.is_exception result['owner'] = lic.owner result['homepage_url'] = lic.homepage_url result['text_url'] = lic.text_urls[0] if lic.text_urls else '' result['reference_url'] = license_url_template.format(lic.key) spdx_key = lic.spdx_license_key result['spdx_license_key'] = spdx_key if spdx_key: spdx_key = lic.spdx_license_key.rstrip('+') spdx_url = SPDX_LICENSE_URL.format(spdx_key) else: spdx_url = '' result['spdx_url'] = spdx_url result['start_line'] = match.start_line result['end_line'] = match.end_line matched_rule = result['matched_rule'] = OrderedDict() matched_rule['identifier'] = match.rule.identifier matched_rule['license_expression'] = match.rule.license_expression matched_rule['licenses'] = match.rule.license_keys() matched_rule['is_license_text'] = match.rule.is_license_text matched_rule['is_license_notice'] = match.rule.is_license_notice matched_rule['is_license_reference'] = match.rule.is_license_reference matched_rule['is_license_tag'] = match.rule.is_license_tag matched_rule['matcher'] = match.matcher matched_rule['rule_length'] = match.rule.length matched_rule['matched_length'] = match.len() matched_rule['match_coverage'] = match.coverage() matched_rule['rule_relevance'] = match.rule.relevance # FIXME: for sanity this should always be included????? if include_text: result['matched_text'] = matched_text return detected_licenses
def get_spdx_keys(): """ Return a set of ScanEngine license keys for licenses that are listed in SPDX. """ global _spdx_keys if not _spdx_keys: _spdx_keys = frozenset(models.get_all_spdx_keys(get_licenses_db())) return _spdx_keys
def test_validate_license_library(self): errors, warnings, infos = models.License.validate( cache.get_licenses_db(), verbose=False, ) assert errors == {} assert warnings == {} assert infos
def test_all_spdx_tokens_exists_in_dictionary(self): idx = cache.get_index() dic = idx.dictionary licenses = cache.get_licenses_db() tokens = set(models.get_all_spdx_key_tokens(licenses)) keys = set(idx.dictionary) try: assert tokens.issubset(keys) except: for token in tokens: dic[token]
def __init__(self, src_dir, match_text=False, match_approx=False): """ `src_dir` is where the License objects are dumped. """ src_dir = os.path.realpath(src_dir) self.src_dir = src_dir self.match_text = match_text self.match_approx = match_approx self.fetched = False if os.path.exists(src_dir): # fetch ONLY if the directory is empty self.fetched = True else: os.mkdir(src_dir) self.update_dir = self.src_dir.rstrip('\\/') + '-update' if not os.path.exists(self.update_dir): os.mkdir(self.update_dir) self.new_dir = self.src_dir.rstrip('\\/') + '-new' if not os.path.exists(self.new_dir): os.mkdir(self.new_dir) self.del_dir = self.src_dir.rstrip('\\/') + '-del' if not os.path.exists(self.del_dir): os.mkdir(self.del_dir) self.scancodes_by_key = get_licenses_db() self.scancodes_by_spdx_key = { l.spdx_license_key.lower(): l for l in self.scancodes_by_key.values() if l.spdx_license_key } composites_dir = os.path.join(licensedcode.data_dir, 'composites', 'licenses') self.composites_by_key = load_licenses(composites_dir, with_deprecated=True) self.composites_by_spdx_key = { l.spdx_license_key.lower(): l for l in self.composites_by_key.values() if l.spdx_license_key } foreign_dir = os.path.join(licensedcode.data_dir, 'non-english', 'licenses') self.non_english_by_key = load_licenses(foreign_dir, with_deprecated=True) self.non_english_by_spdx_key = { l.spdx_license_key.lower(): l for l in self.non_english_by_key.values() if l.spdx_license_key }
def get_rules(licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir): """ Yield Rule objects loaded from license files found in `licenses_data_dir` and rule files fourn in `rules_data_dir`. Raise a Exceptions if a rule is inconsistent or incorrect. """ from licensedcode.cache import get_licenses_db licenses = get_licenses_db(licenses_data_dir=licenses_data_dir) rules = list(load_rules(rules_data_dir=rules_data_dir)) check_rules_integrity(rules, licenses) licenses_as_rules = build_rules_from_licenses(licenses) return chain(licenses_as_rules, rules)
def get_spdx_keys(): """ Return a set of ScanCode license keys for licenses that are listed in SPDX. """ global _spdx_keys if not _spdx_keys: licenses = get_licenses_db() spdx = set() for lic in licenses.values(): if (lic.spdx_license_key or lic.other_spdx_license_keys): spdx.add(lic.key) _spdx_keys = frozenset(spdx) return _spdx_keys
def get_rules(licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir): """ Return a mapping of key->license and an iterable of license detection rules loaded from licenses and rules files. Raise a MissingLicenses exceptions if a rule references unknown license keys. """ from licensedcode.cache import get_licenses_db licenses = get_licenses_db(licenses_data_dir=licenses_data_dir) rules = list(load_rules(rules_data_dir=rules_data_dir)) check_rules_integrity(rules, licenses) licenses_as_rules = build_rules_from_licenses(licenses) return chain(licenses_as_rules, rules)
def cli(path=(), update=True): """ Update licenses and rules with ignorable copyrights, holders, authors URLs and emails. """ licensish = list(cache.get_licenses_db().values()) + list( models.load_rules()) if path: licensish = [ l for l in licensish if l.text_file.endswith(path) or l.data_file.endswith(path) ] refresh_ignorables(licensish)
def check_ignorable_clues(licensish, regen=REGEN_TEST_FIXTURES, verbose=False): """ Validate that all expected ignorable clues declared in a `licensish` License or Rule object are properly detected in that rule text file. Optionally ``regen`` the ignorables to update the License or Rule .yml data file. """ result = models.get_ignorables(text_file=licensish.text_file) if verbose: print() print('result') pprint(result) if regen: is_from_license = licensish.is_from_license if is_from_license: db = cache.get_licenses_db() licish = db[licensish.license_expression] else: licish = licensish models.set_ignorables(licish, result, verbose=verbose) licish.dump() if is_from_license: licensish = models.build_rule_from_license(licish) expected = models.get_normalized_ignorables(licensish) if verbose: print('expected') pprint(expected) try: assert result == expected except: # On failure, we compare again to get additional failure details such as # a clickable text_file path. data_file = licensish.data_file if not data_file: data_file = licensish.text_file.replace('.LICENSE', '.yml') result['files'] = [ f'file://{data_file}', f'file://{licensish.text_file}', ] # This assert will always fail and provide a more detailed failure trace assert saneyaml.dump(result) == saneyaml.dump(expected)
def __init__(self, src_dir, match_text=False, match_approx=False): """ `src_dir` is where the License objects are dumped. """ src_dir = os.path.realpath(src_dir) self.src_dir = src_dir self.match_text = match_text self.match_approx = match_approx self.fetched = False if os.path.exists(src_dir): # fetch ONLY if the directory is empty self.fetched = True else: os.mkdir(src_dir) self.update_dir = self.src_dir.rstrip('\\/') + '-update' if not os.path.exists(self.update_dir): os.mkdir(self.update_dir) self.new_dir = self.src_dir.rstrip('\\/') + '-new' if not os.path.exists(self.new_dir): os.mkdir(self.new_dir) self.del_dir = self.src_dir.rstrip('\\/') + '-del' if not os.path.exists(self.del_dir): os.mkdir(self.del_dir) self.scancodes_by_key = get_licenses_db() self.scancodes_by_spdx_key = {l.spdx_license_key.lower(): l for l in self.scancodes_by_key.values() if l.spdx_license_key} composites_dir = os.path.join(licensedcode.data_dir, 'composites', 'licenses') self.composites_by_key = load_licenses(composites_dir, with_deprecated=True) self.composites_by_spdx_key = {l.spdx_license_key.lower(): l for l in self.composites_by_key.values() if l.spdx_license_key} foreign_dir = os.path.join(licensedcode.data_dir, 'non-english', 'licenses') self.non_english_by_key = load_licenses(foreign_dir, with_deprecated=True) self.non_english_by_spdx_key = {l.spdx_license_key.lower(): l for l in self.non_english_by_key.values() if l.spdx_license_key}
def generate_output(results, version, template): """ Yield unicode strings from incrementally rendering `results` and `version` with the Jinja `template` object. """ # FIXME: This code is highly coupled with actual scans and may not # support adding new scans at all from licensedcode.cache import get_licenses_db converted = {} converted_infos = {} converted_packages = {} licenses = {} LICENSES = 'licenses' COPYRIGHTS = 'copyrights' PACKAGES = 'package_data' # Create a flattened data dict keyed by path for scanned_file in results: scanned_file = dict(scanned_file) path = scanned_file['path'] results = [] if COPYRIGHTS in scanned_file: for entry in scanned_file[COPYRIGHTS]: results.append({ 'start': entry['start_line'], 'end': entry['end_line'], 'what': 'copyright', 'value': entry['copyright'], }) if LICENSES in scanned_file: for entry in scanned_file[LICENSES]: # make copy entry = dict(entry) entry_key = entry['key'] results.append({ 'start': entry['start_line'], 'end': entry['end_line'], 'what': 'license', 'value': entry_key, }) # FIXME: we should NOT rely on license objects: only use what is in the JSON instead if entry_key not in licenses: licenses[entry_key] = entry # we were modifying the scan data in place .... entry['object'] = get_licenses_db().get(entry_key) if results: converted[path] = sorted(results, key=itemgetter('start')) # TODO: this is klunky: we need to drop templates entirely or we # should rather just pass a the list of files from the scan # results and let the template handle this rather than # denormalizing the list here?? converted_infos[path] = {} for name, value in scanned_file.items(): if name in (LICENSES, PACKAGES, COPYRIGHTS): continue converted_infos[path][name] = value if PACKAGES in scanned_file: converted_packages[path] = scanned_file[PACKAGES] licenses = dict(sorted(licenses.items())) files = { 'license_copyright': converted, 'infos': converted_infos, 'package_data': converted_packages } return template.generate(files=files, licenses=licenses, version=version)
def cli(licenses_file): """ Create rules from a text file with delimited blocks of metadata and texts. As an example a file would contains one of more blocks such as this: \b ---------------------------------------- license_expression: lgpl-2.1 relevance: 100 is_license_notice: yes --- This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation; ---------------------------------------- """ rules_data = load_data(licenses_file) rules_tokens = all_rule_tokens() licenses = cache.get_licenses_db() licensing = Licensing(licenses.values()) print() errors = validate_license_rules(rules_data, licensing) if errors: print('Invalid rules: exiting....') for error in errors: print(error) print() raise Exception('Invalid rules: exiting....') print() for rule in rules_data: is_negative = rule.data.get('is_negative') is_false_positive = rule.data.get('is_false_positive') existing = rule_exists(rule.text) if existing and not is_negative: print('Skipping existing non-negative rule:', existing, 'with text:\n', rule.text[:50].strip(), '...') continue if is_negative: base_name = 'not-a-license' else: license_expression = rule.data.get('license_expression') license_expression = str( licensing.parse(license_expression, validate=True, simple=True)) base_name = license_expression if is_false_positive: base_name = 'false-positive_' + base_name base_loc = find_rule_base_loc(base_name) data_file = base_loc + '.yml' with io.open(data_file, 'w', encoding='utf-8') as o: o.write(rule.raw_data) text_file = base_loc + '.RULE' with io.open(text_file, 'w', encoding='utf-8') as o: o.write(rule.text) rulerec = models.Rule(data_file=data_file, text_file=text_file) rule_tokens = tuple(rulerec.tokens()) if rule_tokens in rules_tokens: # cleanup os.remove(text_file) os.remove(data_file) print('Skipping already added rule with text for:', base_name) else: rules_tokens.add(rule_tokens) rulerec.dump() models.update_ignorables(rulerec, verbose=False) print( 'Rule added:', 'file://' + rulerec.data_file, '\n', 'file://' + rulerec.text_file, )
def as_template(scanned_files, template): """ Return an string built from a list of `scanned_files` results and the provided `template` identifier. The template defaults to the standard HTML template format or can point to the path of a custom template file. """ # FIXME: This code is highly coupled with actual scans and may not # support adding new scans at all from licensedcode.cache import get_licenses_db # FIXME: factor out the html vs custom from this function: we should get a template path if template == 'html': template = get_template(get_template_dir('html')) else: # load a custom template tpath = fileutils.as_posixpath(os.path.abspath(os.path.expanduser(template))) assert os.path.isfile(tpath) tdir = fileutils.parent_directory(tpath) tfile = fileutils.file_name(tpath) template = get_template(tdir, tfile) converted = OrderedDict() converted_infos = OrderedDict() converted_packages = OrderedDict() licenses = {} LICENSES = 'licenses' COPYRIGHTS = 'copyrights' PACKAGES = 'packages' URLS = 'urls' EMAILS = 'emails' # Create a flattened data dict keyed by path for scanned_file in scanned_files: path = scanned_file['path'] results = [] if COPYRIGHTS in scanned_file: for entry in scanned_file[COPYRIGHTS]: results.append({ 'start': entry['start_line'], 'end': entry['end_line'], 'what': 'copyright', # NOTE: we display one statement per line. 'value': '\n'.join(entry['statements']), }) if LICENSES in scanned_file: for entry in scanned_file[LICENSES]: results.append({ 'start': entry['start_line'], 'end': entry['end_line'], 'what': 'license', 'value': entry['key'], }) # FIXME: we hsould NOT rely on license objects: only use what is in the JSON instead if entry['key'] not in licenses: licenses[entry['key']] = entry entry['object'] = get_licenses_db().get(entry['key']) if results: converted[path] = sorted(results, key=itemgetter('start')) # TODO: this is klunky: we need to drop templates entirely or we # should rather just pass a the list of files from the scan # results and let the template handle this rather than # denormalizing the list here?? converted_infos[path] = OrderedDict() for name, value in scanned_file.items(): if name in (LICENSES, PACKAGES, COPYRIGHTS, EMAILS, URLS): continue converted_infos[path][name] = value if PACKAGES in scanned_file: converted_packages[path] = scanned_file[PACKAGES] licenses = OrderedDict(sorted(licenses.items())) files = { 'license_copyright': converted, 'infos': converted_infos, 'packages': converted_packages } return template.generate(files=files, licenses=licenses)
# also verify that we are detecting exactly with the license rule itself test_method = make_license_test_function(license_key, license_obj.text_file, license_obj.data_file, test_name, detect_negative=True, trace_text=True) setattr(cls, test_name, test_method) class TestValidateLicenseTextDetection(unittest.TestCase): # Test functions are attached to this class at import time pass build_license_validation_tests(cache.get_licenses_db(), TestValidateLicenseTextDetection) def build_rule_validation_tests(rules, cls): """ Dynamically build an individual test method for each rule texts in a rules `data_set` then mapping attaching the test method to the `cls` test class. """ for rule in rules: if rule.negative: continue expected_identifier = rule.identifier test_name = ('test_validate_self_detection_of_rule_for_' + text.python_safe_name(expected_identifier)) test_method = make_license_test_function(
def as_template(scanned_files, version, template): """ Return an string built from a list of `scanned_files` results and the provided `template` identifier. The template defaults to the standard HTML template format or can point to the path of a custom template file. """ # FIXME: This code is highly coupled with actual scans and may not # support adding new scans at all from licensedcode.cache import get_licenses_db # FIXME: factor out the html vs custom from this function: we should get a template path if template == 'html': template = get_template(get_template_dir('html')) else: # load a custom template tpath = fileutils.as_posixpath( os.path.abspath(os.path.expanduser(template))) assert os.path.isfile(tpath) tdir = fileutils.parent_directory(tpath) tfile = fileutils.file_name(tpath) template = get_template(tdir, tfile) converted = OrderedDict() converted_infos = OrderedDict() converted_packages = OrderedDict() licenses = {} LICENSES = 'licenses' COPYRIGHTS = 'copyrights' PACKAGES = 'packages' URLS = 'urls' EMAILS = 'emails' # Create a flattened data dict keyed by path for scanned_file in scanned_files: path = scanned_file['path'] results = [] if COPYRIGHTS in scanned_file: for entry in scanned_file[COPYRIGHTS]: results.append({ 'start': entry['start_line'], 'end': entry['end_line'], 'what': 'copyright', # NOTE: we display one statement per line. 'value': '\n'.join(entry['statements']), }) if LICENSES in scanned_file: for entry in scanned_file[LICENSES]: results.append({ 'start': entry['start_line'], 'end': entry['end_line'], 'what': 'license', 'value': entry['key'], }) # FIXME: we hsould NOT rely on license objects: only use what is in the JSON instead if entry['key'] not in licenses: licenses[entry['key']] = entry entry['object'] = get_licenses_db().get(entry['key']) if results: converted[path] = sorted(results, key=itemgetter('start')) # TODO: this is klunky: we need to drop templates entirely or we # should rather just pass a the list of files from the scan # results and let the template handle this rather than # denormalizing the list here?? converted_infos[path] = OrderedDict() for name, value in scanned_file.items(): if name in (LICENSES, PACKAGES, COPYRIGHTS, EMAILS, URLS): continue converted_infos[path][name] = value if PACKAGES in scanned_file: converted_packages[path] = scanned_file[PACKAGES] licenses = OrderedDict(sorted(licenses.items())) files = { 'license_copyright': converted, 'infos': converted_infos, 'packages': converted_packages } return template.generate(files=files, licenses=licenses, version=version)
def cli(licenses_file): """ Create rules from a text file with delimited blocks of metadata and texts. As an example a file would contains one of more blocks such as this: \b ---------------------------------------- license_expression: lgpl-2.1 relevance: 100 is_license_notice: yes --- This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation; ---------------------------------------- """ rules_data = load_data(licenses_file) rule_by_tokens = all_rule_by_tokens() licenses_by_key = cache.get_licenses_db() skinny_rules = [] for rdata in rules_data: relevance = rdata.data.get("relevance") rdata.data["has_stored_relevance"] = bool(relevance) license_expression = rdata.data.get("license_expression") if license_expression: rdata.data["license_expression"] = license_expression.lower( ).strip() minimum_coverage = rdata.data.get("minimum_coverage") rdata.data["has_stored_minimum_coverage"] = bool(minimum_coverage) rl = models.BasicRule(**rdata.data) rl.stored_text = rdata.text skinny_rules.append(rl) models.validate_rules(skinny_rules, licenses_by_key, with_text=True) print() for rule in skinny_rules: if rule.is_false_positive: base_name = "false-positive" elif rule.is_license_intro: base_name = "license-intro" else: base_name = rule.license_expression text = rule.text() existing_rule = rule_exists(text) skinny_text = " ".join(text[:80].split()).replace("{", " ").replace( "}", " ") existing_msg = (f"Skipping rule for: {base_name!r}, " "dupe of: {existing_rule} " f"with text: {skinny_text!r}...") if existing_rule: print(existing_msg.format(**locals())) continue base_loc = find_rule_base_loc(base_name) rd = rule.to_dict() rd["stored_text"] = rule.stored_text rd["has_stored_relevance"] = rule.has_stored_relevance rd["has_stored_minimum_coverage"] = rule.has_stored_minimum_coverage rulerec = models.Rule(**rd) # force recomputing relevance to remove junk stored relevance for long rules rulerec.set_relevance() rulerec.data_file = base_loc + ".yml" rulerec.text_file = base_loc + ".RULE" rule_tokens = tuple(rulerec.tokens()) existing_rule = rule_by_tokens.get(rule_tokens) if existing_rule: print(existing_msg.format(**locals())) continue else: print(f"Adding new rule: {base_name}") print(" file://" + rulerec.data_file) print(" file://" + rulerec.text_file, ) rulerec.dump() models.update_ignorables(rulerec, verbose=False) rulerec.dump() rule_by_tokens[rule_tokens] = base_name
def test_validate_licenses(self): errors, warnings, infos = models.License.validate(cache.get_licenses_db()) assert {} == errors assert {} == warnings assert infos
""" for license_key, license_obj in licenses_by_key.items(): if license_obj.text_file and os.path.exists(license_obj.text_file): test_name = ('test_validate_self_detection_of_text_for_' + text.python_safe_name(license_key)) # also verify that we are detecting exactly with the license rule itself test_method = make_license_test_function( license_key, license_obj.text_file, license_obj.data_file, test_name, detect_negative=True, trace_text=True) setattr(cls, test_name, test_method) class TestValidateLicenseTextDetection(unittest.TestCase): # Test functions are attached to this class at import time pass build_license_validation_tests(cache.get_licenses_db(), TestValidateLicenseTextDetection) def build_rule_validation_tests(rules, cls): """ Dynamically build an individual test method for each rule texts in a rules `data_set` then mapping attaching the test method to the `cls` test class. """ for rule in rules: if rule.negative: continue expected_identifier = rule.identifier test_name = ('test_validate_self_detection_of_rule_for_' + text.python_safe_name(expected_identifier)) test_method = make_license_test_function( rule.licenses, rule.text_file, rule.data_file, test_name, detect_negative=not rule.negative, trace_text=True )
def cli(licenses_file): """ Create rules from a structured text file For instance: ---------------------------------------- license_expression: lgpl-2.1 relevance: 100 is_license_notice: yes --- This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation; ---------------------------------------- """ rule_data = load_data(licenses_file) rules_tokens = set() licenses = cache.get_licenses_db() licensing = Licensing(licenses.values()) print() for data, text in rule_data: rdat = '\n'.join(data) rtxt = '\n'.join(text) existing = rule_exists(rtxt) if existing: print('Skipping existing rule:', existing, 'with text:\n', rtxt[:50].strip(), '...') continue # validate YAML syntax parsed = saneyaml.load(rdat) if parsed.get('is_negative'): license_expression = 'not-a-license' else: _, _, license_expression = data[0].partition(': ') license_expression = license_expression.strip() if not license_expression: raise Exception('Missing license_expression for text:', rtxt) licensing.parse(license_expression, validate=True, simple=True) base_loc = find_rule_base_loc(license_expression) data_file = base_loc + '.yml' with io.open(data_file, 'w', encoding='utf-8') as o: o.write(rdat) text_file = base_loc + '.RULE' with io.open(text_file, 'w', encoding='utf-8') as o: o.write(rtxt) rule = models.Rule(data_file=data_file, text_file=text_file) rule_tokens = tuple(rule.tokens()) if rule_tokens in rules_tokens: # cleanup os.remove(text_file) os.remove(data_file) print('Skipping already added rule with text for:', license_expression) else: rules_tokens.add(rule_tokens) rule.dump() models.update_ignorables(rule, verbose=True) print('Rule added:', rule.identifier)
def get_licenses(location, min_score=0, include_text=False, license_url_template=DEJACODE_LICENSE_URL, **kwargs): """ Return a mapping or detected_licenses for licenses detected in the file at `location` This mapping contains two keys: - 'licenses' with a value that is list of mappings of license information. - 'license_expressions' with a value that is list of license expression strings. `minimum_score` is a minimum score threshold from 0 to 100. The default is 0 means that all license matches are returned. Otherwise, matches with a score below `minimum_score` are returned. if `include_text` is True, matched text is included in the returned `licenses` data. """ from licensedcode.cache import get_index from licensedcode.cache import get_licenses_db idx = get_index() licenses = get_licenses_db() detected_licenses = [] detected_expressions = [] for match in idx.match(location=location, min_score=min_score, **kwargs): if include_text: # TODO: handle whole lines with the case of very long lines matched_text = match.matched_text(whole_lines=False) detected_expressions.append(match.rule.license_expression) for license_key in match.rule.license_keys(): lic = licenses.get(license_key) result = OrderedDict() detected_licenses.append(result) result['key'] = lic.key result['score'] = match.score() result['name'] = lic.name result['short_name'] = lic.short_name result['category'] = lic.category result['is_exception'] = lic.is_exception result['owner'] = lic.owner result['homepage_url'] = lic.homepage_url result['text_url'] = lic.text_urls[0] if lic.text_urls else '' result['reference_url'] = license_url_template.format(lic.key) spdx_key = lic.spdx_license_key result['spdx_license_key'] = spdx_key if spdx_key: spdx_key = lic.spdx_license_key.rstrip('+') spdx_url = SPDX_LICENSE_URL.format(spdx_key) else: spdx_url = '' result['spdx_url'] = spdx_url result['start_line'] = match.start_line result['end_line'] = match.end_line matched_rule = result['matched_rule'] = OrderedDict() matched_rule['identifier'] = match.rule.identifier matched_rule['license_expression'] = match.rule.license_expression matched_rule['licenses'] = match.rule.license_keys() matched_rule['is_license_text'] = match.rule.is_license_text matched_rule['is_license_notice'] = match.rule.is_license_notice matched_rule[ 'is_license_reference'] = match.rule.is_license_reference matched_rule['is_license_tag'] = match.rule.is_license_tag matched_rule['matcher'] = match.matcher matched_rule['rule_length'] = match.rule.length matched_rule['matched_length'] = match.ilen() matched_rule['match_coverage'] = match.coverage() matched_rule['rule_relevance'] = match.rule.relevance # FIXME: for sanity this should always be included????? if include_text: result['matched_text'] = matched_text return OrderedDict([ ('licenses', detected_licenses), ('license_expressions', detected_expressions), ])
def test_validate_license_library(self): errors, warnings, infos = models.License.validate( cache.get_licenses_db(), verbose=True) assert {} == errors assert {} == warnings assert infos
def cli(licenses_file): """ Create rules from a text file with delimited blocks of metadata and texts. As an example a file would contains one of more blocks such as this: \b ---------------------------------------- license_expression: lgpl-2.1 relevance: 100 is_license_notice: yes --- This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation; ---------------------------------------- """ rules_data = load_data(licenses_file) rules_tokens = all_rule_tokens() licenses_by_key = cache.get_licenses_db() skinny_rules = [] for rdata in rules_data: relevance = rdata.data.get('relevance') rdata.data['has_stored_relevance'] = bool(relevance) minimum_coverage = rdata.data.get('minimum_coverage') rdata.data['has_stored_minimum_coverage'] = bool(minimum_coverage) rl = models.BasicRule(**rdata.data) rl.stored_text = rdata.text skinny_rules.append(rl) models.validate_rules(skinny_rules, licenses_by_key, with_text=True) print() for rule in skinny_rules: existing = rule_exists(rule.text()) if existing: print('Skipping existing rule:', existing, 'with text:\n', rule.text()[:50].strip(), '...') continue if rule.is_false_positive: base_name = 'false-positive' elif rule.is_license_intro: base_name = 'license-intro' else: base_name = rule.license_expression base_loc = find_rule_base_loc(base_name) rd = rule.to_dict() rd['stored_text'] = rule.stored_text rd['has_stored_relevance'] = rule.has_stored_relevance rd['has_stored_minimum_coverage'] = rule.has_stored_minimum_coverage rulerec = models.Rule(**rd) # force recomputing relevance to remove junk stored relevance for long rules rulerec.compute_relevance(_threshold=18.0) rulerec.data_file = base_loc + '.yml' rulerec.text_file = base_loc + '.RULE' rule_tokens = tuple(rulerec.tokens()) if rule_tokens in rules_tokens: print('Skipping already added rule with text for:', base_name) else: print('Adding new rule:') print(' file://' + rulerec.data_file) print(' file://' + rulerec.text_file, ) rules_tokens.add(rule_tokens) rulerec.dump() models.update_ignorables(rulerec, verbose=False) rulerec.dump()
def _licenses_data_from_match(match, include_text=False, license_text_diagnostics=False, license_url_template=SCANCODE_LICENSEDB_URL): """ Return a list of "licenses" scan data built from a license match. Used directly only internally for testing. """ from licensedcode import cache licenses = cache.get_licenses_db() matched_text = None if include_text: if license_text_diagnostics: matched_text = match.matched_text(whole_lines=False, highlight=True) else: matched_text = match.matched_text(whole_lines=True, highlight=False) SCANCODE_BASE_URL = 'https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses' SCANCODE_LICENSE_TEXT_URL = SCANCODE_BASE_URL + '/{}.LICENSE' SCANCODE_LICENSE_DATA_URL = SCANCODE_BASE_URL + '/{}.yml' detected_licenses = [] for license_key in match.rule.license_keys(): lic = licenses.get(license_key) result = {} detected_licenses.append(result) result['key'] = lic.key result['score'] = match.score() result['name'] = lic.name result['short_name'] = lic.short_name result['category'] = lic.category result['is_exception'] = lic.is_exception result['is_unknown'] = lic.is_unknown result['owner'] = lic.owner result['homepage_url'] = lic.homepage_url result['text_url'] = lic.text_urls[0] if lic.text_urls else '' result['reference_url'] = license_url_template.format(lic.key) result['scancode_text_url'] = SCANCODE_LICENSE_TEXT_URL.format(lic.key) result['scancode_data_url'] = SCANCODE_LICENSE_DATA_URL.format(lic.key) spdx_key = lic.spdx_license_key result['spdx_license_key'] = spdx_key if spdx_key: is_license_ref = spdx_key.lower().startswith('licenseref-') if is_license_ref: spdx_url = SCANCODE_LICENSE_TEXT_URL.format(lic.key) else: spdx_key = lic.spdx_license_key.rstrip('+') spdx_url = SPDX_LICENSE_URL.format(spdx_key) else: spdx_url = '' result['spdx_url'] = spdx_url result['start_line'] = match.start_line result['end_line'] = match.end_line matched_rule = result['matched_rule'] = {} matched_rule['identifier'] = match.rule.identifier matched_rule['license_expression'] = match.rule.license_expression matched_rule['licenses'] = match.rule.license_keys() matched_rule['referenced_filenames'] = match.rule.referenced_filenames matched_rule['is_license_text'] = match.rule.is_license_text matched_rule['is_license_notice'] = match.rule.is_license_notice matched_rule['is_license_reference'] = match.rule.is_license_reference matched_rule['is_license_tag'] = match.rule.is_license_tag matched_rule['is_license_intro'] = match.rule.is_license_intro matched_rule['has_unknown'] = match.rule.has_unknown matched_rule['matcher'] = match.matcher matched_rule['rule_length'] = match.rule.length matched_rule['matched_length'] = match.len() matched_rule['match_coverage'] = match.coverage() matched_rule['rule_relevance'] = match.rule.relevance # FIXME: for sanity this should always be included????? if include_text: result['matched_text'] = matched_text return detected_licenses