def cleanse_licence_expression(licence_expression: str) -> str: """Cleanses a licence expression by using SPDX identifiers when possible. A licence expression can be a combination of licences and in a lot of cases is free-form text. The idea is to return an equivalent expression but using SPDX identifiers when possible. """ simplified_expression = _parse_licence_expression( Licensing(), licence_expression).simplify() for s in simplified_expression.symbols: corresponding_licence = OPENSOURCE_LICENCES.get_licence(s.key) if corresponding_licence: s.key = corresponding_licence.identifier return simplify_licence_expression(str(simplified_expression))
def combine_expressions( expressions, relation='AND', unique=True, licensing=Licensing(), ): """ Return a combined license expression string with relation, given a sequence of license ``expressions`` strings or LicenseExpression objects. """ return expressions and str( le_combine_expressions(expressions, relation, unique, licensing)) or None
def test_get_expression_without_lid(self): licensing = Licensing() spdx_symbols = get_spdx_symbols() unknown_symbol = get_unknown_spdx_symbol() line_text = ('EPL-2.0 OR Apache-2.0 OR ' 'GPL-2.0 WITH Classpath-exception-2.0 OR ' 'GPL-2.0') expression = get_expression(line_text, licensing, spdx_symbols, unknown_symbol) expected = 'epl-2.0 OR apache-2.0 OR gpl-2.0 WITH classpath-exception-2.0 OR gpl-2.0' assert expected == expression.render() expected = [ 'epl-2.0', u'apache-2.0', u'gpl-2.0', u'classpath-exception-2.0', u'gpl-2.0' ] assert expected == licensing.license_keys(expression, unique=False) assert all( s.wrapped for s in licensing.license_symbols(expression, decompose=True))
def test_get_expression_complex(self): licensing = Licensing() spdx_symbols = get_spdx_symbols() unknown_symbol = get_unknown_spdx_symbol() line_text = ('* SPDX-License-Identifier: ' 'EPL-2.0 OR aPache-2.0 OR ' 'GPL-2.0 WITH classpath-exception-2.0 OR ' 'GPL-2.0') expression = get_expression(line_text, licensing, spdx_symbols, unknown_symbol) expected = 'epl-2.0 OR apache-2.0 OR gpl-2.0 WITH classpath-exception-2.0 OR gpl-2.0' assert expression.render() == expected expected = [ 'epl-2.0', u'apache-2.0', u'gpl-2.0', u'classpath-exception-2.0' ] assert licensing.license_keys(expression, unique=True) == expected assert all( s.wrapped for s in licensing.license_symbols(expression, decompose=True))
def get_declared_license_keys_in_packages(codebase): """ Return a list of declared license keys found in packages. A package manifest (such as Maven POM file or an npm package.json file) contains structured declared license information. This is further normalized as a license_expression. We extract the list of licenses from the normalized license expressions. """ packages = chain.from_iterable( getattr(res, 'packages', []) or [] for res in codebase.walk(topdown=True)) licensing = Licensing() detected_good_licenses = [] for package in packages: expression = package.get('license_expression') if expression: exp = licensing.parse( expression, validate=False, strict=False, simple=True) keys = licensing.license_keys(exp, unique=True) detected_good_licenses.extend(keys) return detected_good_licenses
class AlpineLicenseTest(object): """ A license detection test is used to verify that Alpine declared license detection works correctly It consists of one YAML file with test data and expectation and a pacakge reference. """ declared_license = attr.attrib() license_expression = attr.attrib() data_file = attr.attrib(default=None) licensing = Licensing() @classmethod def from_file(cls, data_file): with open(data_file) as df: data = saneyaml.load(df.read()) data['data_file'] = data_file alptest = cls(**data) alptest.license_expression = cls.licensing.parse( alptest.license_expression).render() return alptest def to_dict(self): dct = attr.asdict(self) dct.pop('data_file') return dct def dump(self): parent = fileutils.parent_directory(self.data_file) if not exists(parent): fileutils.create_dir(parent) with open(self.data_file, 'w') as df: df.write(saneyaml.dump(self.to_dict())) def get_test_method_name(self): dfn = fileutils.file_base_name(self.data_file.lower()) test_name = f'test_alpine_license_detection_{dfn}' return text.python_safe_name(test_name) @staticmethod def from_dir(test_dir): """ Return an iterable of AlpineLicenseTest objects loaded from `test_dir` """ test_files = packages_test_utils.get_test_files(test_dir, '.yml') test_files = (join(test_dir, f) for f in test_files) return map(AlpineLicenseTest.from_file, test_files)
def test_get_expression_complex_with_unknown_symbols_and_refs(self): licensing = Licensing() spdx_symbols = get_spdx_symbols() unknown_symbol = get_unknown_spdx_symbol() line_text = ('* SPDX-License-Identifier: ' 'EPL-2.0 OR Apache-2.0 ' 'OR GPL-2.0 WITH Classpath-exception-2.0 ' 'OR LicenseRef-GPL-2.0 WITH Assembly-exception') expression = get_expression(line_text, licensing, spdx_symbols, unknown_symbol) expected = 'epl-2.0 OR apache-2.0 OR gpl-2.0 WITH classpath-exception-2.0 OR unknown-spdx WITH unknown-spdx' assert expected == expression.render() expected = [ 'epl-2.0', 'apache-2.0', 'gpl-2.0', 'classpath-exception-2.0', 'unknown-spdx', 'unknown-spdx' ] assert expected == licensing.license_keys(expression, unique=False) assert all( s.wrapped for s in licensing.license_symbols(expression, decompose=True))
def group_license_expressions(unique_license_expressions): """ Return a tuple that contains two list of license expressions. The first list in the tuple contains unique license expressions with "AND", "OR, or "WITH" in it. The second list in the tuple contains unique license expressions without "AND", "OR", or "WITH". """ joined_expressions = [] single_expressions = [] for license_expression in unique_license_expressions: if ('AND' in license_expression or 'OR' in license_expression or 'WITH' in license_expression): joined_expressions.append(license_expression) else: single_expressions.append(license_expression) licensing = Licensing() unique_joined_expressions = [] seen_joined_expression = [] len_joined_expressions = len(joined_expressions) if len_joined_expressions > 1: for i, j in enumerate(joined_expressions, start=1): if i > len_joined_expressions: break for j1 in joined_expressions[i:]: if licensing.is_equivalent(j, j1): if j not in unique_joined_expressions and j not in seen_joined_expression: unique_joined_expressions.append(j) seen_joined_expression.append(j1) else: unique_joined_expressions = joined_expressions return unique_joined_expressions, single_expressions
def to_dict(self, **kwargs): def dict_fields(attr, value): if attr.name in ('resources', ): return False return True license_expressions_to_combine = [] if self.core_license_expression: license_expressions_to_combine.append(self.core_license_expression) if self.other_license_expression: license_expressions_to_combine.append(self.other_license_expression) if license_expressions_to_combine: combined_license_expression = combine_expressions(license_expressions_to_combine) if combined_license_expression: self.consolidated_license_expression = str(Licensing().parse(combined_license_expression).simplify()) self.consolidated_holders = sorted(set(list(self.core_holders) + list(self.other_holders))) # TODO: Verify and test that we are generating detectable copyrights self.consolidated_copyright = 'Copyright (c) {}'.format(', '.join(self.consolidated_holders)) return attr.asdict(self, filter=dict_fields, dict_factory=OrderedDict)
def test_get_expression_works_for_legacy_deprecated_old_spdx_symbols(self): exp_by_old = { 'eCos-2.0': 'gpl-2.0-plus WITH ecos-exception-2.0', 'GPL-2.0-with-autoconf-exception': 'gpl-2.0 WITH autoconf-exception-2.0', 'GPL-2.0-with-bison-exception': 'gpl-2.0 WITH bison-exception-2.2', 'GPL-2.0-with-classpath-exception': 'gpl-2.0 WITH classpath-exception-2.0', 'GPL-2.0-with-font-exception': 'gpl-2.0 WITH font-exception-gpl', 'GPL-2.0-with-GCC-exception': 'gpl-2.0 WITH gcc-linking-exception-2.0', 'GPL-3.0-with-autoconf-exception': 'gpl-3.0 WITH autoconf-exception-3.0', 'GPL-3.0-with-GCC-exception': 'gpl-3.0 WITH gcc-exception-3.1', 'wxWindows': 'lgpl-2.0-plus WITH wxwindows-exception-3.1', } licensing = Licensing() symbols_by_spdx = get_spdx_symbols() unknown_symbol = get_unknown_spdx_symbol() for test, expected in exp_by_old.items(): result = get_expression( test, licensing, symbols_by_spdx, unknown_symbol).render() assert result == expected
def get_normalized_expression(query_string): """ Given a text `query_string` return a single detected license expression. `query_string` is typically the value of a license field as found in package manifests. Return None if there is the `query_string` is empty. Return "unknown" as a license expression if there is a `query_string` but nothing was detected. For example:: >>> get_normalized_expression('mit') 'mit' >>> get_normalized_expression('mit or asasa or Apache-2.0') 'apache-2.0 AND unknown' >>> get_normalized_expression('mit or asasa or Apache-2.0') 'apache-2.0 AND unknown' >>> get_normalized_expression('mit asasa or Apache-2.0') 'apache-2.0 AND unknown' >>> assert get_normalized_expression('') is None >>> assert get_normalized_expression(None) is None """ if not query_string or not query_string.strip(): return if TRACE: logger_debug('get_normalized_expression: query_string: "{}"'.format( query_string)) from licensedcode.cache import get_index idx = get_index() licensing = Licensing() # we match twice in a cascade: as an expression, then as plain text if we # did not succeed. matches = None try: matched_as_expression = True matches = idx.match(query_string=query_string, as_expression=True) if matches_have_unknown(matches, licensing): # rematch also if we have unknowns matched_as_expression = False matches = idx.match(query_string=query_string, as_expression=False) except Exception: matched_as_expression = False matches = idx.match(query_string=query_string, as_expression=False) if not matches: # we have a query_string text but there was no match: return an unknown # key return 'unknown' if TRACE: logger_debug('get_normalized_expression: matches:', matches) # join the possible multiple detected license expression with an AND expression_objects = [m.rule.license_expression_object for m in matches] if len(expression_objects) == 1: combined_expression_object = expression_objects[0] else: combined_expression_object = licensing.AND(*expression_objects) if matched_as_expression: # then just return the expression(s) return str(combined_expression_object) # Otherwise, verify that we consumed 100% of the query string e.g. that we # have no unknown leftover. # 1. have all matches 100% coverage? all_matches_have_full_coverage = all(m.coverage() == 100 for m in matches) # TODO: have all matches a high enough score? # 2. are all declared license tokens consumed? query = matches[0].query # the query object should be the same for all matches. Is this always true?? for mt in matches: if mt.query != query: # FIXME: the expception may be swallowed in callers!!! raise Exception( 'Inconsistent package.declared_license: text with multiple "queries".' 'Please report this issue to the scancode-toolkit team.\n' '{}'.format(query_string)) query_len = len(query.tokens) matched_qspans = [m.qspan for m in matches] matched_qpositions = Span.union(*matched_qspans) len_all_matches = len(matched_qpositions) declared_license_is_fully_matched = query_len == len_all_matches if not all_matches_have_full_coverage or not declared_license_is_fully_matched: # We inject an 'unknown' symbol in the expression unknown = licensing.parse('unknown', simple=True) combined_expression_object = licensing.AND(combined_expression_object, unknown) return str(combined_expression_object)
class LicenseTest(object): """ A license detection test is used to verify that license detection works correctly It consists of two files with the same base name: a .yml file with test data and a test file with any other extension that needs to be tested for detection The following data are loaded from the .yml file: - a test file to scan for licenses, - a list of expected licenses expressions to detect - optional notes. - a boolean flag expected_failure set to True if a test is expected to fail for now. If the list of license expressions is empty, then this test should not detect any license in the test file. """ data_file = attr.attrib(default=None) test_file = attr.attrib(default=None) test_file_name = attr.attrib(default=None) license_expressions = attr.attrib(default=attr.Factory(list)) notes = attr.attrib(default=None) expected_failure = attr.attrib(default=False) licensing = Licensing() def __attrs_post_init__(self, *args, **kwargs): if self.test_file: _, _, self.test_file_name = self.test_file.partition( os.path.join('licensedcode', 'data') + os.sep) data = {} if self.data_file: try: with io.open(self.data_file, encoding='utf-8') as df: data = saneyaml.load(df.read()) or {} except Exception as e: raise Exception(f'Failed to read: file://{self.data_file}', e) self.license_expressions = data.pop('license_expressions', []) self.notes = data.pop('notes', None) # True if the test is expected to fail self.expected_failure = data.pop('expected_failure', False) if data: raise Exception('Unknown data elements: ' + repr(data) + ' for: file://' + self.data_file) if self.license_expressions: for i, exp in enumerate(self.license_expressions[:]): try: expression = self.licensing.parse(exp) except: raise Exception('Unable to parse License rule expression: ' f'{exp!r} for: file://{self.data_file}\n' + traceback.format_exc()) if expression is None: raise Exception('Unable to parse License rule expression: ' f'{exp!r} for: file://{self.data_file}') new_exp = expression.render() self.license_expressions[i] = new_exp else: if not self.notes: raise Exception( 'A license test without expected license_expressions should ' f'have explanatory notes: for: file://{self.data_file}') def to_dict(self): dct = {} if self.license_expressions: dct['license_expressions'] = self.license_expressions if self.expected_failure: dct['expected_failure'] = self.expected_failure if self.notes: dct['notes'] = self.notes return dct def dump(self): """ Dump a representation of self to its YAML data file """ as_yaml = saneyaml.dump(self.to_dict()) with io.open(self.data_file, 'w', encoding='utf-8') as df: df.write(as_yaml) def get_content(self): """ Return a byte strings of the test file content. """ with open(self.test_file, 'rb') as df: d = df.read() return d def get_test_method_name(self, prefix='test_detection_'): test_file_name = self.test_file_name test_name = '{prefix}{test_file_name}'.format(**locals()) test_name = text.python_safe_name(test_name) if not isinstance(test_name, str): test_name = test_name.decode('utf-8') return test_name @staticmethod def load_from(test_dir): """ Return an iterable of LicenseTest objects loaded from `test_dir` """ return [ LicenseTest(data_file, test_file) for data_file, test_file in get_test_file_pairs(test_dir) ]
def get_origin_info_from_top_level_packages(top_level_packages, codebase): """ Return a 3-tuple containing the strings of declared license expression, copyright holder, and primary programming language from a ``top_level_packages`` list of detected top-level packages mapping and a ``codebase``. """ if not top_level_packages: return '', '', '' license_expressions = [] programming_languages = [] copyrights = [] parties = [] for package_mapping in top_level_packages: package = models.Package.from_dict(package_mapping) # we are only interested in key packages if not is_key_package(package, codebase): continue license_expression = package.license_expression if license_expression: license_expressions.append(license_expression) programming_language = package.primary_language if programming_language: programming_languages.append(programming_language) copyright_statement = package.copyright if copyright_statement: copyrights.append(copyright_statement) parties.extend(package.parties or []) # Combine license expressions unique_license_expressions = unique(license_expressions) combined_declared_license_expression = combine_expressions( expressions=unique_license_expressions, relation='AND', ) declared_license_expression = '' if combined_declared_license_expression: declared_license_expression = str( Licensing().parse(combined_declared_license_expression).simplify()) # Get holders holders = list(get_holders_from_copyright(copyrights)) declared_holders = [] if holders: declared_holders = holders elif parties: declared_holders = [party.name for party in parties or []] declared_holders = unique(declared_holders) # Programming language unique_programming_languages = unique(programming_languages) primary_language = '' if len(unique_programming_languages) == 1: primary_language = unique_programming_languages[0] return declared_license_expression, declared_holders, primary_language
def cli(licenses_file): """ Create rules from a structured text file For instance: ---------------------------------------- license_expression: lgpl-2.1 relevance: 100 is_license_notice: yes --- This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation; ---------------------------------------- """ rule_data = load_data(licenses_file) rules_tokens = set() licenses = cache.get_licenses_db() licensing = Licensing(licenses.values()) print() for data, text in rule_data: rdat = '\n'.join(data) rtxt = '\n'.join(text) existing = rule_exists(rtxt) if existing: print('Skipping existing rule:', existing, 'with text:\n', rtxt[:50].strip(), '...') continue # validate YAML syntax parsed = saneyaml.load(rdat) if parsed.get('is_negative'): license_expression = 'not-a-license' else: _, _, license_expression = data[0].partition(': ') license_expression = license_expression.strip() if not license_expression: raise Exception('Missing license_expression for text:', rtxt) licensing.parse(license_expression, validate=True, simple=True) base_loc = find_rule_base_loc(license_expression) data_file = base_loc + '.yml' with io.open(data_file, 'w', encoding='utf-8') as o: o.write(rdat) text_file = base_loc + '.RULE' with io.open(text_file, 'w', encoding='utf-8') as o: o.write(rtxt) rule = models.Rule(data_file=data_file, text_file=text_file) rule_tokens = tuple(rule.tokens()) if rule_tokens in rules_tokens: # cleanup os.remove(text_file) os.remove(data_file) print('Skipping already added rule with text for:', license_expression) else: rules_tokens.add(rule_tokens) rule.dump() models.update_ignorables(rule, verbose=True) print('Rule added:', rule.identifier)
def simplify_licence_expression(licence_expression: str) -> str: """Simplifies a licence expression.""" return str( _parse_licence_expression(Licensing(), licence_expression).simplify())
from typing import Tuple, Set, List from anytree import PreOrderIter, LevelOrderIter, AnyNode from anytree.exporter import DictExporter from anytree.importer import DictImporter from license_expression import Licensing try: from license_sh_private.normalizer import normalize except ImportError: def normalize(license_expression): return license_expression, False licensing = Licensing() UNKNOWN = "Unknown" RED = "\033[1;31m" BLUE = "\033[1;34m" CYAN = "\033[1;36m" GREEN = "\033[0;32m" RESET = "\033[0;0m" BOLD = "\033[;1m" REVERSE = "\033[;7m" def get_npm_license_from_licenses_array(licenses_array): """ Extract licenses name from licenses array and join them with AND
class LicenseHandler: def __init__(self, translations_files, relicense_file, group_file): self.translations_files = translations_files self.relicense_file = relicense_file self.relicense_map = None self.group_file = group_file symbols = self.read_symbols(self.translations_files) self.licensing = Licensing(symbols) def read_symbols(self, translations_files): symbols_map = {} self.translations = [] for translations_file in translations_files.split(): translation_data = read_translations(translations_file) self.translations.append(translation_data) for lic_key in translation_data: if lic_key not in symbols_map: symbols_map[lic_key] = set() for val in translation_data[lic_key]: symbols_map[lic_key].add(val) return [LicenseSymbol(key=key, aliases=tuple(value)) for key, value in symbols_map.items()] def translate_and_relicense(self, license_expression): transl = self.translate(license_expression) if not transl: transl = license_expression rel = self.expand_relicense(transl) return rel if rel else transl def expand_relicense(self, license_expression): if self.relicense_file is not None and self.relicense_file: self.relicense_map = read_relicense_file(self.relicense_file) expanded = relicense_license( self.relicense_map, license_expression) return expanded.strip() else: return license_expression.strip() def group(self, license_expression): return license_expression.strip() def translate(self, license_expression): license_expression = license_expression.replace( "&", AND_STRING).replace("|", OR_STRING) return str(self.simplify(license_expression)) def simplify(self, license_expression): parsed = self.licensing.parse(license_expression) return parsed.simplify() def license_expression_list_json(self, license_expression, relicense=True): license = self.license_expression_list(license_expression, relicense) return { "license_expression": license_expression, "expanded": license.expanded, "grouped": license.grouped, "translated": license.translated, "simplified": license.simplified, "interim": license.interim, "set_list": license.set_list } def license_expression_list(self, license_expression, relicense=True): license = ManagedLicenseExpression(license_expression) license.translated = self.translate(license_expression) # We need str to skip verbose output license.simplified = str(self.simplify(license.translated)) if relicense: license.expanded = self.expand_relicense(license.simplified) else: license.expanded = license.simplified license.grouped = self.group(license.expanded) license.interim = self.interim_license_expression_list( license.grouped, self.licensing) license.set_list = self.interim_license_expression_set_list( license.interim) return license def interim_license_expression_list(self, license_expression, licensing): """ Transforms and boolean symbolic expression Turns an expression like this: G AND (A OR B) into: AND [G, OR [A, B]] The latter is an interim format. """ encoded = encode_license_expression(license_expression) tokenizer = licensing.get_advanced_tokenizer() tokenized = tokenizer.tokenize(encoded) current_license = None current_licenses = [] current_op = None paren_expr = None paren_count = 0 for token in tokenized: tok = token.string if tok == '(': if paren_expr is None: paren_expr = "" else: paren_expr = paren_expr + " " + tok paren_count = paren_count + 1 elif tok == ')': if paren_count == 0: current_license = self.interim_license_expression_list( paren_expr, licensing) paren_expr = None else: paren_count = paren_count - 1 paren_expr = paren_expr + " " + tok elif tok == 'OR' or tok == 'AND': if paren_expr is not None: paren_expr = paren_expr + " " + tok else: if current_licenses is None: raise FlictError(ReturnCodes.RET_INTERNAL_ERROR, "Internal failure. Failed creating interim license expression. current_licenses is None") if current_op is None: # first operator current_op = tok current_licenses.append(current_license) elif current_op == tok: # same operator current_licenses.append(current_license) else: # different operator raise FlictError(ReturnCodes.RET_INTERNAL_ERROR, "Internal failure. Failed creating interim license expression.") else: if paren_expr is not None: paren_expr = paren_expr + " " + tok else: current_license = tok current_licenses.append(current_license) if current_op is None: current_op = "AND" list = LicenseExpressionList(current_op, current_licenses) return list def _combinations(self, lel): if not isinstance(lel, LicenseExpressionList): return 1 if lel.op == "AND": prod = 1 for item in lel.list: prod = prod * self._combinations(item) return prod elif lel.op == "OR": sum = 0 for item in lel.list: sum = sum + self._combinations(item) return sum else: FlictError(ReturnCodes.RET_INTERNAL_ERROR, f"Internal failure. Failed identifying operator: {lel}") def interim_license_expression_set_list(self, interim_license_expression_list): """ Transforms a boolean symbolic expression Turns an expression like this: AND [G, OR [A, B]] into: [ { G, A }, { G, B } ] The latter is an interim format. """ expanded_list = [] if not isinstance(interim_license_expression_list, LicenseExpressionList): # single license license_set = {decode_license_expression(interim_license_expression_list)} expanded_list.append(list(license_set)) return expanded_list current_op = interim_license_expression_list.op for lep in interim_license_expression_list.list: if current_op is None: raise FlictError(ReturnCodes.RET_INTERNAL_ERROR, "Internal failure. No operator found") lep_list = self.interim_license_expression_set_list(lep) if current_op == "OR": expanded_list = self._manage_list_item_or( expanded_list, lep_list) elif current_op == "AND": expanded_list = self._manage_list_item_and( expanded_list, lep_list) return expanded_list def _manage_list_item_and(self, license_list, lep): if isinstance(lep, LicenseExpressionList): raise FlictError(ReturnCodes.RET_INTERNAL_ERROR, f"Internal failure. Wrong type {lep} for: {lep}") # single license if len(license_list) == 0: return lep new_list = [] for item in license_list: for lep_item in lep: new_item = list(set(item + lep_item)) new_list.append(new_item) return new_list def _manage_list_item_or(self, license_list, lep): if isinstance(lep, LicenseExpressionList): raise FlictError(ReturnCodes.RET_INTERNAL_ERROR, f"Internal failure. Wrong type {lep} for: {lep}") # single license if len(license_list) == 0: return lep new_list = license_list for lep_item in lep: new_list.append(lep_item) return new_list def relicensing_information(self): if self.relicense_map is None: self.relicense_map = read_relicense_file(self.relicense_file) return self.relicense_map def translation_information(self): return self.translations
def get_consolidated_packages(codebase): """ Yield a ConsolidatedPackage for each detected package in the codebase """ for resource in codebase.walk(topdown=False): for package_data in resource.packages: package = get_package_instance(package_data) package_root = package.get_package_root(resource, codebase) package_root.extra_data['package_root'] = True package_root.save(codebase) is_build_file = isinstance(package, BaseBuildManifestPackage) package_resources = list( package.get_package_resources(package_root, codebase)) package_license_expression = package.license_expression package_copyright = package.copyright package_holders = [] if package_copyright: numbered_lines = [(0, package_copyright)] for _, holder, _, _ in CopyrightDetector().detect( numbered_lines, copyrights=False, holders=True, authors=False, include_years=False): package_holders.append(holder) package_holders = process_holders(package_holders) discovered_license_expressions = [] discovered_holders = [] for package_resource in package_resources: if not is_build_file: # If a resource is part of a package Component, then it cannot be part of any other type of Component package_resource.extra_data['in_package_component'] = True package_resource.save(codebase) if package_resource.license_expressions: package_resource_license_expression = combine_expressions( package_resource.license_expressions) if package_resource_license_expression: discovered_license_expressions.append( package_resource_license_expression) if package_resource.holders: discovered_holders.extend( h.get('value') for h in package_resource.holders) discovered_holders = process_holders(discovered_holders) combined_discovered_license_expression = combine_expressions( discovered_license_expressions) if combined_discovered_license_expression: simplified_discovered_license_expression = str( Licensing().parse( combined_discovered_license_expression).simplify()) else: simplified_discovered_license_expression = None c = Consolidation( core_license_expression=package_license_expression, # Sort holders by holder key core_holders=[ h for h, _ in sorted(copyright_summary.cluster( package_holders), key=lambda t: t[0].key) ], other_license_expression= simplified_discovered_license_expression, # Sort holders by holder key other_holders=[ h for h, _ in sorted(copyright_summary.cluster( discovered_holders), key=lambda t: t[0].key) ], files_count=len([ package_resource for package_resource in package_resources if package_resource.is_file ]), resources=package_resources, ) if is_build_file: c.identifier = package.name yield ConsolidatedComponent(type='build', consolidation=c) else: yield ConsolidatedPackage(package=package, consolidation=c)
def get_consolidated_packages(codebase): """ Yield a ConsolidatedPackage for each detected package in the codebase """ for resource in codebase.walk(topdown=False): for package_data in resource.packages: package = get_package_instance(package_data) is_build_file = isinstance(package, BaseBuildManifestPackage) package_resources = list(package.get_package_resources(resource, codebase)) package_license_expression = package.license_expression package_copyright = package.copyright package_holders = [] if package_copyright: numbered_lines = [(0, package_copyright)] for _, holder, _, _ in CopyrightDetector().detect(numbered_lines, copyrights=False, holders=True, authors=False, include_years=False): package_holders.append(holder) discovered_license_expressions = [] discovered_holders = [] for package_resource in package_resources: if not is_build_file: # If a resource is part of a package Component, then it cannot be part of any other type of Component package_resource.extra_data['in_package_component'] = True package_resource.save(codebase) package_resource_license_expression = combine_expressions(package_resource.license_expressions) package_resource_holders = package_resource.holders if not package_resource_license_expression and not package_resource_holders: continue discovered_license_expressions.append(package_resource_license_expression) discovered_holders.extend(h.get('value') for h in package_resource_holders) # Remove NoneTypes from discovered licenses discovered_license_expressions = [lic for lic in discovered_license_expressions if lic] # Remove NoneTypes from discovered holders discovered_holders = [holder for holder in discovered_holders if holder] combined_discovered_license_expression = combine_expressions(discovered_license_expressions) if combined_discovered_license_expression: simplified_discovered_license_expression = str(Licensing().parse(combined_discovered_license_expression).simplify()) else: simplified_discovered_license_expression = None c = Consolidation( core_license_expression=package_license_expression, core_holders=sorted(set(package_holders)), other_license_expression=simplified_discovered_license_expression, other_holders=sorted(set(discovered_holders)), files_count=sum(1 for package_resource in package_resources if package_resource.is_file), resources=package_resources, ) if is_build_file: c.identifier = package.name yield ConsolidatedComponent( type='build', consolidation=c ) else: yield ConsolidatedPackage( package=package, consolidation=c )
def compute_license_score(codebase): """ Return a mapping of scoring elements and a license clarity score computed at the codebase level. The license clarity score is a value from 0-100 calculated by combining the weighted values determined for each of the scoring elements: Declared license: - When true, indicates that the software package licensing is documented at top-level or well-known locations in the software project, typically in a package manifest, NOTICE, LICENSE, COPYING or README file. - Scoring Weight = 40 Identification precision: - Indicates how well the license statement(s) of the software identify known licenses that can be designated by precise keys (identifiers) as provided in a publicly available license list, such as the ScanCode LicenseDB, the SPDX license list, the OSI license list, or a URL pointing to a specific license text in a project or organization website. - Scoring Weight = 40 License texts: - License texts are provided to support the declared license expression in files such as a package manifest, NOTICE, LICENSE, COPYING or README. - Scoring Weight = 10 Declared copyright: - When true, indicates that the software package copyright is documented at top-level or well-known locations in the software project, typically in a package manifest, NOTICE, LICENSE, COPYING or README file. - Scoring Weight = 10 Ambiguous compound licensing - When true, indicates that the software has a license declaration that makes it difficult to construct a reliable license expression, such as in the case of multiple licenses where the conjunctive versus disjunctive relationship is not well defined. - Scoring Weight = -10 Conflicting license categories - When true, indicates the declared license expression of the software is in the permissive category, but that other potentially conflicting categories, such as copyleft and proprietary, have been detected in lower level code. - Scoring Weight = -20 """ scoring_elements = ScoringElements() declared_licenses = get_field_values_from_codebase_resources( codebase=codebase, field_name='licenses', key_files_only=True, ) declared_license_expressions = get_field_values_from_codebase_resources( codebase=codebase, field_name='license_expressions', key_files_only=True) unique_declared_license_expressions = unique(declared_license_expressions) declared_license_categories = get_license_categories(declared_licenses) copyrights = get_field_values_from_codebase_resources( codebase=codebase, field_name='copyrights', key_files_only=True) other_licenses = get_field_values_from_codebase_resources( codebase=codebase, field_name='licenses', key_files_only=False) scoring_elements.declared_license = bool(declared_licenses) if scoring_elements.declared_license: scoring_elements.score += 40 scoring_elements.precise_license_detection = check_declared_licenses( declared_licenses) if scoring_elements.precise_license_detection: scoring_elements.score += 40 scoring_elements.has_license_text = check_for_license_texts( declared_licenses) if scoring_elements.has_license_text: scoring_elements.score += 10 scoring_elements.declared_copyrights = bool(copyrights) if scoring_elements.declared_copyrights: scoring_elements.score += 10 is_permissively_licensed = check_declared_license_categories( declared_license_categories) if is_permissively_licensed: scoring_elements.conflicting_license_categories = check_for_conflicting_licenses( other_licenses) if scoring_elements.conflicting_license_categories and scoring_elements.score > 0: scoring_elements.score -= 20 declared_license_expression = get_primary_license( unique_declared_license_expressions) if not declared_license_expression: # If we cannot get a single primary license, then we combine and simplify the license expressions from key files combined_declared_license_expression = combine_expressions( unique_declared_license_expressions) if combined_declared_license_expression: declared_license_expression = str(Licensing().parse( combined_declared_license_expression).simplify()) scoring_elements.ambiguous_compound_licensing = True if scoring_elements.score > 0: scoring_elements.score -= 10 return scoring_elements, declared_license_expression or ''
def _is_expression_or(licence_expression: str) -> bool: licensing_util = Licensing() return isinstance( _parse_licence_expression(licensing_util, licence_expression), OR)
def determine_licence_compound(main_licence: str, additional_licences: List[str]) -> str: """Determines the overall licence based on main licence and additional licences.""" overall_licence = f"({main_licence}) AND ({') AND ('.join(additional_licences)})" return str( _parse_licence_expression(Licensing(), overall_licence).simplify())
def check_expected_parse_copyright_file( test_loc, expected_loc, regen=False, simplified=False, _licensing=Licensing(), ): ''' Check copyright parsing of `test_loc` location against an expected JSON file at `expected_loc` location. Regen the expected file if `regen` is True. ''' if simplified: filter_duplicates = True skip_debian_packaging = True simplify_licenses = True unique_copyrights = True else: filter_duplicates = False skip_debian_packaging = False simplify_licenses = False unique_copyrights = False try: dc = debian_copyright.parse_copyright_file( location=test_loc, check_consistency=False, ) declared_license = dc.get_declared_license( filter_duplicates=filter_duplicates, skip_debian_packaging=skip_debian_packaging, ) license_expression = dc.get_license_expression( skip_debian_packaging=skip_debian_packaging, simplify_licenses=simplify_licenses, ) license_expression_keys = set(_licensing.license_keys(license_expression)) copyrght = dc.get_copyright( skip_debian_packaging=skip_debian_packaging, unique_copyrights=unique_copyrights, ).strip() primary_license = dc.primary_license match_details = list(map(get_match_details, dc.license_matches)) results = { 'primary_license': primary_license, 'declared_license': declared_license, 'license_expression': license_expression, 'copyright': copyrght, 'matches': match_details, } if regen: expected = results with open(expected_loc, 'w') as res: res.write(saneyaml.dump(results)) else: with open(expected_loc) as ex: expected = saneyaml.load(ex.read()) except Exception as e: import traceback files = [ 'file://' + test_loc, 'file://' + expected_loc, ] raise Exception(repr(e), traceback.format_exc(), files) from e if ( not regen and (saneyaml.dump(results) != saneyaml.dump(expected) or 'unknown-license-reference' in license_expression_keys) ) : res = { 'test_loc': f'file://{test_loc}', 'expected_loc': f'file://{expected_loc}', } res.update(results) results = saneyaml.dump(res) results = results.replace( 'unknown-license-reference', 'unknown-license-reference should not be detected', ) assert results == saneyaml.dump(expected)
class LicenseHandler: def __init__(self, translations_files, relicense_file, group_file): self.translations_files = translations_files self.relicense_file = relicense_file self.relicense_map = None self.group_file = group_file symbols = self.read_symbols(self.translations_files) self.licensing = Licensing(symbols) #print("symbols: " + str(symbols)) def read_symbols(self, translations_files): symbols = [] symbols_map = {} for translations_file in translations_files.split(): #print("reading translation file: " + str(translations_file)) translation_data = read_translations(translations_file) for lic_key in translation_data: #print("lic_key: \"" + str(lic_key) + "\"") #print(" lic_alias: " + str(translation_data[lic_key] )) if lic_key not in symbols_map: symbols_map[lic_key] = set() for val in translation_data[lic_key]: symbols_map[lic_key].add(val) #lic_aliases = tuple(translation_data[lic_key]) #symbols.append(LicenseSymbol(key=key, aliases=lic_aliases)) for key, value in symbols_map.items(): #print("Adding to symbols: " + key) #print(" - " + str(value)) symbols.append(LicenseSymbol(key=key, aliases=tuple(value))) # debugging #print("Symbols") #for sym in symbols: #print(" sym: " + (str(sym.key))) #print(" aliases : " + (str(sym.aliases))) #l = Licensing([sym]) #print(" licensing: " + (str(l))) #print("symbols: " + str(symbols)) return symbols def translate_and_relicense(self, license_expression): license_expression = license_expression.replace("&", " and ").replace("|", " or ") transl = self.translate(license_expression) if transl == None or transl == "": transl = license_expression #print("translate_and_relicenseself: " + license_expression + " ==> " + transl) rel = self.expand_relicense(transl) if rel == None: rel = transl #print("translate_and_relicenseself: " + rel) return rel def expand_relicense(self, license_expression): if self.relicense_file != None and self.relicense_file != "": self.relicense_map = read_relicense_file(self.relicense_file) expanded = relicense_license(self.relicense_map, license_expression) return expanded.strip() else: return license_expression.strip() def group(self, license_expression): return license_expression.strip() def translate(self, license_expression): return str(self.simplify(license_expression)) def simplify(self, license_expression): parsed = self.licensing.parse(license_expression) #parsed = self.licensing._parse_and_simplify(license_expression) #print("simplified: " + str(parsed.simplify())) #return parsed.simplify() return parsed def license_expression_list_json(self, license_expression, relicense=True): license = self.license_expression_list(license_expression, relicense) output = {} output["license_expression"] = license_expression output["expanded"] = license.expanded output["grouped"] = license.grouped output["translated"] = license.translated output["simplified"] = license.simplified output["interim"] = license.interim output["set_list"] = license.set_list return output def license_expression_list(self, license_expression, relicense=True): license = ManagedLicenseExpression(license_expression) license.translated = self.translate(license_expression) if relicense: license.expanded = self.expand_relicense(license.translated) else: license.expanded = license.translated license.grouped = self.group(license.expanded) # We need str to skip verbose output license.simplified = str(self.simplify(license.grouped)) license.interim = self.interim_license_expression_list(license.simplified, self.licensing) license.set_list = self.interim_license_expression_set_list(license.interim) return license def interim_license_expression_list(self, license_expression, licensing): """ Turns an expression like this: G AND (A OR B) into: AND [G, OR [A, B]] The latter is an interim format. """ #print("") #print("parse(" + str(license_expression) + ")") tokenizer = licensing.get_advanced_tokenizer() tokenized = tokenizer.tokenize(str(license_expression)) current_license=None current_licenses=[] current_op=None paren_expr = None paren_count=0 for token in tokenized: tok = token.string if tok == '(': #print("(") if paren_expr == None: paren_expr = "" else: paren_expr = paren_expr + " " + tok paren_count = paren_count + 1 elif tok == ')': #print("about to parse: \"" + paren_expr + "\" count: " + str(paren_count)) if paren_count == 0: current_license = self.interim_license_expression_list(paren_expr, licensing) #print("got: \"" + str(current_license) + "\"") paren_expr=None else: paren_count = paren_count - 1 paren_expr = paren_expr + " " + tok elif tok == 'OR' or tok == 'AND': if paren_expr != None: #print("TEMP " + tok) paren_expr = paren_expr + " " + tok else: #print("OPERATOR " + tok + " (" + str(current_op) + ")") if current_licenses == None: print("ERROR......") print("ERROR......") print("ERROR......") exit(24) if current_op == None: # first operator current_op = tok #print("=cop: " + tok + " " + current_license) current_licenses.append(current_license) elif current_op == tok: # same operator #print("-cop: " + tok + " " + current_license) current_licenses.append(current_license) else: # different operator print("-------------------------------------------- Store me: " + current_op + " " + str(current_licenses)) exit(12) else: #print("tok: \"" + tok + "\"") if paren_expr != None: #print("TEMP " + tok) paren_expr = paren_expr + " " + tok else: #print("license: " + tok) current_license = tok current_licenses.append(current_license) if current_op == None: current_op = "AND" list = LicenseExpressionList(current_op, current_licenses) #print("DONE: " + str(license_expression) + " => " + str(list)) return list def _combinations(self, lel): #print("lel : " + str(lel)) if not isinstance(lel, LicenseExpressionList): return 1 if lel.op == "AND": prod = 1 for l in lel.list: prod = prod * _combinations(l) return prod elif lel.op == "OR": sum = 0 for l in lel.list: sum = sum + _combinations(l) return sum else: print("ERROR: NO OP") exit(11) def interim_license_expression_set_list(self, interim_license_expression_list): """ Turns an expression like this: AND [G, OR [A, B]] into: [ { G, A }, { G, B } ] The latter is an interim format. """ expanded_list=[] #print("Count: " + str(_combinations(interim_license_expression_list))) if not isinstance(interim_license_expression_list, LicenseExpressionList): # single license license_set= { interim_license_expression_list } expanded_list.append(list(license_set)) license_verbose_debug("LEAF, returning " + str(expanded_list)) license_verbose_debug("cop: " + interim_license_expression_list ) #print("managed____ \"" + str(expanded_list) + "\" <---- MIDDLE") return expanded_list current_op = interim_license_expression_list.op; license_verbose_debug("cop: " + current_op ) for lep in interim_license_expression_list.list: license_verbose_debug(" ------ lep ----- " + str(lep)) if current_op == None: print("ERROR: NO OP") exit(11) elif current_op == "OR": lep_list = self.interim_license_expression_set_list(lep) expanded_list = self._manage_list_item_or(expanded_list, lep_list) elif current_op == "AND": lep_list = self.interim_license_expression_set_list(lep) expanded_list = self._manage_list_item_and(expanded_list, lep_list) #print("managed____ \"" + str(expanded_list) + "\" <---- FINAL") return expanded_list def _manage_list_item_and(self, license_list, lep): license_verbose_debug(" * Andy" ) if isinstance(lep, LicenseExpressionList): print(" -------------====== Andy ====-----------------" ) exit(77) # TODO : implement below (for lep) print("AND Count 0: " + str(_combinations(lep))) for inner_lep in lep.list: print("¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤ AND Count A: " + str(_combinations(inner_lep))) set_list = self.interim_license_expression_set_list(inner_lep) return None else: # single license if len(license_list) == 0: license_verbose_debug("wooops: " + str(license_list)) license_verbose_debug("wooops: " + str(lep)) license_list = lep else: license_verbose_debug("daisy: " + str(license_list)) license_verbose_debug("daisy: " + str(lep)) license_verbose_debug(" -------------====== Andy ====----------------- SINGLE: " + str(license_list) ) new_list=[] for item in license_list: license_verbose_debug(" item: " + str(item) + " <--- " + str(lep) ) for lep_item in lep: license_verbose_debug(" item: " + str(item) + " <--- " + str(lep_item) ) new_item =list(set(item + lep_item)) license_verbose_debug(" item: " + str(item) ) new_list.append(new_item) license_verbose_debug(" list: " + str(new_list) ) license_list = new_list return license_list def _manage_list_item_or(self, license_list, lep): license_verbose_debug(" * Orleans: " + (str(lep))) if isinstance(lep, LicenseExpressionList): # TODO : implement below (for lep) license_verbose_debug(" -------------====== ORLEANS ====----------------- : " + str(lep.license_list) ) exit(77) for inner_lep in lep.license_list: print(" ====----------------- : " + str(inner_lep) ) print("OR Count A: " + str(_combinations(inner_lep))) set_list = self.interim_license_expression_set_list(inner_lep) print("OR Count B: " + str(len(set_list))) license_list.append(inner_lep) else: # single license license_verbose_debug("HERE I AM .... \"" + str(lep) + "\"") if len(license_list) == 0: new_list=lep license_verbose_debug("topsss: " + str(license_list) + " size: " + str(len(license_list))) license_verbose_debug("topsss: " + str(lep) + " size: " + str(len(lep))) license_verbose_debug("topsss: " + str(new_list) + " size: " + str(len(new_list))) else: new_list = license_list license_verbose_debug("dapsss: " + str(license_list)) license_verbose_debug("dappss: " + str(lep)) for lep_item in lep: license_verbose_debug(" item: " + str(license_list) + " <--- " + str(lep_item) ) new_list.append(lep_item) return new_list
def parse_structured_copyright_file( copyright_file, skip_debian_packaging=True, simplify_licenses=True, unique=True, ): """ Return a tuple of (declared license, detected license_expression, copyrights) strings computed from the `copyright_file` location. For each copyright file paragraph we treat the "name" as a license declaration. The text is used for detection and cross-reference with the declaration. If `skip_debian_packaging` is True, the Debian packaging license --if detected-- is skipped. If `simplify_licenses` is True the license expressions are simplified. If `unique` is True, repeated copyrights, detected or declared licenses are ignore, and only unique detections are returne. """ if not copyright_file: return None, None, None deco = DebianCopyright.from_file(copyright_file) declared_licenses = [] detected_licenses = [] copyrights = [] deco = fix_copyright(deco) licensing = Licensing() for paragraph in deco.paragraphs: if skip_debian_packaging and is_debian_packaging(paragraph): # Skipping packaging license and copyrights since they are not # relevant to the effective package license continue if isinstance(paragraph, (CopyrightHeaderParagraph, CopyrightFilesParagraph)): pcs = paragraph.copyright.statements or [] for p in pcs: p = p.dumps() # avoid repeats if unique: if p not in copyrights: copyrights.append(p) else: copyrights.append(p) if isinstance(paragraph, CatchAllParagraph): text = paragraph.dumps() if text: detected = get_normalized_expression(text, try_as_expression=False) if not detected: detected = 'unknown' detected_licenses.append(detected) else: plicense = paragraph.license if not plicense: continue declared, detected = detect_declared_license(plicense.name) # avoid repeats if unique: if declared and declared not in declared_licenses: declared_licenses.append(declared) if detected and detected not in detected_licenses: detected_licenses.append(detected) else: declared_licenses.append(declared) detected_licenses.append(detected) # also detect in text text = paragraph.license.text if text: detected = get_normalized_expression(text, try_as_expression=False) if not detected: detected = 'unknown' # avoid repeats if unique: if detected not in detected_licenses: detected_licenses.append(detected) else: detected_licenses.append(detected) declared_license = '\n'.join(declared_licenses) if detected_licenses: detected_licenses = [licensing.parse(dl, simple=True) for dl in detected_licenses] if len(detected_licenses) > 1: detected_license = licensing.AND(*detected_licenses) else: detected_license = detected_licenses[0] if simplify_licenses: detected_license = detected_license.simplify() detected_license = str(detected_license) else: detected_license = 'unknown' copyrights = '\n'.join(copyrights) return declared_license, detected_license, copyrights
def cli(licenses_file): """ Create rules from a text file with delimited blocks of metadata and texts. As an example a file would contains one of more blocks such as this: \b ---------------------------------------- license_expression: lgpl-2.1 relevance: 100 is_license_notice: yes --- This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation; ---------------------------------------- """ rules_data = load_data(licenses_file) rules_tokens = all_rule_tokens() licenses = cache.get_licenses_db() licensing = Licensing(licenses.values()) print() errors = validate_license_rules(rules_data, licensing) if errors: print('Invalid rules: exiting....') for error in errors: print(error) print() raise Exception('Invalid rules: exiting....') print() for rule in rules_data: is_negative = rule.data.get('is_negative') is_false_positive = rule.data.get('is_false_positive') existing = rule_exists(rule.text) if existing and not is_negative: print('Skipping existing non-negative rule:', existing, 'with text:\n', rule.text[:50].strip(), '...') continue if is_negative: base_name = 'not-a-license' else: license_expression = rule.data.get('license_expression') license_expression = str( licensing.parse(license_expression, validate=True, simple=True)) base_name = license_expression if is_false_positive: base_name = 'false-positive_' + base_name base_loc = find_rule_base_loc(base_name) data_file = base_loc + '.yml' with io.open(data_file, 'w', encoding='utf-8') as o: o.write(rule.raw_data) text_file = base_loc + '.RULE' with io.open(text_file, 'w', encoding='utf-8') as o: o.write(rule.text) rulerec = models.Rule(data_file=data_file, text_file=text_file) rule_tokens = tuple(rulerec.tokens()) if rule_tokens in rules_tokens: # cleanup os.remove(text_file) os.remove(data_file) print('Skipping already added rule with text for:', base_name) else: rules_tokens.add(rule_tokens) rulerec.dump() models.update_ignorables(rulerec, verbose=False) print( 'Rule added:', 'file://' + rulerec.data_file, '\n', 'file://' + rulerec.text_file, )
def _report(self, args, scancode_report, _files): copyrights = set() licenses = set() spdx = set() files = _files['included'] for f in files: if self._isdir(f): continue manifest_map = f['scancode_manifestor'] #print("collecting info " + str(f)) self.logger.verbose("collecting info " + str(f['name']) + " " + str(manifest_map['license_key'])) for c in manifest_map['copyright']: copyrights.add(c) assert 'scancode_manifestor' in f #print("---------------- CHECKING------") #print(json.dumps(f['scancode_manifestor'], indent=4)) #print("Adding license for file : " + str(f['path']) + ": ", end="") if 'curated_license' in f['scancode_manifestor']: lic_key = f['scancode_manifestor']['curated_license'] #print(" curated ", end="") elif 'license_key' in f['scancode_manifestor']: lic_key = f['scancode_manifestor']['license_key'] #print(" originial ", end="") else: # If we get here, it means the transormation has failed. # Better go out with a bang print("000000000000000000000000000000000000000000000000000000") assert False #print(lic_key) #lic_key = manifest_map['license_key'] #if lic_key == None: # if 'scancode_manifestor' in f['scancode_manifestor']: # lic_key = f['scancode_manifestor']['curated_license'] # #print("wooops.... : " + str(f['path']) + " has " + f['scancode_manifestor']['curated_license']) # else: # lic_key = " none " #else: licenses.add(lic_key) spdx.add(manifest_map['license_spdx']) lic_expr = None #print("licenses: " + str(licenses)) for lic in licenses: if lic_expr == None: lic_expr = "" else: lic_expr += " and " lic_expr += str(lic) spdx_expr = None for lic in spdx: if spdx_expr == None: spdx_expr = "" else: spdx_expr += " and " spdx_expr += str(lic) c_list = list(copyrights) c_list.sort() #print("\nlicenses: " + str(lic_expr)) licensing = Licensing() parsed = licensing._parse_and_simplify(lic_expr) json_compat_lic = str(parsed).replace("AND", " & ") #print("\nlicenses: " + str(parsed)) #exit(0) report = {} # # Files # report['files'] = {} report['files']['included'] = _files['included'] report['files']['excluded'] = _files['excluded'] report['files']['included_files_count'] = self._count_files( _files['included']) report['files']['excluded_files_count'] = self._count_files( _files['excluded']) report['files'][ 'original_files_count'] = self._scancode_report_files_count( scancode_report) # # Project # project = {} project['name'] = args['project_name'] project['sub_package'] = args['sub_package_name'] project['version'] = args['project_version'] project['url'] = args['project_url'] project['source_url'] = args['project_source_url'] project['issue_url'] = args['project_issue_url'] project['download_url'] = args['project_download_url'] report['project'] = project # # Conclusion # report['conclusion'] = {} report['conclusion']['copyright'] = c_list report['conclusion']['license_expression'] = json_compat_lic report['conclusion']['license_expression_original'] = lic_expr # # Meta information # report['meta'] = {} #report['meta']['curations'] = curations report['meta']['arguments'] = args #.__dict__ report['meta']['report_date'] = str(datetime.datetime.now()) report['meta']['scancode_report_file'] = args['input_file'] report['meta']['scancode_name'] = scancode_report['headers'][0][ 'tool_name'] report['meta']['scancode_version'] = scancode_report['headers'][0][ 'tool_version'] #report['meta']['scancode_report'] = scancode_report return report
class Rule(object): """ A detection rule object is a text to use for detection and corresponding detected licenses and metadata. """ licensing = Licensing() ########### # FIXME: !!! TWO RULES MAY DIFFER BECAUSE THEY ARE UPDATED BY INDEXING ########### # optional rule id int typically assigned at indexing time rid = attr.ib(default=None, repr=TRACE_REPR) # unique identifier identifier = attr.ib(default=None) # License expression string license_expression = attr.ib(default=None) # License expression object, created at build time license_expression_object = attr.ib(default=None, repr=False) # an indication of what this rule importance is (e.g. how important is its # text when detected as a licensing clue) as one of several flags: # for a license full text: this provides the highest level of confidence wrt # detection is_license_text = attr.ib(default=False, repr=False) # for a license notice: this provides a strong confidence wrt detection is_license_notice = attr.ib(default=False, repr=False) # reference for a mere short license reference such as its bare name or a URL # this provides a weak confidence wrt detection is_license_reference = attr.ib(default=False, repr=False) # tag for a structured licensing tag such as a package manifest metadata or # an SPDX license identifier or similar package manifest tag # this provides a strong confidence wrt detection is_license_tag = attr.ib(default=False, repr=False) # is this rule text a false positive when matched? it will filtered out at # the end if matched is_false_positive = attr.ib(default=False, repr=False) # is this rule text a negative rule? it will be removed from the matchable # text the start if matched is_negative = attr.ib(default=False, repr=False) # is this rule text only to be matched with a minimum coverage e.g. a # minimum proportion of tokens as a float between 0 and 100 where 100 means # all tokens must be matched and a smaller value means a smaller propertion # of matched tokens is acceptable. this is computed unless this is provided # here. minimum_coverage = attr.ib(default=0) has_stored_minimum_coverage = attr.ib(default=False, repr=False) # same as minimum_coverage but divided/100 _minimum_containment = attr.ib(default=0, repr=False) # Can this rule be matched if there are unknown words in its matched range? # The default is to allow known and unknown words. Unknown words are words # that do not exist in the text of any indexed license or license detection # rule. only_known_words = attr.ib(default=False) # what is the relevance of a match to this rule text? a float between 0 and # 100 where 100 means highly relevant and 0 menas not relevant at all. # For instance a match to the "gpl" or the "cpol" words have a fairly low # relevance as they are a weak indication of an actual license and could be # a false positive. In somce cases, this may even be used to discard obvious # false positive matches automatically. relevance = attr.ib(default=100) has_stored_relevance = attr.ib(default=False, repr=False) # The rule contains a reference to some file name that comtains the text referenced_filenames = attr.ib(default=attr.Factory(list), repr=False) # optional, free text notes = attr.ib(default=None, repr=False) # set to True if the rule is built from a .LICENSE full text is_license = attr.ib(default=False, repr=False) # lists of copuyrights, emails and URLs that can be ignored when detected # in this license as they are part of the license or rule text itself ignorable_copyrights = attr.ib(default=attr.Factory(list), repr=False) ignorable_holders = attr.ib(default=attr.Factory(list), repr=False) ignorable_authors = attr.ib(default=attr.Factory(list), repr=False) ignorable_urls = attr.ib(default=attr.Factory(list), repr=False) ignorable_emails = attr.ib(default=attr.Factory(list), repr=False) ########################################################################### # path to the YAML data file for this rule data_file = attr.ib(default=None, repr=False) # path to the rule text file text_file = attr.ib(default=None, repr=False) # text of this rule for special cases where the rule is not backed by a file: # for SPDX license expression dynamic rules or testing stored_text = attr.ib(default=None, repr=False) # These attributes are computed upon text loading or setting the thresholds ########################################################################### # lengths in tokens length = attr.ib(default=0) min_matched_length = attr.ib(default=0, repr=TRACE_REPR) high_length = attr.ib(default=0, repr=TRACE_REPR) min_high_matched_length = attr.ib(default=0, repr=TRACE_REPR) # lengths in unique token. length_unique = attr.ib(default=0, repr=TRACE_REPR) min_matched_length_unique = attr.ib(default=0, repr=TRACE_REPR) high_length_unique = attr.ib(default=0, repr=TRACE_REPR) min_high_matched_length_unique = attr.ib(default=0, repr=TRACE_REPR) is_small = attr.ib(default=False, repr=TRACE_REPR) has_computed_thresholds = attr.ib(default=False, repr=False) def get_length(self, unique=False): return self.length_unique if unique else self.length def get_min_matched_length(self, unique=False): return (self.min_matched_length_unique if unique else self.min_matched_length) def get_high_length(self, unique=False): return self.high_length_unique if unique else self.high_length def get_min_high_matched_length(self, unique=False): return (self.min_high_matched_length_unique if unique else self.min_high_matched_length) def __attrs_post_init__(self, *args, **kwargs): if not self.text_file: # for SPDX or tests only if not self.stored_text : raise Exception('Invalid rule without its corresponding text file: {}'.format(self)) self.identifier = '_tst_' + str(len(self.stored_text)) else: self.identifier = file_name(self.text_file) if self.data_file: try: self.load() except Exception as e: data_file = self.data_file trace = traceback.format_exc() message = 'While loading: file://{data_file}\n{trace}'.format(**locals()) raise Exception(message) if self.relevance and self.relevance != 100: self.has_stored_relevance = True if self.minimum_coverage: self.has_stored_minimum_coverage = True if self.license_expression: try: expression = self.licensing.parse(self.license_expression) except: raise Exception( 'Unable to parse License rule expression: ' +repr(self.license_expression) + ' for: file://' + self.data_file + '\n' + traceback.format_exc() ) if expression is None: raise Exception( 'Unable to parse License rule expression: ' +repr(self.license_expression) + ' for: file://' + self.data_file) self.license_expression = expression.render() self.license_expression_object = expression def tokens(self): """ Return an iterable of token strings for this rule. Length, relevance and minimum_coverage may be recomputed as a side effect. """ length = 0 text = self.text() text = text.strip() # FIXME: this is weird: # We tag this rule as being a bare URL if it starts with a scheme and is # on one line: this is used to determine a matching approach # FIXME: this does not lower the text first?? if text.startswith(('http://', 'https://', 'ftp://')) and '\n' not in text[:1000].lower(): self.minimum_coverage = 100 for token in query_tokenizer(self.text()): length += 1 yield token self.length = length self.compute_relevance() def text(self): """ Return the rule text loaded from its file. """ if self.text_file and exists(self.text_file): # IMPORTANT: use the same process as query text loading for symmetry numbered_lines = numbered_text_lines(self.text_file, demarkup=False, plain_text=True) return ''.join(l for _, l in numbered_lines) # used for non-file backed rules elif self.stored_text: return self.stored_text else: raise Exception('Inconsistent rule text for: ' + self.identifier + '\nfile://' + self.text_file) def license_keys(self, unique=True): """ Return a list of license keys for this rule. """ if not self.license_expression: return [] return self.licensing.license_keys(self.license_expression_object, unique=unique) def same_licensing(self, other): """ Return True if the other rule has the same licensing as this rule. """ if self.license_expression and other.license_expression: return self.licensing.is_equivalent( self.license_expression_object, other.license_expression_object) def licensing_contains(self, other): """ Return True if this rule licensing contains the other rule licensing. """ if self.license_expression and other.license_expression: return self.licensing.contains( self.license_expression_object, other.license_expression_object) def compute_thresholds(self, small_rule=SMALL_RULE): """ Compute and set thresholds either considering the occurrence of all tokens or the occurance of unique tokens. """ minimum_coverage, self.min_matched_length, self.min_high_matched_length = ( compute_thresholds_occurences( self.minimum_coverage, self.length, self.high_length)) if not self.has_stored_minimum_coverage: self.minimum_coverage = minimum_coverage self._minimum_containment = self.minimum_coverage / 100 self.min_matched_length_unique, self.min_high_matched_length_unique = ( compute_thresholds_unique( self.minimum_coverage, self.length, self.length_unique, self.high_length_unique)) self.is_small = self.length < small_rule def to_dict(self): """ Return an ordered mapping of self, excluding texts. Used for serialization. Empty values are not included. """ data = OrderedDict() if self.license_expression: data['license_expression'] = self.license_expression flags = ( 'is_false_positive', 'is_negative', 'is_license_text', 'is_license_notice', 'is_license_reference', 'is_license_tag', 'only_known_words', ) for flag in flags: tag_value = getattr(self, flag, False) if tag_value: data[flag] = tag_value if self.has_stored_relevance and self.relevance: rl = self.relevance if isinstance(rl, float) and int(rl) == rl: rl = int(rl) data['relevance'] = rl if self.has_stored_minimum_coverage and self.minimum_coverage > 0: cv = self.minimum_coverage if isinstance(cv, float) and int(cv) == cv: cv = int(cv) data['minimum_coverage'] = cv if self.referenced_filenames: data['referenced_filenames'] = self.referenced_filenames if self.notes: data['notes'] = self.notes if self.ignorable_copyrights: data['ignorable_copyrights'] = self.ignorable_copyrights if self.ignorable_holders: data['ignorable_holders'] = self.ignorable_holders if self.ignorable_authors: data['ignorable_authors'] = self.ignorable_authors if self.ignorable_urls: data['ignorable_urls'] = self.ignorable_urls if self.ignorable_emails: data['ignorable_emails'] = self.ignorable_emails return data def dump(self): """ Dump a representation of this rule as two files: - a .yml for the rule data in YAML (self.data_file) - a .RULE: the rule text as a UTF-8 file (self.text_file) Does nothing if this rule was created from a License (e.g. `is_license` is True) """ if self.is_license: return def write(location, byte_string): # we write as binary because rules and licenses texts and data are UTF-8-encoded bytes with io.open(location, 'wb') as of: of.write(byte_string) if self.data_file: as_yaml = saneyaml.dump(self.to_dict(), indent=4, encoding='utf-8') write(self.data_file, as_yaml) write(self.text_file, self.text().encode('utf-8')) def load(self): """ Load self from a .RULE YAML file stored in self.data_file. Does not load the rule text file. Unknown fields are ignored and not bound to the Rule object. """ try: with io.open(self.data_file, encoding='utf-8') as f: data = saneyaml.load(f.read()) except Exception as e: print('#############################') print('INVALID LICENSE RULE FILE:', 'file://' + self.data_file) print('#############################') print(e) print('#############################') # this is a rare case, but yes we abruptly stop. raise e known_attributes = set(attr.fields_dict(self.__class__)) data_file_attributes = set(data) unknown_attributes = data_file_attributes.difference(known_attributes) if unknown_attributes: unknown_attributes = ', '.join(sorted(unknown_attributes)) msg = 'License rule {} data file has unknown attributes: {}' raise Exception(msg.format(self, unknown_attributes)) self.license_expression = data.get('license_expression') self.is_negative = data.get('is_negative', False) self.is_false_positive = data.get('is_false_positive', False) if not self.license_expression and not (self.is_negative or self.is_false_positive): msg = 'License rule {} is missing a license_expression.' raise Exception(msg.format(self)) relevance = float(data.get('relevance', 0)) if relevance: if relevance <= 0 or relevance > 100: msg = ('License rule {} data file has an invalid relevance. ' 'Should be above 0 and 100 or less: {}') raise Exception(msg.format(self, repr(relevance))) # Keep track if we have a stored relevance of not. self.relevance = relevance self.has_stored_relevance = True self.minimum_coverage = float(data.get('minimum_coverage', 0)) self._minimum_containment = self.minimum_coverage / 100 if not (0 <= self.minimum_coverage <= 100): msg = ( 'License rule {} data file has an invalid minimum_coverage. ' 'Should be between 0 and 100: {}') raise Exception(msg.format(self, self.minimum_coverage)) self.is_license_text = data.get('is_license_text', False) self.is_license_notice = data.get('is_license_notice', False) self.is_license_tag = data.get('is_license_tag', False) self.is_license_reference = data.get('is_license_reference', False) self.only_known_words = data.get('only_known_words', False) self.referenced_filenames = data.get('referenced_filenames', []) or [] if not isinstance(self.referenced_filenames, list): msg = ( 'License rule {} data file has an invalid referenced_filenames. ' 'Should be a list: {}') raise Exception(msg.format(self, self.referenced_filenames)) # these are purely informational and not used at run time notes = data.get('notes') if notes: self.notes = notes.strip() if not self.notes and (self.is_negative or self.is_false_positive): msg = 'Special License rule {} is missing explanatory notes.' raise Exception(msg.format(self)) self.ignorable_copyrights = data.get('ignorable_copyrights', []) self.ignorable_holders = data.get('ignorable_holders', []) self.ignorable_authors = data.get('ignorable_authors', []) self.ignorable_urls = data.get('ignorable_urls', []) self.ignorable_emails = data.get('ignorable_emails', []) return self def compute_relevance(self): """ Compute and set the `relevance` attribute for this rule. The relevance is a float between 0 and 100 where 100 means highly relevant and 0 means not relevant at all. For instance a match to the "gpl" or the "cpol" words have a fairly low relevance as they are a weak indication of an actual license and could be a false positive and should therefore be assigned a low relevance. In contrast a match to most or all of the apache-2.0 license text is highly relevant. The Rule relevance is used as the basis to compute a match score. The relevance is either pre-defined in the rule YAML data file with the "relevance" attribute or computed base on the rule length here using this approach: - a false positive or a negative rule has a relevance of 100. - a rule of length equal to or larger than a threshold has a 100 relevance - a rule of length smaller than a threshold has a relevance of 100/threshold, rounded down. The current threshold is 18 words. """ if isinstance(self, SpdxRule): self.relevance = 100 return if self.has_stored_relevance: return # case for false positive if self.is_false_positive: self.relevance = 100 return # case for negative rules with no license (and are not an FP) # they do not have licenses and their matches are never returned if self.is_negative: self.relevance = 100 return threshold = 18.0 relevance_of_one_word = round((1 / threshold) * 100, 2) length = self.length if length >= threshold: # general case self.relevance = 100 else: computed = int(length * relevance_of_one_word) self.relevance = min([100, computed]) @property def has_flags(self): """ Return True if this Rule has at least one flag set. """ return (self.is_license_text or self.is_license_notice or self.is_license_reference or self.is_license_tag)
from pathlib import Path from typing import BinaryIO, List, Optional from boolean.boolean import Expression, ParseError from debian.copyright import Copyright from license_expression import ExpressionError, Licensing from . import SpdxInfo from ._comment import _all_style_classes from ._licenses import ALL_NON_DEPRECATED_MAP GIT_EXE = shutil.which("git") HG_EXE = shutil.which("hg") _LOGGER = logging.getLogger(__name__) _LICENSING = Licensing() _END_PATTERN = "{}$".format( "".join( { "(?:{})*".format(re.escape(style.MULTI_LINE[2])) for style in _all_style_classes() if style.MULTI_LINE[2] } ) ) _IDENTIFIER_PATTERN = re.compile( r"SPDX" "-License-Identifier:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE ) _COPYRIGHT_PATTERNS = [ re.compile(r"(SPDX" "-FileCopyrightText:[ \t]+.*?)" + _END_PATTERN),
def spdx_id_match(idx, query_run, text): """ Return one LicenseMatch by matching the `text` as an SPDX license expression using the `query_run` positions and `idx` index for support. """ from licensedcode.cache import get_spdx_symbols from licensedcode.cache import get_unknown_spdx_symbol if TRACE: logger_debug('spdx_id_match: start:', 'text:', text, 'query_run:', query_run) licensing = Licensing() symbols_by_spdx = get_spdx_symbols() unknown_symbol = get_unknown_spdx_symbol() expression = get_expression(text, licensing, symbols_by_spdx, unknown_symbol) expression_str = expression.render() if TRACE: logger_debug('spdx_id_match: expression:', repr(expression_str)) # how many known or unknown-spdx symbols occurence do we have? known_syms = 0 unknown_syms = 0 for sym in licensing.license_symbols(expression, unique=False, decompose=True): if sym == unknown_symbol: unknown_syms += 1 else: known_syms += 1 match_len = len(query_run) match_start = query_run.start matched_tokens = query_run.tokens # are we starting with SPDX-License-Identifier or not? if yes: fix start cleaned = clean_text(text).lower() # FIXME: dnl and rem may not be known tokens hence the pos will be wrong if cleaned.startswith(( 'list', 'dnl', 'rem', )): match_start += 1 match_len -= 1 matched_tokens[1:] # build synthetic rule # TODO: ensure that all the SPDX license keys are known symbols rule = SpdxRule( license_expression=expression_str, # FIXME: for now we are putting the original query text as a # rule text: this is likely incorrect when it comes to properly # computing the known and unknowns and high and lows for this rule. # alternatively we could use the expression string, padded with # spdx-license-identifier: this may be wrong too, if the line was # not padded originally with this tag stored_text=text, length=match_len) # build match from parsed expression # collect match start and end: e.g. the whole text qspan = Span(range(match_start, query_run.end + 1)) # we use the query side to build the ispans ispan = Span(range(0, match_len)) len_junk = idx.len_junk hispan = Span(p for p, t in enumerate(matched_tokens) if t >= len_junk) match = LicenseMatch(rule=rule, qspan=qspan, ispan=ispan, hispan=hispan, query_run_start=match_start, matcher=MATCH_SPDX_ID, query=query_run.query) if TRACE: logger_debug('spdx_id_match: match found:', match) return match