def cleanse_licence_expression(licence_expression: str) -> str:
    """Cleanses a licence expression by using SPDX identifiers when possible.

    A licence expression can be a combination of licences and in a lot of cases is free-form text.
    The idea is to return an equivalent expression but using SPDX identifiers when possible.
    """
    simplified_expression = _parse_licence_expression(
        Licensing(), licence_expression).simplify()
    for s in simplified_expression.symbols:
        corresponding_licence = OPENSOURCE_LICENCES.get_licence(s.key)
        if corresponding_licence:
            s.key = corresponding_licence.identifier
    return simplify_licence_expression(str(simplified_expression))
Esempio n. 2
0
def combine_expressions(
        expressions,
        relation='AND',
        unique=True,
        licensing=Licensing(),
):
    """
    Return a combined license expression string with relation, given a sequence of
    license ``expressions`` strings or LicenseExpression objects.
    """
    return expressions and str(
        le_combine_expressions(expressions, relation, unique,
                               licensing)) or None
    def test_get_expression_without_lid(self):
        licensing = Licensing()
        spdx_symbols = get_spdx_symbols()
        unknown_symbol = get_unknown_spdx_symbol()
        line_text = ('EPL-2.0 OR Apache-2.0 OR '
                     'GPL-2.0 WITH Classpath-exception-2.0 OR '
                     'GPL-2.0')
        expression = get_expression(line_text, licensing, spdx_symbols,
                                    unknown_symbol)

        expected = 'epl-2.0 OR apache-2.0 OR gpl-2.0 WITH classpath-exception-2.0 OR gpl-2.0'
        assert expected == expression.render()

        expected = [
            'epl-2.0', u'apache-2.0', u'gpl-2.0', u'classpath-exception-2.0',
            u'gpl-2.0'
        ]
        assert expected == licensing.license_keys(expression, unique=False)

        assert all(
            s.wrapped
            for s in licensing.license_symbols(expression, decompose=True))
    def test_get_expression_complex(self):
        licensing = Licensing()
        spdx_symbols = get_spdx_symbols()
        unknown_symbol = get_unknown_spdx_symbol()
        line_text = ('* SPDX-License-Identifier: '
                     'EPL-2.0 OR aPache-2.0 OR '
                     'GPL-2.0 WITH classpath-exception-2.0 OR '
                     'GPL-2.0')
        expression = get_expression(line_text, licensing, spdx_symbols,
                                    unknown_symbol)

        expected = 'epl-2.0 OR apache-2.0 OR gpl-2.0 WITH classpath-exception-2.0 OR gpl-2.0'
        assert expression.render() == expected

        expected = [
            'epl-2.0', u'apache-2.0', u'gpl-2.0', u'classpath-exception-2.0'
        ]
        assert licensing.license_keys(expression, unique=True) == expected

        assert all(
            s.wrapped
            for s in licensing.license_symbols(expression, decompose=True))
Esempio n. 5
0
def get_declared_license_keys_in_packages(codebase):
    """
    Return a list of declared license keys found in packages.

    A package manifest (such as Maven POM file or an npm package.json file)
    contains structured declared license information. This is further normalized
    as a license_expression. We extract the list of licenses from the normalized
    license expressions.
    """
    packages = chain.from_iterable(
        getattr(res, 'packages', []) or []
        for res in codebase.walk(topdown=True))

    licensing = Licensing()
    detected_good_licenses = []
    for package in packages:
        expression = package.get('license_expression')
        if expression:
            exp = licensing.parse(
                expression, validate=False, strict=False, simple=True)
            keys = licensing.license_keys(exp, unique=True)
            detected_good_licenses.extend(keys)
    return detected_good_licenses
class AlpineLicenseTest(object):
    """
    A license detection test is used to verify that Alpine declared license
    detection works  correctly

    It consists of one YAML file with test data and expectation and a pacakge
    reference.
    """
    declared_license = attr.attrib()
    license_expression = attr.attrib()
    data_file = attr.attrib(default=None)

    licensing = Licensing()

    @classmethod
    def from_file(cls, data_file):
        with open(data_file) as df:
            data = saneyaml.load(df.read())
        data['data_file'] = data_file
        alptest = cls(**data)
        alptest.license_expression = cls.licensing.parse(
            alptest.license_expression).render()
        return alptest

    def to_dict(self):
        dct = attr.asdict(self)
        dct.pop('data_file')
        return dct

    def dump(self):
        parent = fileutils.parent_directory(self.data_file)
        if not exists(parent):
            fileutils.create_dir(parent)
        with open(self.data_file, 'w') as df:
            df.write(saneyaml.dump(self.to_dict()))

    def get_test_method_name(self):
        dfn = fileutils.file_base_name(self.data_file.lower())
        test_name = f'test_alpine_license_detection_{dfn}'
        return text.python_safe_name(test_name)

    @staticmethod
    def from_dir(test_dir):
        """
        Return an iterable of AlpineLicenseTest objects loaded from `test_dir`
        """
        test_files = packages_test_utils.get_test_files(test_dir, '.yml')
        test_files = (join(test_dir, f) for f in test_files)
        return map(AlpineLicenseTest.from_file, test_files)
    def test_get_expression_complex_with_unknown_symbols_and_refs(self):
        licensing = Licensing()
        spdx_symbols = get_spdx_symbols()
        unknown_symbol = get_unknown_spdx_symbol()
        line_text = ('* SPDX-License-Identifier: '
                     'EPL-2.0 OR Apache-2.0 '
                     'OR GPL-2.0  WITH Classpath-exception-2.0 '
                     'OR LicenseRef-GPL-2.0 WITH Assembly-exception')

        expression = get_expression(line_text, licensing, spdx_symbols,
                                    unknown_symbol)

        expected = 'epl-2.0 OR apache-2.0 OR gpl-2.0 WITH classpath-exception-2.0 OR unknown-spdx WITH unknown-spdx'
        assert expected == expression.render()

        expected = [
            'epl-2.0', 'apache-2.0', 'gpl-2.0', 'classpath-exception-2.0',
            'unknown-spdx', 'unknown-spdx'
        ]
        assert expected == licensing.license_keys(expression, unique=False)

        assert all(
            s.wrapped
            for s in licensing.license_symbols(expression, decompose=True))
Esempio n. 8
0
def group_license_expressions(unique_license_expressions):
    """
    Return a tuple that contains two list of license expressions.

    The first list in the tuple contains unique license expressions with "AND",
    "OR, or "WITH" in it.

    The second list in the tuple contains unique license
    expressions without "AND", "OR", or "WITH".
    """
    joined_expressions = []
    single_expressions = []
    for license_expression in unique_license_expressions:
        if ('AND' in license_expression or 'OR' in license_expression
                or 'WITH' in license_expression):
            joined_expressions.append(license_expression)
        else:
            single_expressions.append(license_expression)

    licensing = Licensing()
    unique_joined_expressions = []
    seen_joined_expression = []
    len_joined_expressions = len(joined_expressions)
    if len_joined_expressions > 1:
        for i, j in enumerate(joined_expressions, start=1):
            if i > len_joined_expressions:
                break
            for j1 in joined_expressions[i:]:
                if licensing.is_equivalent(j, j1):
                    if j not in unique_joined_expressions and j not in seen_joined_expression:
                        unique_joined_expressions.append(j)
                        seen_joined_expression.append(j1)
    else:
        unique_joined_expressions = joined_expressions

    return unique_joined_expressions, single_expressions
Esempio n. 9
0
 def to_dict(self, **kwargs):
     def dict_fields(attr, value):
         if attr.name in ('resources', ):
             return False
         return True
     license_expressions_to_combine = []
     if self.core_license_expression:
         license_expressions_to_combine.append(self.core_license_expression)
     if self.other_license_expression:
         license_expressions_to_combine.append(self.other_license_expression)
     if license_expressions_to_combine:
         combined_license_expression = combine_expressions(license_expressions_to_combine)
         if combined_license_expression:
             self.consolidated_license_expression = str(Licensing().parse(combined_license_expression).simplify())
     self.consolidated_holders = sorted(set(list(self.core_holders) + list(self.other_holders)))
     # TODO: Verify and test that we are generating detectable copyrights
     self.consolidated_copyright = 'Copyright (c) {}'.format(', '.join(self.consolidated_holders))
     return attr.asdict(self, filter=dict_fields, dict_factory=OrderedDict)
Esempio n. 10
0
    def test_get_expression_works_for_legacy_deprecated_old_spdx_symbols(self):
        exp_by_old = {
            'eCos-2.0': 'gpl-2.0-plus WITH ecos-exception-2.0',
            'GPL-2.0-with-autoconf-exception': 'gpl-2.0 WITH autoconf-exception-2.0',
            'GPL-2.0-with-bison-exception': 'gpl-2.0 WITH bison-exception-2.2',
            'GPL-2.0-with-classpath-exception': 'gpl-2.0 WITH classpath-exception-2.0',
            'GPL-2.0-with-font-exception': 'gpl-2.0 WITH font-exception-gpl',
            'GPL-2.0-with-GCC-exception': 'gpl-2.0 WITH gcc-linking-exception-2.0',
            'GPL-3.0-with-autoconf-exception': 'gpl-3.0 WITH autoconf-exception-3.0',
            'GPL-3.0-with-GCC-exception': 'gpl-3.0 WITH gcc-exception-3.1',
            'wxWindows': 'lgpl-2.0-plus WITH wxwindows-exception-3.1',
        }
        licensing = Licensing()
        symbols_by_spdx = get_spdx_symbols()
        unknown_symbol = get_unknown_spdx_symbol()

        for test, expected in exp_by_old.items():
            result = get_expression(
                test, licensing, symbols_by_spdx, unknown_symbol).render()
            assert result == expected
Esempio n. 11
0
def get_normalized_expression(query_string):
    """
    Given a text `query_string` return a single detected license expression.
    `query_string` is typically the value of a license field as found in package
    manifests.
    Return None if there is the `query_string` is empty. Return "unknown" as a
    license expression if there is a `query_string` but nothing was detected.

    For example::
    >>> get_normalized_expression('mit')
    'mit'
    >>> get_normalized_expression('mit or asasa or Apache-2.0')
    'apache-2.0 AND unknown'
    >>> get_normalized_expression('mit or asasa or Apache-2.0')
    'apache-2.0 AND unknown'
    >>> get_normalized_expression('mit asasa or Apache-2.0')
    'apache-2.0 AND unknown'
    >>> assert get_normalized_expression('') is None
    >>> assert get_normalized_expression(None) is None
    """
    if not query_string or not query_string.strip():
        return

    if TRACE:
        logger_debug('get_normalized_expression: query_string: "{}"'.format(
            query_string))

    from licensedcode.cache import get_index
    idx = get_index()
    licensing = Licensing()

    # we match twice in a cascade: as an expression, then as plain text if we
    # did not succeed.
    matches = None
    try:
        matched_as_expression = True
        matches = idx.match(query_string=query_string, as_expression=True)
        if matches_have_unknown(matches, licensing):
            # rematch also if we have unknowns
            matched_as_expression = False
            matches = idx.match(query_string=query_string, as_expression=False)

    except Exception:
        matched_as_expression = False
        matches = idx.match(query_string=query_string, as_expression=False)

    if not matches:
        # we have a query_string text but there was no match: return an unknown
        # key
        return 'unknown'

    if TRACE:
        logger_debug('get_normalized_expression: matches:', matches)

    # join the possible multiple detected license expression with an AND
    expression_objects = [m.rule.license_expression_object for m in matches]
    if len(expression_objects) == 1:
        combined_expression_object = expression_objects[0]
    else:
        combined_expression_object = licensing.AND(*expression_objects)

    if matched_as_expression:
        # then just return the expression(s)
        return str(combined_expression_object)

    # Otherwise, verify that we consumed 100% of the query string e.g. that we
    # have no unknown leftover.

    # 1. have all matches 100% coverage?
    all_matches_have_full_coverage = all(m.coverage() == 100 for m in matches)

    # TODO: have all matches a high enough score?

    # 2. are all declared license tokens consumed?
    query = matches[0].query
    # the query object should be the same for all matches. Is this always true??
    for mt in matches:
        if mt.query != query:
            # FIXME: the expception may be swallowed in callers!!!
            raise Exception(
                'Inconsistent package.declared_license: text with multiple "queries".'
                'Please report this issue to the scancode-toolkit team.\n'
                '{}'.format(query_string))

    query_len = len(query.tokens)
    matched_qspans = [m.qspan for m in matches]
    matched_qpositions = Span.union(*matched_qspans)
    len_all_matches = len(matched_qpositions)
    declared_license_is_fully_matched = query_len == len_all_matches

    if not all_matches_have_full_coverage or not declared_license_is_fully_matched:
        # We inject an 'unknown' symbol in the expression
        unknown = licensing.parse('unknown', simple=True)
        combined_expression_object = licensing.AND(combined_expression_object,
                                                   unknown)

    return str(combined_expression_object)
Esempio n. 12
0
class LicenseTest(object):
    """
    A license detection test is used to verify that license detection works
    correctly

    It consists of two files with the same base name: a .yml file with test data
    and a test file with any other extension that needs to be tested for
    detection

    The following data are loaded from the .yml file:
     - a test file to scan for licenses,
     - a list of expected licenses expressions to detect
     - optional notes.
     - a boolean flag expected_failure set to True if a test is expected to fail
       for now.

    If the list of license expressions is empty, then this test should not
    detect any license in the test file.
    """
    data_file = attr.attrib(default=None)
    test_file = attr.attrib(default=None)
    test_file_name = attr.attrib(default=None)
    license_expressions = attr.attrib(default=attr.Factory(list))
    notes = attr.attrib(default=None)
    expected_failure = attr.attrib(default=False)

    licensing = Licensing()

    def __attrs_post_init__(self, *args, **kwargs):
        if self.test_file:
            _, _, self.test_file_name = self.test_file.partition(
                os.path.join('licensedcode', 'data') + os.sep)

        data = {}
        if self.data_file:
            try:
                with io.open(self.data_file, encoding='utf-8') as df:
                    data = saneyaml.load(df.read()) or {}
            except Exception as e:
                raise Exception(f'Failed to read: file://{self.data_file}', e)

            self.license_expressions = data.pop('license_expressions', [])
            self.notes = data.pop('notes', None)
            # True if the test is expected to fail
            self.expected_failure = data.pop('expected_failure', False)

        if data:
            raise Exception('Unknown data elements: ' + repr(data) +
                            ' for: file://' + self.data_file)

        if self.license_expressions:
            for i, exp in enumerate(self.license_expressions[:]):
                try:
                    expression = self.licensing.parse(exp)
                except:
                    raise Exception('Unable to parse License rule expression: '
                                    f'{exp!r} for: file://{self.data_file}\n' +
                                    traceback.format_exc())
                if expression is None:
                    raise Exception('Unable to parse License rule expression: '
                                    f'{exp!r} for: file://{self.data_file}')
                new_exp = expression.render()
                self.license_expressions[i] = new_exp

        else:
            if not self.notes:
                raise Exception(
                    'A license test without expected license_expressions should '
                    f'have explanatory notes:  for: file://{self.data_file}')

    def to_dict(self):
        dct = {}
        if self.license_expressions:
            dct['license_expressions'] = self.license_expressions
        if self.expected_failure:
            dct['expected_failure'] = self.expected_failure
        if self.notes:
            dct['notes'] = self.notes
        return dct

    def dump(self):
        """
        Dump a representation of self to its YAML data file
        """
        as_yaml = saneyaml.dump(self.to_dict())
        with io.open(self.data_file, 'w', encoding='utf-8') as df:
            df.write(as_yaml)

    def get_content(self):
        """
        Return a byte strings of the test file content.
        """
        with open(self.test_file, 'rb') as df:
            d = df.read()
        return d

    def get_test_method_name(self, prefix='test_detection_'):
        test_file_name = self.test_file_name
        test_name = '{prefix}{test_file_name}'.format(**locals())
        test_name = text.python_safe_name(test_name)
        if not isinstance(test_name, str):
            test_name = test_name.decode('utf-8')
        return test_name

    @staticmethod
    def load_from(test_dir):
        """
        Return an iterable of LicenseTest objects loaded from `test_dir`
        """
        return [
            LicenseTest(data_file, test_file)
            for data_file, test_file in get_test_file_pairs(test_dir)
        ]
Esempio n. 13
0
def get_origin_info_from_top_level_packages(top_level_packages, codebase):
    """
    Return a 3-tuple containing the strings of declared license expression,
    copyright holder, and primary programming language from a
    ``top_level_packages`` list of detected top-level packages mapping and a
    ``codebase``.
    """
    if not top_level_packages:
        return '', '', ''

    license_expressions = []
    programming_languages = []
    copyrights = []
    parties = []

    for package_mapping in top_level_packages:
        package = models.Package.from_dict(package_mapping)
        # we are only interested in key packages
        if not is_key_package(package, codebase):
            continue

        license_expression = package.license_expression
        if license_expression:
            license_expressions.append(license_expression)

        programming_language = package.primary_language
        if programming_language:
            programming_languages.append(programming_language)

        copyright_statement = package.copyright
        if copyright_statement:
            copyrights.append(copyright_statement)

        parties.extend(package.parties or [])

    # Combine license expressions
    unique_license_expressions = unique(license_expressions)
    combined_declared_license_expression = combine_expressions(
        expressions=unique_license_expressions,
        relation='AND',
    )

    declared_license_expression = ''
    if combined_declared_license_expression:
        declared_license_expression = str(
            Licensing().parse(combined_declared_license_expression).simplify())

    # Get holders
    holders = list(get_holders_from_copyright(copyrights))
    declared_holders = []
    if holders:
        declared_holders = holders
    elif parties:
        declared_holders = [party.name for party in parties or []]

    declared_holders = unique(declared_holders)

    # Programming language
    unique_programming_languages = unique(programming_languages)
    primary_language = ''
    if len(unique_programming_languages) == 1:
        primary_language = unique_programming_languages[0]

    return declared_license_expression, declared_holders, primary_language
Esempio n. 14
0
def cli(licenses_file):
    """
    Create rules from a structured text file

    For instance:
        ----------------------------------------
        license_expression: lgpl-2.1
        relevance: 100
        is_license_notice: yes
        ---
        This program is free software; you can redistribute it and/or modify
        it under the terms of the GNU Lesser General Public License
        version 2.1 as published by the Free Software Foundation;
        ----------------------------------------
    """

    rule_data = load_data(licenses_file)
    rules_tokens = set()

    licenses = cache.get_licenses_db()
    licensing = Licensing(licenses.values())

    print()
    for data, text in rule_data:
        rdat = '\n'.join(data)
        rtxt = '\n'.join(text)
        existing = rule_exists(rtxt)
        if existing:
            print('Skipping existing rule:', existing, 'with text:\n', rtxt[:50].strip(), '...')
            continue

        # validate YAML syntax
        parsed = saneyaml.load(rdat)
        if parsed.get('is_negative'):
            license_expression = 'not-a-license'
        else:
            _, _, license_expression = data[0].partition(': ')
            license_expression = license_expression.strip()
            if not license_expression:
                raise Exception('Missing license_expression for text:', rtxt)
            licensing.parse(license_expression, validate=True, simple=True)

        base_loc = find_rule_base_loc(license_expression)

        data_file = base_loc + '.yml'
        with io.open(data_file, 'w', encoding='utf-8') as o:
            o.write(rdat)

        text_file = base_loc + '.RULE'
        with io.open(text_file, 'w', encoding='utf-8') as o:
            o.write(rtxt)
        rule = models.Rule(data_file=data_file, text_file=text_file)
        rule_tokens = tuple(rule.tokens())
        if rule_tokens in rules_tokens:
            # cleanup
            os.remove(text_file)
            os.remove(data_file)
            print('Skipping already added rule with text for:', license_expression)
        else:
            rules_tokens.add(rule_tokens)
            rule.dump()
            models.update_ignorables(rule, verbose=True)
            print('Rule added:', rule.identifier)
def simplify_licence_expression(licence_expression: str) -> str:
    """Simplifies a licence expression."""
    return str(
        _parse_licence_expression(Licensing(), licence_expression).simplify())
Esempio n. 16
0
from typing import Tuple, Set, List

from anytree import PreOrderIter, LevelOrderIter, AnyNode
from anytree.exporter import DictExporter
from anytree.importer import DictImporter
from license_expression import Licensing

try:
    from license_sh_private.normalizer import normalize
except ImportError:

    def normalize(license_expression):
        return license_expression, False


licensing = Licensing()
UNKNOWN = "Unknown"

RED = "\033[1;31m"
BLUE = "\033[1;34m"
CYAN = "\033[1;36m"
GREEN = "\033[0;32m"
RESET = "\033[0;0m"
BOLD = "\033[;1m"
REVERSE = "\033[;7m"


def get_npm_license_from_licenses_array(licenses_array):
    """
    Extract licenses name from licenses array and join them with AND
Esempio n. 17
0
class LicenseHandler:

    def __init__(self, translations_files, relicense_file, group_file):
        self.translations_files = translations_files
        self.relicense_file = relicense_file
        self.relicense_map = None
        self.group_file = group_file
        symbols = self.read_symbols(self.translations_files)
        self.licensing = Licensing(symbols)

    def read_symbols(self, translations_files):
        symbols_map = {}
        self.translations = []
        for translations_file in translations_files.split():
            translation_data = read_translations(translations_file)
            self.translations.append(translation_data)
            for lic_key in translation_data:
                if lic_key not in symbols_map:
                    symbols_map[lic_key] = set()
                for val in translation_data[lic_key]:
                    symbols_map[lic_key].add(val)

        return [LicenseSymbol(key=key, aliases=tuple(value))
                for key, value in symbols_map.items()]

    def translate_and_relicense(self, license_expression):
        transl = self.translate(license_expression)
        if not transl:
            transl = license_expression
        rel = self.expand_relicense(transl)

        return rel if rel else transl

    def expand_relicense(self, license_expression):
        if self.relicense_file is not None and self.relicense_file:
            self.relicense_map = read_relicense_file(self.relicense_file)
            expanded = relicense_license(
                self.relicense_map, license_expression)
            return expanded.strip()
        else:
            return license_expression.strip()

    def group(self, license_expression):
        return license_expression.strip()

    def translate(self, license_expression):
        license_expression = license_expression.replace(
            "&", AND_STRING).replace("|", OR_STRING)
        return str(self.simplify(license_expression))

    def simplify(self, license_expression):
        parsed = self.licensing.parse(license_expression)
        return parsed.simplify()

    def license_expression_list_json(self, license_expression, relicense=True):
        license = self.license_expression_list(license_expression, relicense)
        return {
            "license_expression": license_expression,
            "expanded": license.expanded,
            "grouped": license.grouped,
            "translated": license.translated,
            "simplified": license.simplified,
            "interim": license.interim,
            "set_list": license.set_list
        }

    def license_expression_list(self, license_expression, relicense=True):

        license = ManagedLicenseExpression(license_expression)
        license.translated = self.translate(license_expression)

        # We need str to skip verbose output
        license.simplified = str(self.simplify(license.translated))

        if relicense:
            license.expanded = self.expand_relicense(license.simplified)
        else:
            license.expanded = license.simplified

        license.grouped = self.group(license.expanded)

        license.interim = self.interim_license_expression_list(
            license.grouped, self.licensing)

        license.set_list = self.interim_license_expression_set_list(
            license.interim)

        return license

    def interim_license_expression_list(self, license_expression, licensing):
        """
        Transforms and boolean symbolic expression

        Turns an expression like this:
            G AND (A OR B)
        into:
            AND [G, OR [A, B]]
        The latter is an interim format.
        """
        encoded = encode_license_expression(license_expression)
        tokenizer = licensing.get_advanced_tokenizer()
        tokenized = tokenizer.tokenize(encoded)
        current_license = None
        current_licenses = []
        current_op = None
        paren_expr = None
        paren_count = 0
        for token in tokenized:
            tok = token.string
            if tok == '(':
                if paren_expr is None:
                    paren_expr = ""
                else:
                    paren_expr = paren_expr + " " + tok
                    paren_count = paren_count + 1
            elif tok == ')':
                if paren_count == 0:
                    current_license = self.interim_license_expression_list(
                        paren_expr, licensing)
                    paren_expr = None
                else:
                    paren_count = paren_count - 1
                    paren_expr = paren_expr + " " + tok
            elif tok == 'OR' or tok == 'AND':
                if paren_expr is not None:
                    paren_expr = paren_expr + " " + tok
                else:
                    if current_licenses is None:
                        raise FlictError(ReturnCodes.RET_INTERNAL_ERROR,
                                         "Internal failure. Failed creating interim license expression. current_licenses is None")
                    if current_op is None:
                        # first operator
                        current_op = tok
                        current_licenses.append(current_license)
                    elif current_op == tok:
                        # same operator
                        current_licenses.append(current_license)
                    else:
                        # different operator
                        raise FlictError(ReturnCodes.RET_INTERNAL_ERROR,
                                         "Internal failure. Failed creating interim license expression.")
            else:
                if paren_expr is not None:
                    paren_expr = paren_expr + " " + tok
                else:
                    current_license = tok

        current_licenses.append(current_license)
        if current_op is None:
            current_op = "AND"

        list = LicenseExpressionList(current_op, current_licenses)
        return list

    def _combinations(self, lel):
        if not isinstance(lel, LicenseExpressionList):
            return 1
        if lel.op == "AND":
            prod = 1
            for item in lel.list:
                prod = prod * self._combinations(item)
            return prod
        elif lel.op == "OR":
            sum = 0
            for item in lel.list:
                sum = sum + self._combinations(item)
            return sum
        else:
            FlictError(ReturnCodes.RET_INTERNAL_ERROR,
                       f"Internal failure. Failed identifying operator: {lel}")

    def interim_license_expression_set_list(self, interim_license_expression_list):
        """
        Transforms a boolean symbolic expression

        Turns an expression like this:
            AND [G, OR [A, B]]
        into:
            [
              { G, A },
              { G, B }
            ]
        The latter is an interim format.
        """
        expanded_list = []

        if not isinstance(interim_license_expression_list, LicenseExpressionList):
            # single license
            license_set = {decode_license_expression(interim_license_expression_list)}
            expanded_list.append(list(license_set))
            return expanded_list

        current_op = interim_license_expression_list.op
        for lep in interim_license_expression_list.list:
            if current_op is None:
                raise FlictError(ReturnCodes.RET_INTERNAL_ERROR,
                                 "Internal failure. No operator found")

            lep_list = self.interim_license_expression_set_list(lep)
            if current_op == "OR":
                expanded_list = self._manage_list_item_or(
                    expanded_list, lep_list)

            elif current_op == "AND":
                expanded_list = self._manage_list_item_and(
                    expanded_list, lep_list)
        return expanded_list

    def _manage_list_item_and(self, license_list, lep):
        if isinstance(lep, LicenseExpressionList):
            raise FlictError(ReturnCodes.RET_INTERNAL_ERROR,
                             f"Internal failure. Wrong type {lep} for: {lep}")

        # single license
        if len(license_list) == 0:
            return lep

        new_list = []
        for item in license_list:
            for lep_item in lep:
                new_item = list(set(item + lep_item))
                new_list.append(new_item)

        return new_list

    def _manage_list_item_or(self, license_list, lep):
        if isinstance(lep, LicenseExpressionList):
            raise FlictError(ReturnCodes.RET_INTERNAL_ERROR,
                             f"Internal failure. Wrong type {lep} for: {lep}")

        # single license
        if len(license_list) == 0:
            return lep

        new_list = license_list
        for lep_item in lep:
            new_list.append(lep_item)

        return new_list

    def relicensing_information(self):
        if self.relicense_map is None:
            self.relicense_map = read_relicense_file(self.relicense_file)
        return self.relicense_map

    def translation_information(self):
        return self.translations
Esempio n. 18
0
def get_consolidated_packages(codebase):
    """
    Yield a ConsolidatedPackage for each detected package in the codebase
    """
    for resource in codebase.walk(topdown=False):
        for package_data in resource.packages:
            package = get_package_instance(package_data)
            package_root = package.get_package_root(resource, codebase)
            package_root.extra_data['package_root'] = True
            package_root.save(codebase)
            is_build_file = isinstance(package, BaseBuildManifestPackage)
            package_resources = list(
                package.get_package_resources(package_root, codebase))
            package_license_expression = package.license_expression
            package_copyright = package.copyright

            package_holders = []
            if package_copyright:
                numbered_lines = [(0, package_copyright)]
                for _, holder, _, _ in CopyrightDetector().detect(
                        numbered_lines,
                        copyrights=False,
                        holders=True,
                        authors=False,
                        include_years=False):
                    package_holders.append(holder)
            package_holders = process_holders(package_holders)

            discovered_license_expressions = []
            discovered_holders = []
            for package_resource in package_resources:
                if not is_build_file:
                    # If a resource is part of a package Component, then it cannot be part of any other type of Component
                    package_resource.extra_data['in_package_component'] = True
                    package_resource.save(codebase)
                if package_resource.license_expressions:
                    package_resource_license_expression = combine_expressions(
                        package_resource.license_expressions)
                    if package_resource_license_expression:
                        discovered_license_expressions.append(
                            package_resource_license_expression)
                if package_resource.holders:
                    discovered_holders.extend(
                        h.get('value') for h in package_resource.holders)
            discovered_holders = process_holders(discovered_holders)

            combined_discovered_license_expression = combine_expressions(
                discovered_license_expressions)
            if combined_discovered_license_expression:
                simplified_discovered_license_expression = str(
                    Licensing().parse(
                        combined_discovered_license_expression).simplify())
            else:
                simplified_discovered_license_expression = None

            c = Consolidation(
                core_license_expression=package_license_expression,
                # Sort holders by holder key
                core_holders=[
                    h for h, _ in sorted(copyright_summary.cluster(
                        package_holders),
                                         key=lambda t: t[0].key)
                ],
                other_license_expression=
                simplified_discovered_license_expression,
                # Sort holders by holder key
                other_holders=[
                    h for h, _ in sorted(copyright_summary.cluster(
                        discovered_holders),
                                         key=lambda t: t[0].key)
                ],
                files_count=len([
                    package_resource for package_resource in package_resources
                    if package_resource.is_file
                ]),
                resources=package_resources,
            )
            if is_build_file:
                c.identifier = package.name
                yield ConsolidatedComponent(type='build', consolidation=c)
            else:
                yield ConsolidatedPackage(package=package, consolidation=c)
Esempio n. 19
0
def get_consolidated_packages(codebase):
    """
    Yield a ConsolidatedPackage for each detected package in the codebase
    """
    for resource in codebase.walk(topdown=False):
        for package_data in resource.packages:
            package = get_package_instance(package_data)
            is_build_file = isinstance(package, BaseBuildManifestPackage)
            package_resources = list(package.get_package_resources(resource, codebase))
            package_license_expression = package.license_expression
            package_copyright = package.copyright

            package_holders = []
            if package_copyright:
                numbered_lines = [(0, package_copyright)]
                for _, holder, _, _ in CopyrightDetector().detect(numbered_lines,
                        copyrights=False, holders=True, authors=False, include_years=False):
                    package_holders.append(holder)

            discovered_license_expressions = []
            discovered_holders = []
            for package_resource in package_resources:
                if not is_build_file:
                    # If a resource is part of a package Component, then it cannot be part of any other type of Component
                    package_resource.extra_data['in_package_component'] = True
                    package_resource.save(codebase)

                package_resource_license_expression = combine_expressions(package_resource.license_expressions)
                package_resource_holders = package_resource.holders
                if not package_resource_license_expression and not package_resource_holders:
                    continue
                discovered_license_expressions.append(package_resource_license_expression)
                discovered_holders.extend(h.get('value') for h in package_resource_holders)

            # Remove NoneTypes from discovered licenses
            discovered_license_expressions = [lic for lic in discovered_license_expressions if lic]
            # Remove NoneTypes from discovered holders
            discovered_holders = [holder for holder in discovered_holders if holder]

            combined_discovered_license_expression = combine_expressions(discovered_license_expressions)
            if combined_discovered_license_expression:
                simplified_discovered_license_expression = str(Licensing().parse(combined_discovered_license_expression).simplify())
            else:
                simplified_discovered_license_expression = None

            c = Consolidation(
                core_license_expression=package_license_expression,
                core_holders=sorted(set(package_holders)),
                other_license_expression=simplified_discovered_license_expression,
                other_holders=sorted(set(discovered_holders)),
                files_count=sum(1 for package_resource in package_resources if package_resource.is_file),
                resources=package_resources,
            )
            if is_build_file:
                c.identifier = package.name
                yield ConsolidatedComponent(
                    type='build',
                    consolidation=c
                )
            else:
                yield ConsolidatedPackage(
                    package=package,
                    consolidation=c
                )
Esempio n. 20
0
def compute_license_score(codebase):
    """
    Return a mapping of scoring elements and a license clarity score computed at
    the codebase level.

    The license clarity score is a value from 0-100 calculated by combining the
    weighted values determined for each of the scoring elements:

    Declared license:
    - When true, indicates that the software package licensing is documented at
      top-level or well-known locations in the software project, typically in a
      package manifest, NOTICE, LICENSE, COPYING or README file.
    - Scoring Weight = 40

    Identification precision:
    - Indicates how well the license statement(s) of the software identify known
      licenses that can be designated by precise keys (identifiers) as provided in
      a publicly available license list, such as the ScanCode LicenseDB, the SPDX
      license list, the OSI license list, or a URL pointing to a specific license
      text in a project or organization website.
    - Scoring Weight = 40

    License texts:
    - License texts are provided to support the declared license expression in
      files such as a package manifest, NOTICE, LICENSE, COPYING or README.
    - Scoring Weight = 10

    Declared copyright:
    - When true, indicates that the software package copyright is documented at
      top-level or well-known locations in the software project, typically in a
      package manifest, NOTICE, LICENSE, COPYING or README file.
    - Scoring Weight = 10

    Ambiguous compound licensing
    - When true, indicates that the software has a license declaration that
      makes it difficult to construct a reliable license expression, such as in
      the case of multiple licenses where the conjunctive versus disjunctive
      relationship is not well defined.
    - Scoring Weight = -10

    Conflicting license categories
    - When true, indicates the declared license expression of the software is in
      the permissive category, but that other potentially conflicting categories,
      such as copyleft and proprietary, have been detected in lower level code.
    - Scoring Weight = -20
    """

    scoring_elements = ScoringElements()
    declared_licenses = get_field_values_from_codebase_resources(
        codebase=codebase,
        field_name='licenses',
        key_files_only=True,
    )
    declared_license_expressions = get_field_values_from_codebase_resources(
        codebase=codebase,
        field_name='license_expressions',
        key_files_only=True)

    unique_declared_license_expressions = unique(declared_license_expressions)
    declared_license_categories = get_license_categories(declared_licenses)

    copyrights = get_field_values_from_codebase_resources(
        codebase=codebase, field_name='copyrights', key_files_only=True)

    other_licenses = get_field_values_from_codebase_resources(
        codebase=codebase, field_name='licenses', key_files_only=False)

    scoring_elements.declared_license = bool(declared_licenses)
    if scoring_elements.declared_license:
        scoring_elements.score += 40

    scoring_elements.precise_license_detection = check_declared_licenses(
        declared_licenses)
    if scoring_elements.precise_license_detection:
        scoring_elements.score += 40

    scoring_elements.has_license_text = check_for_license_texts(
        declared_licenses)
    if scoring_elements.has_license_text:
        scoring_elements.score += 10

    scoring_elements.declared_copyrights = bool(copyrights)
    if scoring_elements.declared_copyrights:
        scoring_elements.score += 10

    is_permissively_licensed = check_declared_license_categories(
        declared_license_categories)
    if is_permissively_licensed:
        scoring_elements.conflicting_license_categories = check_for_conflicting_licenses(
            other_licenses)
        if scoring_elements.conflicting_license_categories and scoring_elements.score > 0:
            scoring_elements.score -= 20

    declared_license_expression = get_primary_license(
        unique_declared_license_expressions)

    if not declared_license_expression:
        # If we cannot get a single primary license, then we combine and simplify the license expressions from key files
        combined_declared_license_expression = combine_expressions(
            unique_declared_license_expressions)
        if combined_declared_license_expression:
            declared_license_expression = str(Licensing().parse(
                combined_declared_license_expression).simplify())
        scoring_elements.ambiguous_compound_licensing = True
        if scoring_elements.score > 0:
            scoring_elements.score -= 10

    return scoring_elements, declared_license_expression or ''
def _is_expression_or(licence_expression: str) -> bool:
    licensing_util = Licensing()
    return isinstance(
        _parse_licence_expression(licensing_util, licence_expression), OR)
def determine_licence_compound(main_licence: str,
                               additional_licences: List[str]) -> str:
    """Determines the overall licence based on main licence and additional licences."""
    overall_licence = f"({main_licence}) AND ({') AND ('.join(additional_licences)})"
    return str(
        _parse_licence_expression(Licensing(), overall_licence).simplify())
def check_expected_parse_copyright_file(
    test_loc,
    expected_loc,
    regen=False,
    simplified=False,
    _licensing=Licensing(),
):
    '''
    Check copyright parsing of `test_loc` location against an expected JSON file
    at `expected_loc` location. Regen the expected file if `regen` is True.
    '''
    if simplified:
        filter_duplicates = True
        skip_debian_packaging = True
        simplify_licenses = True
        unique_copyrights = True
    else:

        filter_duplicates = False
        skip_debian_packaging = False
        simplify_licenses = False
        unique_copyrights = False
    try:
        dc = debian_copyright.parse_copyright_file(
            location=test_loc,
            check_consistency=False,
        )

        declared_license = dc.get_declared_license(
            filter_duplicates=filter_duplicates,
            skip_debian_packaging=skip_debian_packaging,
        )

        license_expression = dc.get_license_expression(
            skip_debian_packaging=skip_debian_packaging,
            simplify_licenses=simplify_licenses,
        )

        license_expression_keys = set(_licensing.license_keys(license_expression))

        copyrght = dc.get_copyright(
            skip_debian_packaging=skip_debian_packaging,
            unique_copyrights=unique_copyrights,
        ).strip()

        primary_license = dc.primary_license

        match_details = list(map(get_match_details, dc.license_matches))

        results = {
            'primary_license': primary_license,
            'declared_license': declared_license,
            'license_expression': license_expression,
            'copyright': copyrght,
            'matches': match_details,
        }

        if regen:
            expected = results
            with open(expected_loc, 'w') as res:
                res.write(saneyaml.dump(results))
        else:
            with open(expected_loc) as ex:
                expected = saneyaml.load(ex.read())
    except Exception as e:
        import traceback
        files = [
            'file://' + test_loc,
            'file://' + expected_loc,
        ]
        raise Exception(repr(e), traceback.format_exc(), files) from e

    if (
        not regen
        and (saneyaml.dump(results) != saneyaml.dump(expected)
        or 'unknown-license-reference' in license_expression_keys)
    ) :
        res = {
            'test_loc': f'file://{test_loc}',
            'expected_loc': f'file://{expected_loc}',
        }
        res.update(results)
        results = saneyaml.dump(res)
        results = results.replace(
            'unknown-license-reference',
            'unknown-license-reference should not be detected',
        )
        assert results == saneyaml.dump(expected)
Esempio n. 24
0
class LicenseHandler:

    def __init__(self, translations_files, relicense_file, group_file):
        self.translations_files = translations_files
        self.relicense_file = relicense_file
        self.relicense_map = None
        self.group_file = group_file
        symbols = self.read_symbols(self.translations_files)
        self.licensing = Licensing(symbols)
        #print("symbols: " + str(symbols))
        
    def read_symbols(self, translations_files):
        symbols = []
        symbols_map = {}
        for translations_file in translations_files.split():
            #print("reading translation file: " + str(translations_file))
            translation_data = read_translations(translations_file)
            for lic_key in translation_data:
                #print("lic_key:  \"" + str(lic_key) + "\"")
                #print("  lic_alias:  " + str(translation_data[lic_key] ))
                if lic_key not in symbols_map:
                    symbols_map[lic_key] = set()
                for val in translation_data[lic_key]:
                    symbols_map[lic_key].add(val)
                
                #lic_aliases = tuple(translation_data[lic_key])
                #symbols.append(LicenseSymbol(key=key, aliases=lic_aliases))

        for key, value in symbols_map.items():
            #print("Adding to symbols: " + key)
            #print(" - " + str(value))
            symbols.append(LicenseSymbol(key=key, aliases=tuple(value)))

        # debugging
        #print("Symbols")
        #for sym in symbols:
            #print(" sym: " + (str(sym.key)))
            #print("    aliases :  " + (str(sym.aliases)))
            #l = Licensing([sym])
            #print("    licensing: " + (str(l)))
            
        
        #print("symbols: " + str(symbols))
        return symbols
        
    def translate_and_relicense(self, license_expression):
        license_expression = license_expression.replace("&", " and ").replace("|", " or ")
        
        transl = self.translate(license_expression)
        if transl == None or transl == "":
            transl = license_expression
        #print("translate_and_relicenseself: " + license_expression + " ==> " + transl)
        rel = self.expand_relicense(transl)
        if rel == None:
            rel = transl
        #print("translate_and_relicenseself: " + rel)
        return rel
        
    def expand_relicense(self, license_expression):
        if self.relicense_file != None and self.relicense_file != "":
            self.relicense_map = read_relicense_file(self.relicense_file)
            expanded = relicense_license(self.relicense_map, license_expression)
            return expanded.strip()
        else:
            return license_expression.strip()

    def group(self, license_expression):
        return license_expression.strip()

    def translate(self, license_expression):
        return str(self.simplify(license_expression))

    def simplify(self, license_expression):
        parsed = self.licensing.parse(license_expression)
        #parsed = self.licensing._parse_and_simplify(license_expression)
        #print("simplified: " + str(parsed.simplify()))
        #return parsed.simplify()
        return parsed

    def license_expression_list_json(self, license_expression, relicense=True):
        license = self.license_expression_list(license_expression, relicense)
        output = {}
        output["license_expression"] = license_expression
        output["expanded"] = license.expanded
        output["grouped"] = license.grouped
        output["translated"] = license.translated
        output["simplified"] = license.simplified
        output["interim"] = license.interim
        output["set_list"] = license.set_list
        return output

        
    def license_expression_list(self, license_expression, relicense=True):

        license = ManagedLicenseExpression(license_expression)
        license.translated = self.translate(license_expression)
        
        if relicense:
            license.expanded = self.expand_relicense(license.translated)
        else:
            license.expanded = license.translated

            
        license.grouped = self.group(license.expanded)

        # We need str to skip verbose output
        license.simplified = str(self.simplify(license.grouped))
        
        license.interim = self.interim_license_expression_list(license.simplified, self.licensing)
        
        license.set_list = self.interim_license_expression_set_list(license.interim)

        return license


    
    def interim_license_expression_list(self, license_expression, licensing):
        """
        Turns an expression like this:
            G AND (A OR B)
        into:
            AND [G, OR [A, B]]
        The latter is an interim format.
        """
        #print("")
        #print("parse(" + str(license_expression) + ")")
        tokenizer = licensing.get_advanced_tokenizer()
        tokenized = tokenizer.tokenize(str(license_expression))
        current_license=None
        current_licenses=[]
        current_op=None
        paren_expr = None
        paren_count=0
        for token in tokenized:
            tok = token.string
            if tok == '(':
                #print("(")
                if paren_expr == None:
                    paren_expr = ""
                else:
                    paren_expr = paren_expr + " " + tok
                    paren_count = paren_count + 1
            elif tok == ')':
                #print("about to parse: \"" + paren_expr + "\"  count: " + str(paren_count))
                if paren_count == 0:
                    current_license = self.interim_license_expression_list(paren_expr, licensing)
                    #print("got:            \"" + str(current_license) + "\"")
                    paren_expr=None
                else:
                    paren_count = paren_count - 1
                    paren_expr = paren_expr + " " + tok                
            elif tok == 'OR' or tok == 'AND':
                if paren_expr != None:
                    #print("TEMP " + tok)
                    paren_expr = paren_expr + " " + tok
                else:
                    #print("OPERATOR " + tok + " (" + str(current_op) + ")")
                    if current_licenses == None:
                        print("ERROR......")
                        print("ERROR......")
                        print("ERROR......")
                        exit(24)
                    if current_op == None:
                        # first operator
                        current_op = tok
                        #print("=cop: " + tok + "   " + current_license)
                        current_licenses.append(current_license)
                    elif current_op == tok:
                        # same operator
                        #print("-cop: " + tok + "   " + current_license)
                        current_licenses.append(current_license)
                    else:
                        # different operator
                        print("-------------------------------------------- Store me: " + current_op + " " + str(current_licenses))
                        exit(12)
            else:
                #print("tok: \"" + tok + "\"")
                if paren_expr != None:
                    #print("TEMP " + tok)
                    paren_expr = paren_expr + " " + tok
                else:
                    #print("license: " + tok)
                    current_license = tok

        current_licenses.append(current_license)
        if current_op == None:
            current_op = "AND"
        list = LicenseExpressionList(current_op, current_licenses)
        #print("DONE: " + str(license_expression) + " => " + str(list))
        return list

    def _combinations(self, lel):
        #print("lel : " + str(lel))
        if not isinstance(lel, LicenseExpressionList):
            return 1
        if lel.op == "AND":
            prod = 1
            for l in lel.list:
                prod = prod * _combinations(l)
            return prod
        elif lel.op == "OR":
            sum = 0
            for l in lel.list:
                sum = sum + _combinations(l)
            return sum
        else:
            print("ERROR: NO OP")
            exit(11)

    def interim_license_expression_set_list(self, interim_license_expression_list):
        """
        Turns an expression like this:
            AND [G, OR [A, B]]
        into:
            [ 
              { G, A },
              { G, B }
            ]
        The latter is an interim format.
        """    
        expanded_list=[]

        #print("Count: " + str(_combinations(interim_license_expression_list)))
        if not isinstance(interim_license_expression_list, LicenseExpressionList):
            # single license
            license_set= { interim_license_expression_list }
            expanded_list.append(list(license_set))
            license_verbose_debug("LEAF, returning " +  str(expanded_list))
            license_verbose_debug("cop: " + interim_license_expression_list )
            #print("managed____ \""  + str(expanded_list) + "\"  <---- MIDDLE")
            return expanded_list

        current_op = interim_license_expression_list.op;
        license_verbose_debug("cop: " + current_op )
        for lep in interim_license_expression_list.list:
            license_verbose_debug(" ------ lep ----- " + str(lep))
            if current_op == None:
                print("ERROR: NO OP")
                exit(11)

            elif current_op == "OR":
                lep_list = self.interim_license_expression_set_list(lep)
                expanded_list = self._manage_list_item_or(expanded_list, lep_list)

            elif current_op == "AND":
                lep_list = self.interim_license_expression_set_list(lep)
                expanded_list = self._manage_list_item_and(expanded_list, lep_list)
        #print("managed____ \""  + str(expanded_list) + "\"  <---- FINAL")
        return expanded_list

    def _manage_list_item_and(self, license_list, lep):
        license_verbose_debug(" * Andy" )
        if isinstance(lep, LicenseExpressionList):
            print(" -------------====== Andy ====-----------------" )
            exit(77)
            # TODO : implement below (for lep)
            print("AND Count 0: " + str(_combinations(lep)))
            for inner_lep in lep.list:
                print("¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤ AND Count A: " + str(_combinations(inner_lep)))
                set_list = self.interim_license_expression_set_list(inner_lep)
            return None
        else:
            # single license
            if len(license_list) == 0:
                license_verbose_debug("wooops: " + str(license_list))
                license_verbose_debug("wooops: " + str(lep))
                license_list = lep
            else:
                license_verbose_debug("daisy: " + str(license_list))
                license_verbose_debug("daisy: " + str(lep))
                license_verbose_debug(" -------------====== Andy ====----------------- SINGLE: " + str(license_list) )
                new_list=[]
                for item in license_list:
                    license_verbose_debug("  item: " + str(item) + " <--- "  + str(lep) )
                    for lep_item in lep:
                        license_verbose_debug("    item: " + str(item) + " <--- "  + str(lep_item) )
                        new_item =list(set(item + lep_item))
                        license_verbose_debug("    item: " + str(item)  )
                        new_list.append(new_item)
                    license_verbose_debug("    list: " + str(new_list)  )
                license_list = new_list
            return license_list


    def _manage_list_item_or(self, license_list, lep):
        license_verbose_debug(" * Orleans: " + (str(lep)))
        if isinstance(lep, LicenseExpressionList):
            # TODO : implement below (for lep)
            license_verbose_debug(" -------------====== ORLEANS ====----------------- : " + str(lep.license_list) )
            exit(77)
            for inner_lep in lep.license_list:
                print("        ====----------------- : " + str(inner_lep) )
                print("OR Count A: " + str(_combinations(inner_lep)))
                set_list = self.interim_license_expression_set_list(inner_lep)
                print("OR Count B: " + str(len(set_list)))
                license_list.append(inner_lep)
        else:
            # single license
            license_verbose_debug("HERE I AM .... \"" + str(lep) + "\"")
            if len(license_list) == 0:
                new_list=lep
                license_verbose_debug("topsss: " + str(license_list) + " size: " + str(len(license_list)))
                license_verbose_debug("topsss: " + str(lep) + " size: " + str(len(lep)))
                license_verbose_debug("topsss: " + str(new_list) + " size: " + str(len(new_list)))
            else:
                new_list = license_list
                license_verbose_debug("dapsss: " + str(license_list))
                license_verbose_debug("dappss: " + str(lep))
                for lep_item in lep:
                    license_verbose_debug("    item: " + str(license_list) + " <--- "  + str(lep_item) )
                    new_list.append(lep_item)

        return new_list
Esempio n. 25
0
def parse_structured_copyright_file(
    copyright_file,
    skip_debian_packaging=True,
    simplify_licenses=True,
    unique=True,
):
    """
    Return a tuple of (declared license, detected license_expression,
    copyrights) strings computed from the `copyright_file` location. For each
    copyright file paragraph we treat the "name" as a license declaration. The
    text is used for detection and cross-reference with the declaration.

    If `skip_debian_packaging` is True, the Debian packaging license --if
    detected-- is skipped.

    If `simplify_licenses` is True the license expressions are simplified.

    If `unique` is True, repeated copyrights, detected or declared licenses are
    ignore, and only unique detections are returne.
    """
    if not copyright_file:
        return None, None, None

    deco = DebianCopyright.from_file(copyright_file)

    declared_licenses = []
    detected_licenses = []
    copyrights = []

    deco = fix_copyright(deco)

    licensing = Licensing()
    for paragraph in deco.paragraphs:

        if skip_debian_packaging and is_debian_packaging(paragraph):
            # Skipping packaging license and copyrights since they are not
            # relevant to the effective package license
            continue

        if isinstance(paragraph, (CopyrightHeaderParagraph, CopyrightFilesParagraph)):
            pcs = paragraph.copyright.statements or []
            for p in pcs:
                p = p.dumps()
                # avoid repeats
                if unique:
                    if p not in copyrights:
                        copyrights.append(p)
                else:
                    copyrights.append(p)

        if isinstance(paragraph, CatchAllParagraph):
            text = paragraph.dumps()
            if text:
                detected = get_normalized_expression(text, try_as_expression=False)
                if not detected:
                    detected = 'unknown'
                detected_licenses.append(detected)
        else:
            plicense = paragraph.license
            if not plicense:
                continue

            declared, detected = detect_declared_license(plicense.name)
            # avoid repeats
            if unique:
                if declared and declared not in declared_licenses:
                    declared_licenses.append(declared)
                if detected and detected not in detected_licenses:
                    detected_licenses.append(detected)
            else:
                declared_licenses.append(declared)
                detected_licenses.append(detected)

            # also detect in text
            text = paragraph.license.text
            if text:
                detected = get_normalized_expression(text, try_as_expression=False)
                if not detected:
                    detected = 'unknown'
                # avoid repeats
                if unique:
                    if detected not in detected_licenses:
                        detected_licenses.append(detected)
                else:
                    detected_licenses.append(detected)

    declared_license = '\n'.join(declared_licenses)

    if detected_licenses:
        detected_licenses = [licensing.parse(dl, simple=True) for dl in detected_licenses]

        if len(detected_licenses) > 1:
            detected_license = licensing.AND(*detected_licenses)
        else:
            detected_license = detected_licenses[0]

        if simplify_licenses:
            detected_license = detected_license.simplify()

        detected_license = str(detected_license)

    else:
        detected_license = 'unknown'

    copyrights = '\n'.join(copyrights)
    return declared_license, detected_license, copyrights
Esempio n. 26
0
def cli(licenses_file):
    """
    Create rules from a text file with delimited blocks of metadata and texts.

    As an example a file would contains one of more blocks such as this:

\b
        ----------------------------------------
        license_expression: lgpl-2.1
        relevance: 100
        is_license_notice: yes
        ---
        This program is free software; you can redistribute it and/or modify
        it under the terms of the GNU Lesser General Public License
        version 2.1 as published by the Free Software Foundation;
        ----------------------------------------
    """

    rules_data = load_data(licenses_file)
    rules_tokens = all_rule_tokens()

    licenses = cache.get_licenses_db()
    licensing = Licensing(licenses.values())

    print()
    errors = validate_license_rules(rules_data, licensing)
    if errors:
        print('Invalid rules: exiting....')
        for error in errors:
            print(error)
            print()

        raise Exception('Invalid rules: exiting....')

    print()
    for rule in rules_data:
        is_negative = rule.data.get('is_negative')
        is_false_positive = rule.data.get('is_false_positive')
        existing = rule_exists(rule.text)
        if existing and not is_negative:
            print('Skipping existing non-negative rule:', existing,
                  'with text:\n', rule.text[:50].strip(), '...')
            continue

        if is_negative:
            base_name = 'not-a-license'
        else:
            license_expression = rule.data.get('license_expression')
            license_expression = str(
                licensing.parse(license_expression, validate=True,
                                simple=True))
            base_name = license_expression
            if is_false_positive:
                base_name = 'false-positive_' + base_name

        base_loc = find_rule_base_loc(base_name)

        data_file = base_loc + '.yml'
        with io.open(data_file, 'w', encoding='utf-8') as o:
            o.write(rule.raw_data)

        text_file = base_loc + '.RULE'
        with io.open(text_file, 'w', encoding='utf-8') as o:
            o.write(rule.text)

        rulerec = models.Rule(data_file=data_file, text_file=text_file)
        rule_tokens = tuple(rulerec.tokens())
        if rule_tokens in rules_tokens:
            # cleanup
            os.remove(text_file)
            os.remove(data_file)
            print('Skipping already added rule with text for:', base_name)
        else:
            rules_tokens.add(rule_tokens)
            rulerec.dump()
            models.update_ignorables(rulerec, verbose=False)
            print(
                'Rule added:',
                'file://' + rulerec.data_file,
                '\n',
                'file://' + rulerec.text_file,
            )
    def _report(self, args, scancode_report, _files):
        copyrights = set()
        licenses = set()
        spdx = set()
        files = _files['included']

        for f in files:
            if self._isdir(f):
                continue
            manifest_map = f['scancode_manifestor']
            #print("collecting info " + str(f))
            self.logger.verbose("collecting info " + str(f['name']) + " " +
                                str(manifest_map['license_key']))
            for c in manifest_map['copyright']:
                copyrights.add(c)

            assert 'scancode_manifestor' in f
            #print("---------------- CHECKING------")
            #print(json.dumps(f['scancode_manifestor'], indent=4))
            #print("Adding license for file : " + str(f['path']) + ": ", end="")
            if 'curated_license' in f['scancode_manifestor']:
                lic_key = f['scancode_manifestor']['curated_license']
                #print(" curated ", end="")
            elif 'license_key' in f['scancode_manifestor']:
                lic_key = f['scancode_manifestor']['license_key']
                #print(" originial ", end="")
            else:
                # If we get here, it means the transormation has failed.
                # Better go out with a bang
                print("000000000000000000000000000000000000000000000000000000")
                assert False
            #print(lic_key)

            #lic_key = manifest_map['license_key']
            #if lic_key == None:
            #    if 'scancode_manifestor' in f['scancode_manifestor']:
            #        lic_key = f['scancode_manifestor']['curated_license']
            #        #print("wooops.... : " + str(f['path']) + " has " + f['scancode_manifestor']['curated_license'])
            #    else:
            #        lic_key = " none "

            #else:
            licenses.add(lic_key)
            spdx.add(manifest_map['license_spdx'])

        lic_expr = None
        #print("licenses: " + str(licenses))
        for lic in licenses:
            if lic_expr == None:
                lic_expr = ""
            else:
                lic_expr += " and "

            lic_expr += str(lic)

        spdx_expr = None
        for lic in spdx:
            if spdx_expr == None:
                spdx_expr = ""
            else:
                spdx_expr += " and "

            spdx_expr += str(lic)

        c_list = list(copyrights)
        c_list.sort()

        #print("\nlicenses: " + str(lic_expr))
        licensing = Licensing()
        parsed = licensing._parse_and_simplify(lic_expr)
        json_compat_lic = str(parsed).replace("AND", " & ")
        #print("\nlicenses: " + str(parsed))
        #exit(0)

        report = {}

        #
        # Files
        #
        report['files'] = {}
        report['files']['included'] = _files['included']
        report['files']['excluded'] = _files['excluded']
        report['files']['included_files_count'] = self._count_files(
            _files['included'])
        report['files']['excluded_files_count'] = self._count_files(
            _files['excluded'])
        report['files'][
            'original_files_count'] = self._scancode_report_files_count(
                scancode_report)

        #
        # Project
        #
        project = {}
        project['name'] = args['project_name']
        project['sub_package'] = args['sub_package_name']
        project['version'] = args['project_version']
        project['url'] = args['project_url']
        project['source_url'] = args['project_source_url']
        project['issue_url'] = args['project_issue_url']
        project['download_url'] = args['project_download_url']
        report['project'] = project

        #
        # Conclusion
        #
        report['conclusion'] = {}
        report['conclusion']['copyright'] = c_list
        report['conclusion']['license_expression'] = json_compat_lic
        report['conclusion']['license_expression_original'] = lic_expr

        #
        # Meta information
        #
        report['meta'] = {}
        #report['meta']['curations'] = curations
        report['meta']['arguments'] = args  #.__dict__
        report['meta']['report_date'] = str(datetime.datetime.now())
        report['meta']['scancode_report_file'] = args['input_file']
        report['meta']['scancode_name'] = scancode_report['headers'][0][
            'tool_name']
        report['meta']['scancode_version'] = scancode_report['headers'][0][
            'tool_version']
        #report['meta']['scancode_report'] = scancode_report

        return report
Esempio n. 28
0
class Rule(object):
    """
    A detection rule object is a text to use for detection and corresponding
    detected licenses and metadata.
    """
    licensing = Licensing()

    ###########
    # FIXME: !!! TWO RULES MAY DIFFER BECAUSE THEY ARE UPDATED BY INDEXING
    ###########

    # optional rule id int typically assigned at indexing time
    rid = attr.ib(default=None, repr=TRACE_REPR)

    # unique identifier
    identifier = attr.ib(default=None)

    # License expression string
    license_expression = attr.ib(default=None)

    # License expression object, created at build time
    license_expression_object = attr.ib(default=None, repr=False)

    # an indication of what this rule importance is (e.g. how important is its
    # text when detected as a licensing clue) as one of several flags:

    # for a license full text: this provides the highest level of confidence wrt
    # detection
    is_license_text = attr.ib(default=False, repr=False)

    # for a license notice: this provides a strong confidence wrt detection
    is_license_notice = attr.ib(default=False, repr=False)

    # reference for a mere short license reference such as its bare name or a URL
    # this provides a weak confidence wrt detection
    is_license_reference = attr.ib(default=False, repr=False)

    # tag for a structured licensing tag such as a package manifest metadata or
    # an SPDX license identifier or similar package manifest tag
    # this provides a strong confidence wrt detection
    is_license_tag = attr.ib(default=False, repr=False)

    # is this rule text a false positive when matched? it will filtered out at
    # the end if matched
    is_false_positive = attr.ib(default=False, repr=False)

    # is this rule text a negative rule? it will be removed from the matchable
    # text the start if matched
    is_negative = attr.ib(default=False, repr=False)

    # is this rule text only to be matched with a minimum coverage e.g. a
    # minimum proportion of tokens as a float between 0 and 100 where 100 means
    # all tokens must be matched and a smaller value means a smaller propertion
    # of matched tokens is acceptable. this is computed unless this is provided
    # here.
    minimum_coverage = attr.ib(default=0)
    has_stored_minimum_coverage = attr.ib(default=False, repr=False)
    # same as minimum_coverage but divided/100
    _minimum_containment = attr.ib(default=0, repr=False)

    # Can this rule be matched if there are unknown words in its matched range?
    # The default is to allow known and unknown words. Unknown words are words
    # that do not exist in the text of any indexed license or license detection
    # rule.
    only_known_words = attr.ib(default=False)

    # what is the relevance of a match to this rule text? a float between 0 and
    # 100 where 100 means highly relevant and 0 menas not relevant at all.
    # For instance a match to the "gpl" or the "cpol" words have a fairly low
    # relevance as they are a weak indication of an actual license and could be
    # a false positive. In somce cases, this may even be used to discard obvious
    # false positive matches automatically.
    relevance = attr.ib(default=100)
    has_stored_relevance = attr.ib(default=False, repr=False)

    # The rule contains a reference to some file name that comtains the text
    referenced_filenames = attr.ib(default=attr.Factory(list), repr=False)

    # optional, free text
    notes = attr.ib(default=None, repr=False)

    # set to True if the rule is built from a .LICENSE full text
    is_license = attr.ib(default=False, repr=False)

    # lists of copuyrights, emails and URLs that can be ignored when detected
    # in this license as they are part of the license or rule text itself
    ignorable_copyrights = attr.ib(default=attr.Factory(list), repr=False)
    ignorable_holders = attr.ib(default=attr.Factory(list), repr=False)
    ignorable_authors = attr.ib(default=attr.Factory(list), repr=False)
    ignorable_urls = attr.ib(default=attr.Factory(list), repr=False)
    ignorable_emails = attr.ib(default=attr.Factory(list), repr=False)

    ###########################################################################

    # path to the YAML data file for this rule
    data_file = attr.ib(default=None, repr=False)

    # path to the rule text file
    text_file = attr.ib(default=None, repr=False)

    # text of this rule for special cases where the rule is not backed by a file:
    # for SPDX license expression dynamic rules or testing
    stored_text = attr.ib(default=None, repr=False)

    # These attributes are computed upon text loading or setting the thresholds
    ###########################################################################

    # lengths in tokens
    length = attr.ib(default=0)
    min_matched_length = attr.ib(default=0, repr=TRACE_REPR)

    high_length = attr.ib(default=0, repr=TRACE_REPR)
    min_high_matched_length = attr.ib(default=0, repr=TRACE_REPR)

    # lengths in unique token.
    length_unique = attr.ib(default=0, repr=TRACE_REPR)
    min_matched_length_unique = attr.ib(default=0, repr=TRACE_REPR)

    high_length_unique = attr.ib(default=0, repr=TRACE_REPR)
    min_high_matched_length_unique = attr.ib(default=0, repr=TRACE_REPR)

    is_small = attr.ib(default=False, repr=TRACE_REPR)

    has_computed_thresholds = attr.ib(default=False, repr=False)

    def get_length(self, unique=False):
        return self.length_unique if unique else self.length

    def get_min_matched_length(self, unique=False):
        return (self.min_matched_length_unique if unique
                else self.min_matched_length)

    def get_high_length(self, unique=False):
        return self.high_length_unique if unique else self.high_length

    def get_min_high_matched_length(self, unique=False):
        return (self.min_high_matched_length_unique if unique
                else self.min_high_matched_length)

    def __attrs_post_init__(self, *args, **kwargs):
        if not self.text_file:
            # for SPDX or tests only
            if not self.stored_text :
                raise Exception('Invalid rule without its corresponding text file: {}'.format(self))
            self.identifier = '_tst_' + str(len(self.stored_text))
        else:
            self.identifier = file_name(self.text_file)

        if self.data_file:
            try:
                self.load()
            except Exception as e:
                data_file = self.data_file
                trace = traceback.format_exc()
                message = 'While loading: file://{data_file}\n{trace}'.format(**locals())
                raise Exception(message)

        if self.relevance and self.relevance != 100:
            self.has_stored_relevance = True

        if self.minimum_coverage:
            self.has_stored_minimum_coverage = True

        if self.license_expression:
            try:
                expression = self.licensing.parse(self.license_expression)
            except:
                raise Exception(
                    'Unable to parse License rule expression: '
                    +repr(self.license_expression) + ' for: file://' + self.data_file +
                    '\n' + traceback.format_exc()
                )
            if expression is None:
                raise Exception(
                    'Unable to parse License rule expression: '
                    +repr(self.license_expression) + ' for: file://' + self.data_file)

            self.license_expression = expression.render()
            self.license_expression_object = expression

    def tokens(self):
        """
        Return an iterable of token strings for this rule. Length, relevance and
        minimum_coverage may be recomputed as a side effect.
        """
        length = 0
        text = self.text()
        text = text.strip()

        # FIXME: this is weird:

        # We tag this rule as being a bare URL if it starts with a scheme and is
        # on one line: this is used to determine a matching approach

        # FIXME: this does not lower the text first??
        if text.startswith(('http://', 'https://', 'ftp://')) and '\n' not in text[:1000].lower():
            self.minimum_coverage = 100

        for token in query_tokenizer(self.text()):
            length += 1
            yield token

        self.length = length
        self.compute_relevance()

    def text(self):
        """
        Return the rule text loaded from its file.
        """
        if self.text_file and exists(self.text_file):
            # IMPORTANT: use the same process as query text loading for symmetry
            numbered_lines = numbered_text_lines(self.text_file, demarkup=False, plain_text=True)
            return ''.join(l for _, l in numbered_lines)

        # used for non-file backed rules
        elif self.stored_text:
            return self.stored_text

        else:
            raise Exception('Inconsistent rule text for: ' +
                            self.identifier + '\nfile://' + self.text_file)

    def license_keys(self, unique=True):
        """
        Return a list of license keys for this rule.
        """
        if not self.license_expression:
            return []
        return self.licensing.license_keys(self.license_expression_object, unique=unique)

    def same_licensing(self, other):
        """
        Return True if the other rule has the same licensing as this rule.
        """
        if self.license_expression and other.license_expression:
            return self.licensing.is_equivalent(
                self.license_expression_object, other.license_expression_object)

    def licensing_contains(self, other):
        """
        Return True if this rule licensing contains the other rule licensing.
        """
        if self.license_expression and other.license_expression:
            return self.licensing.contains(
                self.license_expression_object, other.license_expression_object)

    def compute_thresholds(self, small_rule=SMALL_RULE):
        """
        Compute and set thresholds either considering the occurrence of all
        tokens or the occurance of unique tokens.
        """
        minimum_coverage, self.min_matched_length, self.min_high_matched_length = (
            compute_thresholds_occurences(
                self.minimum_coverage,
                self.length,
                self.high_length))
        if not self.has_stored_minimum_coverage:
            self.minimum_coverage = minimum_coverage

        self._minimum_containment = self.minimum_coverage / 100

        self.min_matched_length_unique, self.min_high_matched_length_unique = (
        compute_thresholds_unique(
            self.minimum_coverage,
            self.length,
            self.length_unique, self.high_length_unique))

        self.is_small = self.length < small_rule

    def to_dict(self):
        """
        Return an ordered mapping of self, excluding texts. Used for
        serialization. Empty values are not included.
        """
        data = OrderedDict()
        if self.license_expression:
            data['license_expression'] = self.license_expression

        flags = (
            'is_false_positive',
            'is_negative',
            'is_license_text',
            'is_license_notice',
            'is_license_reference',
            'is_license_tag',
            'only_known_words',
        )

        for flag in flags:
            tag_value = getattr(self, flag, False)
            if tag_value:
                data[flag] = tag_value

        if self.has_stored_relevance and self.relevance:
            rl = self.relevance
            if isinstance(rl, float) and int(rl) == rl:
                rl = int(rl)
            data['relevance'] = rl

        if self.has_stored_minimum_coverage and self.minimum_coverage > 0:
            cv = self.minimum_coverage
            if isinstance(cv, float) and int(cv) == cv:
                cv = int(cv)
            data['minimum_coverage'] = cv

        if self.referenced_filenames:
            data['referenced_filenames'] = self.referenced_filenames

        if self.notes:
            data['notes'] = self.notes

        if self.ignorable_copyrights:
            data['ignorable_copyrights'] = self.ignorable_copyrights
        if self.ignorable_holders:
            data['ignorable_holders'] = self.ignorable_holders
        if self.ignorable_authors:
            data['ignorable_authors'] = self.ignorable_authors
        if self.ignorable_urls:
            data['ignorable_urls'] = self.ignorable_urls
        if self.ignorable_emails:
            data['ignorable_emails'] = self.ignorable_emails

        return data

    def dump(self):
        """
        Dump a representation of this rule as two files:
         - a .yml for the rule data in YAML (self.data_file)
         - a .RULE: the rule text as a UTF-8 file (self.text_file)
        Does nothing if this rule was created from a License (e.g.
        `is_license` is True)
        """
        if self.is_license:
            return

        def write(location, byte_string):
            # we write as binary because rules and licenses texts and data are UTF-8-encoded bytes
            with io.open(location, 'wb') as of:
                of.write(byte_string)

        if self.data_file:
            as_yaml = saneyaml.dump(self.to_dict(), indent=4, encoding='utf-8')
            write(self.data_file, as_yaml)
            write(self.text_file, self.text().encode('utf-8'))

    def load(self):
        """
        Load self from a .RULE YAML file stored in self.data_file.
        Does not load the rule text file.
        Unknown fields are ignored and not bound to the Rule object.
        """
        try:
            with io.open(self.data_file, encoding='utf-8') as f:
                data = saneyaml.load(f.read())
        except Exception as e:
            print('#############################')
            print('INVALID LICENSE RULE FILE:', 'file://' + self.data_file)
            print('#############################')
            print(e)
            print('#############################')
            # this is a rare case, but yes we abruptly stop.
            raise e
        known_attributes = set(attr.fields_dict(self.__class__))
        data_file_attributes = set(data)
        unknown_attributes = data_file_attributes.difference(known_attributes)
        if unknown_attributes:
            unknown_attributes = ', '.join(sorted(unknown_attributes))
            msg = 'License rule {} data file has unknown attributes: {}'
            raise Exception(msg.format(self, unknown_attributes))

        self.license_expression = data.get('license_expression')
        self.is_negative = data.get('is_negative', False)
        self.is_false_positive = data.get('is_false_positive', False)

        if not self.license_expression and not (self.is_negative or self.is_false_positive):
            msg = 'License rule {} is missing a license_expression.'
            raise Exception(msg.format(self))

        relevance = float(data.get('relevance', 0))
        if relevance:
            if relevance <= 0 or relevance > 100:
                msg = ('License rule {} data file has an invalid relevance. '
                       'Should be above 0 and 100 or less: {}')
                raise Exception(msg.format(self, repr(relevance)))
            # Keep track if we have a stored relevance of not.
            self.relevance = relevance
            self.has_stored_relevance = True

        self.minimum_coverage = float(data.get('minimum_coverage', 0))
        self._minimum_containment = self.minimum_coverage / 100

        if not (0 <= self.minimum_coverage <= 100):
            msg = (
                'License rule {} data file has an invalid minimum_coverage. '
                'Should be between 0 and 100: {}')
            raise Exception(msg.format(self, self.minimum_coverage))

        self.is_license_text = data.get('is_license_text', False)
        self.is_license_notice = data.get('is_license_notice', False)
        self.is_license_tag = data.get('is_license_tag', False)
        self.is_license_reference = data.get('is_license_reference', False)
        self.only_known_words = data.get('only_known_words', False)
        self.referenced_filenames = data.get('referenced_filenames', []) or []
        if not isinstance(self.referenced_filenames, list):
            msg = (
                'License rule {} data file has an invalid referenced_filenames. '
                'Should be a list: {}')
            raise Exception(msg.format(self, self.referenced_filenames))

        # these are purely informational and not used at run time
        notes = data.get('notes')
        if notes:
            self.notes = notes.strip()

        if not self.notes and (self.is_negative or self.is_false_positive):
            msg = 'Special License rule {} is missing explanatory notes.'
            raise Exception(msg.format(self))

        self.ignorable_copyrights = data.get('ignorable_copyrights', [])
        self.ignorable_holders = data.get('ignorable_holders', [])
        self.ignorable_authors = data.get('ignorable_authors', [])
        self.ignorable_urls = data.get('ignorable_urls', [])
        self.ignorable_emails = data.get('ignorable_emails', [])

        return self

    def compute_relevance(self):
        """
        Compute and set the `relevance` attribute for this rule. The
        relevance is a float between 0 and 100 where 100 means highly
        relevant and 0 means not relevant at all.

        For instance a match to the "gpl" or the "cpol" words have a fairly low
        relevance as they are a weak indication of an actual license and could be a
        false positive and should therefore be assigned a low relevance. In contrast
        a match to most or all of the apache-2.0 license text is highly relevant. The
        Rule relevance is used as the basis to compute a match score.

        The relevance is either pre-defined in the rule YAML data file with the
        "relevance" attribute or computed base on the rule length here using
        this approach:

        - a false positive or a negative rule has a relevance of 100.
        - a rule of length equal to or larger than a threshold has a 100 relevance
        - a rule of length smaller than a threshold has a relevance of
          100/threshold, rounded down.

        The current threshold is 18 words.
        """

        if isinstance(self, SpdxRule):
            self.relevance = 100
            return

        if self.has_stored_relevance:
            return

        # case for false positive
        if self.is_false_positive:
            self.relevance = 100
            return

        # case for negative rules with no license (and are not an FP)
        # they do not have licenses and their matches are never returned
        if self.is_negative:
            self.relevance = 100
            return

        threshold = 18.0
        relevance_of_one_word = round((1 / threshold) * 100, 2)
        length = self.length
        if length >= threshold:
            # general case
            self.relevance = 100
        else:
            computed = int(length * relevance_of_one_word)
            self.relevance = min([100, computed])

    @property
    def has_flags(self):
        """
        Return True if this Rule has at least one flag set.
        """
        return (self.is_license_text or self.is_license_notice
            or self.is_license_reference or self.is_license_tag)
Esempio n. 29
0
from pathlib import Path
from typing import BinaryIO, List, Optional

from boolean.boolean import Expression, ParseError
from debian.copyright import Copyright
from license_expression import ExpressionError, Licensing

from . import SpdxInfo
from ._comment import _all_style_classes
from ._licenses import ALL_NON_DEPRECATED_MAP

GIT_EXE = shutil.which("git")
HG_EXE = shutil.which("hg")

_LOGGER = logging.getLogger(__name__)
_LICENSING = Licensing()

_END_PATTERN = "{}$".format(
    "".join(
        {
            "(?:{})*".format(re.escape(style.MULTI_LINE[2]))
            for style in _all_style_classes()
            if style.MULTI_LINE[2]
        }
    )
)
_IDENTIFIER_PATTERN = re.compile(
    r"SPDX" "-License-Identifier:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE
)
_COPYRIGHT_PATTERNS = [
    re.compile(r"(SPDX" "-FileCopyrightText:[ \t]+.*?)" + _END_PATTERN),
Esempio n. 30
0
def spdx_id_match(idx, query_run, text):
    """
    Return one LicenseMatch by matching the `text` as an SPDX license expression
    using the `query_run` positions and `idx` index for support.
    """
    from licensedcode.cache import get_spdx_symbols
    from licensedcode.cache import get_unknown_spdx_symbol

    if TRACE:
        logger_debug('spdx_id_match: start:', 'text:', text, 'query_run:',
                     query_run)

    licensing = Licensing()
    symbols_by_spdx = get_spdx_symbols()
    unknown_symbol = get_unknown_spdx_symbol()

    expression = get_expression(text, licensing, symbols_by_spdx,
                                unknown_symbol)
    expression_str = expression.render()

    if TRACE:
        logger_debug('spdx_id_match: expression:', repr(expression_str))

    # how many known or unknown-spdx symbols occurence do we have?
    known_syms = 0
    unknown_syms = 0
    for sym in licensing.license_symbols(expression,
                                         unique=False,
                                         decompose=True):
        if sym == unknown_symbol:
            unknown_syms += 1
        else:
            known_syms += 1

    match_len = len(query_run)
    match_start = query_run.start
    matched_tokens = query_run.tokens

    # are we starting with SPDX-License-Identifier or not? if yes: fix start
    cleaned = clean_text(text).lower()
    # FIXME: dnl and rem may not be known tokens hence the pos will be wrong
    if cleaned.startswith((
            'list',
            'dnl',
            'rem',
    )):
        match_start += 1
        match_len -= 1
        matched_tokens[1:]

    # build synthetic rule
    # TODO: ensure that all the SPDX license keys are known symbols
    rule = SpdxRule(
        license_expression=expression_str,
        # FIXME: for now we are putting the original query text as a
        # rule text: this is likely incorrect when it comes to properly
        # computing the known and unknowns and high and lows for this rule.
        # alternatively we could use the expression string, padded with
        # spdx-license-identifier: this may be wrong too, if the line was
        # not padded originally with this tag
        stored_text=text,
        length=match_len)

    # build match from parsed expression
    # collect match start and end: e.g. the whole text
    qspan = Span(range(match_start, query_run.end + 1))

    # we use the query side to build the ispans
    ispan = Span(range(0, match_len))

    len_junk = idx.len_junk
    hispan = Span(p for p, t in enumerate(matched_tokens) if t >= len_junk)

    match = LicenseMatch(rule=rule,
                         qspan=qspan,
                         ispan=ispan,
                         hispan=hispan,
                         query_run_start=match_start,
                         matcher=MATCH_SPDX_ID,
                         query=query_run.query)

    if TRACE:
        logger_debug('spdx_id_match: match found:', match)
    return match