def test_all_spdx_tokens_exists_in_dictionary(self):
     idx = cache.get_index()
     dic = idx.dictionary
     licenses = cache.get_licenses_db()
     tokens = models.get_all_spdx_key_tokens(licenses)
     for token in tokens:
         dic[token]
Beispiel #2
0
def get_licenses(location, min_score=0, include_text=False, diag=False, license_url_template=DEJACODE_LICENSE_URL):
    """
    Yield mappings of license data detected in the file at `location`.

    `minimum_score` is a minimum score threshold from 0 to 100. The
    default is 0 means that all license matches will be returned. With
    any other value matches that have a score below minimum score with
    not be returned.

    if `include_text` is True, the matched text is included in the
    returned data.

    If `diag` is True, additional match details are returned with the
    matched_rule key of the returned mapping.
    """
    from licensedcode.cache import get_index
    from licensedcode.cache import get_licenses_db

    idx = get_index()
    licenses = get_licenses_db()

    for match in idx.match(location=location, min_score=min_score):
        if include_text:
            matched_text = match.matched_text(whole_lines=False)
        for license_key in match.rule.licenses:
            lic = licenses.get(license_key)
            result = OrderedDict()
            result['key'] = lic.key
            result['score'] = match.score()
            result['short_name'] = lic.short_name
            result['category'] = lic.category
            result['owner'] = lic.owner
            result['homepage_url'] = lic.homepage_url
            result['text_url'] = lic.text_urls[0] if lic.text_urls else ''
            result['reference_url'] = license_url_template.format(lic.key)
            spdx_key = lic.spdx_license_key
            result['spdx_license_key'] = spdx_key
            if spdx_key:
                spdx_key = lic.spdx_license_key.rstrip('+')
                spdx_url = SPDX_LICENSE_URL.format(spdx_key)
            else:
                spdx_url = ''
            result['spdx_url'] = spdx_url
            result['start_line'] = match.start_line
            result['end_line'] = match.end_line
            matched_rule = result['matched_rule'] = OrderedDict()
            matched_rule['identifier'] = match.rule.identifier
            matched_rule['license_choice'] = match.rule.license_choice
            matched_rule['licenses'] = match.rule.licenses
            # FIXME: for sanity these should always be included???
            if diag:
                matched_rule['matcher'] = match.matcher
                matched_rule['rule_length'] = match.rule.length
                matched_rule['matched_length'] = match.ilen()
                matched_rule['match_coverage'] = match.coverage()
                matched_rule['rule_relevance'] = match.rule.relevance
            # FIXME: for sanity this should always be included?????
            if include_text:
                result['matched_text'] = matched_text
            yield result
    def process_codebase(self, codebase, licenses_reference, **kwargs):
        from licensedcode.cache import get_licenses_db
        licensing = Licensing()

        license_keys = set()

        for resource in codebase.walk():
            licexps = getattr(resource, 'license_expressions', []) or []
            for expression in licexps:
                if expression:
                    license_keys.update(licensing.license_keys(expression))

        packages = getattr(codebase, 'packages', []) or []
        for package in packages:
            # FXIME: license_expression attribute name is changing soon
            expression = package.get('license_expression')
            if expression:
                license_keys.update(licensing.license_keys(expression))

                resource.save(codebase)

        db = get_licenses_db()
        for key in sorted(license_keys):
            license_details = db[key].to_dict(
                include_ignorables=False,
                include_text=True,
            )
            codebase.attributes.licenses_reference.append(license_details)
Beispiel #4
0
def get_licenses(location, min_score=0, include_text=False, diag=False):
    """
    Yield dictionaries of license data detected in the file at location.

    `minimum_score` is a minimum score threshold from 0 to 100. The default is 0
    means that all license matches will be returned. With any other value matches
    that have a score below minimum score with not be returned.

    If `diag` is True, additional match details are returned with the matched_rule
    key of the returned mapping.
    """
    from licensedcode.cache import get_index
    from licensedcode.cache import get_licenses_db
    from licensedcode.match import get_full_matched_text

    idx = get_index()
    licenses = get_licenses_db()

    for match in idx.match(location=location, min_score=min_score):
        if include_text:
            matched_text = u''.join(
                get_full_matched_text(match,
                                      location=location,
                                      idx=idx,
                                      whole_lines=False))
        for license_key in match.rule.licenses:
            lic = licenses.get(license_key)
            result = OrderedDict()
            result['key'] = lic.key
            result['score'] = match.score()
            result['short_name'] = lic.short_name
            result['category'] = lic.category
            result['owner'] = lic.owner
            result['homepage_url'] = lic.homepage_url
            result['text_url'] = lic.text_urls[0] if lic.text_urls else ''
            result['dejacode_url'] = DEJACODE_LICENSE_URL.format(lic.key)
            spdx_key = lic.spdx_license_key
            result['spdx_license_key'] = spdx_key
            if spdx_key:
                spdx_key = lic.spdx_license_key.rstrip('+')
                spdx_url = SPDX_LICENSE_URL.format(spdx_key)
            else:
                spdx_url = ''
            result['spdx_url'] = spdx_url
            result['start_line'] = match.start_line
            result['end_line'] = match.end_line
            matched_rule = result['matched_rule'] = OrderedDict()
            matched_rule['identifier'] = match.rule.identifier
            matched_rule['license_choice'] = match.rule.license_choice
            matched_rule['licenses'] = match.rule.licenses
            if diag:
                matched_rule['matcher'] = match.matcher
                matched_rule['rule_length'] = match.rule.length
                matched_rule['matched_length'] = match.ilen()
                matched_rule['match_coverage'] = match.coverage()
                matched_rule['rule_relevance'] = match.rule.relevance
            if include_text:
                result['matched_text'] = matched_text
            yield result
Beispiel #5
0
def _licenses_data_from_match(
        match, include_text=False, license_text_diagnostics=False,
        license_url_template=DEJACODE_LICENSE_URL):
    """
    Return a list of "licenses" scan data built from a license match.
    Used directly only internally for testing.
    """
    from licensedcode import cache
    licenses = cache.get_licenses_db()

    matched_text = None
    if include_text:
        if license_text_diagnostics:
            matched_text = match.matched_text(whole_lines=False, highlight=True)
        else:
            matched_text = match.matched_text(whole_lines=True, highlight=False)

    detected_licenses = []
    for license_key in match.rule.license_keys():
        lic = licenses.get(license_key)
        result = OrderedDict()
        detected_licenses.append(result)
        result['key'] = lic.key
        result['score'] = match.score()
        result['name'] = lic.name
        result['short_name'] = lic.short_name
        result['category'] = lic.category
        result['is_exception'] = lic.is_exception
        result['owner'] = lic.owner
        result['homepage_url'] = lic.homepage_url
        result['text_url'] = lic.text_urls[0] if lic.text_urls else ''
        result['reference_url'] = license_url_template.format(lic.key)
        spdx_key = lic.spdx_license_key
        result['spdx_license_key'] = spdx_key
        if spdx_key:
            spdx_key = lic.spdx_license_key.rstrip('+')
            spdx_url = SPDX_LICENSE_URL.format(spdx_key)
        else:
            spdx_url = ''
        result['spdx_url'] = spdx_url
        result['start_line'] = match.start_line
        result['end_line'] = match.end_line
        matched_rule = result['matched_rule'] = OrderedDict()
        matched_rule['identifier'] = match.rule.identifier
        matched_rule['license_expression'] = match.rule.license_expression
        matched_rule['licenses'] = match.rule.license_keys()
        matched_rule['is_license_text'] = match.rule.is_license_text
        matched_rule['is_license_notice'] = match.rule.is_license_notice
        matched_rule['is_license_reference'] = match.rule.is_license_reference
        matched_rule['is_license_tag'] = match.rule.is_license_tag
        matched_rule['matcher'] = match.matcher
        matched_rule['rule_length'] = match.rule.length
        matched_rule['matched_length'] = match.len()
        matched_rule['match_coverage'] = match.coverage()
        matched_rule['rule_relevance'] = match.rule.relevance
        # FIXME: for sanity this should always be included?????
        if include_text:
            result['matched_text'] = matched_text
    return detected_licenses
Beispiel #6
0
def get_spdx_keys():
    """
    Return a set of ScanEngine license keys for licenses that are listed in SPDX.
    """
    global _spdx_keys
    if not _spdx_keys:
        _spdx_keys = frozenset(models.get_all_spdx_keys(get_licenses_db()))
    return _spdx_keys
 def test_validate_license_library(self):
     errors, warnings, infos = models.License.validate(
         cache.get_licenses_db(),
         verbose=False,
     )
     assert errors == {}
     assert warnings == {}
     assert infos
Beispiel #8
0
 def test_all_spdx_tokens_exists_in_dictionary(self):
     idx = cache.get_index()
     dic = idx.dictionary
     licenses = cache.get_licenses_db()
     tokens = set(models.get_all_spdx_key_tokens(licenses))
     keys = set(idx.dictionary)
     try:
         assert tokens.issubset(keys)
     except:
         for token in tokens:
             dic[token]
Beispiel #9
0
    def __init__(self, src_dir, match_text=False, match_approx=False):
        """
        `src_dir` is where the License objects are dumped.
        """
        src_dir = os.path.realpath(src_dir)
        self.src_dir = src_dir

        self.match_text = match_text
        self.match_approx = match_approx

        self.fetched = False
        if os.path.exists(src_dir):
            # fetch ONLY if the directory is empty
            self.fetched = True
        else:
            os.mkdir(src_dir)

        self.update_dir = self.src_dir.rstrip('\\/') + '-update'
        if not os.path.exists(self.update_dir):
            os.mkdir(self.update_dir)

        self.new_dir = self.src_dir.rstrip('\\/') + '-new'
        if not os.path.exists(self.new_dir):
            os.mkdir(self.new_dir)

        self.del_dir = self.src_dir.rstrip('\\/') + '-del'
        if not os.path.exists(self.del_dir):
            os.mkdir(self.del_dir)

        self.scancodes_by_key = get_licenses_db()

        self.scancodes_by_spdx_key = {
            l.spdx_license_key.lower(): l
            for l in self.scancodes_by_key.values() if l.spdx_license_key
        }

        composites_dir = os.path.join(licensedcode.data_dir, 'composites',
                                      'licenses')
        self.composites_by_key = load_licenses(composites_dir,
                                               with_deprecated=True)
        self.composites_by_spdx_key = {
            l.spdx_license_key.lower(): l
            for l in self.composites_by_key.values() if l.spdx_license_key
        }

        foreign_dir = os.path.join(licensedcode.data_dir, 'non-english',
                                   'licenses')
        self.non_english_by_key = load_licenses(foreign_dir,
                                                with_deprecated=True)
        self.non_english_by_spdx_key = {
            l.spdx_license_key.lower(): l
            for l in self.non_english_by_key.values() if l.spdx_license_key
        }
def get_rules(licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir):
    """
    Yield Rule objects loaded from license files found in `licenses_data_dir`
    and rule files fourn in `rules_data_dir`. Raise a Exceptions if a rule is
    inconsistent or incorrect.
    """
    from licensedcode.cache import get_licenses_db
    licenses = get_licenses_db(licenses_data_dir=licenses_data_dir)
    rules = list(load_rules(rules_data_dir=rules_data_dir))
    check_rules_integrity(rules, licenses)
    licenses_as_rules = build_rules_from_licenses(licenses)
    return chain(licenses_as_rules, rules)
Beispiel #11
0
def get_spdx_keys():
    """
    Return a set of ScanCode license keys for licenses that are listed in SPDX.
    """
    global _spdx_keys
    if not _spdx_keys:
        licenses = get_licenses_db()
        spdx = set()
        for lic in licenses.values():
            if (lic.spdx_license_key or lic.other_spdx_license_keys):
                spdx.add(lic.key)
        _spdx_keys = frozenset(spdx)
    return _spdx_keys
Beispiel #12
0
def get_rules(licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir):
    """
    Return a mapping of key->license and an iterable of license detection
    rules loaded from licenses and rules files. Raise a MissingLicenses
    exceptions if a rule references unknown license keys.
    """
    from licensedcode.cache import get_licenses_db
    licenses = get_licenses_db(licenses_data_dir=licenses_data_dir)
    rules = list(load_rules(rules_data_dir=rules_data_dir))
    check_rules_integrity(rules, licenses)

    licenses_as_rules = build_rules_from_licenses(licenses)
    return chain(licenses_as_rules, rules)
Beispiel #13
0
def get_rules(licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir):
    """
    Return a mapping of key->license and an iterable of license detection
    rules loaded from licenses and rules files. Raise a MissingLicenses
    exceptions if a rule references unknown license keys.
    """
    from licensedcode.cache import get_licenses_db
    licenses = get_licenses_db(licenses_data_dir=licenses_data_dir)
    rules = list(load_rules(rules_data_dir=rules_data_dir))
    check_rules_integrity(rules, licenses)

    licenses_as_rules = build_rules_from_licenses(licenses)
    return chain(licenses_as_rules, rules)
def cli(path=(), update=True):
    """
    Update licenses and rules with ignorable copyrights, holders, authors URLs
    and emails.
    """
    licensish = list(cache.get_licenses_db().values()) + list(
        models.load_rules())

    if path:
        licensish = [
            l for l in licensish
            if l.text_file.endswith(path) or l.data_file.endswith(path)
        ]
    refresh_ignorables(licensish)
def check_ignorable_clues(licensish, regen=REGEN_TEST_FIXTURES, verbose=False):
    """
    Validate that all expected ignorable clues declared in a `licensish` License
    or Rule object are properly detected in that rule text file. Optionally
    ``regen`` the ignorables to update the License or Rule .yml data file.
    """
    result = models.get_ignorables(text_file=licensish.text_file)

    if verbose:
        print()
        print('result')
        pprint(result)

    if regen:
        is_from_license = licensish.is_from_license
        if is_from_license:
            db = cache.get_licenses_db()
            licish = db[licensish.license_expression]
        else:
            licish = licensish
        models.set_ignorables(licish, result, verbose=verbose)
        licish.dump()
        if is_from_license:
            licensish = models.build_rule_from_license(licish)

    expected = models.get_normalized_ignorables(licensish)

    if verbose:
        print('expected')
        pprint(expected)

    try:
        assert result == expected
    except:
        # On failure, we compare again to get additional failure details such as
        # a clickable text_file path.

        data_file = licensish.data_file
        if not data_file:
            data_file = licensish.text_file.replace('.LICENSE', '.yml')

        result['files'] = [
            f'file://{data_file}',
            f'file://{licensish.text_file}',
        ]

        # This assert will always fail and provide a more detailed failure trace
        assert saneyaml.dump(result) == saneyaml.dump(expected)
Beispiel #16
0
    def __init__(self, src_dir, match_text=False, match_approx=False):
        """
        `src_dir` is where the License objects are dumped.
        """
        src_dir = os.path.realpath(src_dir)
        self.src_dir = src_dir

        self.match_text = match_text
        self.match_approx = match_approx

        self.fetched = False
        if os.path.exists(src_dir):
            # fetch ONLY if the directory is empty
            self.fetched = True
        else:
            os.mkdir(src_dir)

        self.update_dir = self.src_dir.rstrip('\\/') + '-update'
        if not os.path.exists(self.update_dir):
            os.mkdir(self.update_dir)

        self.new_dir = self.src_dir.rstrip('\\/') + '-new'
        if not os.path.exists(self.new_dir):
            os.mkdir(self.new_dir)

        self.del_dir = self.src_dir.rstrip('\\/') + '-del'
        if not os.path.exists(self.del_dir):
            os.mkdir(self.del_dir)

        self.scancodes_by_key = get_licenses_db()

        self.scancodes_by_spdx_key = {l.spdx_license_key.lower(): l
            for l in self.scancodes_by_key.values()
            if l.spdx_license_key}

        composites_dir = os.path.join(licensedcode.data_dir, 'composites', 'licenses')
        self.composites_by_key = load_licenses(composites_dir, with_deprecated=True)
        self.composites_by_spdx_key = {l.spdx_license_key.lower(): l
            for l in self.composites_by_key.values()
            if l.spdx_license_key}

        foreign_dir = os.path.join(licensedcode.data_dir, 'non-english', 'licenses')
        self.non_english_by_key = load_licenses(foreign_dir, with_deprecated=True)
        self.non_english_by_spdx_key = {l.spdx_license_key.lower(): l
            for l in self.non_english_by_key.values()
            if l.spdx_license_key}
Beispiel #17
0
def generate_output(results, version, template):
    """
    Yield unicode strings from incrementally rendering `results` and `version`
    with the Jinja `template` object.
    """
    # FIXME: This code is highly coupled with actual scans and may not
    # support adding new scans at all

    from licensedcode.cache import get_licenses_db

    converted = {}
    converted_infos = {}
    converted_packages = {}
    licenses = {}

    LICENSES = 'licenses'
    COPYRIGHTS = 'copyrights'
    PACKAGES = 'package_data'

    # Create a flattened data dict keyed by path
    for scanned_file in results:
        scanned_file = dict(scanned_file)
        path = scanned_file['path']
        results = []
        if COPYRIGHTS in scanned_file:
            for entry in scanned_file[COPYRIGHTS]:
                results.append({
                    'start': entry['start_line'],
                    'end': entry['end_line'],
                    'what': 'copyright',
                    'value': entry['copyright'],
                })
        if LICENSES in scanned_file:
            for entry in scanned_file[LICENSES]:
                # make copy
                entry = dict(entry)
                entry_key = entry['key']
                results.append({
                    'start': entry['start_line'],
                    'end': entry['end_line'],
                    'what': 'license',
                    'value': entry_key,
                })

                # FIXME: we should NOT rely on license objects: only use what is in the JSON instead
                if entry_key not in licenses:
                    licenses[entry_key] = entry
                    # we were modifying the scan data in place ....
                    entry['object'] = get_licenses_db().get(entry_key)
        if results:
            converted[path] = sorted(results, key=itemgetter('start'))

        # TODO: this is klunky: we need to drop templates entirely or we
        # should rather just pass a the list of files from the scan
        # results and let the template handle this rather than
        # denormalizing the list here??
        converted_infos[path] = {}
        for name, value in scanned_file.items():
            if name in (LICENSES, PACKAGES, COPYRIGHTS):
                continue
            converted_infos[path][name] = value

        if PACKAGES in scanned_file:
            converted_packages[path] = scanned_file[PACKAGES]

        licenses = dict(sorted(licenses.items()))

    files = {
        'license_copyright': converted,
        'infos': converted_infos,
        'package_data': converted_packages
    }

    return template.generate(files=files, licenses=licenses, version=version)
Beispiel #18
0
def cli(licenses_file):
    """
    Create rules from a text file with delimited blocks of metadata and texts.

    As an example a file would contains one of more blocks such as this:

\b
        ----------------------------------------
        license_expression: lgpl-2.1
        relevance: 100
        is_license_notice: yes
        ---
        This program is free software; you can redistribute it and/or modify
        it under the terms of the GNU Lesser General Public License
        version 2.1 as published by the Free Software Foundation;
        ----------------------------------------
    """

    rules_data = load_data(licenses_file)
    rules_tokens = all_rule_tokens()

    licenses = cache.get_licenses_db()
    licensing = Licensing(licenses.values())

    print()
    errors = validate_license_rules(rules_data, licensing)
    if errors:
        print('Invalid rules: exiting....')
        for error in errors:
            print(error)
            print()

        raise Exception('Invalid rules: exiting....')

    print()
    for rule in rules_data:
        is_negative = rule.data.get('is_negative')
        is_false_positive = rule.data.get('is_false_positive')
        existing = rule_exists(rule.text)
        if existing and not is_negative:
            print('Skipping existing non-negative rule:', existing,
                  'with text:\n', rule.text[:50].strip(), '...')
            continue

        if is_negative:
            base_name = 'not-a-license'
        else:
            license_expression = rule.data.get('license_expression')
            license_expression = str(
                licensing.parse(license_expression, validate=True,
                                simple=True))
            base_name = license_expression
            if is_false_positive:
                base_name = 'false-positive_' + base_name

        base_loc = find_rule_base_loc(base_name)

        data_file = base_loc + '.yml'
        with io.open(data_file, 'w', encoding='utf-8') as o:
            o.write(rule.raw_data)

        text_file = base_loc + '.RULE'
        with io.open(text_file, 'w', encoding='utf-8') as o:
            o.write(rule.text)

        rulerec = models.Rule(data_file=data_file, text_file=text_file)
        rule_tokens = tuple(rulerec.tokens())
        if rule_tokens in rules_tokens:
            # cleanup
            os.remove(text_file)
            os.remove(data_file)
            print('Skipping already added rule with text for:', base_name)
        else:
            rules_tokens.add(rule_tokens)
            rulerec.dump()
            models.update_ignorables(rulerec, verbose=False)
            print(
                'Rule added:',
                'file://' + rulerec.data_file,
                '\n',
                'file://' + rulerec.text_file,
            )
def as_template(scanned_files, template):
    """
    Return an string built from a list of `scanned_files` results and
    the provided `template` identifier. The template defaults to the standard HTML
    template format or can point to the path of a custom template file.
    """
    # FIXME: This code is highly coupled with actual scans and may not
    # support adding new scans at all

    from licensedcode.cache import get_licenses_db

    # FIXME: factor out the html vs custom from this function: we should get a template path
    if template == 'html':
        template = get_template(get_template_dir('html'))
    else:
        # load a custom template
        tpath = fileutils.as_posixpath(os.path.abspath(os.path.expanduser(template)))
        assert os.path.isfile(tpath)
        tdir = fileutils.parent_directory(tpath)
        tfile = fileutils.file_name(tpath)
        template = get_template(tdir, tfile)

    converted = OrderedDict()
    converted_infos = OrderedDict()
    converted_packages = OrderedDict()
    licenses = {}

    LICENSES = 'licenses'
    COPYRIGHTS = 'copyrights'
    PACKAGES = 'packages'
    URLS = 'urls'
    EMAILS = 'emails'

    # Create a flattened data dict keyed by path
    for scanned_file in scanned_files:
        path = scanned_file['path']
        results = []
        if COPYRIGHTS in scanned_file:
            for entry in scanned_file[COPYRIGHTS]:
                results.append({
                    'start': entry['start_line'],
                    'end': entry['end_line'],
                    'what': 'copyright',
                    # NOTE: we display one statement per line.
                    'value': '\n'.join(entry['statements']),
                })
        if LICENSES in scanned_file:
            for entry in scanned_file[LICENSES]:
                results.append({
                    'start': entry['start_line'],
                    'end': entry['end_line'],
                    'what': 'license',
                    'value': entry['key'],
                })

                # FIXME: we hsould NOT rely on license objects: only use what is in the JSON instead
                if entry['key'] not in licenses:
                    licenses[entry['key']] = entry
                    entry['object'] = get_licenses_db().get(entry['key'])
        if results:
            converted[path] = sorted(results, key=itemgetter('start'))

        # TODO: this is klunky: we need to drop templates entirely or we
        # should rather just pass a the list of files from the scan
        # results and let the template handle this rather than
        # denormalizing the list here??
        converted_infos[path] = OrderedDict()
        for name, value in scanned_file.items():
            if name in (LICENSES, PACKAGES, COPYRIGHTS, EMAILS, URLS):
                continue
            converted_infos[path][name] = value

        if PACKAGES in scanned_file:
            converted_packages[path] = scanned_file[PACKAGES]

        licenses = OrderedDict(sorted(licenses.items()))

    files = {
        'license_copyright': converted,
        'infos': converted_infos,
        'packages': converted_packages
    }

    return template.generate(files=files, licenses=licenses)
            # also verify that we are detecting exactly with the license rule itself
            test_method = make_license_test_function(license_key,
                                                     license_obj.text_file,
                                                     license_obj.data_file,
                                                     test_name,
                                                     detect_negative=True,
                                                     trace_text=True)
            setattr(cls, test_name, test_method)


class TestValidateLicenseTextDetection(unittest.TestCase):
    # Test functions are attached to this class at import time
    pass


build_license_validation_tests(cache.get_licenses_db(),
                               TestValidateLicenseTextDetection)


def build_rule_validation_tests(rules, cls):
    """
    Dynamically build an individual test method for each rule texts in a rules
    `data_set` then mapping attaching the test method to the `cls` test class.
    """
    for rule in rules:
        if rule.negative:
            continue
        expected_identifier = rule.identifier
        test_name = ('test_validate_self_detection_of_rule_for_' +
                     text.python_safe_name(expected_identifier))
        test_method = make_license_test_function(
def as_template(scanned_files, version, template):
    """
    Return an string built from a list of `scanned_files` results and
    the provided `template` identifier. The template defaults to the standard HTML
    template format or can point to the path of a custom template file.
    """
    # FIXME: This code is highly coupled with actual scans and may not
    # support adding new scans at all

    from licensedcode.cache import get_licenses_db

    # FIXME: factor out the html vs custom from this function: we should get a template path
    if template == 'html':
        template = get_template(get_template_dir('html'))
    else:
        # load a custom template
        tpath = fileutils.as_posixpath(
            os.path.abspath(os.path.expanduser(template)))
        assert os.path.isfile(tpath)
        tdir = fileutils.parent_directory(tpath)
        tfile = fileutils.file_name(tpath)
        template = get_template(tdir, tfile)

    converted = OrderedDict()
    converted_infos = OrderedDict()
    converted_packages = OrderedDict()
    licenses = {}

    LICENSES = 'licenses'
    COPYRIGHTS = 'copyrights'
    PACKAGES = 'packages'
    URLS = 'urls'
    EMAILS = 'emails'

    # Create a flattened data dict keyed by path
    for scanned_file in scanned_files:
        path = scanned_file['path']
        results = []
        if COPYRIGHTS in scanned_file:
            for entry in scanned_file[COPYRIGHTS]:
                results.append({
                    'start': entry['start_line'],
                    'end': entry['end_line'],
                    'what': 'copyright',
                    # NOTE: we display one statement per line.
                    'value': '\n'.join(entry['statements']),
                })
        if LICENSES in scanned_file:
            for entry in scanned_file[LICENSES]:
                results.append({
                    'start': entry['start_line'],
                    'end': entry['end_line'],
                    'what': 'license',
                    'value': entry['key'],
                })

                # FIXME: we hsould NOT rely on license objects: only use what is in the JSON instead
                if entry['key'] not in licenses:
                    licenses[entry['key']] = entry
                    entry['object'] = get_licenses_db().get(entry['key'])
        if results:
            converted[path] = sorted(results, key=itemgetter('start'))

        # TODO: this is klunky: we need to drop templates entirely or we
        # should rather just pass a the list of files from the scan
        # results and let the template handle this rather than
        # denormalizing the list here??
        converted_infos[path] = OrderedDict()
        for name, value in scanned_file.items():
            if name in (LICENSES, PACKAGES, COPYRIGHTS, EMAILS, URLS):
                continue
            converted_infos[path][name] = value

        if PACKAGES in scanned_file:
            converted_packages[path] = scanned_file[PACKAGES]

        licenses = OrderedDict(sorted(licenses.items()))

    files = {
        'license_copyright': converted,
        'infos': converted_infos,
        'packages': converted_packages
    }

    return template.generate(files=files, licenses=licenses, version=version)
Beispiel #22
0
def cli(licenses_file):
    """
        Create rules from a text file with delimited blocks of metadata and texts.

        As an example a file would contains one of more blocks such as this:

    \b
            ----------------------------------------
            license_expression: lgpl-2.1
            relevance: 100
            is_license_notice: yes
            ---
            This program is free software; you can redistribute it and/or modify
            it under the terms of the GNU Lesser General Public License
            version 2.1 as published by the Free Software Foundation;
            ----------------------------------------
    """

    rules_data = load_data(licenses_file)
    rule_by_tokens = all_rule_by_tokens()

    licenses_by_key = cache.get_licenses_db()
    skinny_rules = []

    for rdata in rules_data:
        relevance = rdata.data.get("relevance")
        rdata.data["has_stored_relevance"] = bool(relevance)

        license_expression = rdata.data.get("license_expression")
        if license_expression:
            rdata.data["license_expression"] = license_expression.lower(
            ).strip()

        minimum_coverage = rdata.data.get("minimum_coverage")
        rdata.data["has_stored_minimum_coverage"] = bool(minimum_coverage)

        rl = models.BasicRule(**rdata.data)
        rl.stored_text = rdata.text
        skinny_rules.append(rl)

    models.validate_rules(skinny_rules, licenses_by_key, with_text=True)

    print()
    for rule in skinny_rules:

        if rule.is_false_positive:
            base_name = "false-positive"
        elif rule.is_license_intro:
            base_name = "license-intro"
        else:
            base_name = rule.license_expression

        text = rule.text()

        existing_rule = rule_exists(text)
        skinny_text = " ".join(text[:80].split()).replace("{", " ").replace(
            "}", " ")

        existing_msg = (f"Skipping rule for: {base_name!r}, "
                        "dupe of: {existing_rule} "
                        f"with text: {skinny_text!r}...")

        if existing_rule:
            print(existing_msg.format(**locals()))
            continue

        base_loc = find_rule_base_loc(base_name)

        rd = rule.to_dict()
        rd["stored_text"] = rule.stored_text
        rd["has_stored_relevance"] = rule.has_stored_relevance
        rd["has_stored_minimum_coverage"] = rule.has_stored_minimum_coverage

        rulerec = models.Rule(**rd)

        # force recomputing relevance to remove junk stored relevance for long rules
        rulerec.set_relevance()

        rulerec.data_file = base_loc + ".yml"
        rulerec.text_file = base_loc + ".RULE"

        rule_tokens = tuple(rulerec.tokens())

        existing_rule = rule_by_tokens.get(rule_tokens)
        if existing_rule:
            print(existing_msg.format(**locals()))
            continue
        else:
            print(f"Adding new rule: {base_name}")
            print("  file://" + rulerec.data_file)
            print("  file://" + rulerec.text_file, )
            rulerec.dump()
            models.update_ignorables(rulerec, verbose=False)
            rulerec.dump()

            rule_by_tokens[rule_tokens] = base_name
 def test_validate_licenses(self):
     errors, warnings, infos = models.License.validate(cache.get_licenses_db())
     assert {} == errors
     assert {} == warnings
     assert infos
    """
    for license_key, license_obj in licenses_by_key.items():
        if license_obj.text_file and os.path.exists(license_obj.text_file):
            test_name = ('test_validate_self_detection_of_text_for_' + text.python_safe_name(license_key))
            # also verify that we are detecting exactly with the license rule itself
            test_method = make_license_test_function(
                license_key, license_obj.text_file, license_obj.data_file, test_name, detect_negative=True, trace_text=True)
            setattr(cls, test_name, test_method)


class TestValidateLicenseTextDetection(unittest.TestCase):
    # Test functions are attached to this class at import time
    pass


build_license_validation_tests(cache.get_licenses_db(), TestValidateLicenseTextDetection)


def build_rule_validation_tests(rules, cls):
    """
    Dynamically build an individual test method for each rule texts in a rules
    `data_set` then mapping attaching the test method to the `cls` test class.
    """
    for rule in rules:
        if rule.negative:
            continue
        expected_identifier = rule.identifier
        test_name = ('test_validate_self_detection_of_rule_for_' + text.python_safe_name(expected_identifier))
        test_method = make_license_test_function(
            rule.licenses, rule.text_file, rule.data_file, test_name, detect_negative=not rule.negative, trace_text=True
        )
def cli(licenses_file):
    """
    Create rules from a structured text file

    For instance:
        ----------------------------------------
        license_expression: lgpl-2.1
        relevance: 100
        is_license_notice: yes
        ---
        This program is free software; you can redistribute it and/or modify
        it under the terms of the GNU Lesser General Public License
        version 2.1 as published by the Free Software Foundation;
        ----------------------------------------
    """

    rule_data = load_data(licenses_file)
    rules_tokens = set()

    licenses = cache.get_licenses_db()
    licensing = Licensing(licenses.values())

    print()
    for data, text in rule_data:
        rdat = '\n'.join(data)
        rtxt = '\n'.join(text)
        existing = rule_exists(rtxt)
        if existing:
            print('Skipping existing rule:', existing, 'with text:\n', rtxt[:50].strip(), '...')
            continue

        # validate YAML syntax
        parsed = saneyaml.load(rdat)
        if parsed.get('is_negative'):
            license_expression = 'not-a-license'
        else:
            _, _, license_expression = data[0].partition(': ')
            license_expression = license_expression.strip()
            if not license_expression:
                raise Exception('Missing license_expression for text:', rtxt)
            licensing.parse(license_expression, validate=True, simple=True)

        base_loc = find_rule_base_loc(license_expression)

        data_file = base_loc + '.yml'
        with io.open(data_file, 'w', encoding='utf-8') as o:
            o.write(rdat)

        text_file = base_loc + '.RULE'
        with io.open(text_file, 'w', encoding='utf-8') as o:
            o.write(rtxt)
        rule = models.Rule(data_file=data_file, text_file=text_file)
        rule_tokens = tuple(rule.tokens())
        if rule_tokens in rules_tokens:
            # cleanup
            os.remove(text_file)
            os.remove(data_file)
            print('Skipping already added rule with text for:', license_expression)
        else:
            rules_tokens.add(rule_tokens)
            rule.dump()
            models.update_ignorables(rule, verbose=True)
            print('Rule added:', rule.identifier)
Beispiel #26
0
def get_licenses(location,
                 min_score=0,
                 include_text=False,
                 license_url_template=DEJACODE_LICENSE_URL,
                 **kwargs):
    """
    Return a mapping or detected_licenses for licenses detected in the file at
    `location`

    This mapping contains two keys:
     - 'licenses' with a value that is list of mappings of license information.
     - 'license_expressions' with a value that is list of license expression
       strings.

    `minimum_score` is a minimum score threshold from 0 to 100. The default is 0
    means that all license matches are returned. Otherwise, matches with a score
    below `minimum_score` are returned.

    if `include_text` is True, matched text is included in the returned
    `licenses` data.
    """
    from licensedcode.cache import get_index
    from licensedcode.cache import get_licenses_db

    idx = get_index()
    licenses = get_licenses_db()

    detected_licenses = []
    detected_expressions = []
    for match in idx.match(location=location, min_score=min_score, **kwargs):

        if include_text:
            # TODO: handle whole lines with the case of very long lines
            matched_text = match.matched_text(whole_lines=False)

        detected_expressions.append(match.rule.license_expression)

        for license_key in match.rule.license_keys():
            lic = licenses.get(license_key)
            result = OrderedDict()
            detected_licenses.append(result)
            result['key'] = lic.key
            result['score'] = match.score()
            result['name'] = lic.name
            result['short_name'] = lic.short_name
            result['category'] = lic.category
            result['is_exception'] = lic.is_exception
            result['owner'] = lic.owner
            result['homepage_url'] = lic.homepage_url
            result['text_url'] = lic.text_urls[0] if lic.text_urls else ''
            result['reference_url'] = license_url_template.format(lic.key)
            spdx_key = lic.spdx_license_key
            result['spdx_license_key'] = spdx_key
            if spdx_key:
                spdx_key = lic.spdx_license_key.rstrip('+')
                spdx_url = SPDX_LICENSE_URL.format(spdx_key)
            else:
                spdx_url = ''
            result['spdx_url'] = spdx_url
            result['start_line'] = match.start_line
            result['end_line'] = match.end_line
            matched_rule = result['matched_rule'] = OrderedDict()
            matched_rule['identifier'] = match.rule.identifier
            matched_rule['license_expression'] = match.rule.license_expression
            matched_rule['licenses'] = match.rule.license_keys()

            matched_rule['is_license_text'] = match.rule.is_license_text
            matched_rule['is_license_notice'] = match.rule.is_license_notice
            matched_rule[
                'is_license_reference'] = match.rule.is_license_reference
            matched_rule['is_license_tag'] = match.rule.is_license_tag

            matched_rule['matcher'] = match.matcher
            matched_rule['rule_length'] = match.rule.length
            matched_rule['matched_length'] = match.ilen()
            matched_rule['match_coverage'] = match.coverage()
            matched_rule['rule_relevance'] = match.rule.relevance

            # FIXME: for sanity this should always be included?????
            if include_text:
                result['matched_text'] = matched_text

    return OrderedDict([
        ('licenses', detected_licenses),
        ('license_expressions', detected_expressions),
    ])
 def test_validate_license_library(self):
     errors, warnings, infos = models.License.validate(
         cache.get_licenses_db(), verbose=True)
     assert {} == errors
     assert {} == warnings
     assert infos
Beispiel #28
0
def cli(licenses_file):
    """
    Create rules from a text file with delimited blocks of metadata and texts.

    As an example a file would contains one of more blocks such as this:

\b
        ----------------------------------------
        license_expression: lgpl-2.1
        relevance: 100
        is_license_notice: yes
        ---
        This program is free software; you can redistribute it and/or modify
        it under the terms of the GNU Lesser General Public License
        version 2.1 as published by the Free Software Foundation;
        ----------------------------------------
    """

    rules_data = load_data(licenses_file)
    rules_tokens = all_rule_tokens()

    licenses_by_key = cache.get_licenses_db()
    skinny_rules = []

    for rdata in rules_data:
        relevance = rdata.data.get('relevance')
        rdata.data['has_stored_relevance'] = bool(relevance)

        minimum_coverage = rdata.data.get('minimum_coverage')
        rdata.data['has_stored_minimum_coverage'] = bool(minimum_coverage)

        rl = models.BasicRule(**rdata.data)
        rl.stored_text = rdata.text
        skinny_rules.append(rl)

    models.validate_rules(skinny_rules, licenses_by_key, with_text=True)

    print()
    for rule in skinny_rules:
        existing = rule_exists(rule.text())
        if existing:
            print('Skipping existing rule:', existing, 'with text:\n',
                  rule.text()[:50].strip(), '...')
            continue

        if rule.is_false_positive:
            base_name = 'false-positive'
        elif rule.is_license_intro:
            base_name = 'license-intro'
        else:
            base_name = rule.license_expression

        base_loc = find_rule_base_loc(base_name)

        rd = rule.to_dict()
        rd['stored_text'] = rule.stored_text
        rd['has_stored_relevance'] = rule.has_stored_relevance
        rd['has_stored_minimum_coverage'] = rule.has_stored_minimum_coverage

        rulerec = models.Rule(**rd)

        # force recomputing relevance to remove junk stored relevance for long rules
        rulerec.compute_relevance(_threshold=18.0)

        rulerec.data_file = base_loc + '.yml'
        rulerec.text_file = base_loc + '.RULE'

        rule_tokens = tuple(rulerec.tokens())

        if rule_tokens in rules_tokens:
            print('Skipping already added rule with text for:', base_name)
        else:
            print('Adding new rule:')
            print('  file://' + rulerec.data_file)
            print('  file://' + rulerec.text_file, )
            rules_tokens.add(rule_tokens)
            rulerec.dump()
            models.update_ignorables(rulerec, verbose=False)
            rulerec.dump()
Beispiel #29
0
def _licenses_data_from_match(match,
                              include_text=False,
                              license_text_diagnostics=False,
                              license_url_template=SCANCODE_LICENSEDB_URL):
    """
    Return a list of "licenses" scan data built from a license match.
    Used directly only internally for testing.
    """
    from licensedcode import cache
    licenses = cache.get_licenses_db()

    matched_text = None
    if include_text:
        if license_text_diagnostics:
            matched_text = match.matched_text(whole_lines=False,
                                              highlight=True)
        else:
            matched_text = match.matched_text(whole_lines=True,
                                              highlight=False)

    SCANCODE_BASE_URL = 'https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses'
    SCANCODE_LICENSE_TEXT_URL = SCANCODE_BASE_URL + '/{}.LICENSE'
    SCANCODE_LICENSE_DATA_URL = SCANCODE_BASE_URL + '/{}.yml'

    detected_licenses = []
    for license_key in match.rule.license_keys():
        lic = licenses.get(license_key)
        result = {}
        detected_licenses.append(result)
        result['key'] = lic.key
        result['score'] = match.score()
        result['name'] = lic.name
        result['short_name'] = lic.short_name
        result['category'] = lic.category
        result['is_exception'] = lic.is_exception
        result['is_unknown'] = lic.is_unknown
        result['owner'] = lic.owner
        result['homepage_url'] = lic.homepage_url
        result['text_url'] = lic.text_urls[0] if lic.text_urls else ''
        result['reference_url'] = license_url_template.format(lic.key)
        result['scancode_text_url'] = SCANCODE_LICENSE_TEXT_URL.format(lic.key)
        result['scancode_data_url'] = SCANCODE_LICENSE_DATA_URL.format(lic.key)

        spdx_key = lic.spdx_license_key
        result['spdx_license_key'] = spdx_key

        if spdx_key:
            is_license_ref = spdx_key.lower().startswith('licenseref-')
            if is_license_ref:
                spdx_url = SCANCODE_LICENSE_TEXT_URL.format(lic.key)
            else:
                spdx_key = lic.spdx_license_key.rstrip('+')
                spdx_url = SPDX_LICENSE_URL.format(spdx_key)
        else:
            spdx_url = ''
        result['spdx_url'] = spdx_url
        result['start_line'] = match.start_line
        result['end_line'] = match.end_line
        matched_rule = result['matched_rule'] = {}
        matched_rule['identifier'] = match.rule.identifier
        matched_rule['license_expression'] = match.rule.license_expression
        matched_rule['licenses'] = match.rule.license_keys()
        matched_rule['referenced_filenames'] = match.rule.referenced_filenames
        matched_rule['is_license_text'] = match.rule.is_license_text
        matched_rule['is_license_notice'] = match.rule.is_license_notice
        matched_rule['is_license_reference'] = match.rule.is_license_reference
        matched_rule['is_license_tag'] = match.rule.is_license_tag
        matched_rule['is_license_intro'] = match.rule.is_license_intro
        matched_rule['has_unknown'] = match.rule.has_unknown
        matched_rule['matcher'] = match.matcher
        matched_rule['rule_length'] = match.rule.length
        matched_rule['matched_length'] = match.len()
        matched_rule['match_coverage'] = match.coverage()
        matched_rule['rule_relevance'] = match.rule.relevance
        # FIXME: for sanity this should always be included?????
        if include_text:
            result['matched_text'] = matched_text
    return detected_licenses
 def test_validate_licenses(self):
     errors, warnings, infos = models.License.validate(cache.get_licenses_db())
     assert {} == errors
     assert {} == warnings
     assert infos