Esempio n. 1
0
def cli(licenses_file):
    """
        Create rules from a text file with delimited blocks of metadata and texts.

        As an example a file would contains one of more blocks such as this:

    \b
            ----------------------------------------
            license_expression: lgpl-2.1
            relevance: 100
            is_license_notice: yes
            ---
            This program is free software; you can redistribute it and/or modify
            it under the terms of the GNU Lesser General Public License
            version 2.1 as published by the Free Software Foundation;
            ----------------------------------------
    """

    rules_data = load_data(licenses_file)
    rule_by_tokens = all_rule_by_tokens()

    licenses_by_key = cache.get_licenses_db()
    skinny_rules = []

    for rdata in rules_data:
        relevance = rdata.data.get("relevance")
        rdata.data["has_stored_relevance"] = bool(relevance)

        license_expression = rdata.data.get("license_expression")
        if license_expression:
            rdata.data["license_expression"] = license_expression.lower(
            ).strip()

        minimum_coverage = rdata.data.get("minimum_coverage")
        rdata.data["has_stored_minimum_coverage"] = bool(minimum_coverage)

        rl = models.BasicRule(**rdata.data)
        rl.stored_text = rdata.text
        skinny_rules.append(rl)

    models.validate_rules(skinny_rules, licenses_by_key, with_text=True)

    print()
    for rule in skinny_rules:

        if rule.is_false_positive:
            base_name = "false-positive"
        elif rule.is_license_intro:
            base_name = "license-intro"
        else:
            base_name = rule.license_expression

        text = rule.text()

        existing_rule = rule_exists(text)
        skinny_text = " ".join(text[:80].split()).replace("{", " ").replace(
            "}", " ")

        existing_msg = (f"Skipping rule for: {base_name!r}, "
                        "dupe of: {existing_rule} "
                        f"with text: {skinny_text!r}...")

        if existing_rule:
            print(existing_msg.format(**locals()))
            continue

        base_loc = find_rule_base_loc(base_name)

        rd = rule.to_dict()
        rd["stored_text"] = rule.stored_text
        rd["has_stored_relevance"] = rule.has_stored_relevance
        rd["has_stored_minimum_coverage"] = rule.has_stored_minimum_coverage

        rulerec = models.Rule(**rd)

        # force recomputing relevance to remove junk stored relevance for long rules
        rulerec.set_relevance()

        rulerec.data_file = base_loc + ".yml"
        rulerec.text_file = base_loc + ".RULE"

        rule_tokens = tuple(rulerec.tokens())

        existing_rule = rule_by_tokens.get(rule_tokens)
        if existing_rule:
            print(existing_msg.format(**locals()))
            continue
        else:
            print(f"Adding new rule: {base_name}")
            print("  file://" + rulerec.data_file)
            print("  file://" + rulerec.text_file, )
            rulerec.dump()
            models.update_ignorables(rulerec, verbose=False)
            rulerec.dump()

            rule_by_tokens[rule_tokens] = base_name
Esempio n. 2
0
def cli(licenses_file):
    """
    Create rules from a text file with delimited blocks of metadata and texts.

    As an example a file would contains one of more blocks such as this:

\b
        ----------------------------------------
        license_expression: lgpl-2.1
        relevance: 100
        is_license_notice: yes
        ---
        This program is free software; you can redistribute it and/or modify
        it under the terms of the GNU Lesser General Public License
        version 2.1 as published by the Free Software Foundation;
        ----------------------------------------
    """

    rules_data = load_data(licenses_file)
    rules_tokens = all_rule_tokens()

    licenses_by_key = cache.get_licenses_db()
    skinny_rules = []

    for rdata in rules_data:
        relevance = rdata.data.get('relevance')
        rdata.data['has_stored_relevance'] = bool(relevance)

        minimum_coverage = rdata.data.get('minimum_coverage')
        rdata.data['has_stored_minimum_coverage'] = bool(minimum_coverage)

        rl = models.BasicRule(**rdata.data)
        rl.stored_text = rdata.text
        skinny_rules.append(rl)

    models.validate_rules(skinny_rules, licenses_by_key, with_text=True)

    print()
    for rule in skinny_rules:
        existing = rule_exists(rule.text())
        if existing:
            print('Skipping existing rule:', existing, 'with text:\n',
                  rule.text()[:50].strip(), '...')
            continue

        if rule.is_false_positive:
            base_name = 'false-positive'
        elif rule.is_license_intro:
            base_name = 'license-intro'
        else:
            base_name = rule.license_expression

        base_loc = find_rule_base_loc(base_name)

        rd = rule.to_dict()
        rd['stored_text'] = rule.stored_text
        rd['has_stored_relevance'] = rule.has_stored_relevance
        rd['has_stored_minimum_coverage'] = rule.has_stored_minimum_coverage

        rulerec = models.Rule(**rd)

        # force recomputing relevance to remove junk stored relevance for long rules
        rulerec.compute_relevance(_threshold=18.0)

        rulerec.data_file = base_loc + '.yml'
        rulerec.text_file = base_loc + '.RULE'

        rule_tokens = tuple(rulerec.tokens())

        if rule_tokens in rules_tokens:
            print('Skipping already added rule with text for:', base_name)
        else:
            print('Adding new rule:')
            print('  file://' + rulerec.data_file)
            print('  file://' + rulerec.text_file, )
            rules_tokens.add(rule_tokens)
            rulerec.dump()
            models.update_ignorables(rulerec, verbose=False)
            rulerec.dump()