Example #1
0
def parse_structured_copyright_file(
    copyright_file,
    skip_debian_packaging=True,
    simplify_licenses=True,
    unique=True,
):
    """
    Return a tuple of (declared license, detected license_expression,
    copyrights) strings computed from the `copyright_file` location. For each
    copyright file paragraph we treat the "name" as a license declaration. The
    text is used for detection and cross-reference with the declaration.

    If `skip_debian_packaging` is True, the Debian packaging license --if
    detected-- is skipped.

    If `simplify_licenses` is True the license expressions are simplified.

    If `unique` is True, repeated copyrights, detected or declared licenses are
    ignore, and only unique detections are returne.
    """
    if not copyright_file:
        return None, None, None

    deco = DebianCopyright.from_file(copyright_file)

    declared_licenses = []
    detected_licenses = []
    copyrights = []

    deco = fix_copyright(deco)

    licensing = Licensing()
    for paragraph in deco.paragraphs:

        if skip_debian_packaging and is_debian_packaging(paragraph):
            # Skipping packaging license and copyrights since they are not
            # relevant to the effective package license
            continue

        if isinstance(paragraph, (CopyrightHeaderParagraph, CopyrightFilesParagraph)):
            pcs = paragraph.copyright.statements or []
            for p in pcs:
                p = p.dumps()
                # avoid repeats
                if unique:
                    if p not in copyrights:
                        copyrights.append(p)
                else:
                    copyrights.append(p)

        if isinstance(paragraph, CatchAllParagraph):
            text = paragraph.dumps()
            if text:
                detected = get_normalized_expression(text, try_as_expression=False)
                if not detected:
                    detected = 'unknown'
                detected_licenses.append(detected)
        else:
            plicense = paragraph.license
            if not plicense:
                continue

            declared, detected = detect_declared_license(plicense.name)
            # avoid repeats
            if unique:
                if declared and declared not in declared_licenses:
                    declared_licenses.append(declared)
                if detected and detected not in detected_licenses:
                    detected_licenses.append(detected)
            else:
                declared_licenses.append(declared)
                detected_licenses.append(detected)

            # also detect in text
            text = paragraph.license.text
            if text:
                detected = get_normalized_expression(text, try_as_expression=False)
                if not detected:
                    detected = 'unknown'
                # avoid repeats
                if unique:
                    if detected not in detected_licenses:
                        detected_licenses.append(detected)
                else:
                    detected_licenses.append(detected)

    declared_license = '\n'.join(declared_licenses)

    if detected_licenses:
        detected_licenses = [licensing.parse(dl, simple=True) for dl in detected_licenses]

        if len(detected_licenses) > 1:
            detected_license = licensing.AND(*detected_licenses)
        else:
            detected_license = detected_licenses[0]

        if simplify_licenses:
            detected_license = detected_license.simplify()

        detected_license = str(detected_license)

    else:
        detected_license = 'unknown'

    copyrights = '\n'.join(copyrights)
    return declared_license, detected_license, copyrights
Example #2
0
def get_normalized_expression(query_string):
    """
    Given a text `query_string` return a single detected license expression.
    `query_string` is typically the value of a license field as found in package
    manifests.
    Return None if there is the `query_string` is empty. Return "unknown" as a
    license expression if there is a `query_string` but nothing was detected.

    For example::
    >>> get_normalized_expression('mit')
    'mit'
    >>> get_normalized_expression('mit or asasa or Apache-2.0')
    'apache-2.0 AND unknown'
    >>> get_normalized_expression('mit or asasa or Apache-2.0')
    'apache-2.0 AND unknown'
    >>> get_normalized_expression('mit asasa or Apache-2.0')
    'apache-2.0 AND unknown'
    >>> assert get_normalized_expression('') is None
    >>> assert get_normalized_expression(None) is None
    """
    if not query_string or not query_string.strip():
        return

    if TRACE:
        logger_debug('get_normalized_expression: query_string: "{}"'.format(
            query_string))

    from licensedcode.cache import get_index
    idx = get_index()
    licensing = Licensing()

    # we match twice in a cascade: as an expression, then as plain text if we
    # did not succeed.
    matches = None
    try:
        matched_as_expression = True
        matches = idx.match(query_string=query_string, as_expression=True)
        if matches_have_unknown(matches, licensing):
            # rematch also if we have unknowns
            matched_as_expression = False
            matches = idx.match(query_string=query_string, as_expression=False)

    except Exception:
        matched_as_expression = False
        matches = idx.match(query_string=query_string, as_expression=False)

    if not matches:
        # we have a query_string text but there was no match: return an unknown
        # key
        return 'unknown'

    if TRACE:
        logger_debug('get_normalized_expression: matches:', matches)

    # join the possible multiple detected license expression with an AND
    expression_objects = [m.rule.license_expression_object for m in matches]
    if len(expression_objects) == 1:
        combined_expression_object = expression_objects[0]
    else:
        combined_expression_object = licensing.AND(*expression_objects)

    if matched_as_expression:
        # then just return the expression(s)
        return str(combined_expression_object)

    # Otherwise, verify that we consumed 100% of the query string e.g. that we
    # have no unknown leftover.

    # 1. have all matches 100% coverage?
    all_matches_have_full_coverage = all(m.coverage() == 100 for m in matches)

    # TODO: have all matches a high enough score?

    # 2. are all declared license tokens consumed?
    query = matches[0].query
    # the query object should be the same for all matches. Is this always true??
    for mt in matches:
        if mt.query != query:
            # FIXME: the expception may be swallowed in callers!!!
            raise Exception(
                'Inconsistent package.declared_license: text with multiple "queries".'
                'Please report this issue to the scancode-toolkit team.\n'
                '{}'.format(query_string))

    query_len = len(query.tokens)
    matched_qspans = [m.qspan for m in matches]
    matched_qpositions = Span.union(*matched_qspans)
    len_all_matches = len(matched_qpositions)
    declared_license_is_fully_matched = query_len == len_all_matches

    if not all_matches_have_full_coverage or not declared_license_is_fully_matched:
        # We inject an 'unknown' symbol in the expression
        unknown = licensing.parse('unknown', simple=True)
        combined_expression_object = licensing.AND(combined_expression_object,
                                                   unknown)

    return str(combined_expression_object)