def parse_structured_copyright_file( copyright_file, skip_debian_packaging=True, simplify_licenses=True, unique=True, ): """ Return a tuple of (declared license, detected license_expression, copyrights) strings computed from the `copyright_file` location. For each copyright file paragraph we treat the "name" as a license declaration. The text is used for detection and cross-reference with the declaration. If `skip_debian_packaging` is True, the Debian packaging license --if detected-- is skipped. If `simplify_licenses` is True the license expressions are simplified. If `unique` is True, repeated copyrights, detected or declared licenses are ignore, and only unique detections are returne. """ if not copyright_file: return None, None, None deco = DebianCopyright.from_file(copyright_file) declared_licenses = [] detected_licenses = [] copyrights = [] deco = fix_copyright(deco) licensing = Licensing() for paragraph in deco.paragraphs: if skip_debian_packaging and is_debian_packaging(paragraph): # Skipping packaging license and copyrights since they are not # relevant to the effective package license continue if isinstance(paragraph, (CopyrightHeaderParagraph, CopyrightFilesParagraph)): pcs = paragraph.copyright.statements or [] for p in pcs: p = p.dumps() # avoid repeats if unique: if p not in copyrights: copyrights.append(p) else: copyrights.append(p) if isinstance(paragraph, CatchAllParagraph): text = paragraph.dumps() if text: detected = get_normalized_expression(text, try_as_expression=False) if not detected: detected = 'unknown' detected_licenses.append(detected) else: plicense = paragraph.license if not plicense: continue declared, detected = detect_declared_license(plicense.name) # avoid repeats if unique: if declared and declared not in declared_licenses: declared_licenses.append(declared) if detected and detected not in detected_licenses: detected_licenses.append(detected) else: declared_licenses.append(declared) detected_licenses.append(detected) # also detect in text text = paragraph.license.text if text: detected = get_normalized_expression(text, try_as_expression=False) if not detected: detected = 'unknown' # avoid repeats if unique: if detected not in detected_licenses: detected_licenses.append(detected) else: detected_licenses.append(detected) declared_license = '\n'.join(declared_licenses) if detected_licenses: detected_licenses = [licensing.parse(dl, simple=True) for dl in detected_licenses] if len(detected_licenses) > 1: detected_license = licensing.AND(*detected_licenses) else: detected_license = detected_licenses[0] if simplify_licenses: detected_license = detected_license.simplify() detected_license = str(detected_license) else: detected_license = 'unknown' copyrights = '\n'.join(copyrights) return declared_license, detected_license, copyrights
def get_normalized_expression(query_string): """ Given a text `query_string` return a single detected license expression. `query_string` is typically the value of a license field as found in package manifests. Return None if there is the `query_string` is empty. Return "unknown" as a license expression if there is a `query_string` but nothing was detected. For example:: >>> get_normalized_expression('mit') 'mit' >>> get_normalized_expression('mit or asasa or Apache-2.0') 'apache-2.0 AND unknown' >>> get_normalized_expression('mit or asasa or Apache-2.0') 'apache-2.0 AND unknown' >>> get_normalized_expression('mit asasa or Apache-2.0') 'apache-2.0 AND unknown' >>> assert get_normalized_expression('') is None >>> assert get_normalized_expression(None) is None """ if not query_string or not query_string.strip(): return if TRACE: logger_debug('get_normalized_expression: query_string: "{}"'.format( query_string)) from licensedcode.cache import get_index idx = get_index() licensing = Licensing() # we match twice in a cascade: as an expression, then as plain text if we # did not succeed. matches = None try: matched_as_expression = True matches = idx.match(query_string=query_string, as_expression=True) if matches_have_unknown(matches, licensing): # rematch also if we have unknowns matched_as_expression = False matches = idx.match(query_string=query_string, as_expression=False) except Exception: matched_as_expression = False matches = idx.match(query_string=query_string, as_expression=False) if not matches: # we have a query_string text but there was no match: return an unknown # key return 'unknown' if TRACE: logger_debug('get_normalized_expression: matches:', matches) # join the possible multiple detected license expression with an AND expression_objects = [m.rule.license_expression_object for m in matches] if len(expression_objects) == 1: combined_expression_object = expression_objects[0] else: combined_expression_object = licensing.AND(*expression_objects) if matched_as_expression: # then just return the expression(s) return str(combined_expression_object) # Otherwise, verify that we consumed 100% of the query string e.g. that we # have no unknown leftover. # 1. have all matches 100% coverage? all_matches_have_full_coverage = all(m.coverage() == 100 for m in matches) # TODO: have all matches a high enough score? # 2. are all declared license tokens consumed? query = matches[0].query # the query object should be the same for all matches. Is this always true?? for mt in matches: if mt.query != query: # FIXME: the expception may be swallowed in callers!!! raise Exception( 'Inconsistent package.declared_license: text with multiple "queries".' 'Please report this issue to the scancode-toolkit team.\n' '{}'.format(query_string)) query_len = len(query.tokens) matched_qspans = [m.qspan for m in matches] matched_qpositions = Span.union(*matched_qspans) len_all_matches = len(matched_qpositions) declared_license_is_fully_matched = query_len == len_all_matches if not all_matches_have_full_coverage or not declared_license_is_fully_matched: # We inject an 'unknown' symbol in the expression unknown = licensing.parse('unknown', simple=True) combined_expression_object = licensing.AND(combined_expression_object, unknown) return str(combined_expression_object)