def language_summarizer(resource, children, keep_details=False): """ Populate a programming_language summary list of mappings such as {value: "programming_language", count: "count of occurences"} sorted by decreasing count. """ PROG_LANG = 'programming_language' languages = [] prog_lang = getattr(resource, PROG_LANG, []) if not prog_lang: if resource.is_file: # also count files with no detection languages.append(None) else: languages.append(prog_lang) # Collect direct children expression summaries for child in children: child_summaries = get_resource_summary( child, key=PROG_LANG, as_attribute=keep_details) or [] for child_summary in child_summaries: values = [child_summary['value']] * child_summary['count'] languages.extend(values) # summarize proper languages_counter = summarize_languages(languages) summarized = sorted_counter(languages_counter) set_resource_summary(resource, key=PROG_LANG, value=summarized, as_attribute=keep_details) return summarized
def license_summarizer(resource, children, keep_details=False): """ Populate a license_expressions list of mappings such as {value: "expression", count: "count of occurences"} sorted by decreasing count. """ LIC_EXP = 'license_expressions' license_expressions = [] # Collect current data lic_expressions = getattr(resource, LIC_EXP, []) if not lic_expressions and resource.is_file: # also count files with no detection license_expressions.append(None) else: license_expressions.extend(lic_expressions) # Collect direct children expression summary for child in children: child_summaries = get_resource_summary( child, key=LIC_EXP, as_attribute=keep_details) or [] for child_summary in child_summaries: # TODO: review this: this feels rather weird values = [child_summary['value']] * child_summary['count'] license_expressions.extend(values) # summarize proper licenses_counter = summarize_licenses(license_expressions) summarized = sorted_counter(licenses_counter) set_resource_summary(resource, key=LIC_EXP, value=summarized, as_attribute=keep_details) return summarized
def build_summary(resource, children, attribute, summarizer, keep_details=False): """ Update the `resource` Resource with a summary of itself and its `children` Resources and this for the `attribute` key (such as copyrights, etc). - `attribute` is the name of the attribute ('copyrights', 'holders' etc.) - `summarizer` is a function that takes a list of texts and returns summarized texts with counts """ # Collect current data values = getattr(resource, attribute, []) no_detection_counter = 0 if values: # keep current data as plain strings candidate_texts = [entry.get('value') for entry in values] else: candidate_texts = [] if resource.is_file: no_detection_counter += 1 # Collect direct children existing summaries for child in children: child_summaries = get_resource_summary( child, key=attribute, as_attribute=keep_details) or [] for child_summary in child_summaries: count = child_summary['count'] value = child_summary['value'] if value: candidate_texts.append(Text(value, value, count)) else: no_detection_counter += count # summarize proper using the provided function summarized = summarizer(candidate_texts) # add back the counter of things without detection if no_detection_counter: summarized.update({None: no_detection_counter}) summarized = sorted_counter(summarized) if TRACE: logger_debug('COPYRIGHT summarized:', summarized) set_resource_summary(resource, key=attribute, value=summarized, as_attribute=keep_details) return summarized
def summarize_codebase_key_files(codebase, **kwargs): """ Summarize codebase key files. """ summarizable_attributes = codebase.attributes.summary.keys() if TRACE: logger_debug('summarizable_attributes:', summarizable_attributes) # TODO: we cannot summarize packages with "key files for now really_summarizable_attributes = set([ 'license_expressions', 'copyrights', 'holders', 'authors', 'programming_language', # 'packages', ]) summarizable_attributes = [ k for k in summarizable_attributes if k in really_summarizable_attributes ] # create one counter for each summarized attribute summarizable_values_by_key = OrderedDict([ (key, []) for key in summarizable_attributes ]) # filter to get only key files key_files = (res for res in codebase.walk(topdown=True) if (res.is_file and res.is_top_level and ( res.is_readme or res.is_legal or res.is_manifest))) for resource in key_files: for key, values in summarizable_values_by_key.items(): # note we assume things are stored as extra-data, not as direct # Resource attributes res_summaries = get_resource_summary( resource, key=key, as_attribute=False) or [] for summary in res_summaries: # each summary is a mapping with value/count: we transform back to values values.extend([summary['value']] * summary['count']) summary_counters = [] for key, values in summarizable_values_by_key.items(): summarized = summarize_values(values, key) summary_counters.append((key, summarized)) sorted_summaries = OrderedDict([(key, sorted_counter(counter)) for key, counter in summary_counters]) codebase.attributes.summary_of_key_files = sorted_summaries if TRACE: logger_debug('codebase summary_of_key_files:', sorted_summaries)
def summarize_codebase_by_facet(codebase, **kwargs): """ Summarize codebase by facte. """ from summarycode import facet as facet_module summarizable = codebase.attributes.summary.keys() if TRACE: logger_debug('summarize_codebase_by_facet for attributes:', summarizable) # create one group of by-facet values lists for each summarized attribute summarizable_values_by_key_by_facet = dict([ (facet, dict([(key, []) for key in summarizable])) for facet in facet_module.FACETS ]) for resource in codebase.walk(topdown=True): if not resource.is_file: continue for facet in resource.facets: # note: this will fail loudly if the facet is not a known one values_by_attribute = summarizable_values_by_key_by_facet[facet] for key, values in values_by_attribute.items(): # note we assume things are stored as extra-data, not as direct # Resource attributes res_summaries = get_resource_summary( resource, key=key, as_attribute=False) or [] for summary in res_summaries: # each summary is a mapping with value/count: we transform back to discrete values sum_value = summary.get('value') if sum_value: values.extend([sum_value] * summary['count']) final_summaries = [] for facet, summarizable_values_by_key in summarizable_values_by_key_by_facet.items( ): summary_counters = ((key, summarize_values( values, key)) for key, values in summarizable_values_by_key.items()) sorted_summaries = dict([(key, sorted_counter(counter)) for key, counter in summary_counters]) facet_summary = dict(facet=facet) facet_summary['summary'] = sorted_summaries final_summaries.append(facet_summary) codebase.attributes.summary_by_facet.extend(final_summaries) if TRACE: logger_debug('codebase summary_by_facet:', final_summaries)
def summarize_codebase_key_files(codebase, **kwargs): """ Summarize codebase key files. """ summarizables = codebase.attributes.summary.keys() if TRACE: logger_debug('summarizables:', summarizables) # TODO: we cannot summarize packages with "key files" for now summarizables = [k for k in summarizables if k in SUMMARIZABLE_ATTRS] # create one counter for each summarized attribute summarizable_values_by_key = dict([(key, []) for key in summarizables]) # filter to get only key files key_files = (res for res in codebase.walk(topdown=True) if (res.is_file and res.is_top_level and ( res.is_readme or res.is_legal or res.is_manifest))) for resource in key_files: for key, values in summarizable_values_by_key.items(): # note we assume things are stored as extra-data, not as direct # Resource attributes res_summaries = get_resource_summary( resource, key=key, as_attribute=False) or [] for summary in res_summaries: # each summary is a mapping with value/count: we transform back to values sum_value = summary.get('value') if sum_value: values.extend([sum_value] * summary['count']) summary_counters = [] for key, values in summarizable_values_by_key.items(): if key not in SUMMARIZABLE_ATTRS: continue summarized = summarize_values(values, key) summary_counters.append((key, summarized)) sorted_summaries = dict([(key, sorted_counter(counter)) for key, counter in summary_counters]) codebase.attributes.summary_of_key_files = sorted_summaries if TRACE: logger_debug('codebase summary_of_key_files:', sorted_summaries)
def package_summarizer(resource, children, keep_details=False): """ Populate a packages summary list of packages mappings. Note: `keep_details` is never used, as we are not keeping details of packages as this has no value. """ packages = [] # Collect current data current_packages = getattr(resource, 'packages') or [] if TRACE_LIGHT and current_packages: from packagedcode.models import Package packs = [Package.create(**p) for p in current_packages] logger_debug('package_summarizer: for:', resource, 'current_packages are:', packs) current_packages = add_files(current_packages, resource) packages.extend(current_packages) if TRACE_LIGHT and packages: logger_debug() from packagedcode.models import Package # NOQA packs = [Package.create(**p) for p in packages] logger_debug('package_summarizer: for:', resource, 'packages are:', packs) # Collect direct children packages summary for child in children: child_summaries = get_resource_summary( child, key='packages', as_attribute=False) or [] packages.extend(child_summaries) # summarize proper set_resource_summary(resource, key='packages', value=packages, as_attribute=False) return packages