def compute_stats(golr_url, release_date, exclude_pb_only=False): """ compute stats on GO annotations - can specify if we include or exclude annotations to protein binding only """ global golr_base_url golr_base_url = golr_url print("Will use golr url: ", golr_base_url) print("1 / 4 - Fetching GO terms...") all_terms = utils.golr_fetch(golr_base_url, golr_select_ontology) print("Done.") print("2 / 4 - Fetching GO annotations...") if exclude_pb_only: all_annotations = utils.golr_fetch( golr_base_url, golr_select_annotations_no_pbinding) else: all_annotations = utils.golr_fetch(golr_base_url, golr_select_annotations) print("Done.") print("3 / 4 - Fetching GO bioentities...") all_entities = utils.golr_fetch(golr_base_url, golr_select_bioentities) # we have to manually update the facts of the first query if we want to remove the bioentities annotated only to protein binding if exclude_pb_only: all_entities_no_pb = utils.golr_fetch(golr_base_url, golr_select_bioentities_pb) # print(all_entities_no_pb) entities_type_no_pb = {} entities_taxon_no_pb = {} count = 0 for doc in all_entities_no_pb['response']['docs']: if len(doc['annotation_class_list']) > 1: continue count += 1 if doc['type'] in entities_type_no_pb: entities_type_no_pb[doc['type']] += 1 else: entities_type_no_pb[doc['type']] = 1 if doc['taxon'] in entities_type_no_pb: entities_taxon_no_pb[doc['taxon']] += 1 else: entities_taxon_no_pb[doc['taxon']] = 1 # finally update the type facet field types = all_entities['facet_counts']['facet_fields']['type'] for i in range(0, len(types), 2): ctype = types[i] retr_value = entities_type_no_pb[ ctype] if ctype in entities_type_no_pb else 0 types[i + 1] = types[i + 1] - retr_value all_entities['facet_counts']['facet_fields']['type'] = types all_entities['response'][ 'numFound'] = all_entities['response']['numFound'] - count # and update the taxon facet field taxons = all_entities['facet_counts']['facet_fields']['taxon'] for i in range(0, len(taxons), 2): ctaxon = taxons[i] retr_value = entities_taxon_no_pb[ ctaxon] if ctaxon in entities_taxon_no_pb else 0 taxons[i + 1] = taxons[i + 1] - retr_value all_entities['facet_counts']['facet_fields']['taxon'] = taxons print("Done.") qualifiers = utils.golr_fetch(golr_base_url, golr_select_qualifiers) qualifiers = utils.build_map( qualifiers['facet_counts']['facet_fields']['qualifier']) print("4 / 4 - Creating Stats...") prepare_globals(all_annotations) print("\t4a - globals prepared") stats = create_stats(all_terms, all_annotations, all_entities, release_date, qualifiers, exclude_pb_only) print("Done.") return stats
def get_references(): refs = utils.golr_fetch(golr_base_url, golr_select_references) refs = utils.build_map(refs['facet_counts']['facet_fields']['reference']) return refs
def create_stats(all_terms, all_annotations, all_entities, release_date, qualifiers, exclude_pb_only=False): stats = {} terms = 0 obsoleted = 0 terms_by_aspect = {"P": 0, "F": 0, "C": 0} for doc in all_terms['response']['docs']: if doc['is_obsolete']: obsoleted += 1 else: terms += 1 # some obsoleted annotations don't have a source if 'source' not in doc: continue if "biological_process" in doc['source']: terms_by_aspect["P"] += 1 if "molecular_function" in doc['source']: terms_by_aspect["F"] += 1 if "cellular_component" in doc['source']: terms_by_aspect["C"] += 1 terms = { "total": all_terms['response']['numFound'], "valid": terms, "obsolete": obsoleted, "by_aspect": terms_by_aspect } print("\t4b - terms computed") all_bioentities_by_taxon = {} cluster_bioentities_by_taxon = {} for taxon in usable_taxons: responses = golr_fetch_bioentities_taxon(taxon) all_map = utils.build_map( responses[ALL]['facet_counts']['facet_fields']['type']) bp_map = utils.build_map( responses[BP]['facet_counts']['facet_fields']['type']) mf_map = utils.build_map( responses[MF]['facet_counts']['facet_fields']['type']) cc_map = utils.build_map( responses[CC]['facet_counts']['facet_fields']['type']) merged_map = {} for key, value in all_map.items(): merged_map[key] = { "A": value, "P": bp_map[key] if key in bp_map else 0, "F": mf_map[key] if key in mf_map else 0, "C": cc_map[key] if key in cc_map else 0 } all_bioentities_by_taxon[taxon] = merged_map cluster_bioentities_by_taxon[taxon] = utils.cluster_complex_map( all_bioentities_by_taxon[taxon], bioentity_type_cluster) # all_bioentities_by_taxon[taxon] = build_map(res['facet_counts']['facet_fields']['type']) # cluster_bioentities_by_taxon[taxon] = cluster_map(all_bioentities_by_taxon[taxon], bioentity_type_cluster) print("\t4c - bioentities computed") references_by_taxon = {} pmids_by_taxon = {} for taxon in usable_taxons: res = golr_fetch_references_taxon(taxon) references_by_taxon[taxon] = int( len(res['facet_counts']['facet_fields']['reference']) / 2) pmid_map = utils.build_map( res['facet_counts']['facet_fields']['reference']) pmid_map = len(utils.extract_map(pmid_map, "PMID:")) pmids_by_taxon[taxon] = pmid_map references_by_taxon = utils.ordered_map(references_by_taxon) pmids_by_taxon = utils.ordered_map(pmids_by_taxon) print("\t4d - taxa computed") references_by_group = {} pmids_by_group = {} for group in groups: res = golr_fetch_references_group(group) references_by_group[group] = int( len(res['facet_counts']['facet_fields']['reference']) / 2) pmid_map = utils.build_map( res['facet_counts']['facet_fields']['reference']) pmid_map = len(utils.extract_map(pmid_map, "PMID:")) pmids_by_group[group] = pmid_map references_by_group = utils.ordered_map(references_by_group) pmids_by_group = utils.ordered_map(pmids_by_group) print("\t4e - references computed") ref_genome_annotation_evidences = {} for taxon in reference_genomes_ids: responses = golr_fetch_annotation_by_evidence_by_species( taxon, exclude_pb_only) all_map = utils.build_map( responses[ALL]['facet_counts']['facet_fields']['evidence_type']) bp_map = utils.build_map( responses[BP]['facet_counts']['facet_fields']['evidence_type']) mf_map = utils.build_map( responses[MF]['facet_counts']['facet_fields']['evidence_type']) cc_map = utils.build_map( responses[CC]['facet_counts']['facet_fields']['evidence_type']) merged_map = {} for key, value in all_map.items(): merged_map[key] = { "A": value, "P": bp_map[key] if key in bp_map else 0, "F": mf_map[key] if key in mf_map else 0, "C": cc_map[key] if key in cc_map else 0 } ref_genome_annotation_evidences[taxon] = {"by_evidence": merged_map} ref_genome_annotation_evidences[taxon][ "by_evidence_cluster"] = utils.cluster_complex_map( ref_genome_annotation_evidences[taxon]["by_evidence"], reverse_evidence_groups) # adding qualifiers for each model organism response_qualifiers = utils.golr_fetch_by_taxon( golr_base_url, golr_select_qualifiers, taxon) response_qualifiers = response_qualifiers['facet_counts'][ 'facet_fields']['qualifier'] ref_genome_annotation_evidences[taxon][ "by_qualifier"] = utils.build_map(response_qualifiers) annotations = { "total": all_annotations['response']['numFound'], "by_aspect": utils.build_map( all_annotations['facet_counts']['facet_fields']['aspect']), "by_bioentity_type": { "all": utils.build_map( all_annotations['facet_counts']['facet_fields']['type']), "cluster": utils.cluster_map( utils.build_map( all_annotations['facet_counts']['facet_fields']['type']), bioentity_type_cluster) }, "by_qualifier": qualifiers, "by_taxon": utils.build_map( all_annotations['facet_counts']['facet_fields']['taxon']), "by_evidence": { "all": utils.build_map(all_annotations['facet_counts']['facet_fields'] ['evidence_type']), "cluster": utils.cluster_map( utils.build_map(all_annotations['facet_counts']['facet_fields'] ['evidence_type']), reverse_evidence_groups) }, "by_model_organism": ref_genome_annotation_evidences, "by_group": utils.build_map( all_annotations['facet_counts']['facet_fields']['assigned_by']) } annotations = add_taxon_label(annotations) taxa = { "total": int(len(all_annotations['facet_counts']['facet_fields']['taxon']) / 2), "filtered": len(usable_taxons), } bioentities = { "total": all_entities['response']['numFound'], "by_type": { "all": utils.build_map( all_entities['facet_counts']['facet_fields']['type']), "cluster": utils.cluster_map( utils.build_map( all_entities['facet_counts']['facet_fields']['type']), bioentity_type_cluster) }, "by_filtered_taxon": { "all": all_bioentities_by_taxon, "cluster": cluster_bioentities_by_taxon } # This can not work and would require an evidence fields in the GOLR bioentity docs # "by_taxon" : { # "all" : all_bioentities_by_taxon, # "experimental" : experimental_bioentities_by_taxon # } } bioentities = add_taxon_label(bioentities) references = { "all": { "total": int( len(all_annotations['facet_counts']['facet_fields'] ['reference']) / 2), "by_filtered_taxon": references_by_taxon, "by_group": references_by_group }, "pmids": { "total": len( utils.extract_map( utils.build_map(all_annotations['facet_counts'] ['facet_fields']['reference']), "PMID:")), "by_filtered_taxon": pmids_by_taxon, "by_group": pmids_by_group } } references = add_taxon_label(references) stats["release_date"] = release_date stats["terms"] = terms stats["annotations"] = annotations stats["taxa"] = taxa stats["bioentities"] = bioentities stats["references"] = references return stats