Ejemplo n.º 1
0
def compute_stats(golr_url, release_date, exclude_pb_only=False):
    """
    compute stats on GO annotations - can specify if we include or exclude annotations to protein binding only
    """
    global golr_base_url
    golr_base_url = golr_url

    print("Will use golr url: ", golr_base_url)

    print("1 / 4 - Fetching GO terms...")
    all_terms = utils.golr_fetch(golr_base_url, golr_select_ontology)
    print("Done.")

    print("2 / 4 - Fetching GO annotations...")
    if exclude_pb_only:
        all_annotations = utils.golr_fetch(
            golr_base_url, golr_select_annotations_no_pbinding)
    else:
        all_annotations = utils.golr_fetch(golr_base_url,
                                           golr_select_annotations)
    print("Done.")

    print("3 / 4 - Fetching GO bioentities...")
    all_entities = utils.golr_fetch(golr_base_url, golr_select_bioentities)

    # we have to manually update the facts of the first query if we want to remove the bioentities annotated only to protein binding
    if exclude_pb_only:
        all_entities_no_pb = utils.golr_fetch(golr_base_url,
                                              golr_select_bioentities_pb)
        # print(all_entities_no_pb)
        entities_type_no_pb = {}
        entities_taxon_no_pb = {}

        count = 0

        for doc in all_entities_no_pb['response']['docs']:
            if len(doc['annotation_class_list']) > 1:
                continue
            count += 1
            if doc['type'] in entities_type_no_pb:
                entities_type_no_pb[doc['type']] += 1
            else:
                entities_type_no_pb[doc['type']] = 1

            if doc['taxon'] in entities_type_no_pb:
                entities_taxon_no_pb[doc['taxon']] += 1
            else:
                entities_taxon_no_pb[doc['taxon']] = 1

        # finally update the type facet field
        types = all_entities['facet_counts']['facet_fields']['type']
        for i in range(0, len(types), 2):
            ctype = types[i]
            retr_value = entities_type_no_pb[
                ctype] if ctype in entities_type_no_pb else 0
            types[i + 1] = types[i + 1] - retr_value
        all_entities['facet_counts']['facet_fields']['type'] = types

        all_entities['response'][
            'numFound'] = all_entities['response']['numFound'] - count

        # and update the taxon facet field
        taxons = all_entities['facet_counts']['facet_fields']['taxon']
        for i in range(0, len(taxons), 2):
            ctaxon = taxons[i]
            retr_value = entities_taxon_no_pb[
                ctaxon] if ctaxon in entities_taxon_no_pb else 0
            taxons[i + 1] = taxons[i + 1] - retr_value
        all_entities['facet_counts']['facet_fields']['taxon'] = taxons

    print("Done.")

    qualifiers = utils.golr_fetch(golr_base_url, golr_select_qualifiers)
    qualifiers = utils.build_map(
        qualifiers['facet_counts']['facet_fields']['qualifier'])

    print("4 / 4 - Creating Stats...")
    prepare_globals(all_annotations)
    print("\t4a - globals prepared")
    stats = create_stats(all_terms, all_annotations, all_entities,
                         release_date, qualifiers, exclude_pb_only)
    print("Done.")

    return stats
Ejemplo n.º 2
0
def get_references():
    refs = utils.golr_fetch(golr_base_url, golr_select_references)
    refs = utils.build_map(refs['facet_counts']['facet_fields']['reference'])
    return refs
Ejemplo n.º 3
0
def create_stats(all_terms,
                 all_annotations,
                 all_entities,
                 release_date,
                 qualifiers,
                 exclude_pb_only=False):
    stats = {}

    terms = 0
    obsoleted = 0
    terms_by_aspect = {"P": 0, "F": 0, "C": 0}

    for doc in all_terms['response']['docs']:
        if doc['is_obsolete']:
            obsoleted += 1
        else:
            terms += 1

            # some obsoleted annotations don't have a source
            if 'source' not in doc:
                continue

            if "biological_process" in doc['source']:
                terms_by_aspect["P"] += 1
            if "molecular_function" in doc['source']:
                terms_by_aspect["F"] += 1
            if "cellular_component" in doc['source']:
                terms_by_aspect["C"] += 1

    terms = {
        "total": all_terms['response']['numFound'],
        "valid": terms,
        "obsolete": obsoleted,
        "by_aspect": terms_by_aspect
    }
    print("\t4b - terms computed")

    all_bioentities_by_taxon = {}
    cluster_bioentities_by_taxon = {}
    for taxon in usable_taxons:
        responses = golr_fetch_bioentities_taxon(taxon)
        all_map = utils.build_map(
            responses[ALL]['facet_counts']['facet_fields']['type'])
        bp_map = utils.build_map(
            responses[BP]['facet_counts']['facet_fields']['type'])
        mf_map = utils.build_map(
            responses[MF]['facet_counts']['facet_fields']['type'])
        cc_map = utils.build_map(
            responses[CC]['facet_counts']['facet_fields']['type'])

        merged_map = {}
        for key, value in all_map.items():
            merged_map[key] = {
                "A": value,
                "P": bp_map[key] if key in bp_map else 0,
                "F": mf_map[key] if key in mf_map else 0,
                "C": cc_map[key] if key in cc_map else 0
            }

        all_bioentities_by_taxon[taxon] = merged_map
        cluster_bioentities_by_taxon[taxon] = utils.cluster_complex_map(
            all_bioentities_by_taxon[taxon], bioentity_type_cluster)

        # all_bioentities_by_taxon[taxon] = build_map(res['facet_counts']['facet_fields']['type'])
        # cluster_bioentities_by_taxon[taxon] =  cluster_map(all_bioentities_by_taxon[taxon], bioentity_type_cluster)
    print("\t4c - bioentities computed")

    references_by_taxon = {}
    pmids_by_taxon = {}
    for taxon in usable_taxons:
        res = golr_fetch_references_taxon(taxon)
        references_by_taxon[taxon] = int(
            len(res['facet_counts']['facet_fields']['reference']) / 2)
        pmid_map = utils.build_map(
            res['facet_counts']['facet_fields']['reference'])
        pmid_map = len(utils.extract_map(pmid_map, "PMID:"))
        pmids_by_taxon[taxon] = pmid_map
    references_by_taxon = utils.ordered_map(references_by_taxon)
    pmids_by_taxon = utils.ordered_map(pmids_by_taxon)
    print("\t4d - taxa computed")

    references_by_group = {}
    pmids_by_group = {}
    for group in groups:
        res = golr_fetch_references_group(group)
        references_by_group[group] = int(
            len(res['facet_counts']['facet_fields']['reference']) / 2)
        pmid_map = utils.build_map(
            res['facet_counts']['facet_fields']['reference'])
        pmid_map = len(utils.extract_map(pmid_map, "PMID:"))
        pmids_by_group[group] = pmid_map
    references_by_group = utils.ordered_map(references_by_group)
    pmids_by_group = utils.ordered_map(pmids_by_group)
    print("\t4e - references computed")

    ref_genome_annotation_evidences = {}
    for taxon in reference_genomes_ids:
        responses = golr_fetch_annotation_by_evidence_by_species(
            taxon, exclude_pb_only)
        all_map = utils.build_map(
            responses[ALL]['facet_counts']['facet_fields']['evidence_type'])
        bp_map = utils.build_map(
            responses[BP]['facet_counts']['facet_fields']['evidence_type'])
        mf_map = utils.build_map(
            responses[MF]['facet_counts']['facet_fields']['evidence_type'])
        cc_map = utils.build_map(
            responses[CC]['facet_counts']['facet_fields']['evidence_type'])

        merged_map = {}
        for key, value in all_map.items():
            merged_map[key] = {
                "A": value,
                "P": bp_map[key] if key in bp_map else 0,
                "F": mf_map[key] if key in mf_map else 0,
                "C": cc_map[key] if key in cc_map else 0
            }

        ref_genome_annotation_evidences[taxon] = {"by_evidence": merged_map}
        ref_genome_annotation_evidences[taxon][
            "by_evidence_cluster"] = utils.cluster_complex_map(
                ref_genome_annotation_evidences[taxon]["by_evidence"],
                reverse_evidence_groups)

        # adding qualifiers for each model organism
        response_qualifiers = utils.golr_fetch_by_taxon(
            golr_base_url, golr_select_qualifiers, taxon)
        response_qualifiers = response_qualifiers['facet_counts'][
            'facet_fields']['qualifier']
        ref_genome_annotation_evidences[taxon][
            "by_qualifier"] = utils.build_map(response_qualifiers)

    annotations = {
        "total":
        all_annotations['response']['numFound'],
        "by_aspect":
        utils.build_map(
            all_annotations['facet_counts']['facet_fields']['aspect']),
        "by_bioentity_type": {
            "all":
            utils.build_map(
                all_annotations['facet_counts']['facet_fields']['type']),
            "cluster":
            utils.cluster_map(
                utils.build_map(
                    all_annotations['facet_counts']['facet_fields']['type']),
                bioentity_type_cluster)
        },
        "by_qualifier":
        qualifiers,
        "by_taxon":
        utils.build_map(
            all_annotations['facet_counts']['facet_fields']['taxon']),
        "by_evidence": {
            "all":
            utils.build_map(all_annotations['facet_counts']['facet_fields']
                            ['evidence_type']),
            "cluster":
            utils.cluster_map(
                utils.build_map(all_annotations['facet_counts']['facet_fields']
                                ['evidence_type']), reverse_evidence_groups)
        },
        "by_model_organism":
        ref_genome_annotation_evidences,
        "by_group":
        utils.build_map(
            all_annotations['facet_counts']['facet_fields']['assigned_by'])
    }
    annotations = add_taxon_label(annotations)

    taxa = {
        "total":
        int(len(all_annotations['facet_counts']['facet_fields']['taxon']) / 2),
        "filtered":
        len(usable_taxons),
    }

    bioentities = {
        "total": all_entities['response']['numFound'],
        "by_type": {
            "all":
            utils.build_map(
                all_entities['facet_counts']['facet_fields']['type']),
            "cluster":
            utils.cluster_map(
                utils.build_map(
                    all_entities['facet_counts']['facet_fields']['type']),
                bioentity_type_cluster)
        },
        "by_filtered_taxon": {
            "all": all_bioentities_by_taxon,
            "cluster": cluster_bioentities_by_taxon
        }

        # This can not work and would require an evidence fields in the GOLR bioentity docs
        # "by_taxon" : {
        #     "all" : all_bioentities_by_taxon,
        #     "experimental" : experimental_bioentities_by_taxon
        # }
    }
    bioentities = add_taxon_label(bioentities)

    references = {
        "all": {
            "total":
            int(
                len(all_annotations['facet_counts']['facet_fields']
                    ['reference']) / 2),
            "by_filtered_taxon":
            references_by_taxon,
            "by_group":
            references_by_group
        },
        "pmids": {
            "total":
            len(
                utils.extract_map(
                    utils.build_map(all_annotations['facet_counts']
                                    ['facet_fields']['reference']), "PMID:")),
            "by_filtered_taxon":
            pmids_by_taxon,
            "by_group":
            pmids_by_group
        }
    }
    references = add_taxon_label(references)

    stats["release_date"] = release_date
    stats["terms"] = terms
    stats["annotations"] = annotations
    stats["taxa"] = taxa
    stats["bioentities"] = bioentities
    stats["references"] = references

    return stats