def main(argv):
    current_stats_url = ''
    previous_stats_url = ''
    output_rep = ''

    if len(argv) < 3:
        print_help()
        sys.exit(2)

    try:
        opts, argv = getopt.getopt(argv, "c:p:o:",
                                   ["current=", "previous=", "orep="])
    except getopt.GetoptError:
        print_help()
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            print_help()
            sys.exit()
        elif opt in ("-c", "--current"):
            current_stats_url = arg
        elif opt in ("-p", "--previous"):
            previous_stats_url = arg
        elif opt in ("-o", "-orep"):
            output_rep = arg

    if not output_rep.endswith("/"):
        output_rep += "/"

    if not os.path.exists(output_rep):
        os.mkdir(output_rep)

    output_json = output_rep + "go-stats-changes.json"
    output_tsv = output_rep + "go-stats-changes.tsv"

    print("Will write stats changes to " + output_json + " and " + output_tsv)

    current_stats = utils.fetch(current_stats_url).json()
    previous_stats = utils.fetch(previous_stats_url).json()
    json_changes = compute_changes(current_stats, previous_stats)

    json_changes = alter_annotation_changes(current_stats, previous_stats,
                                            None, None, json_changes)

    print("Saving Stats to <" + output_json + "> ...")
    utils.write_json(output_json, json_changes)
    print("Done.")

    print("Saving Stats to <" + output_tsv + "> ...")
    tsv_changes = create_text_report(json_changes)
    utils.write_text(output_tsv, tsv_changes)
    print("Done.")
def main(argv):
    current_obo_url = ''
    previous_obo_url = ''
    output_rep = ''

    if len(argv) < 6:
        print_help()
        sys.exit(2)

    try:
        opts, argv = getopt.getopt(argv, "c:p:o:", ["cobo=", "pobo=", "orep="])
    except getopt.GetoptError:
        print_help()
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            print_help()
            sys.exit()
        elif opt in ("-c", "--cobo"):
            current_obo_url = arg
        elif opt in ("-p", "--pobo"):
            previous_obo_url = arg
        elif opt in ("-o", "-orep"):
            output_rep = arg

    if not output_rep.endswith("/"):
        output_rep += "/"

    if not os.path.exists(output_rep):
        os.mkdir(output_rep)

    output_json = output_rep + "go-ontology-changes.json"
    output_stats_json = output_rep + "go-ontology-stats.json"
    output_tsv = output_rep + "go-ontology-changes.tsv"

    print("Will write ontology changes to " + output_json + " and " +
          output_tsv)

    json_changes = compute_changes(current_obo_url, previous_obo_url)

    print("Saving Stats to <" + output_json + "> ...")
    utils.write_json(output_json, json_changes)
    utils.write_json(output_stats_json, json_changes["summary"]["current"])
    print("Done.")

    print("Saving Stats to <" + output_tsv + "> ...")
    tsv_changes = create_text_report(json_changes)
    utils.write_text(output_tsv, tsv_changes)
    print("Done.")
Beispiel #3
0
def main(argv):
    golr_url = ''
    previous_stats_url = ''
    previous_stats_no_pb_url = ''
    current_obo_url = ''
    previous_obo_url = ''
    previous_references_url = ''
    output_rep = ''
    release_date = ''

    print(len(argv))
    if len(argv) < 16:
        print_help()
        sys.exit(2)

    try:
        opts, argv = getopt.getopt(argv, "g:s:n:c:p:o:d:r:", [
            "golrurl=", "pstats=", "pnstats=", "cobo=", "pobo=", "orep=",
            "date=", "ref="
        ])
    except getopt.GetoptError:
        print_help()
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            print_help()
            sys.exit()
        elif opt in ("-g", "--golrurl"):
            golr_url = arg
            if not golr_url.endswith("/"):
                golr_url = golr_url + "/"
        elif opt in ("-s", "--pstats"):
            previous_stats_url = arg
        elif opt in ("-n", "--pnstats"):
            previous_stats_no_pb_url = arg
        elif opt in ("-c", "--cobo"):
            current_obo_url = arg
        elif opt in ("-p", "--pobo"):
            previous_obo_url = arg
        elif opt in ("-r", "--ref"):
            previous_references_url = arg
        elif opt in ("-o", "--orep"):
            output_rep = arg
        elif opt in ("-d", "--date"):
            release_date = arg

    if not output_rep.endswith("/"):
        output_rep += "/"

    if not os.path.exists(output_rep):
        os.mkdir(output_rep)

    # actual names of the files to be generated - can change here if needed
    output_stats = output_rep + "go-stats.json"
    output_stats_no_pb = output_rep + "go-stats-no-pb.json"
    output_references = output_rep + "go-references.tsv"
    output_pmids = output_rep + "go-pmids.tsv"
    output_pubmed_pmids = output_rep + "GO.uid"
    output_ontology_changes = output_rep + "go-ontology-changes.json"
    output_ontology_changes_tsv = output_rep + "go-ontology-changes.tsv"
    output_stats_summary = output_rep + "go-stats-summary.json"
    output_annotation_changes = output_rep + "go-annotation-changes.json"
    output_annotation_changes_tsv = output_rep + "go-annotation-changes.tsv"
    output_annotation_changes_no_pb = output_rep + "go-annotation-changes_no_pb.json"
    output_annotation_changes_no_pb_tsv = output_rep + "go-annotation-changes_no_pb.tsv"

    # 1 - Executing go_stats script
    print(
        "\n\n1a - EXECUTING GO_STATS SCRIPT (INCLUDING PROTEIN BINDING)...\n")
    json_stats = go_stats.compute_stats(golr_url, release_date)
    print("DONE.")

    print(
        "\n\n1b - EXECUTING GO_STATS SCRIPT (EXCLUDING PROTEIN BINDING)...\n")
    json_stats_no_pb = go_stats.compute_stats(golr_url, release_date, True)
    print("DONE.")

    print(
        "\n\n1c - EXECUTING GO_STATS SCRIPT (RETRIEVING PREVIOUS REFERENCES LIST)...\n"
    )
    previous_references_ids = utils.fetch(previous_references_url).text
    previous_references_ids = previous_references_ids.split("\n")
    previous_references_ids = list(
        map(lambda x: x.split("\t")[0], previous_references_ids))
    print("DONE.")

    print(
        "\n\n1d - EXECUTING GO_STATS SCRIPT (CREATING CURRENT REFERENCES LIST)...\n"
    )
    references = go_stats.get_references()
    references_lines = []
    for k, v in references.items():
        references_lines.append(k + "\t" + str(v))
    current_references_ids = list(
        map(lambda x: x.split("\t")[0], references_lines))

    pmids_lines = list(filter(lambda x: "PMID:" in x, references_lines))
    pmids_ids = list(map(lambda x: x.split("\t")[0].split(":")[1],
                         pmids_lines))

    utils.write_text(output_references, "\n".join(references_lines))
    utils.write_text(output_pmids, "\n".join(pmids_lines))
    utils.write_text(output_pubmed_pmids, "\n".join(pmids_ids))
    print("DONE.")

    # 2 - Executing go_ontology_changes script
    print("\n\n2 - EXECUTING GO_ONTOLOGY_CHANGES SCRIPT...\n")
    json_onto_changes = go_ontology_changes.compute_changes(
        current_obo_url, previous_obo_url)
    utils.write_json(output_ontology_changes, json_onto_changes)

    tsv_onto_changes = go_ontology_changes.create_text_report(
        json_onto_changes)
    utils.write_text(output_ontology_changes_tsv, tsv_onto_changes)
    print("DONE.")

    # 3 - Executing go_annotation_changes script
    print(
        "\n\n3a - EXECUTING GO_ANNOTATION_CHANGES SCRIPT (INCLUDING PROTEIN BINDING)...\n"
    )
    previous_stats = utils.fetch(previous_stats_url).json()
    json_annot_changes = go_annotation_changes.compute_changes(
        json_stats, previous_stats)
    print("DONE.")

    print(
        "\n\n3b - EXECUTING GO_ANNOTATION_CHANGES SCRIPT (EXCLUDING PROTEIN BINDING)...\n"
    )
    previous_stats_no_pb = utils.fetch(previous_stats_no_pb_url).json(
    )  # WE STILL NEED TO CORRECT THAT: 1 FILE OR SEVERAL FILE ? IF SEVERAL, ONE MORE PARAMETER
    json_annot_no_pb_changes = go_annotation_changes.compute_changes(
        json_stats_no_pb, previous_stats_no_pb)
    print("DONE.")

    # 4 - Refining go-stats with ontology stats
    print("\n\n4 - EXECUTING GO_REFINE_STATS SCRIPT...\n")
    merged_annotations_diff = utils.merge_dict(json_stats, json_annot_changes)
    json_annot_changes = merged_annotations_diff

    ontology = json_onto_changes["summary"]["current"].copy()
    del ontology["release_date"]
    ontology["changes_created_terms"] = json_onto_changes["summary"][
        "changes"]["created_terms"]
    ontology["changes_valid_terms"] = json_onto_changes["summary"]["changes"][
        "valid_terms"]
    ontology["changes_obsolete_terms"] = json_onto_changes["summary"][
        "changes"]["obsolete_terms"]
    ontology["changes_merged_terms"] = json_onto_changes["summary"]["changes"][
        "merged_terms"]

    ontology["changes_biological_process_terms"] = json_onto_changes[
        "summary"]["changes"]["biological_process_terms"]
    ontology["changes_molecular_function_terms"] = json_onto_changes[
        "summary"]["changes"]["molecular_function_terms"]
    ontology["changes_cellular_component_terms"] = json_onto_changes[
        "summary"]["changes"]["cellular_component_terms"]

    json_stats = {
        "release_date": json_stats["release_date"],
        "ontology": ontology,
        "annotations": json_stats["annotations"],
        "taxa": json_stats["taxa"],
        "bioentities": json_stats["bioentities"],
        "references": json_stats["references"]
    }
    print("\n4a - SAVING GO-STATS...\n")
    utils.write_json(output_stats, json_stats)
    print("DONE.")

    json_stats_no_pb = {
        "release_date": json_stats_no_pb["release_date"],
        "ontology": ontology,
        "annotations": json_stats_no_pb["annotations"],
        "taxa": json_stats_no_pb["taxa"],
        "bioentities": json_stats_no_pb["bioentities"],
        "references": json_stats_no_pb["references"]
    }
    print("\n4b - SAVING GO-STATS-NO-PB...\n")
    utils.write_json(output_stats_no_pb, json_stats_no_pb)
    print("DONE.")

    annotations_by_reference_genome = json_stats["annotations"][
        "by_model_organism"]
    for taxon in annotations_by_reference_genome:
        for ecode in annotations_by_reference_genome[taxon]["by_evidence"]:
            annotations_by_reference_genome[taxon]["by_evidence"][ecode][
                "B"] = json_stats["annotations"]["by_model_organism"][taxon][
                    "by_evidence"][ecode]["F"] - json_stats_no_pb[
                        "annotations"]["by_model_organism"][taxon][
                            "by_evidence"][ecode]["F"]
        for ecode in annotations_by_reference_genome[taxon][
                "by_evidence_cluster"]:
            annotations_by_reference_genome[taxon]["by_evidence_cluster"][
                ecode]["B"] = json_stats["annotations"]["by_model_organism"][
                    taxon]["by_evidence_cluster"][ecode][
                        "F"] - json_stats_no_pb["annotations"][
                            "by_model_organism"][taxon]["by_evidence_cluster"][
                                ecode]["F"]

    bioentities_by_reference_genome = {}
    for taxon in go_stats.reference_genomes_ids:
        key = go_stats.taxon_label(taxon)
        bioentities_by_reference_genome[key] = json_stats["bioentities"][
            "by_filtered_taxon"]["cluster"][key] if key in json_stats[
                "bioentities"]["by_filtered_taxon"]["cluster"] else {}
        # TODO: we don't have a way to filter on bioentity documents without direct annotations to PB ?
        # for btype in bioentities_by_reference_genome[key]:
        #     val = json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"][key]["F"] if (key in json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"] and "F" in json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"][key]) else 0
        #     bioentities_by_reference_genome[key][btype]["B"] = bioentities_by_reference_genome[key][btype]["F"] - val

    references_by_reference_genome = {}
    for taxon in go_stats.reference_genomes_ids:
        key = go_stats.taxon_label(taxon)
        references_by_reference_genome[key] = json_stats["references"]["all"][
            "by_filtered_taxon"][key] if key in json_stats["references"][
                "all"]["by_filtered_taxon"] else {}

    pmids_by_reference_genome = {}
    for taxon in go_stats.reference_genomes_ids:
        key = go_stats.taxon_label(taxon)
        pmids_by_reference_genome[key] = json_stats["references"]["pmids"][
            "by_filtered_taxon"][key] if key in json_stats["references"][
                "pmids"]["by_filtered_taxon"] else {}

    # This is to modify the structure of the annotation changes based on recent requests
    print("\n4c - SAVING GO-ANNOTATION-CHANGES...\n")
    json_annot_changes = go_annotation_changes.alter_annotation_changes(
        json_stats, previous_stats, current_references_ids,
        previous_references_ids, json_annot_changes)
    utils.write_json(output_annotation_changes, json_annot_changes)
    tsv_annot_changes = go_annotation_changes.create_text_report(
        json_annot_changes)
    utils.write_text(output_annotation_changes_tsv, tsv_annot_changes)
    print("DONE.")

    print("\n4d - SAVING GO-ANNOTATION-NO-PB-CHANGES...\n")
    json_annot_no_pb_changes = go_annotation_changes.alter_annotation_changes(
        json_stats_no_pb, previous_stats_no_pb, current_references_ids,
        previous_references_ids, json_annot_no_pb_changes)
    utils.write_json(output_annotation_changes_no_pb, json_annot_no_pb_changes)
    tsv_annot_changes_no_pb = go_annotation_changes.create_text_report(
        json_annot_no_pb_changes)
    utils.write_text(output_annotation_changes_no_pb_tsv,
                     tsv_annot_changes_no_pb)
    print("DONE.")

    json_stats_summary = {
        "release_date": json_stats["release_date"],
        "ontology": ontology,
        "annotations": {
            "total":
            json_stats["annotations"]["total"],
            "total_no_pb":
            json_stats_no_pb["annotations"]["total"],
            "total_pb":
            json_stats["annotations"]["total"] -
            json_stats_no_pb["annotations"]["total"],
            "by_aspect": {
                "P":
                json_stats["annotations"]["by_aspect"]["P"],
                "F":
                json_stats["annotations"]["by_aspect"]["F"],
                "C":
                json_stats["annotations"]["by_aspect"]["C"],
                "B":
                json_stats["annotations"]["by_aspect"]["F"] -
                json_stats_no_pb["annotations"]["by_aspect"]["F"]
            },
            "by_bioentity_type_cluster":
            json_stats["annotations"]["by_bioentity_type"]["cluster"],
            "by_bioentity_type_cluster_no_pb":
            json_stats_no_pb["annotations"]["by_bioentity_type"]["cluster"],
            "by_qualifier":
            json_stats["annotations"]["by_qualifier"],
            "by_evidence_cluster":
            json_stats["annotations"]["by_evidence"]["cluster"],
            "by_evidence_cluster_no_pb":
            json_stats_no_pb["annotations"]["by_evidence"]["cluster"],
            "by_model_organism":
            annotations_by_reference_genome
        },
        "taxa": {
            "total": json_stats["taxa"]["total"],
            "filtered": json_stats["taxa"]["filtered"],
        },
        "bioentities": {
            "total":
            json_stats["bioentities"]["total"],
            "total_no_pb":
            json_stats_no_pb["bioentities"]["total"],
            "by_type_cluster":
            json_stats["bioentities"]["by_type"]["cluster"],
            "by_type_cluster_no_pb":
            json_stats_no_pb["bioentities"]["by_type"]["cluster"],
            "by_model_organism":
            bioentities_by_reference_genome
        },
        "references": {
            "all": {
                "total":
                json_stats["references"]["all"]["total"],
                "total_no_pb":
                json_stats_no_pb["references"]["all"]["total"],
                "added":
                json_annot_changes["summary"]["changes"]["references"]
                ["added"],
                "removed":
                json_annot_changes["summary"]["changes"]["references"]
                ["removed"],
                "by_model_organism":
                references_by_reference_genome
            },
            "pmids": {
                "total":
                json_stats["references"]["pmids"]["total"],
                "total_no_pb":
                json_stats_no_pb["references"]["pmids"]["total"],
                "added":
                json_annot_changes["summary"]["changes"]["pmids"]["added"],
                "removed":
                json_annot_changes["summary"]["changes"]["pmids"]["removed"],
                "by_model_organism":
                pmids_by_reference_genome
            }
        },
    }

    # removing by_reference_genome.by_evidence
    for gen in json_stats_summary["annotations"]["by_model_organism"]:
        del json_stats_summary["annotations"]["by_model_organism"][gen][
            "by_evidence"]
    print("\n4e - SAVING GO-STATS-SUMMARY...\n")
    utils.write_json(output_stats_summary, json_stats_summary)
    print("DONE.")

    # Indicate all processes finished
    print("SUCCESS.")
Beispiel #4
0
def main(argv):
    golr_url = ''
    current_obo_url = ''
    previous_obo_url = ''
    output_rep = ''
    release_date = ''

    print(len(argv))
    if len(argv) < 10:
        print_help()
        sys.exit(2)

    try:
        opts, argv = getopt.getopt(
            argv, "g:c:p:o:d:",
            ["golrurl=", "cobo=", "pobo=", "orep=", "date="])
    except getopt.GetoptError:
        print_help()
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            print_help()
            sys.exit()
        elif opt in ("-g", "--golrurl"):
            golr_url = arg
            if not golr_url.endswith("/"):
                golr_url = golr_url + "/"
        elif opt in ("-c", "--cobo"):
            current_obo_url = arg
        elif opt in ("-p", "--pobo"):
            previous_obo_url = arg
        elif opt in ("-o", "--orep"):
            output_rep = arg
        elif opt in ("-d", "--date"):
            release_date = arg

    if not output_rep.endswith("/"):
        output_rep += "/"

    if not os.path.exists(output_rep):
        os.mkdir(output_rep)

    # actual names of the files to be generated - can change here if needed
    output_stats = output_rep + "go-stats.json"
    output_stats_no_pb = output_rep + "go-stats-no-pb.json"
    output_references = output_rep + "go-references.tsv"
    output_pmids = output_rep + "go-pmids.tsv"
    output_pubmed_pmids = output_rep + "GO.uid"
    output_ontology_changes = output_rep + "go-ontology-changes.json"
    output_ontology_changes_tsv = output_rep + "go-ontology-changes.tsv"
    output_stats_summary = output_rep + "go-stats-summary.json"

    # 1 - Executing go_stats script
    print(
        "\n\n1a - EXECUTING GO_STATS SCRIPT (INCLUDING PROTEIN BINDING)...\n")
    json_stats = go_stats.compute_stats(golr_url, release_date)
    print("DONE.")

    print(
        "\n\n1b - EXECUTING GO_STATS SCRIPT (EXCLUDING PROTEIN BINDING)...\n")
    json_stats_no_pb = go_stats.compute_stats(golr_url, release_date, True)
    print("DONE.")

    # 2 - Executing go_ontology_changes script
    print("\n\n2 - EXECUTING GO_ONTOLOGY_CHANGES SCRIPT...\n")
    json_onto_changes = go_ontology_changes.compute_changes(
        current_obo_url, previous_obo_url)
    utils.write_json(output_ontology_changes, json_onto_changes)

    tsv_onto_changes = go_ontology_changes.create_text_report(
        json_onto_changes)
    utils.write_text(output_ontology_changes_tsv, tsv_onto_changes)
    print("DONE.")

    # 3 - Refining go-stats with ontology stats
    print("\n\n3 - EXECUTING GO_REFINE_STATS SCRIPT...\n")
    ontology = json_onto_changes["summary"]["current"].copy()
    del ontology["release_date"]
    ontology["changes_created_terms"] = json_onto_changes["summary"][
        "changes"]["created_terms"]
    ontology["changes_valid_terms"] = json_onto_changes["summary"]["changes"][
        "valid_terms"]
    ontology["changes_obsolete_terms"] = json_onto_changes["summary"][
        "changes"]["obsolete_terms"]
    ontology["changes_merged_terms"] = json_onto_changes["summary"]["changes"][
        "merged_terms"]

    ontology["changes_biological_process_terms"] = json_onto_changes[
        "summary"]["changes"]["biological_process_terms"]
    ontology["changes_molecular_function_terms"] = json_onto_changes[
        "summary"]["changes"]["molecular_function_terms"]
    ontology["changes_cellular_component_terms"] = json_onto_changes[
        "summary"]["changes"]["cellular_component_terms"]

    json_stats = {
        "release_date": json_stats["release_date"],
        "ontology": ontology,
        "annotations": json_stats["annotations"],
        "taxa": json_stats["taxa"],
        "bioentities": json_stats["bioentities"],
        "references": json_stats["references"]
    }
    utils.write_json(output_stats, json_stats)

    json_stats_no_pb = {
        "release_date": json_stats_no_pb["release_date"],
        "ontology": ontology,
        "annotations": json_stats_no_pb["annotations"],
        "taxa": json_stats_no_pb["taxa"],
        "bioentities": json_stats_no_pb["bioentities"],
        "references": json_stats_no_pb["references"]
    }
    utils.write_json(output_stats_no_pb, json_stats_no_pb)

    annotations_by_reference_genome = json_stats["annotations"][
        "by_model_organism"]
    for taxon in annotations_by_reference_genome:
        for ecode in annotations_by_reference_genome[taxon]["by_evidence"]:
            annotations_by_reference_genome[taxon]["by_evidence"][ecode][
                "B"] = json_stats["annotations"]["by_model_organism"][taxon][
                    "by_evidence"][ecode]["F"] - json_stats_no_pb[
                        "annotations"]["by_model_organism"][taxon][
                            "by_evidence"][ecode]["F"]
        for ecode in annotations_by_reference_genome[taxon][
                "by_evidence_cluster"]:
            annotations_by_reference_genome[taxon]["by_evidence_cluster"][
                ecode]["B"] = json_stats["annotations"]["by_model_organism"][
                    taxon]["by_evidence_cluster"][ecode][
                        "F"] - json_stats_no_pb["annotations"][
                            "by_model_organism"][taxon]["by_evidence_cluster"][
                                ecode]["F"]

    bioentities_by_reference_genome = {}
    for taxon in go_stats.reference_genomes_ids:
        key = go_stats.taxon_label(taxon)
        bioentities_by_reference_genome[key] = json_stats["bioentities"][
            "by_filtered_taxon"]["cluster"][key] if key in json_stats[
                "bioentities"]["by_filtered_taxon"]["cluster"] else {}
        # TODO: we don't have a way to filter on bioentity documents without direct annotations to PB ?
        # for btype in bioentities_by_reference_genome[key]:
        #     val = json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"][key]["F"] if (key in json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"] and "F" in json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"][key]) else 0
        #     bioentities_by_reference_genome[key][btype]["B"] = bioentities_by_reference_genome[key][btype]["F"] - val

    references_by_reference_genome = {}
    for taxon in go_stats.reference_genomes_ids:
        key = go_stats.taxon_label(taxon)
        references_by_reference_genome[key] = json_stats["references"]["all"][
            "by_filtered_taxon"][key] if key in json_stats["references"][
                "all"]["by_filtered_taxon"] else {}

    pmids_by_reference_genome = {}
    for taxon in go_stats.reference_genomes_ids:
        key = go_stats.taxon_label(taxon)
        pmids_by_reference_genome[key] = json_stats["references"]["pmids"][
            "by_filtered_taxon"][key] if key in json_stats["references"][
                "pmids"]["by_filtered_taxon"] else {}

    json_stats_summary = {
        "release_date": json_stats["release_date"],
        "ontology": ontology,
        "annotations": {
            "total":
            json_stats["annotations"]["total"],
            "total_no_pb":
            json_stats_no_pb["annotations"]["total"],
            "total_pb":
            json_stats["annotations"]["total"] -
            json_stats_no_pb["annotations"]["total"],
            "by_aspect": {
                "P":
                json_stats["annotations"]["by_aspect"]["P"],
                "F":
                json_stats["annotations"]["by_aspect"]["F"],
                "C":
                json_stats["annotations"]["by_aspect"]["C"],
                "B":
                json_stats["annotations"]["by_aspect"]["F"] -
                json_stats_no_pb["annotations"]["by_aspect"]["F"]
            },
            "by_bioentity_type_cluster":
            json_stats["annotations"]["by_bioentity_type"]["cluster"],
            "by_bioentity_type_cluster_no_pb":
            json_stats_no_pb["annotations"]["by_bioentity_type"]["cluster"],
            "by_evidence_cluster":
            json_stats["annotations"]["by_evidence"]["cluster"],
            "by_evidence_cluster_no_pb":
            json_stats_no_pb["annotations"]["by_evidence"]["cluster"],
            "by_model_organism":
            annotations_by_reference_genome
        },
        "taxa": {
            "total": json_stats["taxa"]["total"],
            "filtered": json_stats["taxa"]["filtered"],
        },
        "bioentities": {
            "total":
            json_stats["bioentities"]["total"],
            "total_no_pb":
            json_stats_no_pb["bioentities"]["total"],
            "by_type_cluster":
            json_stats["bioentities"]["by_type"]["cluster"],
            "by_type_cluster_no_pb":
            json_stats_no_pb["bioentities"]["by_type"]["cluster"],
            "by_model_organism":
            bioentities_by_reference_genome
        },
        "references": {
            "all": {
                "total": json_stats["references"]["all"]["total"],
                "total_no_pb": json_stats_no_pb["references"]["all"]["total"],
                "by_model_organism": references_by_reference_genome
            },
            "pmids": {
                "total": json_stats["references"]["pmids"]["total"],
                "total_no_pb":
                json_stats_no_pb["references"]["pmids"]["total"],
                "by_model_organism": pmids_by_reference_genome
            }
        },
    }

    # removing by_reference_genome.by_evidence
    for gen in json_stats_summary["annotations"]["by_model_organism"]:
        del json_stats_summary["annotations"]["by_model_organism"][gen][
            "by_evidence"]
    utils.write_json(output_stats_summary, json_stats_summary)

    print("Saving references file to <" + output_pmids +
          "> and PubMed PMID file to <" + output_pubmed_pmids + ">")
    references = go_stats.get_references()
    references_lines = []
    for k, v in references.items():
        references_lines.append(k + "\t" + str(v))

    pmids_lines = list(filter(lambda x: "PMID:" in x, references_lines))
    pmids_ids = list(map(lambda x: x.split("\t")[0].split(":")[1],
                         pmids_lines))

    utils.write_text(output_references, "\n".join(references_lines))
    utils.write_text(output_pmids, "\n".join(pmids_lines))
    utils.write_text(output_pubmed_pmids, "\n".join(pmids_ids))
    print("Done.")

    print("SUCCESS.")
Beispiel #5
0
def main(argv):
    golr_url = ''
    current_obo_url = ''
    previous_obo_url = ''    
    output_rep = ''
    release_date = ''

    if len(argv) < 10:
        print_help()
        sys.exit(2)

    try:
        opts, argv = getopt.getopt(argv,"g:c:p:o:d:",["golrurl=", "cobo=", "pobo=", "orep=", "date="])
    except getopt.GetoptError:
        print_help()
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            print_help()
            sys.exit()
        elif opt in ("-g", "--golrurl"):
            golr_url = arg
        elif opt in ("-c", "--cobo"):
            current_obo_url = arg
        elif opt in ("-p", "--pobo"):
            previous_obo_url = arg
        elif opt in ("-o", "--orep"):
            output_rep = arg
        elif opt in ("-d", "--date"):
            release_date = arg

    if not output_rep.endswith("/"):
        output_rep += "/"

    if not os.path.exists(output_rep):
        os.mkdir(output_rep)


    # 1 - Executing go_stats script
    print("\n\n1a - EXECUTING GO_STATS SCRIPT (INCLUDING PROTEIN BINDING)...\n")
    json_stats = go_stats.compute_stats(golr_url, release_date)
    # data = None
    # with open('newtest/go-stats.json', 'r') as myfile:
    #     data=myfile.read()
    # json_stats = json.loads(data)


    print("DONE.")

    print("\n\n1b - EXECUTING GO_STATS SCRIPT (EXCLUDING PROTEIN BINDING)...\n")
    json_stats_no_pb = go_stats.compute_stats(golr_url, release_date, True)
    # with open('newtest/go-stats-no-pb.json', 'r') as myfile:
    #     data=myfile.read()
    # json_stats_no_pb = json.loads(data)    
    print("DONE.")


    # 2 - Executing go_ontology_changes script
    print("\n\n2 - EXECUTING GO_ONTOLOGY_CHANGES SCRIPT...\n")
    # with open('newtest/go-ontology-changes.json', 'r') as myfile:
    #     data=myfile.read()
    # json_onto_changes = json.loads(data)
    
    json_onto_changes = go_ontology_changes.compute_changes(current_obo_url, previous_obo_url)
    utils.write_json(output_rep + "go-ontology-changes.json", json_onto_changes)

    tsv_onto_changes = go_ontology_changes.create_text_report(json_onto_changes) 
    utils.write_text(output_rep + "go-ontology-changes.tsv", tsv_onto_changes)
    print("DONE.")


    # 4 - Refining go-stats with ontology stats
    print("\n\n4 - EXECUTING GO_REFINE_STATS SCRIPT...\n")

    ontology = json_onto_changes["summary"]["current"].copy()
    del ontology["release_date"]
    ontology["changes_created_terms"] = json_onto_changes["summary"]["changes"]["created_terms"]
    ontology["changes_valid_terms"] = json_onto_changes["summary"]["changes"]["valid_terms"]
    ontology["changes_obsolete_terms"] = json_onto_changes["summary"]["changes"]["obsolete_terms"]
    ontology["changes_merged_terms"] = json_onto_changes["summary"]["changes"]["merged_terms"]

    ontology["changes_biological_process_terms"] = json_onto_changes["summary"]["changes"]["biological_process_terms"]
    ontology["changes_molecular_function_terms"] = json_onto_changes["summary"]["changes"]["molecular_function_terms"]
    ontology["changes_cellular_component_terms"] = json_onto_changes["summary"]["changes"]["cellular_component_terms"]


    json_stats = {
        "release_date" : json_stats["release_date"],
        "ontology" : ontology,
        "annotations" : json_stats["annotations"],
        "taxa" : json_stats["taxa"],
        "bioentities" : json_stats["bioentities"],
        "references" : json_stats["references"]
    }
    utils.write_json(output_rep + "go-stats.json", json_stats)


    json_stats_no_pb = {
        "release_date" : json_stats_no_pb["release_date"],
        "ontology" : ontology,
        "annotations" : json_stats_no_pb["annotations"],
        "taxa" : json_stats_no_pb["taxa"],
        "bioentities" : json_stats_no_pb["bioentities"],
        "references" : json_stats_no_pb["references"]
    }
    utils.write_json(output_rep + "go-stats-no-pb.json", json_stats_no_pb)


    annotations_by_reference_genome = json_stats["annotations"]["by_model_organism"]
    for taxon in annotations_by_reference_genome:
        for ecode in annotations_by_reference_genome[taxon]["by_evidence"]:
            annotations_by_reference_genome[taxon]["by_evidence"][ecode]["B"] = json_stats["annotations"]["by_model_organism"][taxon]["by_evidence"][ecode]["F"] - json_stats_no_pb["annotations"]["by_model_organism"][taxon]["by_evidence"][ecode]["F"]
        for ecode in annotations_by_reference_genome[taxon]["by_evidence_cluster"]:
            annotations_by_reference_genome[taxon]["by_evidence_cluster"][ecode]["B"] = json_stats["annotations"]["by_model_organism"][taxon]["by_evidence_cluster"][ecode]["F"] - json_stats_no_pb["annotations"]["by_model_organism"][taxon]["by_evidence_cluster"][ecode]["F"]

    bioentities_by_reference_genome = { }
    for taxon in go_stats.reference_genomes_ids:
        key = go_stats.taxon_label(taxon)
        bioentities_by_reference_genome[key] = json_stats["bioentities"]["by_filtered_taxon"]["cluster"][key] if key in json_stats["bioentities"]["by_filtered_taxon"]["cluster"] else { }
        # TODO: we don't have a way to filter on bioentity documents without direct annotations to PB ?
        # for btype in bioentities_by_reference_genome[key]:
        #     val = json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"][key]["F"] if (key in json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"] and "F" in json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"][key]) else 0
        #     bioentities_by_reference_genome[key][btype]["B"] = bioentities_by_reference_genome[key][btype]["F"] - val

    references_by_reference_genome = { }
    for taxon in go_stats.reference_genomes_ids:
        key = go_stats.taxon_label(taxon)
        references_by_reference_genome[key] = json_stats["references"]["all"]["by_filtered_taxon"][key] if key in json_stats["references"]["all"]["by_filtered_taxon"] else { }

    pmids_by_reference_genome = { }
    for taxon in go_stats.reference_genomes_ids:
        key = go_stats.taxon_label(taxon)
        pmids_by_reference_genome[key] = json_stats["references"]["pmids"]["by_filtered_taxon"][key] if key in json_stats["references"]["pmids"]["by_filtered_taxon"] else { }
        
    json_stats_summary = {
        "release_date" : json_stats["release_date"],
        "ontology" : ontology,
        "annotations" : {
            "total" : json_stats["annotations"]["total"],
            "total_no_pb" : json_stats_no_pb["annotations"]["total"],
            "total_pb" : json_stats["annotations"]["total"] - json_stats_no_pb["annotations"]["total"],
            "by_aspect" : {
                "P" : json_stats["annotations"]["by_aspect"]["P"],
                "F" : json_stats["annotations"]["by_aspect"]["F"],
                "C" : json_stats["annotations"]["by_aspect"]["C"],
                "B" : json_stats["annotations"]["by_aspect"]["F"] - json_stats_no_pb["annotations"]["by_aspect"]["F"]
            },
            "by_bioentity_type_cluster" : json_stats["annotations"]["by_bioentity_type"]["cluster"],
            "by_bioentity_type_cluster_no_pb" : json_stats_no_pb["annotations"]["by_bioentity_type"]["cluster"],
            "by_evidence_cluster" : json_stats["annotations"]["by_evidence"]["cluster"],
            "by_evidence_cluster_no_pb" : json_stats_no_pb["annotations"]["by_evidence"]["cluster"],
            "by_model_organism" : annotations_by_reference_genome
        },
        "taxa" : {
            "total" : json_stats["taxa"]["total"],
            "filtered" : json_stats["taxa"]["filtered"],
        },
        "bioentities" : {
            "total" : json_stats["bioentities"]["total"],
            "total_no_pb" : json_stats_no_pb["bioentities"]["total"],
            "by_type_cluster" : json_stats["bioentities"]["by_type"]["cluster"],
            "by_type_cluster_no_pb" : json_stats_no_pb["bioentities"]["by_type"]["cluster"],
            "by_model_organism" : bioentities_by_reference_genome
        },
        "references" : {
            "all" : {
                "total" : json_stats["references"]["all"]["total"],
                "total_no_pb" : json_stats_no_pb["references"]["all"]["total"],
                "by_model_organism" : references_by_reference_genome
            },
            "pmids" : {
                "total" : json_stats["references"]["pmids"]["total"],
                "total_no_pb" : json_stats_no_pb["references"]["pmids"]["total"],
                "by_model_organism" : pmids_by_reference_genome
            }
        },
    }
    utils.write_json(output_rep + "go-stats-summary.json", json_stats_summary)

    print("DONE.")
Beispiel #6
0
def main(argv):
    golr_base_url = ''
    output_rep = ''
    slim_base_url = ''

    if len(argv) < 6:
        print_help()
        sys.exit(2)

    try:
        opts, argv = getopt.getopt(argv, "g:o:s:",
                                   ["golrurl=", "orep=", "slim="])
    except getopt.GetoptError:
        print_help()
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            print_help()
            sys.exit()
        elif opt in ("-g", "--golrurl"):
            golr_base_url = arg
            if not golr_base_url.endswith("/"):
                golr_base_url = golr_base_url + "/"
        elif opt in ("-o", "--orep"):
            output_rep = arg
        elif opt in ("-s", "--slim"):
            slim_base_url = arg
            if not slim_base_url.endswith("/"):
                slim_base_url = slim_base_url + "/"

    if not output_rep.endswith("/"):
        output_rep += "/"

    if not os.path.exists(output_rep):
        os.mkdir(output_rep)

    print("\n1 - Creating ontology map...")
    ontology_map = create_ontology_map(golr_base_url)
    print("Ontology map created with ", len(ontology_map), " terms")

    slims = ["goslim_agr.obo", "goslim_generic.obo", "goslim_chembl.obo"]
    print("\n2 - Loading ", len(slims),
          " slims to create the slim-specific GMTs...")
    slim_obos = {}

    for slim in slims:
        response = utils.fetch(slim_base_url + slim)
        obo = OBO_Parser(response.text)
        slim_obos[slim] = obo
    print("Slims loaded: ", len(slim_obos))

    # taxa = utils.REFERENCE_GENOME_IDS
    taxa = ["NCBITaxon:9606", "NCBITaxon:10090"]
    print("\n3 - Creating the GMTs for ", len(taxa), " taxa")
    for taxon in taxa:
        taxon_id = taxon.split(":")[1]
        gmt_taxon = gmt(ontology_map, golr_base_url, taxon)

        output = output_rep + taxon_id

        for aspect in gmt_taxon:
            for evgroup in gmt_taxon[aspect]:
                if len(gmt_taxon[aspect][evgroup]) > 0:
                    utils.write_text(
                        output + "-" + aspect.lower() + "-" + evgroup.lower() +
                        ".gmt", gmt_taxon[aspect][evgroup])

        for slim_obo in slim_obos:
            oterms = slim_obos[slim_obo].get_terms(TermState.VALID)
            terms = oterms.keys()
            gmt_taxon_slim = filter_slim(gmt_taxon, terms)
            slim_key = slim_obo.replace(".obo", "")

            for aspect in gmt_taxon_slim:
                for evgroup in gmt_taxon_slim[aspect]:
                    if len(gmt_taxon_slim[aspect][evgroup]) > 0:
                        utils.write_text(
                            output + "-" + slim_key + "-" + aspect.lower() +
                            "-" + evgroup.lower() + ".gmt",
                            gmt_taxon_slim[aspect][evgroup])
Beispiel #7
0
def main(argv):
    golr_url = ''
    output_rep = ''
    release_date = ''

    if len(argv) < 6:
        print_help()
        sys.exit(2)

    try:
        opts, argv = getopt.getopt(argv, "g:b:o:d:",
                                   ["golrurl=", "orep=", "date="])
    except getopt.GetoptError:
        print_help()
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            print_help()
            sys.exit()
        elif opt in ("-g", "--golrurl"):
            golr_url = arg
            if not golr_url.endswith("/"):
                golr_url = golr_url + "/"
        elif opt in ("-o", "--orep"):
            output_rep = arg
        elif opt in ("-d", "--date"):
            release_date = arg

    if not output_rep.endswith("/"):
        output_rep += "/"

    if not os.path.exists(output_rep):
        os.mkdir(output_rep)

    # actual names of the files to be generated - can change here if needed
    output_meta = output_rep + "go-meta.json"
    output_meta_no_pb = output_rep + "go-meta-no-pb.json"
    output_stats = output_rep + "go-stats.json"
    output_stats_no_pb = output_rep + "go-stats-no-pb.json"
    output_stats_tsv = output_rep + "go-stats.tsv"
    output_stats_no_pb_tsv = output_rep + "go-stats-no-pb.tsv"
    output_references = output_rep + "go-references.tsv"
    output_pmids = output_rep + "go-pmids.tsv"
    output_pubmed_pmids = output_rep + "GO.uid"

    print("Will write stats to " + output_stats + " and " + output_stats_tsv)
    json_stats = compute_stats(golr_url, release_date, False)
    print("Saving Stats to <" + output_stats + "> ...")
    utils.write_json(output_stats, json_stats)
    print("Done.")

    print("Saving Stats to <" + output_stats_tsv + "> ...")
    tsv_stats = create_text_report(json_stats)
    utils.write_text(output_stats_tsv, tsv_stats)
    print("Done.")

    print("Will write stats (excluding protein binding) to " +
          output_stats_no_pb + " and " + output_stats_no_pb_tsv)
    json_stats_no_pb = compute_stats(golr_url, release_date, True)
    print("Saving Stats to <" + output_stats_no_pb + "> ...")
    utils.write_json(output_stats_no_pb, json_stats_no_pb)
    print("Done.")

    print("Saving Stats (excluding protein binding) to <" +
          output_stats_no_pb_tsv + "> ...")
    tsv_stats_no_pb = create_text_report(json_stats_no_pb)
    utils.write_text(output_stats_no_pb_tsv, tsv_stats_no_pb)
    print("Done.")

    json_meta = create_meta(json_stats)
    print("Saving META to <" + output_meta + "> ...")
    utils.write_json(output_meta, json_meta)
    print("Done.")

    json_meta_no_pb = create_meta(json_stats_no_pb)
    print("Saving META to <" + output_meta_no_pb + "> ...")
    utils.write_json(output_meta_no_pb, json_meta_no_pb)
    print("Done.")

    print("Saving PMID file to <" + output_pmids +
          "> and PubMed PMID file to <" + output_pubmed_pmids + ">")
    references = get_references()
    pmids = {k: v for k, v in references.items() if "PMID:" in k}
    pmids_ids = map(lambda x: x.split(":")[1], pmids)

    pmids_lines = []
    for k, v in pmids.items():
        pmids_lines.append(k + "\t" + str(v))

    utils.write_text(output_pmids, "\n".join(pmids_lines))
    utils.write_text(output_pubmed_pmids, "\n".join(pmids_ids))
    print("Done.")