def main(argv): current_stats_url = '' previous_stats_url = '' output_rep = '' if len(argv) < 3: print_help() sys.exit(2) try: opts, argv = getopt.getopt(argv, "c:p:o:", ["current=", "previous=", "orep="]) except getopt.GetoptError: print_help() sys.exit(2) for opt, arg in opts: if opt == '-h': print_help() sys.exit() elif opt in ("-c", "--current"): current_stats_url = arg elif opt in ("-p", "--previous"): previous_stats_url = arg elif opt in ("-o", "-orep"): output_rep = arg if not output_rep.endswith("/"): output_rep += "/" if not os.path.exists(output_rep): os.mkdir(output_rep) output_json = output_rep + "go-stats-changes.json" output_tsv = output_rep + "go-stats-changes.tsv" print("Will write stats changes to " + output_json + " and " + output_tsv) current_stats = utils.fetch(current_stats_url).json() previous_stats = utils.fetch(previous_stats_url).json() json_changes = compute_changes(current_stats, previous_stats) json_changes = alter_annotation_changes(current_stats, previous_stats, None, None, json_changes) print("Saving Stats to <" + output_json + "> ...") utils.write_json(output_json, json_changes) print("Done.") print("Saving Stats to <" + output_tsv + "> ...") tsv_changes = create_text_report(json_changes) utils.write_text(output_tsv, tsv_changes) print("Done.")
def main(argv): current_obo_url = '' previous_obo_url = '' output_rep = '' if len(argv) < 6: print_help() sys.exit(2) try: opts, argv = getopt.getopt(argv, "c:p:o:", ["cobo=", "pobo=", "orep="]) except getopt.GetoptError: print_help() sys.exit(2) for opt, arg in opts: if opt == '-h': print_help() sys.exit() elif opt in ("-c", "--cobo"): current_obo_url = arg elif opt in ("-p", "--pobo"): previous_obo_url = arg elif opt in ("-o", "-orep"): output_rep = arg if not output_rep.endswith("/"): output_rep += "/" if not os.path.exists(output_rep): os.mkdir(output_rep) output_json = output_rep + "go-ontology-changes.json" output_stats_json = output_rep + "go-ontology-stats.json" output_tsv = output_rep + "go-ontology-changes.tsv" print("Will write ontology changes to " + output_json + " and " + output_tsv) json_changes = compute_changes(current_obo_url, previous_obo_url) print("Saving Stats to <" + output_json + "> ...") utils.write_json(output_json, json_changes) utils.write_json(output_stats_json, json_changes["summary"]["current"]) print("Done.") print("Saving Stats to <" + output_tsv + "> ...") tsv_changes = create_text_report(json_changes) utils.write_text(output_tsv, tsv_changes) print("Done.")
def main(argv): golr_url = '' previous_stats_url = '' previous_stats_no_pb_url = '' current_obo_url = '' previous_obo_url = '' previous_references_url = '' output_rep = '' release_date = '' print(len(argv)) if len(argv) < 16: print_help() sys.exit(2) try: opts, argv = getopt.getopt(argv, "g:s:n:c:p:o:d:r:", [ "golrurl=", "pstats=", "pnstats=", "cobo=", "pobo=", "orep=", "date=", "ref=" ]) except getopt.GetoptError: print_help() sys.exit(2) for opt, arg in opts: if opt == '-h': print_help() sys.exit() elif opt in ("-g", "--golrurl"): golr_url = arg if not golr_url.endswith("/"): golr_url = golr_url + "/" elif opt in ("-s", "--pstats"): previous_stats_url = arg elif opt in ("-n", "--pnstats"): previous_stats_no_pb_url = arg elif opt in ("-c", "--cobo"): current_obo_url = arg elif opt in ("-p", "--pobo"): previous_obo_url = arg elif opt in ("-r", "--ref"): previous_references_url = arg elif opt in ("-o", "--orep"): output_rep = arg elif opt in ("-d", "--date"): release_date = arg if not output_rep.endswith("/"): output_rep += "/" if not os.path.exists(output_rep): os.mkdir(output_rep) # actual names of the files to be generated - can change here if needed output_stats = output_rep + "go-stats.json" output_stats_no_pb = output_rep + "go-stats-no-pb.json" output_references = output_rep + "go-references.tsv" output_pmids = output_rep + "go-pmids.tsv" output_pubmed_pmids = output_rep + "GO.uid" output_ontology_changes = output_rep + "go-ontology-changes.json" output_ontology_changes_tsv = output_rep + "go-ontology-changes.tsv" output_stats_summary = output_rep + "go-stats-summary.json" output_annotation_changes = output_rep + "go-annotation-changes.json" output_annotation_changes_tsv = output_rep + "go-annotation-changes.tsv" output_annotation_changes_no_pb = output_rep + "go-annotation-changes_no_pb.json" output_annotation_changes_no_pb_tsv = output_rep + "go-annotation-changes_no_pb.tsv" # 1 - Executing go_stats script print( "\n\n1a - EXECUTING GO_STATS SCRIPT (INCLUDING PROTEIN BINDING)...\n") json_stats = go_stats.compute_stats(golr_url, release_date) print("DONE.") print( "\n\n1b - EXECUTING GO_STATS SCRIPT (EXCLUDING PROTEIN BINDING)...\n") json_stats_no_pb = go_stats.compute_stats(golr_url, release_date, True) print("DONE.") print( "\n\n1c - EXECUTING GO_STATS SCRIPT (RETRIEVING PREVIOUS REFERENCES LIST)...\n" ) previous_references_ids = utils.fetch(previous_references_url).text previous_references_ids = previous_references_ids.split("\n") previous_references_ids = list( map(lambda x: x.split("\t")[0], previous_references_ids)) print("DONE.") print( "\n\n1d - EXECUTING GO_STATS SCRIPT (CREATING CURRENT REFERENCES LIST)...\n" ) references = go_stats.get_references() references_lines = [] for k, v in references.items(): references_lines.append(k + "\t" + str(v)) current_references_ids = list( map(lambda x: x.split("\t")[0], references_lines)) pmids_lines = list(filter(lambda x: "PMID:" in x, references_lines)) pmids_ids = list(map(lambda x: x.split("\t")[0].split(":")[1], pmids_lines)) utils.write_text(output_references, "\n".join(references_lines)) utils.write_text(output_pmids, "\n".join(pmids_lines)) utils.write_text(output_pubmed_pmids, "\n".join(pmids_ids)) print("DONE.") # 2 - Executing go_ontology_changes script print("\n\n2 - EXECUTING GO_ONTOLOGY_CHANGES SCRIPT...\n") json_onto_changes = go_ontology_changes.compute_changes( current_obo_url, previous_obo_url) utils.write_json(output_ontology_changes, json_onto_changes) tsv_onto_changes = go_ontology_changes.create_text_report( json_onto_changes) utils.write_text(output_ontology_changes_tsv, tsv_onto_changes) print("DONE.") # 3 - Executing go_annotation_changes script print( "\n\n3a - EXECUTING GO_ANNOTATION_CHANGES SCRIPT (INCLUDING PROTEIN BINDING)...\n" ) previous_stats = utils.fetch(previous_stats_url).json() json_annot_changes = go_annotation_changes.compute_changes( json_stats, previous_stats) print("DONE.") print( "\n\n3b - EXECUTING GO_ANNOTATION_CHANGES SCRIPT (EXCLUDING PROTEIN BINDING)...\n" ) previous_stats_no_pb = utils.fetch(previous_stats_no_pb_url).json( ) # WE STILL NEED TO CORRECT THAT: 1 FILE OR SEVERAL FILE ? IF SEVERAL, ONE MORE PARAMETER json_annot_no_pb_changes = go_annotation_changes.compute_changes( json_stats_no_pb, previous_stats_no_pb) print("DONE.") # 4 - Refining go-stats with ontology stats print("\n\n4 - EXECUTING GO_REFINE_STATS SCRIPT...\n") merged_annotations_diff = utils.merge_dict(json_stats, json_annot_changes) json_annot_changes = merged_annotations_diff ontology = json_onto_changes["summary"]["current"].copy() del ontology["release_date"] ontology["changes_created_terms"] = json_onto_changes["summary"][ "changes"]["created_terms"] ontology["changes_valid_terms"] = json_onto_changes["summary"]["changes"][ "valid_terms"] ontology["changes_obsolete_terms"] = json_onto_changes["summary"][ "changes"]["obsolete_terms"] ontology["changes_merged_terms"] = json_onto_changes["summary"]["changes"][ "merged_terms"] ontology["changes_biological_process_terms"] = json_onto_changes[ "summary"]["changes"]["biological_process_terms"] ontology["changes_molecular_function_terms"] = json_onto_changes[ "summary"]["changes"]["molecular_function_terms"] ontology["changes_cellular_component_terms"] = json_onto_changes[ "summary"]["changes"]["cellular_component_terms"] json_stats = { "release_date": json_stats["release_date"], "ontology": ontology, "annotations": json_stats["annotations"], "taxa": json_stats["taxa"], "bioentities": json_stats["bioentities"], "references": json_stats["references"] } print("\n4a - SAVING GO-STATS...\n") utils.write_json(output_stats, json_stats) print("DONE.") json_stats_no_pb = { "release_date": json_stats_no_pb["release_date"], "ontology": ontology, "annotations": json_stats_no_pb["annotations"], "taxa": json_stats_no_pb["taxa"], "bioentities": json_stats_no_pb["bioentities"], "references": json_stats_no_pb["references"] } print("\n4b - SAVING GO-STATS-NO-PB...\n") utils.write_json(output_stats_no_pb, json_stats_no_pb) print("DONE.") annotations_by_reference_genome = json_stats["annotations"][ "by_model_organism"] for taxon in annotations_by_reference_genome: for ecode in annotations_by_reference_genome[taxon]["by_evidence"]: annotations_by_reference_genome[taxon]["by_evidence"][ecode][ "B"] = json_stats["annotations"]["by_model_organism"][taxon][ "by_evidence"][ecode]["F"] - json_stats_no_pb[ "annotations"]["by_model_organism"][taxon][ "by_evidence"][ecode]["F"] for ecode in annotations_by_reference_genome[taxon][ "by_evidence_cluster"]: annotations_by_reference_genome[taxon]["by_evidence_cluster"][ ecode]["B"] = json_stats["annotations"]["by_model_organism"][ taxon]["by_evidence_cluster"][ecode][ "F"] - json_stats_no_pb["annotations"][ "by_model_organism"][taxon]["by_evidence_cluster"][ ecode]["F"] bioentities_by_reference_genome = {} for taxon in go_stats.reference_genomes_ids: key = go_stats.taxon_label(taxon) bioentities_by_reference_genome[key] = json_stats["bioentities"][ "by_filtered_taxon"]["cluster"][key] if key in json_stats[ "bioentities"]["by_filtered_taxon"]["cluster"] else {} # TODO: we don't have a way to filter on bioentity documents without direct annotations to PB ? # for btype in bioentities_by_reference_genome[key]: # val = json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"][key]["F"] if (key in json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"] and "F" in json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"][key]) else 0 # bioentities_by_reference_genome[key][btype]["B"] = bioentities_by_reference_genome[key][btype]["F"] - val references_by_reference_genome = {} for taxon in go_stats.reference_genomes_ids: key = go_stats.taxon_label(taxon) references_by_reference_genome[key] = json_stats["references"]["all"][ "by_filtered_taxon"][key] if key in json_stats["references"][ "all"]["by_filtered_taxon"] else {} pmids_by_reference_genome = {} for taxon in go_stats.reference_genomes_ids: key = go_stats.taxon_label(taxon) pmids_by_reference_genome[key] = json_stats["references"]["pmids"][ "by_filtered_taxon"][key] if key in json_stats["references"][ "pmids"]["by_filtered_taxon"] else {} # This is to modify the structure of the annotation changes based on recent requests print("\n4c - SAVING GO-ANNOTATION-CHANGES...\n") json_annot_changes = go_annotation_changes.alter_annotation_changes( json_stats, previous_stats, current_references_ids, previous_references_ids, json_annot_changes) utils.write_json(output_annotation_changes, json_annot_changes) tsv_annot_changes = go_annotation_changes.create_text_report( json_annot_changes) utils.write_text(output_annotation_changes_tsv, tsv_annot_changes) print("DONE.") print("\n4d - SAVING GO-ANNOTATION-NO-PB-CHANGES...\n") json_annot_no_pb_changes = go_annotation_changes.alter_annotation_changes( json_stats_no_pb, previous_stats_no_pb, current_references_ids, previous_references_ids, json_annot_no_pb_changes) utils.write_json(output_annotation_changes_no_pb, json_annot_no_pb_changes) tsv_annot_changes_no_pb = go_annotation_changes.create_text_report( json_annot_no_pb_changes) utils.write_text(output_annotation_changes_no_pb_tsv, tsv_annot_changes_no_pb) print("DONE.") json_stats_summary = { "release_date": json_stats["release_date"], "ontology": ontology, "annotations": { "total": json_stats["annotations"]["total"], "total_no_pb": json_stats_no_pb["annotations"]["total"], "total_pb": json_stats["annotations"]["total"] - json_stats_no_pb["annotations"]["total"], "by_aspect": { "P": json_stats["annotations"]["by_aspect"]["P"], "F": json_stats["annotations"]["by_aspect"]["F"], "C": json_stats["annotations"]["by_aspect"]["C"], "B": json_stats["annotations"]["by_aspect"]["F"] - json_stats_no_pb["annotations"]["by_aspect"]["F"] }, "by_bioentity_type_cluster": json_stats["annotations"]["by_bioentity_type"]["cluster"], "by_bioentity_type_cluster_no_pb": json_stats_no_pb["annotations"]["by_bioentity_type"]["cluster"], "by_qualifier": json_stats["annotations"]["by_qualifier"], "by_evidence_cluster": json_stats["annotations"]["by_evidence"]["cluster"], "by_evidence_cluster_no_pb": json_stats_no_pb["annotations"]["by_evidence"]["cluster"], "by_model_organism": annotations_by_reference_genome }, "taxa": { "total": json_stats["taxa"]["total"], "filtered": json_stats["taxa"]["filtered"], }, "bioentities": { "total": json_stats["bioentities"]["total"], "total_no_pb": json_stats_no_pb["bioentities"]["total"], "by_type_cluster": json_stats["bioentities"]["by_type"]["cluster"], "by_type_cluster_no_pb": json_stats_no_pb["bioentities"]["by_type"]["cluster"], "by_model_organism": bioentities_by_reference_genome }, "references": { "all": { "total": json_stats["references"]["all"]["total"], "total_no_pb": json_stats_no_pb["references"]["all"]["total"], "added": json_annot_changes["summary"]["changes"]["references"] ["added"], "removed": json_annot_changes["summary"]["changes"]["references"] ["removed"], "by_model_organism": references_by_reference_genome }, "pmids": { "total": json_stats["references"]["pmids"]["total"], "total_no_pb": json_stats_no_pb["references"]["pmids"]["total"], "added": json_annot_changes["summary"]["changes"]["pmids"]["added"], "removed": json_annot_changes["summary"]["changes"]["pmids"]["removed"], "by_model_organism": pmids_by_reference_genome } }, } # removing by_reference_genome.by_evidence for gen in json_stats_summary["annotations"]["by_model_organism"]: del json_stats_summary["annotations"]["by_model_organism"][gen][ "by_evidence"] print("\n4e - SAVING GO-STATS-SUMMARY...\n") utils.write_json(output_stats_summary, json_stats_summary) print("DONE.") # Indicate all processes finished print("SUCCESS.")
def main(argv): golr_url = '' current_obo_url = '' previous_obo_url = '' output_rep = '' release_date = '' print(len(argv)) if len(argv) < 10: print_help() sys.exit(2) try: opts, argv = getopt.getopt( argv, "g:c:p:o:d:", ["golrurl=", "cobo=", "pobo=", "orep=", "date="]) except getopt.GetoptError: print_help() sys.exit(2) for opt, arg in opts: if opt == '-h': print_help() sys.exit() elif opt in ("-g", "--golrurl"): golr_url = arg if not golr_url.endswith("/"): golr_url = golr_url + "/" elif opt in ("-c", "--cobo"): current_obo_url = arg elif opt in ("-p", "--pobo"): previous_obo_url = arg elif opt in ("-o", "--orep"): output_rep = arg elif opt in ("-d", "--date"): release_date = arg if not output_rep.endswith("/"): output_rep += "/" if not os.path.exists(output_rep): os.mkdir(output_rep) # actual names of the files to be generated - can change here if needed output_stats = output_rep + "go-stats.json" output_stats_no_pb = output_rep + "go-stats-no-pb.json" output_references = output_rep + "go-references.tsv" output_pmids = output_rep + "go-pmids.tsv" output_pubmed_pmids = output_rep + "GO.uid" output_ontology_changes = output_rep + "go-ontology-changes.json" output_ontology_changes_tsv = output_rep + "go-ontology-changes.tsv" output_stats_summary = output_rep + "go-stats-summary.json" # 1 - Executing go_stats script print( "\n\n1a - EXECUTING GO_STATS SCRIPT (INCLUDING PROTEIN BINDING)...\n") json_stats = go_stats.compute_stats(golr_url, release_date) print("DONE.") print( "\n\n1b - EXECUTING GO_STATS SCRIPT (EXCLUDING PROTEIN BINDING)...\n") json_stats_no_pb = go_stats.compute_stats(golr_url, release_date, True) print("DONE.") # 2 - Executing go_ontology_changes script print("\n\n2 - EXECUTING GO_ONTOLOGY_CHANGES SCRIPT...\n") json_onto_changes = go_ontology_changes.compute_changes( current_obo_url, previous_obo_url) utils.write_json(output_ontology_changes, json_onto_changes) tsv_onto_changes = go_ontology_changes.create_text_report( json_onto_changes) utils.write_text(output_ontology_changes_tsv, tsv_onto_changes) print("DONE.") # 3 - Refining go-stats with ontology stats print("\n\n3 - EXECUTING GO_REFINE_STATS SCRIPT...\n") ontology = json_onto_changes["summary"]["current"].copy() del ontology["release_date"] ontology["changes_created_terms"] = json_onto_changes["summary"][ "changes"]["created_terms"] ontology["changes_valid_terms"] = json_onto_changes["summary"]["changes"][ "valid_terms"] ontology["changes_obsolete_terms"] = json_onto_changes["summary"][ "changes"]["obsolete_terms"] ontology["changes_merged_terms"] = json_onto_changes["summary"]["changes"][ "merged_terms"] ontology["changes_biological_process_terms"] = json_onto_changes[ "summary"]["changes"]["biological_process_terms"] ontology["changes_molecular_function_terms"] = json_onto_changes[ "summary"]["changes"]["molecular_function_terms"] ontology["changes_cellular_component_terms"] = json_onto_changes[ "summary"]["changes"]["cellular_component_terms"] json_stats = { "release_date": json_stats["release_date"], "ontology": ontology, "annotations": json_stats["annotations"], "taxa": json_stats["taxa"], "bioentities": json_stats["bioentities"], "references": json_stats["references"] } utils.write_json(output_stats, json_stats) json_stats_no_pb = { "release_date": json_stats_no_pb["release_date"], "ontology": ontology, "annotations": json_stats_no_pb["annotations"], "taxa": json_stats_no_pb["taxa"], "bioentities": json_stats_no_pb["bioentities"], "references": json_stats_no_pb["references"] } utils.write_json(output_stats_no_pb, json_stats_no_pb) annotations_by_reference_genome = json_stats["annotations"][ "by_model_organism"] for taxon in annotations_by_reference_genome: for ecode in annotations_by_reference_genome[taxon]["by_evidence"]: annotations_by_reference_genome[taxon]["by_evidence"][ecode][ "B"] = json_stats["annotations"]["by_model_organism"][taxon][ "by_evidence"][ecode]["F"] - json_stats_no_pb[ "annotations"]["by_model_organism"][taxon][ "by_evidence"][ecode]["F"] for ecode in annotations_by_reference_genome[taxon][ "by_evidence_cluster"]: annotations_by_reference_genome[taxon]["by_evidence_cluster"][ ecode]["B"] = json_stats["annotations"]["by_model_organism"][ taxon]["by_evidence_cluster"][ecode][ "F"] - json_stats_no_pb["annotations"][ "by_model_organism"][taxon]["by_evidence_cluster"][ ecode]["F"] bioentities_by_reference_genome = {} for taxon in go_stats.reference_genomes_ids: key = go_stats.taxon_label(taxon) bioentities_by_reference_genome[key] = json_stats["bioentities"][ "by_filtered_taxon"]["cluster"][key] if key in json_stats[ "bioentities"]["by_filtered_taxon"]["cluster"] else {} # TODO: we don't have a way to filter on bioentity documents without direct annotations to PB ? # for btype in bioentities_by_reference_genome[key]: # val = json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"][key]["F"] if (key in json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"] and "F" in json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"][key]) else 0 # bioentities_by_reference_genome[key][btype]["B"] = bioentities_by_reference_genome[key][btype]["F"] - val references_by_reference_genome = {} for taxon in go_stats.reference_genomes_ids: key = go_stats.taxon_label(taxon) references_by_reference_genome[key] = json_stats["references"]["all"][ "by_filtered_taxon"][key] if key in json_stats["references"][ "all"]["by_filtered_taxon"] else {} pmids_by_reference_genome = {} for taxon in go_stats.reference_genomes_ids: key = go_stats.taxon_label(taxon) pmids_by_reference_genome[key] = json_stats["references"]["pmids"][ "by_filtered_taxon"][key] if key in json_stats["references"][ "pmids"]["by_filtered_taxon"] else {} json_stats_summary = { "release_date": json_stats["release_date"], "ontology": ontology, "annotations": { "total": json_stats["annotations"]["total"], "total_no_pb": json_stats_no_pb["annotations"]["total"], "total_pb": json_stats["annotations"]["total"] - json_stats_no_pb["annotations"]["total"], "by_aspect": { "P": json_stats["annotations"]["by_aspect"]["P"], "F": json_stats["annotations"]["by_aspect"]["F"], "C": json_stats["annotations"]["by_aspect"]["C"], "B": json_stats["annotations"]["by_aspect"]["F"] - json_stats_no_pb["annotations"]["by_aspect"]["F"] }, "by_bioentity_type_cluster": json_stats["annotations"]["by_bioentity_type"]["cluster"], "by_bioentity_type_cluster_no_pb": json_stats_no_pb["annotations"]["by_bioentity_type"]["cluster"], "by_evidence_cluster": json_stats["annotations"]["by_evidence"]["cluster"], "by_evidence_cluster_no_pb": json_stats_no_pb["annotations"]["by_evidence"]["cluster"], "by_model_organism": annotations_by_reference_genome }, "taxa": { "total": json_stats["taxa"]["total"], "filtered": json_stats["taxa"]["filtered"], }, "bioentities": { "total": json_stats["bioentities"]["total"], "total_no_pb": json_stats_no_pb["bioentities"]["total"], "by_type_cluster": json_stats["bioentities"]["by_type"]["cluster"], "by_type_cluster_no_pb": json_stats_no_pb["bioentities"]["by_type"]["cluster"], "by_model_organism": bioentities_by_reference_genome }, "references": { "all": { "total": json_stats["references"]["all"]["total"], "total_no_pb": json_stats_no_pb["references"]["all"]["total"], "by_model_organism": references_by_reference_genome }, "pmids": { "total": json_stats["references"]["pmids"]["total"], "total_no_pb": json_stats_no_pb["references"]["pmids"]["total"], "by_model_organism": pmids_by_reference_genome } }, } # removing by_reference_genome.by_evidence for gen in json_stats_summary["annotations"]["by_model_organism"]: del json_stats_summary["annotations"]["by_model_organism"][gen][ "by_evidence"] utils.write_json(output_stats_summary, json_stats_summary) print("Saving references file to <" + output_pmids + "> and PubMed PMID file to <" + output_pubmed_pmids + ">") references = go_stats.get_references() references_lines = [] for k, v in references.items(): references_lines.append(k + "\t" + str(v)) pmids_lines = list(filter(lambda x: "PMID:" in x, references_lines)) pmids_ids = list(map(lambda x: x.split("\t")[0].split(":")[1], pmids_lines)) utils.write_text(output_references, "\n".join(references_lines)) utils.write_text(output_pmids, "\n".join(pmids_lines)) utils.write_text(output_pubmed_pmids, "\n".join(pmids_ids)) print("Done.") print("SUCCESS.")
def main(argv): golr_url = '' current_obo_url = '' previous_obo_url = '' output_rep = '' release_date = '' if len(argv) < 10: print_help() sys.exit(2) try: opts, argv = getopt.getopt(argv,"g:c:p:o:d:",["golrurl=", "cobo=", "pobo=", "orep=", "date="]) except getopt.GetoptError: print_help() sys.exit(2) for opt, arg in opts: if opt == '-h': print_help() sys.exit() elif opt in ("-g", "--golrurl"): golr_url = arg elif opt in ("-c", "--cobo"): current_obo_url = arg elif opt in ("-p", "--pobo"): previous_obo_url = arg elif opt in ("-o", "--orep"): output_rep = arg elif opt in ("-d", "--date"): release_date = arg if not output_rep.endswith("/"): output_rep += "/" if not os.path.exists(output_rep): os.mkdir(output_rep) # 1 - Executing go_stats script print("\n\n1a - EXECUTING GO_STATS SCRIPT (INCLUDING PROTEIN BINDING)...\n") json_stats = go_stats.compute_stats(golr_url, release_date) # data = None # with open('newtest/go-stats.json', 'r') as myfile: # data=myfile.read() # json_stats = json.loads(data) print("DONE.") print("\n\n1b - EXECUTING GO_STATS SCRIPT (EXCLUDING PROTEIN BINDING)...\n") json_stats_no_pb = go_stats.compute_stats(golr_url, release_date, True) # with open('newtest/go-stats-no-pb.json', 'r') as myfile: # data=myfile.read() # json_stats_no_pb = json.loads(data) print("DONE.") # 2 - Executing go_ontology_changes script print("\n\n2 - EXECUTING GO_ONTOLOGY_CHANGES SCRIPT...\n") # with open('newtest/go-ontology-changes.json', 'r') as myfile: # data=myfile.read() # json_onto_changes = json.loads(data) json_onto_changes = go_ontology_changes.compute_changes(current_obo_url, previous_obo_url) utils.write_json(output_rep + "go-ontology-changes.json", json_onto_changes) tsv_onto_changes = go_ontology_changes.create_text_report(json_onto_changes) utils.write_text(output_rep + "go-ontology-changes.tsv", tsv_onto_changes) print("DONE.") # 4 - Refining go-stats with ontology stats print("\n\n4 - EXECUTING GO_REFINE_STATS SCRIPT...\n") ontology = json_onto_changes["summary"]["current"].copy() del ontology["release_date"] ontology["changes_created_terms"] = json_onto_changes["summary"]["changes"]["created_terms"] ontology["changes_valid_terms"] = json_onto_changes["summary"]["changes"]["valid_terms"] ontology["changes_obsolete_terms"] = json_onto_changes["summary"]["changes"]["obsolete_terms"] ontology["changes_merged_terms"] = json_onto_changes["summary"]["changes"]["merged_terms"] ontology["changes_biological_process_terms"] = json_onto_changes["summary"]["changes"]["biological_process_terms"] ontology["changes_molecular_function_terms"] = json_onto_changes["summary"]["changes"]["molecular_function_terms"] ontology["changes_cellular_component_terms"] = json_onto_changes["summary"]["changes"]["cellular_component_terms"] json_stats = { "release_date" : json_stats["release_date"], "ontology" : ontology, "annotations" : json_stats["annotations"], "taxa" : json_stats["taxa"], "bioentities" : json_stats["bioentities"], "references" : json_stats["references"] } utils.write_json(output_rep + "go-stats.json", json_stats) json_stats_no_pb = { "release_date" : json_stats_no_pb["release_date"], "ontology" : ontology, "annotations" : json_stats_no_pb["annotations"], "taxa" : json_stats_no_pb["taxa"], "bioentities" : json_stats_no_pb["bioentities"], "references" : json_stats_no_pb["references"] } utils.write_json(output_rep + "go-stats-no-pb.json", json_stats_no_pb) annotations_by_reference_genome = json_stats["annotations"]["by_model_organism"] for taxon in annotations_by_reference_genome: for ecode in annotations_by_reference_genome[taxon]["by_evidence"]: annotations_by_reference_genome[taxon]["by_evidence"][ecode]["B"] = json_stats["annotations"]["by_model_organism"][taxon]["by_evidence"][ecode]["F"] - json_stats_no_pb["annotations"]["by_model_organism"][taxon]["by_evidence"][ecode]["F"] for ecode in annotations_by_reference_genome[taxon]["by_evidence_cluster"]: annotations_by_reference_genome[taxon]["by_evidence_cluster"][ecode]["B"] = json_stats["annotations"]["by_model_organism"][taxon]["by_evidence_cluster"][ecode]["F"] - json_stats_no_pb["annotations"]["by_model_organism"][taxon]["by_evidence_cluster"][ecode]["F"] bioentities_by_reference_genome = { } for taxon in go_stats.reference_genomes_ids: key = go_stats.taxon_label(taxon) bioentities_by_reference_genome[key] = json_stats["bioentities"]["by_filtered_taxon"]["cluster"][key] if key in json_stats["bioentities"]["by_filtered_taxon"]["cluster"] else { } # TODO: we don't have a way to filter on bioentity documents without direct annotations to PB ? # for btype in bioentities_by_reference_genome[key]: # val = json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"][key]["F"] if (key in json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"] and "F" in json_stats_no_pb["bioentities"]["by_filtered_taxon"]["cluster"][key]) else 0 # bioentities_by_reference_genome[key][btype]["B"] = bioentities_by_reference_genome[key][btype]["F"] - val references_by_reference_genome = { } for taxon in go_stats.reference_genomes_ids: key = go_stats.taxon_label(taxon) references_by_reference_genome[key] = json_stats["references"]["all"]["by_filtered_taxon"][key] if key in json_stats["references"]["all"]["by_filtered_taxon"] else { } pmids_by_reference_genome = { } for taxon in go_stats.reference_genomes_ids: key = go_stats.taxon_label(taxon) pmids_by_reference_genome[key] = json_stats["references"]["pmids"]["by_filtered_taxon"][key] if key in json_stats["references"]["pmids"]["by_filtered_taxon"] else { } json_stats_summary = { "release_date" : json_stats["release_date"], "ontology" : ontology, "annotations" : { "total" : json_stats["annotations"]["total"], "total_no_pb" : json_stats_no_pb["annotations"]["total"], "total_pb" : json_stats["annotations"]["total"] - json_stats_no_pb["annotations"]["total"], "by_aspect" : { "P" : json_stats["annotations"]["by_aspect"]["P"], "F" : json_stats["annotations"]["by_aspect"]["F"], "C" : json_stats["annotations"]["by_aspect"]["C"], "B" : json_stats["annotations"]["by_aspect"]["F"] - json_stats_no_pb["annotations"]["by_aspect"]["F"] }, "by_bioentity_type_cluster" : json_stats["annotations"]["by_bioentity_type"]["cluster"], "by_bioentity_type_cluster_no_pb" : json_stats_no_pb["annotations"]["by_bioentity_type"]["cluster"], "by_evidence_cluster" : json_stats["annotations"]["by_evidence"]["cluster"], "by_evidence_cluster_no_pb" : json_stats_no_pb["annotations"]["by_evidence"]["cluster"], "by_model_organism" : annotations_by_reference_genome }, "taxa" : { "total" : json_stats["taxa"]["total"], "filtered" : json_stats["taxa"]["filtered"], }, "bioentities" : { "total" : json_stats["bioentities"]["total"], "total_no_pb" : json_stats_no_pb["bioentities"]["total"], "by_type_cluster" : json_stats["bioentities"]["by_type"]["cluster"], "by_type_cluster_no_pb" : json_stats_no_pb["bioentities"]["by_type"]["cluster"], "by_model_organism" : bioentities_by_reference_genome }, "references" : { "all" : { "total" : json_stats["references"]["all"]["total"], "total_no_pb" : json_stats_no_pb["references"]["all"]["total"], "by_model_organism" : references_by_reference_genome }, "pmids" : { "total" : json_stats["references"]["pmids"]["total"], "total_no_pb" : json_stats_no_pb["references"]["pmids"]["total"], "by_model_organism" : pmids_by_reference_genome } }, } utils.write_json(output_rep + "go-stats-summary.json", json_stats_summary) print("DONE.")
def main(argv): golr_base_url = '' output_rep = '' slim_base_url = '' if len(argv) < 6: print_help() sys.exit(2) try: opts, argv = getopt.getopt(argv, "g:o:s:", ["golrurl=", "orep=", "slim="]) except getopt.GetoptError: print_help() sys.exit(2) for opt, arg in opts: if opt == '-h': print_help() sys.exit() elif opt in ("-g", "--golrurl"): golr_base_url = arg if not golr_base_url.endswith("/"): golr_base_url = golr_base_url + "/" elif opt in ("-o", "--orep"): output_rep = arg elif opt in ("-s", "--slim"): slim_base_url = arg if not slim_base_url.endswith("/"): slim_base_url = slim_base_url + "/" if not output_rep.endswith("/"): output_rep += "/" if not os.path.exists(output_rep): os.mkdir(output_rep) print("\n1 - Creating ontology map...") ontology_map = create_ontology_map(golr_base_url) print("Ontology map created with ", len(ontology_map), " terms") slims = ["goslim_agr.obo", "goslim_generic.obo", "goslim_chembl.obo"] print("\n2 - Loading ", len(slims), " slims to create the slim-specific GMTs...") slim_obos = {} for slim in slims: response = utils.fetch(slim_base_url + slim) obo = OBO_Parser(response.text) slim_obos[slim] = obo print("Slims loaded: ", len(slim_obos)) # taxa = utils.REFERENCE_GENOME_IDS taxa = ["NCBITaxon:9606", "NCBITaxon:10090"] print("\n3 - Creating the GMTs for ", len(taxa), " taxa") for taxon in taxa: taxon_id = taxon.split(":")[1] gmt_taxon = gmt(ontology_map, golr_base_url, taxon) output = output_rep + taxon_id for aspect in gmt_taxon: for evgroup in gmt_taxon[aspect]: if len(gmt_taxon[aspect][evgroup]) > 0: utils.write_text( output + "-" + aspect.lower() + "-" + evgroup.lower() + ".gmt", gmt_taxon[aspect][evgroup]) for slim_obo in slim_obos: oterms = slim_obos[slim_obo].get_terms(TermState.VALID) terms = oterms.keys() gmt_taxon_slim = filter_slim(gmt_taxon, terms) slim_key = slim_obo.replace(".obo", "") for aspect in gmt_taxon_slim: for evgroup in gmt_taxon_slim[aspect]: if len(gmt_taxon_slim[aspect][evgroup]) > 0: utils.write_text( output + "-" + slim_key + "-" + aspect.lower() + "-" + evgroup.lower() + ".gmt", gmt_taxon_slim[aspect][evgroup])
def main(argv): golr_url = '' output_rep = '' release_date = '' if len(argv) < 6: print_help() sys.exit(2) try: opts, argv = getopt.getopt(argv, "g:b:o:d:", ["golrurl=", "orep=", "date="]) except getopt.GetoptError: print_help() sys.exit(2) for opt, arg in opts: if opt == '-h': print_help() sys.exit() elif opt in ("-g", "--golrurl"): golr_url = arg if not golr_url.endswith("/"): golr_url = golr_url + "/" elif opt in ("-o", "--orep"): output_rep = arg elif opt in ("-d", "--date"): release_date = arg if not output_rep.endswith("/"): output_rep += "/" if not os.path.exists(output_rep): os.mkdir(output_rep) # actual names of the files to be generated - can change here if needed output_meta = output_rep + "go-meta.json" output_meta_no_pb = output_rep + "go-meta-no-pb.json" output_stats = output_rep + "go-stats.json" output_stats_no_pb = output_rep + "go-stats-no-pb.json" output_stats_tsv = output_rep + "go-stats.tsv" output_stats_no_pb_tsv = output_rep + "go-stats-no-pb.tsv" output_references = output_rep + "go-references.tsv" output_pmids = output_rep + "go-pmids.tsv" output_pubmed_pmids = output_rep + "GO.uid" print("Will write stats to " + output_stats + " and " + output_stats_tsv) json_stats = compute_stats(golr_url, release_date, False) print("Saving Stats to <" + output_stats + "> ...") utils.write_json(output_stats, json_stats) print("Done.") print("Saving Stats to <" + output_stats_tsv + "> ...") tsv_stats = create_text_report(json_stats) utils.write_text(output_stats_tsv, tsv_stats) print("Done.") print("Will write stats (excluding protein binding) to " + output_stats_no_pb + " and " + output_stats_no_pb_tsv) json_stats_no_pb = compute_stats(golr_url, release_date, True) print("Saving Stats to <" + output_stats_no_pb + "> ...") utils.write_json(output_stats_no_pb, json_stats_no_pb) print("Done.") print("Saving Stats (excluding protein binding) to <" + output_stats_no_pb_tsv + "> ...") tsv_stats_no_pb = create_text_report(json_stats_no_pb) utils.write_text(output_stats_no_pb_tsv, tsv_stats_no_pb) print("Done.") json_meta = create_meta(json_stats) print("Saving META to <" + output_meta + "> ...") utils.write_json(output_meta, json_meta) print("Done.") json_meta_no_pb = create_meta(json_stats_no_pb) print("Saving META to <" + output_meta_no_pb + "> ...") utils.write_json(output_meta_no_pb, json_meta_no_pb) print("Done.") print("Saving PMID file to <" + output_pmids + "> and PubMed PMID file to <" + output_pubmed_pmids + ">") references = get_references() pmids = {k: v for k, v in references.items() if "PMID:" in k} pmids_ids = map(lambda x: x.split(":")[1], pmids) pmids_lines = [] for k, v in pmids.items(): pmids_lines.append(k + "\t" + str(v)) utils.write_text(output_pmids, "\n".join(pmids_lines)) utils.write_text(output_pubmed_pmids, "\n".join(pmids_ids)) print("Done.")