def run(cancer_types, long_names, out_dir): last_dataset = "000007S" IDGenerator = id_generator.IDGenerator() for cancer in cancer_types: # get ref dataset id data_id, last_dataset = IDGenerator.get_new_OEB_id( "002", "D", last_dataset) info = { "_id": "TCGA:2018-04-05_" + cancer + "_M", "datalink": { "uri": "https://portal.gdc.cancer.gov/", "attrs": ["archive"], "validation_date": "2018-04-05T00:00:00Z", "status": "ok" }, "type": "metrics_reference", "challenge_ids": ["TCGA:2018-04-05_" + cancer], "visibility": "public", "version": "unknown", "name": "Metrics Reference Dataset for " + long_names[cancer], "description": "List of genes (described by TCGA community) that can be used as 'gold standard' in " + long_names[cancer] + " benchmark ", "dates": { "creation": "2018-04-05T00:00:00Z", "modification": "2018-04-05T14:00:00Z" }, "depends_on": { "rel_dataset_ids": [{ "dataset_id": "TCGA:2018-04-05_input", }] }, "_schema": "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Dataset", "community_ids": ["OEBC001"], "dataset_contact_ids": ["Matthew.Bailey", "Eduard.Porta", "Collin.Tokheim"] } filename = "Dataset_Metrics_Ref_" + cancer + "_" + data_id + ".json" # print filename with open(out_dir + filename, 'w') as f: json.dump(info, f, sort_keys=True, indent=4, separators=(',', ': '))
def run(cancer_types, mongo_ids, out_dir): last_challenge = "0000000" last_test_event = "0000000" last_participant_dataset = "0000000" last_tool = "0000008" IDGenerator = id_generator.IDGenerator() for cancer in cancer_types: challenge_id, last_challenge = IDGenerator.get_new_OEB_id( "002", "X", last_challenge) for participant in os.listdir( "/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/participants" ): #if participant is not in mongo, asign new temporary id if participant in mongo_ids: tool_id = mongo_ids[participant] else: tool_id, last_tool = IDGenerator.get_new_OEB_id( "002", "T", last_tool) #get test event id Tevent_id, last_test_event = IDGenerator.get_new_OEB_id( "002", "A", last_test_event) # get participant dataset id participant_data_id, last_participant_dataset = IDGenerator.get_new_OEB_id( "002", "D", last_participant_dataset) info = { "_id": "TCGA:2018-04-05_" + cancer + "_testEvent_" + participant, "_schema": "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/TestAction", "action_type": "TestEvent", "tool_id": "TCGA:" + participant, "involved_datasets": [{ "dataset_id": "TCGA:2018-04-05_input", "role": "incoming" }, { "dataset_id": "TCGA:2018-04-05_" + cancer + "_P_" + participant, "role": "outgoing" }], "challenge_id": "TCGA:2018-04-05_" + cancer, "dates": { "creation": "2018-04-05T00:00:00Z", "reception": "2018-04-05T00:00:00Z" }, "test_contact_ids": ["Matthew.Bailey", "Eduard.Porta", "Collin.Tokheim"] } # print info filename = "TestEvent_" + cancer + "_" + participant + "_" + Tevent_id + ".json" # print filename with open(out_dir + filename, 'w') as f: json.dump(info, f, sort_keys=True, indent=4, separators=(',', ': '))
def run(cancer_types, long_names, mongo_tool_ids, tool_contact, mongo_datRef_ids, out_dir): ## create dict that will store info about all combined cancer types all_cancer_genes = {} for participant in os.listdir("/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/participants"): all_cancer_genes[participant] = [] last_challenge = "0000000" last_event = "000007S" last_participant_dataset = "0000000" last_ref_dataset = "000007S" last_tool = "0000008" last_assessment_dataset = "000008R" IDGenerator = id_generator.IDGenerator() for cancer in cancer_types: challenge_id, last_challenge = IDGenerator.get_new_OEB_id("002", "X", last_challenge) # get metrics reference dataset id - incoming ref_data_id, last_ref_dataset = IDGenerator.get_new_OEB_id("002", "D", last_ref_dataset) data = pandas.read_csv("/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/"+ cancer + ".txt", comment="#", header=None) gold_standard = data.iloc[:, 0].values participants_datasets, all_cancer_genes = compute_metrics("/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/", gold_standard, cancer, all_cancer_genes) for participant in os.listdir("/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/participants"): # if participant is not in mongo, asign new temporary id if participant in mongo_tool_ids: tool_id = mongo_tool_ids[participant] else: tool_id, last_tool = IDGenerator.get_new_OEB_id("002", "T", last_tool) # get participant dataset id - incoming participant_data_id, last_participant_dataset = IDGenerator.get_new_OEB_id("002", "D", last_participant_dataset) #get data-uri value of the 2 metrics metric1 = participants_datasets[participant][0] metric2 = participants_datasets[participant][1] #print metrics1 assesment file # get assessment dataset id for metric 1 A_data_id, last_assessment_dataset = IDGenerator.get_new_OEB_id("002", "D", last_assessment_dataset) info = { "_id":"TCGA:2018-04-05_" + cancer + "_A_TPR_" + participant, "description":"Assessment dataset for applying Metric 'True Positive Rate' to " + participant + " predictions in " + long_names[cancer], "dates":{ "creation":"2018-04-05T00:00:00Z", "modification":"2018-04-05T14:00:00Z" }, "type":"assessment", "visibility": "public", "datalink":{ "inline_data": {"value": metric1} }, "depends_on":{ "tool_id":"TCGA:" + participant, "metrics_id":"TCGA:TPR", "rel_dataset_ids":[ { "dataset_id":"TCGA:2018-04-05_" + cancer + "_P_" + participant, }, { "dataset_id":"TCGA:2018-04-05_" + cancer + "_M", } ] }, "_schema":"https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Dataset", "community_ids":["OEBC001"], "challenge_ids": ["TCGA:2018-04-05_" + cancer], "version":"1", "name":"Assesment of Metric TPR in " + participant, "dataset_contact_ids":[ tool_contact[participant] ] } # print info filename = "Dataset_assessment_" + cancer + "_" + participant + "_TPR_" + A_data_id + ".json" print filename with open(out_dir + filename, 'w') as f: json.dump(info, f, sort_keys=True, indent=4, separators=(',', ': ')) # print metrics2 assessment file # get assessment dataset id for metric 2 A_data_id, last_assessment_dataset = IDGenerator.get_new_OEB_id("002", "D", last_assessment_dataset) info = { "_id": "TCGA:2018-04-05_" + cancer + "_A_precision_" + participant, "description": "Assessment dataset for applying Metric 'Positive Predictive Value' to " + participant + " predictions in " + long_names[cancer], "dates": { "creation": "2018-04-05T00:00:00Z", "modification": "2018-04-05T14:00:00Z" }, "type": "assessment", "visibility": "public", "datalink": { "inline_data": {"value": metric2} }, "depends_on": { "tool_id": "TCGA:" + participant, "metrics_id": "TCGA:precision", "rel_dataset_ids":[ { "dataset_id":"TCGA:2018-04-05_" + cancer + "_P_" + participant, }, { "dataset_id":"TCGA:2018-04-05_" + cancer + "_M", } ] }, "_schema": "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Dataset", "community_ids":["OEBC001"], "challenge_ids": ["TCGA:2018-04-05_" + cancer], "version": "1", "name": "Assesment of Metric precision-PPV in " + participant, "dataset_contact_ids": [ tool_contact[participant] ] } # print info filename = "Dataset_assessment_" + cancer + "_" + participant + "_precision_" + A_data_id + ".json" print filename with open(out_dir + filename, 'w') as f: json.dump(info, f, sort_keys=True, indent=4, separators=(',', ': ')) get_metrics_across_all_cancers(all_cancer_genes, last_assessment_dataset, last_participant_dataset, last_tool, out_dir)
def get_metrics_across_all_cancers(all_cancer_genes, last_assessment_dataset, last_participant_dataset, last_tool, out_dir): # plot chart for results across all cancer types IDGenerator = id_generator.IDGenerator() data = pandas.read_csv("/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/ALL.txt", comment="#", header=None) gold_standard = data.iloc[:, 0].values cancer = "ALL" challenge_id = "OEBX002t00000Z" ref_data_id = "OEBD002t00008R" participants_datasets = {} for participant in os.listdir("/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/participants"): #get set of predicted genes store in all_cancer_genes predicted_genes = all_cancer_genes[participant] # TRUE POSITIVE RATE overlapping_genes = set(predicted_genes).intersection(gold_standard) TPR = len(overlapping_genes) / len(gold_standard) # ACCURACY/ PRECISION if len(predicted_genes) == 0: acc = 0 else: acc = len(overlapping_genes) / len(predicted_genes) participants_datasets[participant] = [TPR, acc] # if participant is not in mongo, asign new temporary id if participant in mongo_tool_ids: tool_id = mongo_tool_ids[participant] else: tool_id, last_tool = IDGenerator.get_new_OEB_id("002", "T", last_tool) # get participant dataset id - incoming participant_data_id, last_participant_dataset = IDGenerator.get_new_OEB_id("002", "D", last_participant_dataset) # get data-uri value of the 2 metrics metric1 = participants_datasets[participant][0] metric2 = participants_datasets[participant][1] # print metrics1 assesment file # get assessment dataset id for metric 1 A_data_id, last_assessment_dataset = IDGenerator.get_new_OEB_id("002", "D", last_assessment_dataset) info = { "_id": "TCGA:2018-04-05_" + cancer + "_A_TPR_" + participant, "description": "Assessment dataset for applying Metric 'True Positive Rate' to " + participant + " predictions in " + long_names[cancer], "dates": { "creation": "2018-04-05T00:00:00Z", "modification": "2018-04-05T14:00:00Z" }, "type": "assessment", "visibility": "public", "datalink": { "inline_data": {"value": metric1} }, "depends_on": { "tool_id": "TCGA:" + participant, "metrics_id": "TCGA:TPR", "rel_dataset_ids":[ { "dataset_id":"TCGA:2018-04-05_" + cancer + "_P_" + participant, }, { "dataset_id":"TCGA:2018-04-05_" + cancer + "_M", } ] }, "_schema": "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Dataset", "community_ids":["OEBC001"], "challenge_ids": ["TCGA:2018-04-05_" + cancer], "version": "1", "name": "Assesment of Metric TPR in " + participant, "dataset_contact_ids": [ tool_contact[participant] ] } # print info filename = "Dataset_assessment_" + cancer + "_" + participant + "_TPR_" + A_data_id + ".json" print filename with open(out_dir + filename, 'w') as f: json.dump(info, f, sort_keys=True, indent=4, separators=(',', ': ')) # print metrics2 assessment file # get assessment dataset id for metric 2 A_data_id, last_assessment_dataset = IDGenerator.get_new_OEB_id("002", "D", last_assessment_dataset) info = { "_id": "TCGA:2018-04-05_" + cancer + "_A_precision_" + participant, "description": "Assessment dataset for applying Metric 'Positive Predictive Value' to " + participant + " predictions in " + long_names[cancer], "dates": { "creation": "2018-04-05T00:00:00Z", "modification": "2018-04-05T14:00:00Z" }, "type": "assessment", "visibility": "public", "datalink": { "inline_data": {"value": metric2} }, "depends_on": { "tool_id": "TCGA:" + participant, "metrics_id": "TCGA:precision", "rel_dataset_ids":[ { "dataset_id":"TCGA:2018-04-05_" + cancer + "_P_" + participant, }, { "dataset_id":"TCGA:2018-04-05_" + cancer + "_M", } ] }, "_schema": "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Dataset", "community_ids":["OEBC001"], "challenge_ids": ["TCGA:2018-04-05_" + cancer], "version": "1", "name": "Assesment of Metric precision-PPV in " + participant, "dataset_contact_ids": [ tool_contact[participant] ] } # print info filename = "Dataset_assessment_" + cancer + "_" + participant + "_precision_" + A_data_id + ".json" print filename with open(out_dir + filename, 'w') as f: json.dump(info, f, sort_keys=True, indent=4, separators=(',', ': '))
def run(cancer_types, long_names, urls, out_dir): IDGenerator = id_generator.IDGenerator() last_used = "0000000" for cancer in cancer_types: # get schema alphanumeric id challenge_id, last_used = IDGenerator.get_new_OEB_id( "002", "X", last_used) info = { "_id": "TCGA:2018-04-05_" + cancer, "_schema": "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Challenge", "acronym": cancer, "name": "Cancer Driver Genes Prediction Benchmark in " + long_names[cancer], "benchmarking_event_id": "TCGA:2018-04-05", "is_automated": False, "dates": { "creation": "2018-04-05T00:00:00Z", "modification": "2018-04-05T14:00:00Z", "benchmark_start": "2018-04-05T05:00:00Z", "benchmark_stop": "2018-04-05T02:00:00Z" }, "metrics_categories": [{ "category": "assessment", "description": "metrics used to benchmark the performance of cancer genes predictors in Challenge " + long_names[cancer] + ", generating the assessment datatseta", "metrics": [{ "metrics_id": "TCGA:TPR", "tool_id": "TCGA:compute_TPR" }, { "metrics_id": "TCGA:precision", "tool_id": "TCGA:compute_precision" }] }, { "category": "aggregation", "description": "metrics used to aggregate the assessment data of all cancer genes predictors participating in challenge " + long_names[cancer] + " in a consolidated Aggregation dataset", "metrics": [{ "metrics_id": "TCGA:aggregation", "tool_id": "TCGA:aggregate_benchmark" }] }], "url": urls[cancer], "challenge_contact_ids": ["Matthew.Bailey", "Eduard.Porta", "Collin.Tokheim"], "references": ["doi:10.1016/j.cell.2018.02.060"] } # print info filename = "Challenge_" + cancer + "_" + challenge_id + ".json" # print filename with open(out_dir + filename, 'w') as f: json.dump(info, f, sort_keys=True, indent=4, separators=(',', ': '))
def run(cancer_types, long_names, mongo_tool_ids, tool_contact, out_dir): last_challenge = "0000000" last_tool = "0000008" last_assessment_dataset = "000008R" last_challenge_dataset = "00000OB" last_ref_dataset = "000007S" IDGenerator = id_generator.IDGenerator() for cancer in cancer_types: challenge_id, last_challenge = IDGenerator.get_new_OEB_id("002", "X", last_challenge) # get metrics reference dataset id - incoming ref_data_id, last_ref_dataset = IDGenerator.get_new_OEB_id("002", "D", last_ref_dataset) # get challenge dataset_id challenge_data_id, last_challenge_dataset = IDGenerator.get_new_OEB_id("002", "D", last_challenge_dataset) # generate array with all related datasets and object with participants metrics results involved_datasets = [] inline_data = { "visualization": { "type":"2D-plot", "x_axis": "OEBM0020000002", "y_axis": "OEBM0020000001" }, "challenge_participants": [] } for participant in os.listdir("/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/participants"): # if participant is not in mongo, asign new temporary id if participant in mongo_tool_ids: tool_id = mongo_tool_ids[participant] else: tool_id, last_tool = IDGenerator.get_new_OEB_id("002", "T", last_tool) # get assessment dataset id for metric 1 A_data_id_TPR, last_assessment_dataset = IDGenerator.get_new_OEB_id("002", "D", last_assessment_dataset) # get assessment dataset id for metric 2 A_data_id_precision, last_assessment_dataset = IDGenerator.get_new_OEB_id("002", "D", last_assessment_dataset) # read files which containes metrics values with io.open("out/assessment_datasets/Dataset_assessment_" + cancer + "_" + participant + "_TPR_" + A_data_id_TPR + ".json", mode='r', encoding="utf-8") as f: assess_file = json.load(f) metric1 = assess_file["datalink"]["inline_data"]["value"] with io.open("out/assessment_datasets/Dataset_assessment_" + cancer + "_" + participant + "_precision_" + A_data_id_precision + ".json", mode='r', encoding="utf-8") as f: assess_file = json.load(f) metric2 = assess_file["datalink"]["inline_data"]["value"] inline_data["challenge_participants"].append( { "tool_id": "TCGA:" + participant, "metric_x": metric1, "metric_y": metric2, }) ############################################################################### involved_datasets.append({ "dataset_id": "TCGA:2018-04-05_" + cancer + "_A_TPR_" + participant, }) involved_datasets.append({ "dataset_id": "TCGA:2018-04-05_" + cancer + "_A_precision_" + participant, }) # append reference and input datasets involved_datasets.append({ "dataset_id": "TCGA:2018-04-05_input", }) involved_datasets.append({ "dataset_id": "TCGA:2018-04-05_" + cancer + "_M", }) info = { "_id": "TCGA:2018-04-05_" + cancer + "_Aggregation", "datalink":{ "inline_data": inline_data, "schema_url": "https://raw.githubusercontent.com/inab/OpenEBench_scientific_visualizer/js/benchmarking_data_model/inline_data_visualizer.json" }, "type":"aggregation", "visibility": "public", "version":"unknown", "name":"Summary dataset for challenge: " + long_names[cancer], "description":"Summary dataset with information about challenge " + long_names[cancer] + " (e.g. input/output datasets, metrics...) in participant " + participant, "dates":{ "creation":"2018-04-05T00:00:00Z", "modification":"2018-04-05T14:00:00Z" }, "depends_on":{ "tool_id": "TCGA:aggregate_benchmark", "rel_dataset_ids": involved_datasets, }, "_schema":"https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Dataset", "community_ids":["OEBC001"], "challenge_ids": ["TCGA:2018-04-05_" + cancer], "dataset_contact_ids":[ "Eduard.Porta", "Matthew.Bailey", "Collin.Tokheim", "Loris.Mularoni", "Juri.Reimand", "David.Tamborero", "Nathan.Dees" ] } # print info filename = "Dataset_Aggregation_" + cancer + "_" + challenge_data_id + ".json" # print filename with open(out_dir + filename, 'w') as f: json.dump(info, f, sort_keys=True, indent=4, separators=(',', ': '))
def run(cancer_types, long_names, mongo_ids, tool_contact, download_urls, out_dir): last_challenge = "0000000" last_participant_dataset = "0000000" last_tool = "0000008" IDGenerator = id_generator.IDGenerator() for cancer in cancer_types: challenge_id, last_challenge = IDGenerator.get_new_OEB_id( "002", "X", last_challenge) for participant in os.listdir( "/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/participants" ): # if participant is not in mongo, assign new temporary id if participant in mongo_ids: tool_id = mongo_ids[participant] else: tool_id, last_tool = IDGenerator.get_new_OEB_id( "002", "T", last_tool) # get participant dataset id participant_data_id, last_participant_dataset = IDGenerator.get_new_OEB_id( "002", "D", last_participant_dataset) info = { "_id": "TCGA:2018-04-05_" + cancer + "_P_" + participant, "name": "Cancer Driver Genes in " + long_names[cancer], "description": "List of Cancer Driver Genes predicted by tool " + participant + " in " + long_names[cancer], "dates": { "creation": "2018-04-05T00:00:00Z", "modification": "2018-04-05T14:00:00Z" }, "datalink": { "uri": download_urls[participant], "attrs": ["archive"], "validation_date": "2018-04-05T00:00:00Z", "status": "ok" }, "type": "participant", "visibility": "public", "_schema": "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Dataset", "community_ids": ["OEBC001"], "challenge_ids": ["TCGA:2018-04-05_" + cancer], "depends_on": { "tool_id": "TCGA:" + participant, "rel_dataset_ids": [{ "dataset_id": "TCGA:2018-04-05_input", }] }, "version": "unknown", "dataset_contact_ids": [tool_contact[participant]] } # print info filename = "Dataset_participant_" + cancer + "_" + participant + "_" + participant_data_id + ".json" # print filename with open(out_dir + filename, 'w') as f: json.dump(info, f, sort_keys=True, indent=4, separators=(',', ': '))
def run(cancer_types, out_dir): last_challenge = "0000000" last_event = "00000NC" last_tool = "0000008" last_assessment_dataset = "000008R" last_challenge_dataset = "00000OB" IDGenerator = id_generator.IDGenerator() for cancer in cancer_types: challenge_id, last_challenge = IDGenerator.get_new_OEB_id( "002", "X", last_challenge) # get stat event id Sevent_id, last_event = IDGenerator.get_new_OEB_id( "002", "A", last_event) #generate array with all incoming assessment datasets aun aoutgoing aggregation dataset involved_datasets = [] for participant in os.listdir( "/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/participants" ): involved_datasets.append({ "dataset_id": "TCGA:2018-04-05_" + cancer + "_A_TPR_" + participant, "role": "incoming" }) involved_datasets.append({ "dataset_id": "TCGA:2018-04-05_" + cancer + "_A_precision_" + participant, "role": "incoming" }) # append test action outgoing dataset involved_datasets.append({ "dataset_id": "TCGA:2018-04-05_" + cancer + "_Aggregation", "role": "outgoing" }) info = { "_id": "TCGA:2018-04-05_" + cancer + "_do_aggregation", "_schema": "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/TestAction", "tool_id": "TCGA:aggregate_benchmark", "action_type": "AggregationEvent", "involved_datasets": involved_datasets, "challenge_id": "TCGA:2018-04-05_" + cancer, "dates": { "creation": "2018-04-05T00:00:00Z", "reception": "2018-04-05T00:00:00Z" }, "test_contact_ids": ["Matthew.Bailey", "Eduard.Porta", "Collin.Tokheim"] } # print info filename = "AggregationEvent_" + cancer + "_" + Sevent_id + ".json" # print filename with open(out_dir + filename, 'w') as f: json.dump(info, f, sort_keys=True, indent=4, separators=(',', ': '))