def get_test_record(): test_clinvar_record_filepath = os.path.join(os.path.dirname(__file__), 'resources', 'test_clinvar_record.json') with utilities.open_file(test_clinvar_record_filepath, "rt") as f: test_record_dict = json.load(f) test_record = clinvar.ClinvarRecord(test_record_dict) return test_record
def get_input_data_for_evidence_string_generation(): """Prepares mock input data necessary for the evidence string generation.""" clinvar_record = clinvar.ClinvarRecord( json.load(open(config.test_clinvar_record_file))) report = clinvar_to_evidence_strings.Report() trait = SimpleNamespace() trait.trait_counter = 0 trait.clinvar_name = '' trait.ontology_id = 'http://www.orpha.net/ORDO/Orphanet_88991' trait.ontology_label = None consequence_type = test_clinvar_to_evidence_strings.MAPPINGS.consequence_type_dict[ '14:67729209:A:G'][0] return clinvar_record, clinvar_record.measures[ 0], report, trait, consequence_type
def clinvar_to_evidence_strings(allowed_clinical_significance, mappings, json_file, ot_schema): report = Report(trait_mappings=mappings.trait_2_efo) cell_recs = cellbase_records.CellbaseRecords(json_file=json_file) ot_schema_contents = json.loads(open(ot_schema).read()) for cellbase_record in cell_recs: report.counters["record_counter"] += 1 if report.counters["record_counter"] % 1000 == 0: logger.info("{} records processed".format( report.counters["record_counter"])) n_ev_strings_per_record = 0 clinvar_record = clinvar.ClinvarRecord(cellbase_record['clinvarSet']) for clinvar_record_measure in clinvar_record.measures: report.counters["n_nsvs"] += (clinvar_record_measure.nsv_id is not None) append_nsv(report.nsv_list, clinvar_record_measure) report.counters["n_multiple_allele_origin"] += (len( clinvar_record.allele_origins) > 1) traits = create_traits(clinvar_record.traits, mappings.trait_2_efo, report) converted_allele_origins = convert_allele_origins( clinvar_record.allele_origins) for consequence_type, trait, allele_origin in itertools.product( get_consequence_types(clinvar_record_measure, mappings.consequence_type_dict), traits, converted_allele_origins): if skip_record(clinvar_record, clinvar_record_measure, consequence_type, allele_origin, allowed_clinical_significance, report): continue if allele_origin == 'germline': evidence_string = evidence_strings.CTTVGeneticsEvidenceString( clinvar_record, clinvar_record_measure, report, trait, consequence_type) elif allele_origin == 'somatic': evidence_string = evidence_strings.CTTVSomaticEvidenceString( clinvar_record, clinvar_record_measure, report, trait, consequence_type) report.add_evidence_string(evidence_string, clinvar_record, trait, consequence_type.ensembl_gene_id, ot_schema_contents) report.evidence_list.append([ clinvar_record.accession, clinvar_record_measure.rs_id, trait.clinvar_name, trait.ontology_id ]) report.counters["n_valid_rs_and_nsv"] += ( clinvar_record_measure.nsv_id is not None) report.traits.add(trait.ontology_id) report.remove_trait_mapping(trait.clinvar_name) report.ensembl_gene_id_uris.add( evidence_strings.get_ensembl_gene_id_uri( consequence_type.ensembl_gene_id)) n_ev_strings_per_record += 1 if n_ev_strings_per_record > 0: report.counters["n_processed_clinvar_records"] += 1 if n_ev_strings_per_record > 1: report.counters["n_multiple_evidence_strings"] += 1 return report
def clinvar_to_evidence_strings(allowed_clinical_significance, mappings, json_file, ot_schema, output_evidence_strings): report = Report(trait_mappings=mappings.trait_2_efo) cell_recs = cellbase_records.CellbaseRecords(json_file=json_file) ot_schema_contents = json.loads(open(ot_schema).read()) output_evidence_strings_file = utilities.open_file(output_evidence_strings, 'wt') for cellbase_record in cell_recs: report.counters["record_counter"] += 1 if report.counters["record_counter"] % 1000 == 0: logger.info("{} records processed".format( report.counters["record_counter"])) n_ev_strings_per_record = 0 clinvar_record = clinvar.ClinvarRecord(cellbase_record['clinvarSet']) for clinvar_record_measure in clinvar_record.measures: report.counters["n_nsvs"] += (clinvar_record_measure.nsv_id is not None) append_nsv(report.nsv_list, clinvar_record_measure) report.counters["n_multiple_allele_origin"] += (len( clinvar_record.allele_origins) > 1) traits = create_traits(clinvar_record.traits, mappings.trait_2_efo, report) converted_allele_origins = convert_allele_origins( clinvar_record.allele_origins) for consequence_type, trait, allele_origin in itertools.product( get_consequence_types(clinvar_record_measure, mappings.consequence_type_dict), traits, converted_allele_origins): if skip_record(clinvar_record, clinvar_record_measure, consequence_type, allele_origin, allowed_clinical_significance, report): continue if allele_origin == 'germline': evidence_string = evidence_strings.CTTVGeneticsEvidenceString( clinvar_record, clinvar_record_measure, report, trait, consequence_type) elif allele_origin == 'somatic': evidence_string = evidence_strings.CTTVSomaticEvidenceString( clinvar_record, clinvar_record_measure, report, trait, consequence_type) else: raise AssertionError( 'Unknown allele_origin present in the data: {}'.format( allele_origin)) # Validate and immediately output the evidence string (not keeping everything in memory) validate_evidence_string(evidence_string, clinvar_record, trait, consequence_type.ensembl_gene_id, ot_schema_contents) output_evidence_strings_file.write( json.dumps(evidence_string) + '\n') report.evidence_string_count += 1 report.evidence_list.append([ clinvar_record.accession, clinvar_record_measure.rs_id, trait.clinvar_name, trait.ontology_id ]) report.counters["n_valid_rs_and_nsv"] += ( clinvar_record_measure.nsv_id is not None) report.traits.add(trait.ontology_id) report.remove_trait_mapping(trait.clinvar_name) report.ensembl_gene_id_uris.add( evidence_strings.get_ensembl_gene_id_uri( consequence_type.ensembl_gene_id)) n_ev_strings_per_record += 1 if n_ev_strings_per_record > 0: report.counters["n_processed_clinvar_records"] += 1 if n_ev_strings_per_record > 1: report.counters["n_multiple_evidence_strings"] += 1 output_evidence_strings_file.close() return report