def test_genetics_evidence_string(self):
     """Verifies expected genetics evidence string generation."""
     evidence_string = json.dumps(
         evidence_strings.CTTVGeneticsEvidenceString(*self.test_args),
         sort_keys=True,
         indent=2)
     expected_evidence_string = open(
         config.expected_genetics_evidence_string).read()
     self.assertEqual(evidence_string, expected_evidence_string)
Esempio n. 2
0
def clinvar_to_evidence_strings(allowed_clinical_significance, mappings,
                                json_file, ot_schema):
    report = Report(trait_mappings=mappings.trait_2_efo)
    cell_recs = cellbase_records.CellbaseRecords(json_file=json_file)
    ot_schema_contents = json.loads(open(ot_schema).read())
    for cellbase_record in cell_recs:
        report.counters["record_counter"] += 1
        if report.counters["record_counter"] % 1000 == 0:
            logger.info("{} records processed".format(
                report.counters["record_counter"]))

        n_ev_strings_per_record = 0
        clinvar_record = clinvar.ClinvarRecord(cellbase_record['clinvarSet'])

        for clinvar_record_measure in clinvar_record.measures:
            report.counters["n_nsvs"] += (clinvar_record_measure.nsv_id
                                          is not None)
            append_nsv(report.nsv_list, clinvar_record_measure)
            report.counters["n_multiple_allele_origin"] += (len(
                clinvar_record.allele_origins) > 1)
            traits = create_traits(clinvar_record.traits, mappings.trait_2_efo,
                                   report)
            converted_allele_origins = convert_allele_origins(
                clinvar_record.allele_origins)

            for consequence_type, trait, allele_origin in itertools.product(
                    get_consequence_types(clinvar_record_measure,
                                          mappings.consequence_type_dict),
                    traits, converted_allele_origins):

                if skip_record(clinvar_record, clinvar_record_measure,
                               consequence_type, allele_origin,
                               allowed_clinical_significance, report):
                    continue

                if allele_origin == 'germline':
                    evidence_string = evidence_strings.CTTVGeneticsEvidenceString(
                        clinvar_record, clinvar_record_measure, report, trait,
                        consequence_type)
                elif allele_origin == 'somatic':
                    evidence_string = evidence_strings.CTTVSomaticEvidenceString(
                        clinvar_record, clinvar_record_measure, report, trait,
                        consequence_type)
                report.add_evidence_string(evidence_string, clinvar_record,
                                           trait,
                                           consequence_type.ensembl_gene_id,
                                           ot_schema_contents)
                report.evidence_list.append([
                    clinvar_record.accession, clinvar_record_measure.rs_id,
                    trait.clinvar_name, trait.ontology_id
                ])
                report.counters["n_valid_rs_and_nsv"] += (
                    clinvar_record_measure.nsv_id is not None)
                report.traits.add(trait.ontology_id)
                report.remove_trait_mapping(trait.clinvar_name)
                report.ensembl_gene_id_uris.add(
                    evidence_strings.get_ensembl_gene_id_uri(
                        consequence_type.ensembl_gene_id))

                n_ev_strings_per_record += 1

            if n_ev_strings_per_record > 0:
                report.counters["n_processed_clinvar_records"] += 1
                if n_ev_strings_per_record > 1:
                    report.counters["n_multiple_evidence_strings"] += 1

    return report
 def setUp(self):
     self.test_args = get_input_data_for_evidence_string_generation()
     self.test_ges = evidence_strings.CTTVGeneticsEvidenceString(
         *self.test_args)
     self.ot_schema_contents = json.loads(
         gzip.open(config.open_targets_schema_gz).read().decode('utf-8'))
 def test_validate(self):
     test_args = get_input_data_for_evidence_string_generation()
     test_evidence_string = evidence_strings.CTTVGeneticsEvidenceString(
         *test_args)
     self.assertTrue(test_evidence_string.validate(self.ot_schema_contents))
Esempio n. 5
0
def clinvar_to_evidence_strings(allowed_clinical_significance, mappings,
                                json_file, ot_schema, output_evidence_strings):
    report = Report(trait_mappings=mappings.trait_2_efo)
    cell_recs = cellbase_records.CellbaseRecords(json_file=json_file)
    ot_schema_contents = json.loads(open(ot_schema).read())
    output_evidence_strings_file = utilities.open_file(output_evidence_strings,
                                                       'wt')
    for cellbase_record in cell_recs:
        report.counters["record_counter"] += 1
        if report.counters["record_counter"] % 1000 == 0:
            logger.info("{} records processed".format(
                report.counters["record_counter"]))

        n_ev_strings_per_record = 0
        clinvar_record = clinvar.ClinvarRecord(cellbase_record['clinvarSet'])

        for clinvar_record_measure in clinvar_record.measures:
            report.counters["n_nsvs"] += (clinvar_record_measure.nsv_id
                                          is not None)
            append_nsv(report.nsv_list, clinvar_record_measure)
            report.counters["n_multiple_allele_origin"] += (len(
                clinvar_record.allele_origins) > 1)
            traits = create_traits(clinvar_record.traits, mappings.trait_2_efo,
                                   report)
            converted_allele_origins = convert_allele_origins(
                clinvar_record.allele_origins)

            for consequence_type, trait, allele_origin in itertools.product(
                    get_consequence_types(clinvar_record_measure,
                                          mappings.consequence_type_dict),
                    traits, converted_allele_origins):

                if skip_record(clinvar_record, clinvar_record_measure,
                               consequence_type, allele_origin,
                               allowed_clinical_significance, report):
                    continue

                if allele_origin == 'germline':
                    evidence_string = evidence_strings.CTTVGeneticsEvidenceString(
                        clinvar_record, clinvar_record_measure, report, trait,
                        consequence_type)
                elif allele_origin == 'somatic':
                    evidence_string = evidence_strings.CTTVSomaticEvidenceString(
                        clinvar_record, clinvar_record_measure, report, trait,
                        consequence_type)
                else:
                    raise AssertionError(
                        'Unknown allele_origin present in the data: {}'.format(
                            allele_origin))

                # Validate and immediately output the evidence string (not keeping everything in memory)
                validate_evidence_string(evidence_string, clinvar_record,
                                         trait,
                                         consequence_type.ensembl_gene_id,
                                         ot_schema_contents)
                output_evidence_strings_file.write(
                    json.dumps(evidence_string) + '\n')
                report.evidence_string_count += 1

                report.evidence_list.append([
                    clinvar_record.accession, clinvar_record_measure.rs_id,
                    trait.clinvar_name, trait.ontology_id
                ])
                report.counters["n_valid_rs_and_nsv"] += (
                    clinvar_record_measure.nsv_id is not None)
                report.traits.add(trait.ontology_id)
                report.remove_trait_mapping(trait.clinvar_name)
                report.ensembl_gene_id_uris.add(
                    evidence_strings.get_ensembl_gene_id_uri(
                        consequence_type.ensembl_gene_id))

                n_ev_strings_per_record += 1

            if n_ev_strings_per_record > 0:
                report.counters["n_processed_clinvar_records"] += 1
                if n_ev_strings_per_record > 1:
                    report.counters["n_multiple_evidence_strings"] += 1

    output_evidence_strings_file.close()
    return report