Beispiel #1
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    cases_emitter = emitter('case', output_dir=output_dir)
    bcc_cases_emitter = emitter('bcc_participant', output_dir=output_dir)
    cases = {}
    bcc_cases = {}
    submitter_ids = []
    for p in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            submitter_id = line.get('participantid',
                                    line.get('ParticipantID', None))
            submitter_ids.append(submitter_id)
            bcc_submitter_id = '{}-{}'.format(submitter_id, source)
            primary_site = line.get('site', None)
            case = {
                'type': 'case',
                'experiments': {
                    'submitter_id': experiment_code
                },
                'primary_site': primary_site,
                'submitter_id': submitter_id,
                'project_id': DEFAULT_PROJECT_ID
            }
            bcc_case = {
                'type': 'bcc_participant',
                'case': {
                    'submitter_id': submitter_id
                },
                'source': source,
                'submitter_id': bcc_submitter_id,
                'project_id': DEFAULT_PROJECT_ID
            }
            cases[submitter_id] = case
            if bcc_submitter_id in bcc_cases:
                # merge dupes
                bcc_case = bcc_cases[bcc_submitter_id]
            bcc_case.update(line)
            bcc_cases[bcc_submitter_id] = bcc_case
    for k in cases:
        cases_emitter.write(
            obscure_dates(cases[k], participantid=k, output_dir=output_dir))
    for k in bcc_cases:
        bcc_case = bcc_cases[k]
        for p in [
                'FirstName', 'MRN', 'LastName', 'DateOfBirth',
                '_labkeyurl_Gender_ID', '_labkeyurl_ParticipantID', 'Gender_ID'
        ]:
            del bcc_case[p]
        bcc_case = obscure_dates(bcc_case, output_dir=output_dir)
        bcc_cases_emitter.write(bcc_case)
    cases_emitter.close()
    bcc_cases_emitter.close()
Beispiel #2
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    demographics_emitter = emitter('demographic', output_dir=output_dir)
    bcc_demographics_emitter = emitter('bcc_demographic',
                                       output_dir=output_dir)
    demographics = {}
    bcc_demographics = {}

    for p in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            case_submitter_id = line['participantid']
            submitter_id = '{}-demographic'.format(case_submitter_id)
            bcc_submitter_id = '{}-{}'.format(submitter_id, source)

            demographic = {
                'type': 'demographic',
                'cases': {
                    'submitter_id': case_submitter_id
                },
                'submitter_id': submitter_id,
                'project_id': DEFAULT_PROJECT_ID
            }
            bcc_demographic = {
                'type': 'bcc_demographic',
                'demographic': {
                    'submitter_id': submitter_id
                },
                'source': source,
                'submitter_id': bcc_submitter_id,
                'project_id': DEFAULT_PROJECT_ID
            }
            demographics[submitter_id] = demographic
            if bcc_submitter_id in bcc_demographics:
                bcc_demographic = bcc_demographics[bcc_submitter_id]
            bcc_demographic.update(line)
            bcc_demographics[bcc_submitter_id] = bcc_demographic
    for k in demographics:
        demographics[k] = obscure_dates(
            demographics[k],
            output_dir=output_dir,
            participantid=demographics[k]['cases']['submitter_id'])
        demographics_emitter.write(demographics[k])
    demographics_emitter.close()
    for k in bcc_demographics:
        bcc_demographics[k] = obscure_dates(bcc_demographics[k],
                                            output_dir=output_dir)
        bcc_demographics_emitter.write(bcc_demographics[k])
    bcc_demographics_emitter.close()
Beispiel #3
0
def transform_surgery(item_paths, output_dir, project_id, treatment_ids, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    bcc_treatment_emitter = emitter('bcc_surgery', output_dir=output_dir)
    bcc_treatment_submitter_ids = []
    for p,type, callback in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            line['source'] = source
            if callback:
                line = callback(line)
            participantid = line.get('ParticipantID', line.get('participantid', None))
            assert participantid, 'ParticipantID not in {} {}'.format(p, line.keys())
            diagnosis_submitter_id = '{}-diagnosis'.format(participantid)
            treatment_submitter_id = '{}-Surgery-{}'.format(diagnosis_submitter_id, get_uniq(line))
            bcc_treatment_submitter_id = '{}-bcc_surgery'.format(treatment_submitter_id)
            if treatment_submitter_id not in treatment_ids:
                # print('transform_surgery {} not in treatment_ids, skipping.'.format(treatment_submitter_id))
                continue
            if bcc_treatment_submitter_id in bcc_treatment_submitter_ids:
                # print('transform_surgery {} in bcc_treatment_submitter_ids, skipping.'.format(treatment_submitter_id))
                continue
            bcc_treatment_submitter_ids.append(bcc_treatment_submitter_id)
            bcc_treatment = {
                'type': 'bcc_surgery',
                'project_id': project_id,
                'treatment': {'submitter_id': treatment_submitter_id},
                'submitter_id': bcc_treatment_submitter_id
                }
            if 'type' in line and p == 'source/bcc/vResectionDate.json':
                del line['type']
            bcc_treatment.update(line)
            bcc_treatment = obscure_dates(bcc_treatment, output_dir=output_dir)
            bcc_treatment_emitter.write(bcc_treatment)
    bcc_treatment_emitter.close()
Beispiel #4
0
def transform_gen3(item_paths, output_dir, project_id, compresslevel=0):
    """Creates gen3.treatment, returns set of treatment_ids."""
    diagnoses = set([line['submitter_id'] for line in reader('{}/diagnosis.json'.format(output_dir))])
    treatment_emitter = emitter('treatment', output_dir=output_dir)
    treatment_ids = set([])
    missing_diagnoses = []
    for p,treatment_type, callback in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            participantid = line.get('ParticipantID', line.get('participantid', None))
            assert participantid, 'ParticipantID not in {} {}'.format(p, line.keys())
            diagnosis_submitter_id = '{}-diagnosis'.format(participantid)
            treatment_submitter_id = '{}-{}-{}'.format(diagnosis_submitter_id, treatment_type, get_uniq(line))
            if diagnosis_submitter_id not in diagnoses:
                missing_diagnoses.append(missing_parent(parent_id=diagnosis_submitter_id, parent_type='diagnosis', child_id=treatment_submitter_id, child_type='treatment'))
                print('skipping missing diagnosis', treatment_submitter_id)
                continue
            if treatment_submitter_id in treatment_ids:
                print('skipping ',treatment_submitter_id, p, line.keys())
                continue
            treatment_ids.add(treatment_submitter_id)
            treatment = default_treatment(treatment_submitter_id, diagnosis_submitter_id, treatment_type, project_id)
            treatment = obscure_dates(treatment, output_dir=output_dir, participantid=participantid)
            treatment_emitter.write(treatment)
    save_missing_parents(missing_diagnoses)
    return treatment_ids
Beispiel #5
0
def my_callback(line):
    """Remove fields that start with _, fix key names with embedded /, fix id lookups """
    for k in [k for k in line if k.startswith('_')]:
        del line[k]

    for k in [k for k in line if '/' in k]:
        line[k.split('/')[1]] = line[k]
        del line[k]

    for k in [k for k in line if k.endswith('_id')]:
        lup = k.replace('_id', '')
        if line[k]:
            try:
                line[lup] = LOOKUPS[lup][line[k]]
            except Exception as e:
                print(lup, k, line[k])
                print('******')
                print(LOOKUPS[lup])
                print('******')
                raise e
        del line[k]
    if 'chromosome' in line:
        line['chromosome'] = str(line['chromosome'].replace('chr',''))
    if 'gene' in line:
        line['gene_symbol'] = line['gene']
        del line['gene']

    line = obscure_dates(line)
    return line
Beispiel #6
0
def transform(item_paths,
              output_dir,
              experiment_code,
              compresslevel=0,
              callback=None):
    """Read bcc labkey json and writes gen3 json."""
    genetrails_emitter = emitter('genetrails_variant', output_dir=output_dir)
    with open('output/reference/gene_lookup.tsv') as f:
        gene_lookup = {k: v for k, v in (line.split() for line in f)}

    for p in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            line['source'] = source
            if callback:
                line = callback(line)
            genetrails_variant = {
                'type': 'genetrails_variant',
                'project_id': DEFAULT_PROJECT_ID,
                'aliquot': {
                    'submitter_id': '{}-aliquot'.format(line['sample_code'])
                },
                'submitter_id': line['lsid']
            }
            if 'gene_symbol' in line and line['gene_symbol'].lower(
            ) in gene_lookup:
                line['gene'] = {
                    'submitter_id': gene_lookup[line['gene_symbol'].lower()],
                    'project_id': 'smmart-reference'
                }
            genetrails_variant.update(line)
            genetrails_variant = obscure_dates(genetrails_variant,
                                               output_dir=output_dir)
            genetrails_emitter.write(genetrails_variant)
    genetrails_emitter.close()
Beispiel #7
0
def transform_gen3(item_paths, output_dir, project_id, compresslevel=0):
    """Creates gen3.treatment, returns set of treatment_ids."""
    case_lookup = {
        line['MRN']: line['OPTR']
        for line in reader('{}/bcc-cases.tsv'.format('source/bcc'))
    }

    biomarker_emitter = emitter('bcc_biomarker', output_dir=output_dir)
    for item_path in item_paths:
        biomarkers = [line for line in reader(item_path)]

        # missing_cases = [b['MRN'] for b in biomarkers if b['MRN'] not in case_lookup]
        def add_case(b):
            case_submitter_id = case_lookup[b['MRN']]
            submitter_id = '{}-{}-bcc_biomarker'.format(
                case_submitter_id, b['ID_Event'])
            for p in [
                    "MRN",
                    "Participant ID",
                    "_not_available_notes",
                    "_not_available_reason_id",
                    "cBiomarker Label dont use",
            ]:
                del b[p]
            for p in [
                    "CA19 Values After Specimen Collection",
                    "Order Proc ID",
                    "assay version id",
                    "biomarker level",
                    "unit of measure id",
            ]:
                new_p = p.replace(' ', '_').lower()
                b[new_p] = b[p]
                del b[p]
            b['cbiomarker_label'] = b["cBiomarker Label use this"]
            del b["cBiomarker Label use this"]
            biomarker = {
                'type': 'bcc_biomarker',
                'cases': {
                    'submitter_id': case_submitter_id
                },
                'submitter_id': submitter_id,
                'project_id': project_id
            }

            biomarker.update(b)
            return biomarker

        biomarkers_with_case = [
            add_case(b) for b in biomarkers if b['MRN'] in case_lookup
        ]
        print('there are', len(biomarkers_with_case),
              'biomarkers with cases, out of ', len(biomarkers), 'biomarkers')
        [
            biomarker_emitter.write(obscure_dates(b))
            for b in biomarkers_with_case
        ]
    biomarker_emitter.close()
Beispiel #8
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0, callback=None):
    """Read bcc labkey json and writes gen3 json."""
    bcc_aliquot_emitter = emitter('bcc_aliquot', output_dir=output_dir)

    for p in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            line['source'] = source
            if callback:
                line = callback(line)
            bcc_aliquot = {
                'type': 'bcc_aliquot',
                'project_id': DEFAULT_PROJECT_ID,
                'aliquot': {'submitter_id': '{}-aliquot'.format(line['sample_code'])},
                'submitter_id': line['lsid']}
            bcc_aliquot.update(line)
            bcc_aliquot = obscure_dates(bcc_aliquot, output_dir=output_dir)
            bcc_aliquot_emitter.write(bcc_aliquot)
    bcc_aliquot_emitter.close()
Beispiel #9
0
def transform(item_paths, output_dir, experiment_code, project_id, compresslevel=0, callback=None):
    """Read bcc labkey json and writes gen3 json."""
    alleles_emitter = emitter('allele', output_dir=output_dir)
    alleles = {}
    for p in item_paths:
        for line in reader(p):
            if callback:
                line = callback(line)
            allele = {
                'type': 'allele',
                'aliquots': {'submitter_id': '{}-aliquot'.format(line['sample_code'])},
                'projects': {'code': 'reference'},
                'submitter_id': line['lsid']}
            if line['lsid'] in alleles:
                allele = alleles[line['lsid']]
            allele['project_id'] = project_id
            allele.update(line)
            alleles[line['lsid']] = allele
    for k in alleles:
        alleles[k] = obscure_dates(alleles[k], output_dir=output_dir)
        alleles_emitter.write(alleles[k])
    alleles_emitter.close()
Beispiel #10
0
def transform_gen3(item_paths, output_dir, project_id, compresslevel=0):
    """Creates gen3.lesion, returns set of lesion_ids."""
    cases = set([
        line['submitter_id']
        for line in reader('{}/case.json'.format(output_dir))
    ])
    observation_emitter = emitter('observation', output_dir=output_dir)
    observation_ids = set([])
    missing_cases = []
    for p, observation_type, callback in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            participantid = line.get('ParticipantID',
                                     line.get('participantid', None))
            assert participantid, 'ParticipantID not in {} {}'.format(
                p, line.keys())
            case_submitter_id = participantid
            observation = default_observation(case_submitter_id, project_id,
                                              line['date'], observation_type,
                                              line)
            observation_submitter_id = observation['submitter_id']
            if case_submitter_id not in cases:
                missing_cases.append(
                    missing_parent(parent_id=case_submitter_id,
                                   parent_type='case',
                                   child_id=observation_submitter_id,
                                   child_type='observation'))
                continue
            if observation_submitter_id in observation_ids:
                continue
            observation_ids.add(observation_submitter_id)
            observation = obscure_dates(
                observation,
                output_dir=output_dir,
                participantid=observation['cases']['submitter_id'])
            observation_emitter.write(observation)
    save_missing_parents(missing_cases)
    return observation_ids
Beispiel #11
0
def transform_biomarker(item_paths,
                        output_dir,
                        project_id,
                        observation_ids,
                        compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    bcc_biomarker_emitter = emitter('bcc_biomarker', output_dir=output_dir)
    for p, observation_type, callback in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            participantid = line.get('ParticipantID',
                                     line.get('participantid', None))
            observation = default_observation(participantid, project_id,
                                              line['date'], observation_type,
                                              line)
            observation_submitter_id = observation['submitter_id']
            biomarker_submitter_id = '{}-bcc_biomarker'.format(
                observation_submitter_id)
            if observation_submitter_id not in observation_ids:
                print(
                    'transform_biomarker {} not in observation_ids, skipping.'.
                    format(biomarker_submitter_id))
                continue
            bcc_biomarker = {
                'type': 'bcc_biomarker',
                'project_id': project_id,
                'observation': {
                    'submitter_id': observation_submitter_id
                },
                'submitter_id': biomarker_submitter_id
            }
            line['source'] = source
            if callback:
                line = callback(line)
            bcc_biomarker.update(line)
            bcc_biomarker = obscure_dates(bcc_biomarker, output_dir=output_dir)
            bcc_biomarker_emitter.write(bcc_biomarker)
    bcc_biomarker_emitter.close()
Beispiel #12
0
def transform_chemotherapy(item_paths, output_dir, project_id, treatment_ids, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    bcc_treatment_emitter = emitter('bcc_chemotherapy', output_dir=output_dir)
    for p,type, callback in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            line['source'] = source
            if callback:
                line = callback(line)
            diagnosis_submitter_id = '{}-diagnosis'.format(line['ParticipantID'])
            treatment_submitter_id = '{}-Chemotherapy-{}'.format(diagnosis_submitter_id, get_uniq(line))
            if treatment_submitter_id not in treatment_ids:
                # print('transform_chemotherapy {} not in treatment_ids, skipping.'.format(treatment_submitter_id))
                continue
            bcc_treatment = {
                'type': 'bcc_chemotherapy',
                'project_id': project_id,
                'treatment': {'submitter_id': treatment_submitter_id},
                'submitter_id': '{}-{}-{}'.format(treatment_submitter_id, line['days'], line.get('treatment_description', line.get('treatment_agent', 'na')))
                }
            bcc_treatment.update(line)
            bcc_treatment = obscure_dates(bcc_treatment, output_dir=output_dir)
            bcc_treatment_emitter.write(bcc_treatment)
    bcc_treatment_emitter.close()
Beispiel #13
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    cases = set([])
    for line in reader('{}/case.json'.format(output_dir)):
        cases.add(line['submitter_id'])
    diagnoses_emitter = emitter('diagnosis', output_dir=output_dir)
    bcc_diagnosis_emitter = emitter('bcc_diagnosis', output_dir=output_dir)
    diagnosises = {}
    bcc_diagnosises = {}
    missing_cases = set([])
    for p in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            case_submitter_id = line['participantid']
            bcc_submitter_id = '{}-{}'.format(case_submitter_id, source)
            diagnosis = default_diagnosis(case_submitter_id,
                                          project_id=DEFAULT_PROJECT_ID,
                                          line=line)
            submitter_id = diagnosis['submitter_id']
            bcc_diagnosis = {
                'type': 'bcc_diagnosis',
                'diagnosis': {
                    'submitter_id': submitter_id
                },
                'source': source,
                'submitter_id': bcc_submitter_id,
                'project_id': DEFAULT_PROJECT_ID
            }

            if bcc_submitter_id in bcc_diagnosises:
                bcc_diagnosis = bcc_diagnosises[bcc_submitter_id]

            # we will use the name 'diagnosis' as a link back to gen3.diagnosis
            line['diagnosis_name'] = line.get('diagnosis', None)
            del line['diagnosis']
            bcc_diagnosis.update(line)

            diagnosises[submitter_id] = diagnosis
            bcc_diagnosises[bcc_submitter_id] = bcc_diagnosis
            if case_submitter_id not in cases:
                print('no case for: >{}<'.format(case_submitter_id))
                missing_cases.add(case_submitter_id)

    for k in diagnosises:
        diagnosises[k] = obscure_dates(
            diagnosises[k],
            output_dir=output_dir,
            participantid=diagnosises[k]['cases']['submitter_id'])
        diagnoses_emitter.write(diagnosises[k])

    cases = missing_cases - cases
    print('missing diagnosis for {} cases'.format(len(cases)))
    for participantid in cases:
        diagnosis = default_diagnosis(participantid,
                                      project_id=DEFAULT_PROJECT_ID)
        diagnosis = obscure_dates(diagnosis, output_dir=output_dir)
        diagnoses_emitter.write(diagnosis)
    diagnoses_emitter.close()

    print('missing cases for {} cases'.format(len(missing_cases)))
    cases_emitter = emitter('case', output_dir=output_dir, append=True)
    for participantid in missing_cases:
        case = default_case(DEFAULT_EXPERIMENT_CODE, participantid,
                            DEFAULT_PROJECT_ID)
        case = obscure_dates(case, output_dir=output_dir)
        cases_emitter.write(case)
Beispiel #14
0
        diagnosis = obscure_dates(diagnosis, output_dir=output_dir)
        diagnoses_emitter.write(diagnosis)
    diagnoses_emitter.close()

    print('missing cases for {} cases'.format(len(missing_cases)))
    cases_emitter = emitter('case', output_dir=output_dir, append=True)
    for participantid in missing_cases:
        case = default_case(DEFAULT_EXPERIMENT_CODE, participantid,
                            DEFAULT_PROJECT_ID)
        case = obscure_dates(case, output_dir=output_dir)
        cases_emitter.write(case)
    cases_emitter.close()

    bcc_diagnosises_emitter = emitter('bcc_diagnosis', output_dir=output_dir)
    for k in bcc_diagnosises:
        bcc_diagnosises[k] = obscure_dates(bcc_diagnosises[k],
                                           output_dir=output_dir)
        bcc_diagnosises_emitter.write(bcc_diagnosises[k])
    bcc_diagnosises_emitter.close()


if __name__ == "__main__":
    item_paths = ['source/bcc/voncologdiagnosis.json']
    args = default_parser(DEFAULT_OUTPUT_DIR, DEFAULT_EXPERIMENT_CODE,
                          DEFAULT_PROJECT_ID).parse_args()
    transform(item_paths,
              output_dir=args.output_dir,
              experiment_code=args.experiment_code)

    p = os.path.join(args.output_dir, 'diagnosis.json')
    assert os.path.isfile(p), 'should have an output file {}'.format(p)
    print(p)
Beispiel #15
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    cases = set([])
    for line in reader('{}/case.json'.format(output_dir)):
        cases.add(line['submitter_id'])

    diagnoses = set([])
    for line in reader('{}/diagnosis.json'.format(output_dir)):
        diagnoses.add(line['submitter_id'])

    missing_cases = set([])
    print('cases len {}'.format(len(cases)))

    # dedup
    samples = []
    samples_emitter = emitter('sample', output_dir=output_dir)
    bcc_samples_emitter = emitter('bcc_sample', output_dir=output_dir)
    missing_diagnoses = []
    for p in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            case_submitter_id = line.get('participantid',
                                         line.get('ParticipantID'))
            sample = default_sample(case_submitter_id,
                                    line=line,
                                    project_id=DEFAULT_PROJECT_ID)
            submitter_id = sample['submitter_id']
            if case_submitter_id not in cases:
                # print('no case {} for sample {} - skipping.'.format(case_submitter_id, submitter_id))
                missing_diagnoses.append(
                    missing_parent(child_id=submitter_id,
                                   child_type='sample',
                                   parent_id=case_submitter_id,
                                   parent_type='case'))
                continue
            if submitter_id in samples:
                continue
            if sample['diagnoses']['submitter_id'] not in diagnoses:
                missing_diagnoses.append(
                    missing_parent(
                        child_id=submitter_id,
                        child_type='sample',
                        parent_id=sample['diagnoses']['submitter_id'],
                        parent_type='diagnosis'))
                del sample['diagnoses']['submitter_id']

            bcc_submitter_id = '{}-{}'.format(submitter_id, source)
            samples_emitter.write(sample)
            samples.append(submitter_id)

            bcc_sample = {
                'type': 'bcc_sample',
                'sample': {
                    'submitter_id': submitter_id
                },
                'source': source,
                'submitter_id': bcc_submitter_id,
                'project_id': DEFAULT_PROJECT_ID
            }
            bcc_sample.update(line)
            if '_labkeyurl_sample_type_id' in bcc_sample:
                bcc_sample['sample_type'] = LOOKUPS['sample_type'][
                    bcc_sample['sample_type_id']]
                del bcc_sample['sample_type_id']
                del bcc_sample['_labkeyurl_sample_type_id']

            bcc_sample = obscure_dates(bcc_sample, output_dir=output_dir)

            bcc_samples_emitter.write(bcc_sample)

            if case_submitter_id not in cases:
                missing_cases.add(case_submitter_id)
                cases.add(case_submitter_id)
    save_missing_parents(missing_diagnoses)