def transform(item_paths, output_dir, experiment_code, compresslevel=0): """Read bcc labkey json and writes gen3 json.""" cases_emitter = emitter('case', output_dir=output_dir) bcc_cases_emitter = emitter('bcc_participant', output_dir=output_dir) cases = {} bcc_cases = {} submitter_ids = [] for p in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): submitter_id = line.get('participantid', line.get('ParticipantID', None)) submitter_ids.append(submitter_id) bcc_submitter_id = '{}-{}'.format(submitter_id, source) primary_site = line.get('site', None) case = { 'type': 'case', 'experiments': { 'submitter_id': experiment_code }, 'primary_site': primary_site, 'submitter_id': submitter_id, 'project_id': DEFAULT_PROJECT_ID } bcc_case = { 'type': 'bcc_participant', 'case': { 'submitter_id': submitter_id }, 'source': source, 'submitter_id': bcc_submitter_id, 'project_id': DEFAULT_PROJECT_ID } cases[submitter_id] = case if bcc_submitter_id in bcc_cases: # merge dupes bcc_case = bcc_cases[bcc_submitter_id] bcc_case.update(line) bcc_cases[bcc_submitter_id] = bcc_case for k in cases: cases_emitter.write( obscure_dates(cases[k], participantid=k, output_dir=output_dir)) for k in bcc_cases: bcc_case = bcc_cases[k] for p in [ 'FirstName', 'MRN', 'LastName', 'DateOfBirth', '_labkeyurl_Gender_ID', '_labkeyurl_ParticipantID', 'Gender_ID' ]: del bcc_case[p] bcc_case = obscure_dates(bcc_case, output_dir=output_dir) bcc_cases_emitter.write(bcc_case) cases_emitter.close() bcc_cases_emitter.close()
def transform(item_paths, output_dir, experiment_code, compresslevel=0): """Read bcc labkey json and writes gen3 json.""" demographics_emitter = emitter('demographic', output_dir=output_dir) bcc_demographics_emitter = emitter('bcc_demographic', output_dir=output_dir) demographics = {} bcc_demographics = {} for p in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): case_submitter_id = line['participantid'] submitter_id = '{}-demographic'.format(case_submitter_id) bcc_submitter_id = '{}-{}'.format(submitter_id, source) demographic = { 'type': 'demographic', 'cases': { 'submitter_id': case_submitter_id }, 'submitter_id': submitter_id, 'project_id': DEFAULT_PROJECT_ID } bcc_demographic = { 'type': 'bcc_demographic', 'demographic': { 'submitter_id': submitter_id }, 'source': source, 'submitter_id': bcc_submitter_id, 'project_id': DEFAULT_PROJECT_ID } demographics[submitter_id] = demographic if bcc_submitter_id in bcc_demographics: bcc_demographic = bcc_demographics[bcc_submitter_id] bcc_demographic.update(line) bcc_demographics[bcc_submitter_id] = bcc_demographic for k in demographics: demographics[k] = obscure_dates( demographics[k], output_dir=output_dir, participantid=demographics[k]['cases']['submitter_id']) demographics_emitter.write(demographics[k]) demographics_emitter.close() for k in bcc_demographics: bcc_demographics[k] = obscure_dates(bcc_demographics[k], output_dir=output_dir) bcc_demographics_emitter.write(bcc_demographics[k]) bcc_demographics_emitter.close()
def transform_surgery(item_paths, output_dir, project_id, treatment_ids, compresslevel=0): """Read bcc labkey json and writes gen3 json.""" bcc_treatment_emitter = emitter('bcc_surgery', output_dir=output_dir) bcc_treatment_submitter_ids = [] for p,type, callback in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): line['source'] = source if callback: line = callback(line) participantid = line.get('ParticipantID', line.get('participantid', None)) assert participantid, 'ParticipantID not in {} {}'.format(p, line.keys()) diagnosis_submitter_id = '{}-diagnosis'.format(participantid) treatment_submitter_id = '{}-Surgery-{}'.format(diagnosis_submitter_id, get_uniq(line)) bcc_treatment_submitter_id = '{}-bcc_surgery'.format(treatment_submitter_id) if treatment_submitter_id not in treatment_ids: # print('transform_surgery {} not in treatment_ids, skipping.'.format(treatment_submitter_id)) continue if bcc_treatment_submitter_id in bcc_treatment_submitter_ids: # print('transform_surgery {} in bcc_treatment_submitter_ids, skipping.'.format(treatment_submitter_id)) continue bcc_treatment_submitter_ids.append(bcc_treatment_submitter_id) bcc_treatment = { 'type': 'bcc_surgery', 'project_id': project_id, 'treatment': {'submitter_id': treatment_submitter_id}, 'submitter_id': bcc_treatment_submitter_id } if 'type' in line and p == 'source/bcc/vResectionDate.json': del line['type'] bcc_treatment.update(line) bcc_treatment = obscure_dates(bcc_treatment, output_dir=output_dir) bcc_treatment_emitter.write(bcc_treatment) bcc_treatment_emitter.close()
def transform_gen3(item_paths, output_dir, project_id, compresslevel=0): """Creates gen3.treatment, returns set of treatment_ids.""" diagnoses = set([line['submitter_id'] for line in reader('{}/diagnosis.json'.format(output_dir))]) treatment_emitter = emitter('treatment', output_dir=output_dir) treatment_ids = set([]) missing_diagnoses = [] for p,treatment_type, callback in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): participantid = line.get('ParticipantID', line.get('participantid', None)) assert participantid, 'ParticipantID not in {} {}'.format(p, line.keys()) diagnosis_submitter_id = '{}-diagnosis'.format(participantid) treatment_submitter_id = '{}-{}-{}'.format(diagnosis_submitter_id, treatment_type, get_uniq(line)) if diagnosis_submitter_id not in diagnoses: missing_diagnoses.append(missing_parent(parent_id=diagnosis_submitter_id, parent_type='diagnosis', child_id=treatment_submitter_id, child_type='treatment')) print('skipping missing diagnosis', treatment_submitter_id) continue if treatment_submitter_id in treatment_ids: print('skipping ',treatment_submitter_id, p, line.keys()) continue treatment_ids.add(treatment_submitter_id) treatment = default_treatment(treatment_submitter_id, diagnosis_submitter_id, treatment_type, project_id) treatment = obscure_dates(treatment, output_dir=output_dir, participantid=participantid) treatment_emitter.write(treatment) save_missing_parents(missing_diagnoses) return treatment_ids
def my_callback(line): """Remove fields that start with _, fix key names with embedded /, fix id lookups """ for k in [k for k in line if k.startswith('_')]: del line[k] for k in [k for k in line if '/' in k]: line[k.split('/')[1]] = line[k] del line[k] for k in [k for k in line if k.endswith('_id')]: lup = k.replace('_id', '') if line[k]: try: line[lup] = LOOKUPS[lup][line[k]] except Exception as e: print(lup, k, line[k]) print('******') print(LOOKUPS[lup]) print('******') raise e del line[k] if 'chromosome' in line: line['chromosome'] = str(line['chromosome'].replace('chr','')) if 'gene' in line: line['gene_symbol'] = line['gene'] del line['gene'] line = obscure_dates(line) return line
def transform(item_paths, output_dir, experiment_code, compresslevel=0, callback=None): """Read bcc labkey json and writes gen3 json.""" genetrails_emitter = emitter('genetrails_variant', output_dir=output_dir) with open('output/reference/gene_lookup.tsv') as f: gene_lookup = {k: v for k, v in (line.split() for line in f)} for p in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): line['source'] = source if callback: line = callback(line) genetrails_variant = { 'type': 'genetrails_variant', 'project_id': DEFAULT_PROJECT_ID, 'aliquot': { 'submitter_id': '{}-aliquot'.format(line['sample_code']) }, 'submitter_id': line['lsid'] } if 'gene_symbol' in line and line['gene_symbol'].lower( ) in gene_lookup: line['gene'] = { 'submitter_id': gene_lookup[line['gene_symbol'].lower()], 'project_id': 'smmart-reference' } genetrails_variant.update(line) genetrails_variant = obscure_dates(genetrails_variant, output_dir=output_dir) genetrails_emitter.write(genetrails_variant) genetrails_emitter.close()
def transform_gen3(item_paths, output_dir, project_id, compresslevel=0): """Creates gen3.treatment, returns set of treatment_ids.""" case_lookup = { line['MRN']: line['OPTR'] for line in reader('{}/bcc-cases.tsv'.format('source/bcc')) } biomarker_emitter = emitter('bcc_biomarker', output_dir=output_dir) for item_path in item_paths: biomarkers = [line for line in reader(item_path)] # missing_cases = [b['MRN'] for b in biomarkers if b['MRN'] not in case_lookup] def add_case(b): case_submitter_id = case_lookup[b['MRN']] submitter_id = '{}-{}-bcc_biomarker'.format( case_submitter_id, b['ID_Event']) for p in [ "MRN", "Participant ID", "_not_available_notes", "_not_available_reason_id", "cBiomarker Label dont use", ]: del b[p] for p in [ "CA19 Values After Specimen Collection", "Order Proc ID", "assay version id", "biomarker level", "unit of measure id", ]: new_p = p.replace(' ', '_').lower() b[new_p] = b[p] del b[p] b['cbiomarker_label'] = b["cBiomarker Label use this"] del b["cBiomarker Label use this"] biomarker = { 'type': 'bcc_biomarker', 'cases': { 'submitter_id': case_submitter_id }, 'submitter_id': submitter_id, 'project_id': project_id } biomarker.update(b) return biomarker biomarkers_with_case = [ add_case(b) for b in biomarkers if b['MRN'] in case_lookup ] print('there are', len(biomarkers_with_case), 'biomarkers with cases, out of ', len(biomarkers), 'biomarkers') [ biomarker_emitter.write(obscure_dates(b)) for b in biomarkers_with_case ] biomarker_emitter.close()
def transform(item_paths, output_dir, experiment_code, compresslevel=0, callback=None): """Read bcc labkey json and writes gen3 json.""" bcc_aliquot_emitter = emitter('bcc_aliquot', output_dir=output_dir) for p in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): line['source'] = source if callback: line = callback(line) bcc_aliquot = { 'type': 'bcc_aliquot', 'project_id': DEFAULT_PROJECT_ID, 'aliquot': {'submitter_id': '{}-aliquot'.format(line['sample_code'])}, 'submitter_id': line['lsid']} bcc_aliquot.update(line) bcc_aliquot = obscure_dates(bcc_aliquot, output_dir=output_dir) bcc_aliquot_emitter.write(bcc_aliquot) bcc_aliquot_emitter.close()
def transform(item_paths, output_dir, experiment_code, project_id, compresslevel=0, callback=None): """Read bcc labkey json and writes gen3 json.""" alleles_emitter = emitter('allele', output_dir=output_dir) alleles = {} for p in item_paths: for line in reader(p): if callback: line = callback(line) allele = { 'type': 'allele', 'aliquots': {'submitter_id': '{}-aliquot'.format(line['sample_code'])}, 'projects': {'code': 'reference'}, 'submitter_id': line['lsid']} if line['lsid'] in alleles: allele = alleles[line['lsid']] allele['project_id'] = project_id allele.update(line) alleles[line['lsid']] = allele for k in alleles: alleles[k] = obscure_dates(alleles[k], output_dir=output_dir) alleles_emitter.write(alleles[k]) alleles_emitter.close()
def transform_gen3(item_paths, output_dir, project_id, compresslevel=0): """Creates gen3.lesion, returns set of lesion_ids.""" cases = set([ line['submitter_id'] for line in reader('{}/case.json'.format(output_dir)) ]) observation_emitter = emitter('observation', output_dir=output_dir) observation_ids = set([]) missing_cases = [] for p, observation_type, callback in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): participantid = line.get('ParticipantID', line.get('participantid', None)) assert participantid, 'ParticipantID not in {} {}'.format( p, line.keys()) case_submitter_id = participantid observation = default_observation(case_submitter_id, project_id, line['date'], observation_type, line) observation_submitter_id = observation['submitter_id'] if case_submitter_id not in cases: missing_cases.append( missing_parent(parent_id=case_submitter_id, parent_type='case', child_id=observation_submitter_id, child_type='observation')) continue if observation_submitter_id in observation_ids: continue observation_ids.add(observation_submitter_id) observation = obscure_dates( observation, output_dir=output_dir, participantid=observation['cases']['submitter_id']) observation_emitter.write(observation) save_missing_parents(missing_cases) return observation_ids
def transform_biomarker(item_paths, output_dir, project_id, observation_ids, compresslevel=0): """Read bcc labkey json and writes gen3 json.""" bcc_biomarker_emitter = emitter('bcc_biomarker', output_dir=output_dir) for p, observation_type, callback in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): participantid = line.get('ParticipantID', line.get('participantid', None)) observation = default_observation(participantid, project_id, line['date'], observation_type, line) observation_submitter_id = observation['submitter_id'] biomarker_submitter_id = '{}-bcc_biomarker'.format( observation_submitter_id) if observation_submitter_id not in observation_ids: print( 'transform_biomarker {} not in observation_ids, skipping.'. format(biomarker_submitter_id)) continue bcc_biomarker = { 'type': 'bcc_biomarker', 'project_id': project_id, 'observation': { 'submitter_id': observation_submitter_id }, 'submitter_id': biomarker_submitter_id } line['source'] = source if callback: line = callback(line) bcc_biomarker.update(line) bcc_biomarker = obscure_dates(bcc_biomarker, output_dir=output_dir) bcc_biomarker_emitter.write(bcc_biomarker) bcc_biomarker_emitter.close()
def transform_chemotherapy(item_paths, output_dir, project_id, treatment_ids, compresslevel=0): """Read bcc labkey json and writes gen3 json.""" bcc_treatment_emitter = emitter('bcc_chemotherapy', output_dir=output_dir) for p,type, callback in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): line['source'] = source if callback: line = callback(line) diagnosis_submitter_id = '{}-diagnosis'.format(line['ParticipantID']) treatment_submitter_id = '{}-Chemotherapy-{}'.format(diagnosis_submitter_id, get_uniq(line)) if treatment_submitter_id not in treatment_ids: # print('transform_chemotherapy {} not in treatment_ids, skipping.'.format(treatment_submitter_id)) continue bcc_treatment = { 'type': 'bcc_chemotherapy', 'project_id': project_id, 'treatment': {'submitter_id': treatment_submitter_id}, 'submitter_id': '{}-{}-{}'.format(treatment_submitter_id, line['days'], line.get('treatment_description', line.get('treatment_agent', 'na'))) } bcc_treatment.update(line) bcc_treatment = obscure_dates(bcc_treatment, output_dir=output_dir) bcc_treatment_emitter.write(bcc_treatment) bcc_treatment_emitter.close()
def transform(item_paths, output_dir, experiment_code, compresslevel=0): """Read bcc labkey json and writes gen3 json.""" cases = set([]) for line in reader('{}/case.json'.format(output_dir)): cases.add(line['submitter_id']) diagnoses_emitter = emitter('diagnosis', output_dir=output_dir) bcc_diagnosis_emitter = emitter('bcc_diagnosis', output_dir=output_dir) diagnosises = {} bcc_diagnosises = {} missing_cases = set([]) for p in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): case_submitter_id = line['participantid'] bcc_submitter_id = '{}-{}'.format(case_submitter_id, source) diagnosis = default_diagnosis(case_submitter_id, project_id=DEFAULT_PROJECT_ID, line=line) submitter_id = diagnosis['submitter_id'] bcc_diagnosis = { 'type': 'bcc_diagnosis', 'diagnosis': { 'submitter_id': submitter_id }, 'source': source, 'submitter_id': bcc_submitter_id, 'project_id': DEFAULT_PROJECT_ID } if bcc_submitter_id in bcc_diagnosises: bcc_diagnosis = bcc_diagnosises[bcc_submitter_id] # we will use the name 'diagnosis' as a link back to gen3.diagnosis line['diagnosis_name'] = line.get('diagnosis', None) del line['diagnosis'] bcc_diagnosis.update(line) diagnosises[submitter_id] = diagnosis bcc_diagnosises[bcc_submitter_id] = bcc_diagnosis if case_submitter_id not in cases: print('no case for: >{}<'.format(case_submitter_id)) missing_cases.add(case_submitter_id) for k in diagnosises: diagnosises[k] = obscure_dates( diagnosises[k], output_dir=output_dir, participantid=diagnosises[k]['cases']['submitter_id']) diagnoses_emitter.write(diagnosises[k]) cases = missing_cases - cases print('missing diagnosis for {} cases'.format(len(cases))) for participantid in cases: diagnosis = default_diagnosis(participantid, project_id=DEFAULT_PROJECT_ID) diagnosis = obscure_dates(diagnosis, output_dir=output_dir) diagnoses_emitter.write(diagnosis) diagnoses_emitter.close() print('missing cases for {} cases'.format(len(missing_cases))) cases_emitter = emitter('case', output_dir=output_dir, append=True) for participantid in missing_cases: case = default_case(DEFAULT_EXPERIMENT_CODE, participantid, DEFAULT_PROJECT_ID) case = obscure_dates(case, output_dir=output_dir) cases_emitter.write(case)
diagnosis = obscure_dates(diagnosis, output_dir=output_dir) diagnoses_emitter.write(diagnosis) diagnoses_emitter.close() print('missing cases for {} cases'.format(len(missing_cases))) cases_emitter = emitter('case', output_dir=output_dir, append=True) for participantid in missing_cases: case = default_case(DEFAULT_EXPERIMENT_CODE, participantid, DEFAULT_PROJECT_ID) case = obscure_dates(case, output_dir=output_dir) cases_emitter.write(case) cases_emitter.close() bcc_diagnosises_emitter = emitter('bcc_diagnosis', output_dir=output_dir) for k in bcc_diagnosises: bcc_diagnosises[k] = obscure_dates(bcc_diagnosises[k], output_dir=output_dir) bcc_diagnosises_emitter.write(bcc_diagnosises[k]) bcc_diagnosises_emitter.close() if __name__ == "__main__": item_paths = ['source/bcc/voncologdiagnosis.json'] args = default_parser(DEFAULT_OUTPUT_DIR, DEFAULT_EXPERIMENT_CODE, DEFAULT_PROJECT_ID).parse_args() transform(item_paths, output_dir=args.output_dir, experiment_code=args.experiment_code) p = os.path.join(args.output_dir, 'diagnosis.json') assert os.path.isfile(p), 'should have an output file {}'.format(p) print(p)
def transform(item_paths, output_dir, experiment_code, compresslevel=0): """Read bcc labkey json and writes gen3 json.""" cases = set([]) for line in reader('{}/case.json'.format(output_dir)): cases.add(line['submitter_id']) diagnoses = set([]) for line in reader('{}/diagnosis.json'.format(output_dir)): diagnoses.add(line['submitter_id']) missing_cases = set([]) print('cases len {}'.format(len(cases))) # dedup samples = [] samples_emitter = emitter('sample', output_dir=output_dir) bcc_samples_emitter = emitter('bcc_sample', output_dir=output_dir) missing_diagnoses = [] for p in item_paths: source = os.path.splitext(os.path.basename(p))[0] for line in reader(p): case_submitter_id = line.get('participantid', line.get('ParticipantID')) sample = default_sample(case_submitter_id, line=line, project_id=DEFAULT_PROJECT_ID) submitter_id = sample['submitter_id'] if case_submitter_id not in cases: # print('no case {} for sample {} - skipping.'.format(case_submitter_id, submitter_id)) missing_diagnoses.append( missing_parent(child_id=submitter_id, child_type='sample', parent_id=case_submitter_id, parent_type='case')) continue if submitter_id in samples: continue if sample['diagnoses']['submitter_id'] not in diagnoses: missing_diagnoses.append( missing_parent( child_id=submitter_id, child_type='sample', parent_id=sample['diagnoses']['submitter_id'], parent_type='diagnosis')) del sample['diagnoses']['submitter_id'] bcc_submitter_id = '{}-{}'.format(submitter_id, source) samples_emitter.write(sample) samples.append(submitter_id) bcc_sample = { 'type': 'bcc_sample', 'sample': { 'submitter_id': submitter_id }, 'source': source, 'submitter_id': bcc_submitter_id, 'project_id': DEFAULT_PROJECT_ID } bcc_sample.update(line) if '_labkeyurl_sample_type_id' in bcc_sample: bcc_sample['sample_type'] = LOOKUPS['sample_type'][ bcc_sample['sample_type_id']] del bcc_sample['sample_type_id'] del bcc_sample['_labkeyurl_sample_type_id'] bcc_sample = obscure_dates(bcc_sample, output_dir=output_dir) bcc_samples_emitter.write(bcc_sample) if case_submitter_id not in cases: missing_cases.add(case_submitter_id) cases.add(case_submitter_id) save_missing_parents(missing_diagnoses)