Beispiel #1
0
def transform_gen3(item_paths, output_dir, project_id, compresslevel=0):
    """Creates gen3.treatment, returns set of treatment_ids."""
    case_lookup = {
        line['MRN']: line['OPTR']
        for line in reader('{}/bcc-cases.tsv'.format('source/bcc'))
    }

    biomarker_emitter = emitter('bcc_biomarker', output_dir=output_dir)
    for item_path in item_paths:
        biomarkers = [line for line in reader(item_path)]

        # missing_cases = [b['MRN'] for b in biomarkers if b['MRN'] not in case_lookup]
        def add_case(b):
            case_submitter_id = case_lookup[b['MRN']]
            submitter_id = '{}-{}-bcc_biomarker'.format(
                case_submitter_id, b['ID_Event'])
            for p in [
                    "MRN",
                    "Participant ID",
                    "_not_available_notes",
                    "_not_available_reason_id",
                    "cBiomarker Label dont use",
            ]:
                del b[p]
            for p in [
                    "CA19 Values After Specimen Collection",
                    "Order Proc ID",
                    "assay version id",
                    "biomarker level",
                    "unit of measure id",
            ]:
                new_p = p.replace(' ', '_').lower()
                b[new_p] = b[p]
                del b[p]
            b['cbiomarker_label'] = b["cBiomarker Label use this"]
            del b["cBiomarker Label use this"]
            biomarker = {
                'type': 'bcc_biomarker',
                'cases': {
                    'submitter_id': case_submitter_id
                },
                'submitter_id': submitter_id,
                'project_id': project_id
            }

            biomarker.update(b)
            return biomarker

        biomarkers_with_case = [
            add_case(b) for b in biomarkers if b['MRN'] in case_lookup
        ]
        print('there are', len(biomarkers_with_case),
              'biomarkers with cases, out of ', len(biomarkers), 'biomarkers')
        [
            biomarker_emitter.write(obscure_dates(b))
            for b in biomarkers_with_case
        ]
    biomarker_emitter.close()
Beispiel #2
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    aliquots_emitter = emitter('aliquot', output_dir=output_dir)
    for line in reader('{}/sample.json'.format(output_dir)):
        assert 'submitter_id' in line, line
        aliquots_emitter.write(
            default_aliquot(line['submitter_id'],
                            project_id=DEFAULT_PROJECT_ID))
    aliquots_emitter.close()
Beispiel #3
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Reads bcc labkey json and writes participantid, dob json."""
    dob_emitter = emitter('bcc_participant_dob', output_dir=output_dir)

    for p in item_paths:
        for line in reader(p):
            dob_emitter.write({
                'participantid': line['ParticipantID'],
                'DateOfBirth': line['DateOfBirth']
            })
    dob_emitter.close()
Beispiel #4
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read medable csv and writes gen3 json."""
    file_emitter = emitter('submitted_file', output_dir=output_dir)

    with open(item_paths[0], newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if exclude_row(row):
                continue
            case_submitter_id = row['c_public_user._id']
            if len(case_submitter_id) == 0:
                continue
            if len(row['c_file.ETag']) == 0:
                continue

            submitter_id = '{}-sf'.format(row['_id'])

            # {
            #   "*data_type": null,
            #   "urls": null,
            #   "*data_format": null,
            #   "type": "submitted_file",
            #   "object_id": null,
            #   "*submitter_id": null,
            #   "*data_category": null,
            #   "*md5sum": null,
            #   "*file_size": null,
            #   "aliquots": {
            #     "submitter_id": null
            #   },
            #   "*file_name": null,
            #   "cases": {
            #     "submitter_id": null
            #   },
            #   "project_id": null,
            #   "state_comment": null,
            #   "projects": {
            #     "code": null
            #   }
            # }
            file = {
                'type': 'submitted_file',
                'cases': {
                    'submitter_id': case_submitter_id
                },
                'submitter_id': submitter_id,
                'project_id': DEFAULT_PROJECT_ID,
                'data_type': row['c_file.mime'],
                'md5sum': row['c_file.ETag'],
                'file_size': row['c_file.size'],
                'file_name': row['c_file.path'],
            }
            file_emitter.write(file)
    file_emitter.close()
Beispiel #5
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read medable csv and writes gen3 json."""
    somatic_variants_emitter = emitter('somatic_variants2',
                                       output_dir=output_dir)
    for line in reader(item_paths[0]):
        line['aliquot'] = {'submitter_id': line['aliquot']}
        line['submitter_id'] = '{}-{}-{}'.format(line['aliquot'],
                                                 line['allele_id'],
                                                 line['ensembl_transcript'])
        line['type'] = 'somatic_variant'
        del line['ensembl_transcript']
        del line['allele_id']
        somatic_variants_emitter.write(line)
    somatic_variants_emitter.close()
Beispiel #6
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    samples_emitter = emitter('sample', output_dir=output_dir)
    for p in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            sample_id = line.rstrip('\n')
            submitter_id = f"sample-{sample_id}"
            sample = {
                'type': 'sample',
                'cases': {
                    'submitter_id': sample_id
                },
                'submitter_id': submitter_id,
                'project_id': DEFAULT_PROJECT_ID
            }
            samples_emitter.write(sample)
    samples_emitter.close()
Beispiel #7
0
def transform_old(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    genes_emitter = emitter('gene', output_dir=output_dir)
    genes = {}
    for p in item_paths:
        for line in reader(p):
            case = {
                'type': 'gene',
                'experiments': {
                    'submitter_id': experiment_code
                },
                'submitter_id': line['participantid']
            }
            if line['participantid'] in genes:
                # print('merge', line['participantid'])
                case = genes[line['participantid']]
            case.update(line)
            genes[line['participantid']] = case
Beispiel #8
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0, callback=None):
    """Read bcc labkey json and writes gen3 json."""
    bcc_aliquot_emitter = emitter('bcc_aliquot', output_dir=output_dir)

    for p in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            line['source'] = source
            if callback:
                line = callback(line)
            bcc_aliquot = {
                'type': 'bcc_aliquot',
                'project_id': DEFAULT_PROJECT_ID,
                'aliquot': {'submitter_id': '{}-aliquot'.format(line['sample_code'])},
                'submitter_id': line['lsid']}
            bcc_aliquot.update(line)
            bcc_aliquot = obscure_dates(bcc_aliquot, output_dir=output_dir)
            bcc_aliquot_emitter.write(bcc_aliquot)
    bcc_aliquot_emitter.close()
Beispiel #9
0
def transform(item_paths, output_dir, experiment_code, project_id, compresslevel=0, callback=None):
    """Read bcc labkey json and writes gen3 json."""
    alleles_emitter = emitter('allele', output_dir=output_dir)
    alleles = {}
    for p in item_paths:
        for line in reader(p):
            if callback:
                line = callback(line)
            allele = {
                'type': 'allele',
                'aliquots': {'submitter_id': '{}-aliquot'.format(line['sample_code'])},
                'projects': {'code': 'reference'},
                'submitter_id': line['lsid']}
            if line['lsid'] in alleles:
                allele = alleles[line['lsid']]
            allele['project_id'] = project_id
            allele.update(line)
            alleles[line['lsid']] = allele
    for k in alleles:
        alleles[k] = obscure_dates(alleles[k], output_dir=output_dir)
        alleles_emitter.write(alleles[k])
    alleles_emitter.close()
Beispiel #10
0
def transform_gen3(item_paths, output_dir, project_id, compresslevel=0):
    """Creates gen3.lesion, returns set of lesion_ids."""
    cases = set([
        line['submitter_id']
        for line in reader('{}/case.json'.format(output_dir))
    ])
    observation_emitter = emitter('observation', output_dir=output_dir)
    observation_ids = set([])
    missing_cases = []
    for p, observation_type, callback in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            participantid = line.get('ParticipantID',
                                     line.get('participantid', None))
            assert participantid, 'ParticipantID not in {} {}'.format(
                p, line.keys())
            case_submitter_id = participantid
            observation = default_observation(case_submitter_id, project_id,
                                              line['date'], observation_type,
                                              line)
            observation_submitter_id = observation['submitter_id']
            if case_submitter_id not in cases:
                missing_cases.append(
                    missing_parent(parent_id=case_submitter_id,
                                   parent_type='case',
                                   child_id=observation_submitter_id,
                                   child_type='observation'))
                continue
            if observation_submitter_id in observation_ids:
                continue
            observation_ids.add(observation_submitter_id)
            observation = obscure_dates(
                observation,
                output_dir=output_dir,
                participantid=observation['cases']['submitter_id'])
            observation_emitter.write(observation)
    save_missing_parents(missing_cases)
    return observation_ids
Beispiel #11
0
def transform_biomarker(item_paths,
                        output_dir,
                        project_id,
                        observation_ids,
                        compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    bcc_biomarker_emitter = emitter('bcc_biomarker', output_dir=output_dir)
    for p, observation_type, callback in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            participantid = line.get('ParticipantID',
                                     line.get('participantid', None))
            observation = default_observation(participantid, project_id,
                                              line['date'], observation_type,
                                              line)
            observation_submitter_id = observation['submitter_id']
            biomarker_submitter_id = '{}-bcc_biomarker'.format(
                observation_submitter_id)
            if observation_submitter_id not in observation_ids:
                print(
                    'transform_biomarker {} not in observation_ids, skipping.'.
                    format(biomarker_submitter_id))
                continue
            bcc_biomarker = {
                'type': 'bcc_biomarker',
                'project_id': project_id,
                'observation': {
                    'submitter_id': observation_submitter_id
                },
                'submitter_id': biomarker_submitter_id
            }
            line['source'] = source
            if callback:
                line = callback(line)
            bcc_biomarker.update(line)
            bcc_biomarker = obscure_dates(bcc_biomarker, output_dir=output_dir)
            bcc_biomarker_emitter.write(bcc_biomarker)
    bcc_biomarker_emitter.close()
Beispiel #12
0
def transform(item_paths,
              output_dir,
              experiment_code,
              compresslevel=0,
              callback=None):
    """Read bcc labkey json and writes gen3 json."""
    genetrails_emitter = emitter('wes_result', output_dir=output_dir)
    with open('output/reference/gene_lookup.tsv') as f:
        gene_lookup = {k: v for k, v in (line.split() for line in f)}

    for p in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            line['source'] = source
            if callback:
                line = callback(line)
            submitter_id = line.get('participantid',
                                    line.get('ParticipantID', None))
            aliquot_id = '{}-sample-aliquot'.format(submitter_id)

            genetrails_variant = {
                'type': 'wes_result',
                'project_id': DEFAULT_PROJECT_ID,
                'aliquot': {
                    'submitter_id': aliquot_id
                },
                'submitter_id': line['lsid']
            }
            if 'gene_symbol' in line and line['gene_symbol'].lower(
            ) in gene_lookup:
                line['gene'] = {
                    'submitter_id': gene_lookup[line['gene_symbol'].lower()],
                    'project_id': 'smmart-reference'
                }
            genetrails_variant.update(line)
            genetrails_emitter.write(genetrails_variant)
    genetrails_emitter.close()
Beispiel #13
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read medable csv and writes gen3 json."""
    cases_emitter = emitter('case', output_dir=output_dir)
    cases = set([])
    with open(item_paths[0], newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if exclude_row(row):
                continue
            submitter_id = row['c_public_user._id']
            if len(submitter_id) == 0:
                continue
            cases.add(submitter_id)

    for submitter_id in cases:
        case = {
            'type': 'case',
            'experiments': {
                'submitter_id': experiment_code
            },
            'submitter_id': submitter_id,
            'project_id': DEFAULT_PROJECT_ID
        }
        cases_emitter.write(case)
Beispiel #14
0
def transform_chemotherapy(item_paths, output_dir, project_id, treatment_ids, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    bcc_treatment_emitter = emitter('bcc_chemotherapy', output_dir=output_dir)
    for p,type, callback in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            line['source'] = source
            if callback:
                line = callback(line)
            diagnosis_submitter_id = '{}-diagnosis'.format(line['ParticipantID'])
            treatment_submitter_id = '{}-Chemotherapy-{}'.format(diagnosis_submitter_id, get_uniq(line))
            if treatment_submitter_id not in treatment_ids:
                # print('transform_chemotherapy {} not in treatment_ids, skipping.'.format(treatment_submitter_id))
                continue
            bcc_treatment = {
                'type': 'bcc_chemotherapy',
                'project_id': project_id,
                'treatment': {'submitter_id': treatment_submitter_id},
                'submitter_id': '{}-{}-{}'.format(treatment_submitter_id, line['days'], line.get('treatment_description', line.get('treatment_agent', 'na')))
                }
            bcc_treatment.update(line)
            bcc_treatment = obscure_dates(bcc_treatment, output_dir=output_dir)
            bcc_treatment_emitter.write(bcc_treatment)
    bcc_treatment_emitter.close()
Beispiel #15
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    cases = set([])
    for line in reader('{}/case.json'.format(output_dir)):
        cases.add(line['submitter_id'])
    diagnoses_emitter = emitter('diagnosis', output_dir=output_dir)
    bcc_diagnosis_emitter = emitter('bcc_diagnosis', output_dir=output_dir)
    diagnosises = {}
    bcc_diagnosises = {}
    missing_cases = set([])
    for p in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            case_submitter_id = line['participantid']
            bcc_submitter_id = '{}-{}'.format(case_submitter_id, source)
            diagnosis = default_diagnosis(case_submitter_id,
                                          project_id=DEFAULT_PROJECT_ID,
                                          line=line)
            submitter_id = diagnosis['submitter_id']
            bcc_diagnosis = {
                'type': 'bcc_diagnosis',
                'diagnosis': {
                    'submitter_id': submitter_id
                },
                'source': source,
                'submitter_id': bcc_submitter_id,
                'project_id': DEFAULT_PROJECT_ID
            }

            if bcc_submitter_id in bcc_diagnosises:
                bcc_diagnosis = bcc_diagnosises[bcc_submitter_id]

            # we will use the name 'diagnosis' as a link back to gen3.diagnosis
            line['diagnosis_name'] = line.get('diagnosis', None)
            del line['diagnosis']
            bcc_diagnosis.update(line)

            diagnosises[submitter_id] = diagnosis
            bcc_diagnosises[bcc_submitter_id] = bcc_diagnosis
            if case_submitter_id not in cases:
                print('no case for: >{}<'.format(case_submitter_id))
                missing_cases.add(case_submitter_id)

    for k in diagnosises:
        diagnosises[k] = obscure_dates(
            diagnosises[k],
            output_dir=output_dir,
            participantid=diagnosises[k]['cases']['submitter_id'])
        diagnoses_emitter.write(diagnosises[k])

    cases = missing_cases - cases
    print('missing diagnosis for {} cases'.format(len(cases)))
    for participantid in cases:
        diagnosis = default_diagnosis(participantid,
                                      project_id=DEFAULT_PROJECT_ID)
        diagnosis = obscure_dates(diagnosis, output_dir=output_dir)
        diagnoses_emitter.write(diagnosis)
    diagnoses_emitter.close()

    print('missing cases for {} cases'.format(len(missing_cases)))
    cases_emitter = emitter('case', output_dir=output_dir, append=True)
    for participantid in missing_cases:
        case = default_case(DEFAULT_EXPERIMENT_CODE, participantid,
                            DEFAULT_PROJECT_ID)
        case = obscure_dates(case, output_dir=output_dir)
        cases_emitter.write(case)
Beispiel #16
0
        diagnosis = default_diagnosis(participantid,
                                      project_id=DEFAULT_PROJECT_ID)
        diagnosis = obscure_dates(diagnosis, output_dir=output_dir)
        diagnoses_emitter.write(diagnosis)
    diagnoses_emitter.close()

    print('missing cases for {} cases'.format(len(missing_cases)))
    cases_emitter = emitter('case', output_dir=output_dir, append=True)
    for participantid in missing_cases:
        case = default_case(DEFAULT_EXPERIMENT_CODE, participantid,
                            DEFAULT_PROJECT_ID)
        case = obscure_dates(case, output_dir=output_dir)
        cases_emitter.write(case)
    cases_emitter.close()

    bcc_diagnosises_emitter = emitter('bcc_diagnosis', output_dir=output_dir)
    for k in bcc_diagnosises:
        bcc_diagnosises[k] = obscure_dates(bcc_diagnosises[k],
                                           output_dir=output_dir)
        bcc_diagnosises_emitter.write(bcc_diagnosises[k])
    bcc_diagnosises_emitter.close()


if __name__ == "__main__":
    item_paths = ['source/bcc/voncologdiagnosis.json']
    args = default_parser(DEFAULT_OUTPUT_DIR, DEFAULT_EXPERIMENT_CODE,
                          DEFAULT_PROJECT_ID).parse_args()
    transform(item_paths,
              output_dir=args.output_dir,
              experiment_code=args.experiment_code)
Beispiel #17
0
def transform(item_paths, output_dir, compresslevel=0):
    """Transform the bmeg input to gen3 output directory."""
    projects_emitter = emitter('project', output_dir=output_dir)
    experiments_emitter = emitter('experiment', output_dir=output_dir)
    cases_emitter = emitter('case', output_dir=output_dir)
    demographics_emitter = emitter('demographic', output_dir=output_dir)
    cases = {}
    projects = {}
    experiments = {}

    for p in [
            'source/ccle/InProject.Edge.json.gz',
            'source/ccle/maf.InProject.Edge.json.gz'
    ]:
        for line in reader(p):
            # # ['type', 'project_id', '*submitter_id', '*cases.submitter_id', 'ethnicity', 'gender', 'race', 'year_of_birth', 'year_of_death']
            project_submitter_id = line['to']
            project_name = project_submitter_id.replace('Project:', '')
            project_name = 'ccle'
            project = {
                'type': 'project',
                "code": project_name,
                "name": project_name,
                "state": "open",
                "availability_type": "Open",
                "dbgap_accession_number": project_name
            }
            projects[project_name] = project

            experiment_submitter_id = "experiment-{}".format(
                project_submitter_id.replace('Project:', ''))
            experiment = {
                "type": "experiment",
                "projects": [{
                    "code": project_name
                }],
                "submitter_id": experiment_submitter_id
            }
            experiment[
                "experimental_description"] = project_submitter_id.replace(
                    'Project:', '')
            experiments[experiment_submitter_id] = experiment

            case = {
                'type': 'case',
                '*experiments': {
                    'submitter_id': experiment_submitter_id
                }
            }
            case_submitter_id = line['from']
            case['submitter_id'] = case_submitter_id
            cases[case_submitter_id] = case

    for project in projects:
        projects_emitter.write(projects[project])
    for experiment in experiments:
        experiments_emitter.write(experiments[experiment])

    projects_emitter.close()
    experiments_emitter.close()

    for p in item_paths:
        # ['MRN', 'OPTR', 'Date Of Initial Diagnosis', 'Sequence Number', 'Cancer Status', 'cEarliest Chemo Date', 'cEarliest Chemo Date Source', 'cErrorList', 'cEventCount', 'cNeoadjuvant Treatment', 'Count', 'cParent Specimen Count', 'Date of Most Definitive Surgical Resection', 'Tumor Size', 'Type Of First Recurrence', 'Case_ICD::Transformation', 'Case_Patient::Sex']
        for line in reader(p):
            # {"_id": "Individual:CCLE:ACH-001665", "gid": "Individual:CCLE:ACH-001665", "label": "Individual", "data": {"individual_id": "CCLE:ACH-001665", "ccle_attributes": {"gender": "Male"}}}
            case_submitter_id = line['gid']
            # # ['type', 'project_id', '*submitter_id', '*cases.submitter_id', 'ethnicity', 'gender', 'race', 'year_of_birth', 'year_of_death']
            case = cases[case_submitter_id]
            cases_emitter.write(case)
            #
            # # type	project_id	*submitter_id	*cases.submitter_id	ethnicity	gender	race	year_of_birth	year_of_death
            demographic = {
                'type': 'demographic',
                '*submitter_id': 'demographic-{}'.format(case_submitter_id),
                '*cases': {
                    'submitter_id': case_submitter_id
                }
            }
            data = line['data']
            demographic['gender'] = data.get('gender', 'unknown').lower()
            if demographic['gender'] not in ['male', 'female']:
                demographic['gender'] = 'unknown'
            demographics_emitter.write(demographic)
            #
            # # ['type', 'project_id', 'submitter_id', 'cases.submitter_id',
            # # '*age_at_diagnosis', '*classification_of_tumor', '*days_to_last_follow_up', '*days_to_last_known_disease_status', '*days_to_recurrence', '*last_known_disease_status', '*morphology', '*primary_diagnosis', '*progression_or_recurrence', '*site_of_resection_or_biopsy', '*tissue_or_organ_of_origin', '*tumor_grade', '*tumor_stage', '*vital_status', # 'ajcc_clinical_m', 'ajcc_clinical_n', 'ajcc_clinical_stage', 'ajcc_clinical_t',
            # # 'ajcc_pathologic_m', 'ajcc_pathologic_n', 'ajcc_pathologic_stage', 'ajcc_pathologic_t', 'ann_arbor_b_symptoms', 'ann_arbor_clinical_stage', 'ann_arbor_extranodal_involvement', 'ann_arbor_pathologic_stage', 'burkitt_lymphoma_clinical_variant', 'cause_of_death', 'circumferential_resection_margin', 'colon_polyps_history', 'days_to_birth', 'days_to_death', 'days_to_hiv_diagnosis', 'days_to_new_event', 'figo_stage', 'hiv_positive', 'hpv_positive_type', 'hpv_status', 'laterality',
            # # 'ldh_level_at_diagnosis', 'ldh_normal_range_upper', 'lymph_nodes_positive', 'lymphatic_invasion_present', 'method_of_diagnosis', 'new_event_anatomic_site', 'new_event_type', 'perineural_invasion_present', 'prior_malignancy', 'prior_treatment', 'residual_disease', 'vascular_invasion_present', 'year_of_diagnosis']
            # diagnosis = {'type': 'diagnosis', '*submitter_id': 'diagnosis-{}'.format(case_submitter_id),  '*cases': {'submitter_id': case_submitter_id}}
            # diagnosis['*age_at_diagnosis'] = None
            # diagnosis['*classification_of_tumor'] = 'Unknown' # ['primary', 'metastasis', 'recurrence', 'other', 'Unknown', 'not reported', 'Not Allowed To Collect']
            # diagnosis['*days_to_last_follow_up'] = None
            # diagnosis['*days_to_last_known_disease_status'] = None
            # diagnosis['*days_to_recurrence'] = None
            # # [ 'Distant met recurrence/progression',
            # # 'Loco-regional recurrence/progression',
            # # 'Biochemical evidence of disease without structural correlate',
            # # 'Tumor free',
            # # 'Unknown tumor status',
            # # 'With tumor',
            # # 'not reported',
            # # 'Not Allowed To Collect']
            # disease_status = {
            #     'Evidence of this tumor': 'With tumor',
            #     'No evidence of this tumor': 'Tumor free',
            #     'Unknown, indeterminate whether this tumor is present; not stated': 'Unknown tumor status'
            # }
            #
            # diagnosis['*last_known_disease_status'] = disease_status.get(line['Cancer Status'], 'Unknown tumor status')
            # diagnosis['*morphology'] = 'tumor_size={}'.format(line['Tumor Size']) # "None is not of type 'string'")
            # diagnosis['*primary_diagnosis'] = line['Case_ICD::Transformation']
            # diagnosis['*progression_or_recurrence'] = 'unknown' # ['yes', 'no', 'unknown', 'not reported', 'Not Allowed To Collect']
            # diagnosis['*site_of_resection_or_biopsy'] = 'unknown'
            # diagnosis['*tissue_or_organ_of_origin'] = 'pancrease'
            # diagnosis['*tumor_grade'] = 'unknown' #  "None is not of type 'string'")
            # diagnosis['*tumor_stage'] = 'unknown' #  "None is not of type 'string'")
            # diagnosis['*vital_status'] = 'unknown'
            #
            # diagnosis_emitter.write(diagnosis)

    cases_emitter.close()
    demographics_emitter.close()
Beispiel #18
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    cases = set([])
    for line in reader('{}/case.json'.format(output_dir)):
        cases.add(line['submitter_id'])

    diagnoses = set([])
    for line in reader('{}/diagnosis.json'.format(output_dir)):
        diagnoses.add(line['submitter_id'])

    missing_cases = set([])
    print('cases len {}'.format(len(cases)))

    # dedup
    samples = []
    samples_emitter = emitter('sample', output_dir=output_dir)
    bcc_samples_emitter = emitter('bcc_sample', output_dir=output_dir)
    missing_diagnoses = []
    for p in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            case_submitter_id = line.get('participantid',
                                         line.get('ParticipantID'))
            sample = default_sample(case_submitter_id,
                                    line=line,
                                    project_id=DEFAULT_PROJECT_ID)
            submitter_id = sample['submitter_id']
            if case_submitter_id not in cases:
                # print('no case {} for sample {} - skipping.'.format(case_submitter_id, submitter_id))
                missing_diagnoses.append(
                    missing_parent(child_id=submitter_id,
                                   child_type='sample',
                                   parent_id=case_submitter_id,
                                   parent_type='case'))
                continue
            if submitter_id in samples:
                continue
            if sample['diagnoses']['submitter_id'] not in diagnoses:
                missing_diagnoses.append(
                    missing_parent(
                        child_id=submitter_id,
                        child_type='sample',
                        parent_id=sample['diagnoses']['submitter_id'],
                        parent_type='diagnosis'))
                del sample['diagnoses']['submitter_id']

            bcc_submitter_id = '{}-{}'.format(submitter_id, source)
            samples_emitter.write(sample)
            samples.append(submitter_id)

            bcc_sample = {
                'type': 'bcc_sample',
                'sample': {
                    'submitter_id': submitter_id
                },
                'source': source,
                'submitter_id': bcc_submitter_id,
                'project_id': DEFAULT_PROJECT_ID
            }
            bcc_sample.update(line)
            if '_labkeyurl_sample_type_id' in bcc_sample:
                bcc_sample['sample_type'] = LOOKUPS['sample_type'][
                    bcc_sample['sample_type_id']]
                del bcc_sample['sample_type_id']
                del bcc_sample['_labkeyurl_sample_type_id']

            bcc_sample = obscure_dates(bcc_sample, output_dir=output_dir)

            bcc_samples_emitter.write(bcc_sample)

            if case_submitter_id not in cases:
                missing_cases.add(case_submitter_id)
                cases.add(case_submitter_id)
    save_missing_parents(missing_diagnoses)