Beispiel #1
0
def transform(output_dir, compresslevel=0):
    """Read bmeg json and writes gen3 json."""
    read_groups_emitter = JSONEmitter(os.path.join(output_dir,
                                                   'read_group.json'),
                                      compresslevel=0)
    read_groups = {}

    # [ "_id", "data", "from", "gid", "label", "to" ]
    # {"_id": "(Callset:ccle:ACH-001270:None)--CallsetFor->(Aliquot:ACH-001270)", "gid": "(Callset:ccle:ACH-001270:None)--CallsetFor->(Aliquot:ACH-001270)", "label": "CallsetFor", "from": "Callset:ccle:ACH-001270:None", "to": "Aliquot:ACH-001270", "data": {}}
    for line in reader('source/ccle/maf.CallsetFor.Edge.json.gz'):
        # *type	project_id	*submitter_id	*aliquots.submitter_id	RIN	adapter_name	adapter_sequence	barcoding_applied	base_caller_name	base_caller_version	experiment_name	flow_cell_barcode	includes_spike_ins	instrument_model	is_paired_end	library_name	library_preparation_kit_catalog_number	library_preparation_kit_name	library_preparation_kit_vendor	library_preparation_kit_version	library_selection	library_strand	library_strategy	platform	read_group_name	read_length	sequencing_center	sequencing_date	size_selection_range	spike_ins_concentration	spike_ins_fasta	target_capture_kit_catalog_number	target_capture_kit_name	target_capture_kit_target_region	target_capture_kit_vendor	target_capture_kit_version	to_trim_adapter_sequence ]
        read_group_submitter_id = 'read_group-{}'.format(line['from'])
        if read_group_submitter_id in read_groups:
            continue
        read_group = {
            'type': 'read_group',
            '*aliquots': {
                'submitter_id': line['to']
            }
        }
        read_group['*submitter_id'] = read_group_submitter_id
        read_groups[read_group_submitter_id] = read_group

    for read_group in read_groups:
        read_groups_emitter.write(read_groups[read_group])

    read_groups_emitter.close()
Beispiel #2
0
def transform(output_dir, compresslevel=0):
    """Read bmeg json and writes gen3 json."""
    ssm_emitter = JSONEmitter(os.path.join(output_dir, 'submitted_somatic_mutation.json'), compresslevel=0)
    read_groups = {}

    # [ "_id", "data", "from", "gid", "label", "to" ]
    for line in reader('source/ccle/DerivedFrom.Edge.json.gz'):
        read_groups[line['from']] = 'read_group-{}'.format(line['to'])

    for line in reader('source/ccle/File.Vertex.json.gz'):
        ssm_submitter_id = line['gid']
        read_group_submitter_id = read_groups[ssm_submitter_id]
        ssm = {
            'type': 'submitted_somatic_mutation',
            '*read_groups': {
                'submitter_id': read_group_submitter_id
            }
        }
        ssm['*submitter_id'] = ssm_submitter_id
        ssm['md5sum'] = line['data']['md5']
        ssm['file_size'] = line['data']['size']
        ssm['file_name'] = line['data']['path']
        ssm['experimental_strategy'] = 'etl'
        ssm['data_type'] = 'maf like'
        ssm['data_format'] = 'tsv'
        ssm['data_category'] = 'omics'
        ssm_emitter.write(ssm)

    ssm_emitter.close()
Beispiel #3
0
def emitter(type=None, output_dir=DEFAULT_OUTPUT_DIR, **kwargs):
    """Creates a default emitter for type."""
    return JSONEmitter(os.path.join(output_dir, '{}.json'.format(type)), compresslevel=0, **kwargs)
Beispiel #4
0
def transform(output_dir, compresslevel=0):
    """Read bmeg json and writes gen3 json."""
    samples_emitter = JSONEmitter(os.path.join(output_dir, 'samples.json'),
                                  compresslevel=0)
    aliquots_emitter = JSONEmitter(os.path.join(output_dir, 'aliquots.json'),
                                   compresslevel=0)
    diagnosis_emitter = JSONEmitter(os.path.join(output_dir, 'diagnosis.json'),
                                    compresslevel=0)

    samples = {}
    diagnoses = {}

    for p in [
            'source/ccle/BiosampleFor.Edge.json.gz',
            'source/ccle/maf.BiosampleFor.Edge.json.gz'
    ]:
        for line in reader(p):
            case_submitter_id = line['to']
            diagnosis_submitter_id = 'diagnosis-{}'.format(case_submitter_id)
            sample = {'type': 'sample', '*cases': {'submitter_id': line['to']}}
            sample['*submitter_id'] = line['from']
            sample['*diagnoses'] = {'submitter_id': diagnosis_submitter_id}

            samples[sample['*submitter_id']] = sample
            diagnosis = {
                'type': 'diagnosis',
                '*submitter_id': diagnosis_submitter_id,
                '*cases': {
                    'submitter_id': case_submitter_id
                }
            }
            diagnoses[sample['*submitter_id']] = diagnosis

    for p in [
            'source/ccle/maf.AliquotFor.Edge.json.gz',
            'source/ccle/AliquotFor.Edge.json.gz'
    ]:
        for line in reader(p):
            line = json.loads(line)
            # ['type', 'project_id', '*submitter_id', '*cases.submitter_id', 'diagnoses.submitter_id', 'biosample_anatomic_site', 'composition', 'current_weight', 'days_to_collection', 'days_to_sample_procurement', 'diagnosis_pathologically_confirmed', 'freezing_method', 'initial_weight', 'intermediate_dimension', 'is_ffpe', 'longest_dimension', 'method_of_sample_procurement', 'oct_embedded', 'preservation_method', 'sample_type', 'sample_type_id', 'sample_volume', 'shortest_dimension', 'time_between_clamping_and_freezing', 'time_between_excision_and_freezing', 'tissue_type', 'tumor_code', 'tumor_code_id', 'tumor_descriptor',]
            aliquot = {
                'type': 'aliquot',
                '*samples': {
                    'submitter_id': line['to']
                }
            }
            aliquot['*submitter_id'] = line['from']
            aliquots_emitter.write(aliquot)

    for p in [
            'source/ccle/BioSample.Vertex.json.gz',
            'source/ccle/maf.BioSample.Vertex.json.gz'
    ]:
        for line in reader(p):
            line = json.loads(line)
            # ['type', 'project_id', '*submitter_id', '*cases.submitter_id', 'diagnoses.submitter_id', 'biosample_anatomic_site', 'composition', 'current_weight', 'days_to_collection', 'days_to_sample_procurement', 'diagnosis_pathologically_confirmed', 'freezing_method', 'initial_weight', 'intermediate_dimension', 'is_ffpe', 'longest_dimension', 'method_of_sample_procurement', 'oct_embedded', 'preservation_method', 'sample_type', 'sample_type_id', 'sample_volume', 'shortest_dimension', 'time_between_clamping_and_freezing', 'time_between_excision_and_freezing', 'tissue_type', 'tumor_code', 'tumor_code_id', 'tumor_descriptor',]
            sample = samples[line['gid']]
            samples_emitter.write(sample)

            diagnosis = diagnoses[sample['*submitter_id']]
            ccle_attributes = line['data']
            diagnosis['*primary_diagnosis'] = ccle_attributes.get(
                "Primary Disease",
                ccle_attributes.get("Subtype Disease", 'unknown'))

            diagnosis['*age_at_diagnosis'] = None
            diagnosis['*classification_of_tumor'] = 'Unknown'
            diagnosis['*days_to_last_follow_up'] = None
            diagnosis['*days_to_last_known_disease_status'] = None
            diagnosis['*days_to_recurrence'] = None
            diagnosis['*last_known_disease_status'] = 'Unknown tumor status'
            diagnosis['*morphology'] = 'unknown'
            diagnosis['*progression_or_recurrence'] = 'unknown'
            diagnosis['*site_of_resection_or_biopsy'] = 'unknown'
            diagnosis['*tissue_or_organ_of_origin'] = 'unknown'
            diagnosis['*tumor_grade'] = 'unknown'
            diagnosis['*tumor_stage'] = 'unknown'
            diagnosis['*vital_status'] = 'unknown'

            diagnosis_emitter.write(diagnosis)

    samples_emitter.close()
    aliquots_emitter.close()
    diagnosis_emitter.close()