Beispiel #1
0
def transform_gen3(item_paths, output_dir, project_id, compresslevel=0):
    """Creates gen3.treatment, returns set of treatment_ids."""
    diagnoses = set([line['submitter_id'] for line in reader('{}/diagnosis.json'.format(output_dir))])
    treatment_emitter = emitter('treatment', output_dir=output_dir)
    treatment_ids = set([])
    missing_diagnoses = []
    for p,treatment_type, callback in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            participantid = line.get('ParticipantID', line.get('participantid', None))
            assert participantid, 'ParticipantID not in {} {}'.format(p, line.keys())
            diagnosis_submitter_id = '{}-diagnosis'.format(participantid)
            treatment_submitter_id = '{}-{}-{}'.format(diagnosis_submitter_id, treatment_type, get_uniq(line))
            if diagnosis_submitter_id not in diagnoses:
                missing_diagnoses.append(missing_parent(parent_id=diagnosis_submitter_id, parent_type='diagnosis', child_id=treatment_submitter_id, child_type='treatment'))
                print('skipping missing diagnosis', treatment_submitter_id)
                continue
            if treatment_submitter_id in treatment_ids:
                print('skipping ',treatment_submitter_id, p, line.keys())
                continue
            treatment_ids.add(treatment_submitter_id)
            treatment = default_treatment(treatment_submitter_id, diagnosis_submitter_id, treatment_type, project_id)
            treatment = obscure_dates(treatment, output_dir=output_dir, participantid=participantid)
            treatment_emitter.write(treatment)
    save_missing_parents(missing_diagnoses)
    return treatment_ids
Beispiel #2
0
def transform(output_dir, compresslevel=0):
    """Read bmeg json and writes gen3 json."""
    ssm_emitter = JSONEmitter(os.path.join(output_dir, 'submitted_somatic_mutation.json'), compresslevel=0)
    read_groups = {}

    # [ "_id", "data", "from", "gid", "label", "to" ]
    for line in reader('source/ccle/DerivedFrom.Edge.json.gz'):
        read_groups[line['from']] = 'read_group-{}'.format(line['to'])

    for line in reader('source/ccle/File.Vertex.json.gz'):
        ssm_submitter_id = line['gid']
        read_group_submitter_id = read_groups[ssm_submitter_id]
        ssm = {
            'type': 'submitted_somatic_mutation',
            '*read_groups': {
                'submitter_id': read_group_submitter_id
            }
        }
        ssm['*submitter_id'] = ssm_submitter_id
        ssm['md5sum'] = line['data']['md5']
        ssm['file_size'] = line['data']['size']
        ssm['file_name'] = line['data']['path']
        ssm['experimental_strategy'] = 'etl'
        ssm['data_type'] = 'maf like'
        ssm['data_format'] = 'tsv'
        ssm['data_category'] = 'omics'
        ssm_emitter.write(ssm)

    ssm_emitter.close()
Beispiel #3
0
def transform_gen3(item_paths, output_dir, project_id, compresslevel=0):
    """Creates gen3.treatment, returns set of treatment_ids."""
    case_lookup = {
        line['MRN']: line['OPTR']
        for line in reader('{}/bcc-cases.tsv'.format('source/bcc'))
    }

    biomarker_emitter = emitter('bcc_biomarker', output_dir=output_dir)
    for item_path in item_paths:
        biomarkers = [line for line in reader(item_path)]

        # missing_cases = [b['MRN'] for b in biomarkers if b['MRN'] not in case_lookup]
        def add_case(b):
            case_submitter_id = case_lookup[b['MRN']]
            submitter_id = '{}-{}-bcc_biomarker'.format(
                case_submitter_id, b['ID_Event'])
            for p in [
                    "MRN",
                    "Participant ID",
                    "_not_available_notes",
                    "_not_available_reason_id",
                    "cBiomarker Label dont use",
            ]:
                del b[p]
            for p in [
                    "CA19 Values After Specimen Collection",
                    "Order Proc ID",
                    "assay version id",
                    "biomarker level",
                    "unit of measure id",
            ]:
                new_p = p.replace(' ', '_').lower()
                b[new_p] = b[p]
                del b[p]
            b['cbiomarker_label'] = b["cBiomarker Label use this"]
            del b["cBiomarker Label use this"]
            biomarker = {
                'type': 'bcc_biomarker',
                'cases': {
                    'submitter_id': case_submitter_id
                },
                'submitter_id': submitter_id,
                'project_id': project_id
            }

            biomarker.update(b)
            return biomarker

        biomarkers_with_case = [
            add_case(b) for b in biomarkers if b['MRN'] in case_lookup
        ]
        print('there are', len(biomarkers_with_case),
              'biomarkers with cases, out of ', len(biomarkers), 'biomarkers')
        [
            biomarker_emitter.write(obscure_dates(b))
            for b in biomarkers_with_case
        ]
    biomarker_emitter.close()
Beispiel #4
0
def transform(item_paths,
              output_dir,
              experiment_code,
              compresslevel=0,
              callback=None):
    """Read bcc labkey json and writes gen3 json."""
    genetrails_emitter = emitter('genetrails_variant', output_dir=output_dir)
    with open('output/reference/gene_lookup.tsv') as f:
        gene_lookup = {k: v for k, v in (line.split() for line in f)}

    for p in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            line['source'] = source
            if callback:
                line = callback(line)
            genetrails_variant = {
                'type': 'genetrails_variant',
                'project_id': DEFAULT_PROJECT_ID,
                'aliquot': {
                    'submitter_id': '{}-aliquot'.format(line['sample_code'])
                },
                'submitter_id': line['lsid']
            }
            if 'gene_symbol' in line and line['gene_symbol'].lower(
            ) in gene_lookup:
                line['gene'] = {
                    'submitter_id': gene_lookup[line['gene_symbol'].lower()],
                    'project_id': 'smmart-reference'
                }
            genetrails_variant.update(line)
            genetrails_variant = obscure_dates(genetrails_variant,
                                               output_dir=output_dir)
            genetrails_emitter.write(genetrails_variant)
    genetrails_emitter.close()
Beispiel #5
0
def transform_surgery(item_paths, output_dir, project_id, treatment_ids, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    bcc_treatment_emitter = emitter('bcc_surgery', output_dir=output_dir)
    bcc_treatment_submitter_ids = []
    for p,type, callback in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            line['source'] = source
            if callback:
                line = callback(line)
            participantid = line.get('ParticipantID', line.get('participantid', None))
            assert participantid, 'ParticipantID not in {} {}'.format(p, line.keys())
            diagnosis_submitter_id = '{}-diagnosis'.format(participantid)
            treatment_submitter_id = '{}-Surgery-{}'.format(diagnosis_submitter_id, get_uniq(line))
            bcc_treatment_submitter_id = '{}-bcc_surgery'.format(treatment_submitter_id)
            if treatment_submitter_id not in treatment_ids:
                # print('transform_surgery {} not in treatment_ids, skipping.'.format(treatment_submitter_id))
                continue
            if bcc_treatment_submitter_id in bcc_treatment_submitter_ids:
                # print('transform_surgery {} in bcc_treatment_submitter_ids, skipping.'.format(treatment_submitter_id))
                continue
            bcc_treatment_submitter_ids.append(bcc_treatment_submitter_id)
            bcc_treatment = {
                'type': 'bcc_surgery',
                'project_id': project_id,
                'treatment': {'submitter_id': treatment_submitter_id},
                'submitter_id': bcc_treatment_submitter_id
                }
            if 'type' in line and p == 'source/bcc/vResectionDate.json':
                del line['type']
            bcc_treatment.update(line)
            bcc_treatment = obscure_dates(bcc_treatment, output_dir=output_dir)
            bcc_treatment_emitter.write(bcc_treatment)
    bcc_treatment_emitter.close()
Beispiel #6
0
def transform_gen3(item_paths, output_dir, project_id, compresslevel=0):
    """Creates gen3.treatment, returns set of treatment_ids."""
    submitted_file_emitter = emitter('submitted_file', output_dir=output_dir)
    for item_path in item_paths:
        for line in reader(item_path):
            submitter_id = '{}-{}'.format(line['participantid'],
                                          line['document'])
            submitted_file = {
                'type': 'bcc_submitted_file',
                'cases': {
                    'submitter_id': line['participantid']
                },
                'submitter_id': submitter_id,
                'project_id': project_id
            }
            submitted_file.update(line)
            for k in [
                    "_labkeyurl_data_owner",
                    "_labkeyurl_doctype_id",
                    "_labkeyurl_document",
                    "_labkeyurl_participantid",
            ]:
                del submitted_file[k]
            submitted_file_emitter.write(submitted_file)
    submitted_file_emitter.close()
Beispiel #7
0
def transform(item_paths,
              output_dir,
              project_id,
              type,
              callback=None,
              compresslevel=0):
    """Read bcc labkey json and writes postgres TSV with embedded gen3 json."""
    path = os.path.join(output_dir, '{}.tsv'.format(type))
    node_ids = set([])
    with open(path, 'w') as output_file:
        for p in item_paths:
            for line in reader(p):
                node_id = uuid.uuid5(uuid.NAMESPACE_DNS, line['gid'].lower())
                if node_id in node_ids:
                    continue
                node_ids.add(node_id)
                line['data']['project_id'] = project_id
                line['data']['submitter_id'] = line['gid'].lower()
                line['node_id'] = node_id
                if callback:
                    line = callback(line)
                # copy node_gene(node_id, acl, _sysan,  _props) from stdin  csv delimiter E'\x01' quote E'\x02' ;"
                output_file.write('{}\x01{}\x01{}\x01{}\n'.format(
                    node_id, '{}', '{}',
                    json.dumps(line['data'], separators=(',', ':'))))
Beispiel #8
0
def transform(output_dir, compresslevel=0):
    """Read bmeg json and writes gen3 json."""
    read_groups_emitter = JSONEmitter(os.path.join(output_dir,
                                                   'read_group.json'),
                                      compresslevel=0)
    read_groups = {}

    # [ "_id", "data", "from", "gid", "label", "to" ]
    # {"_id": "(Callset:ccle:ACH-001270:None)--CallsetFor->(Aliquot:ACH-001270)", "gid": "(Callset:ccle:ACH-001270:None)--CallsetFor->(Aliquot:ACH-001270)", "label": "CallsetFor", "from": "Callset:ccle:ACH-001270:None", "to": "Aliquot:ACH-001270", "data": {}}
    for line in reader('source/ccle/maf.CallsetFor.Edge.json.gz'):
        # *type	project_id	*submitter_id	*aliquots.submitter_id	RIN	adapter_name	adapter_sequence	barcoding_applied	base_caller_name	base_caller_version	experiment_name	flow_cell_barcode	includes_spike_ins	instrument_model	is_paired_end	library_name	library_preparation_kit_catalog_number	library_preparation_kit_name	library_preparation_kit_vendor	library_preparation_kit_version	library_selection	library_strand	library_strategy	platform	read_group_name	read_length	sequencing_center	sequencing_date	size_selection_range	spike_ins_concentration	spike_ins_fasta	target_capture_kit_catalog_number	target_capture_kit_name	target_capture_kit_target_region	target_capture_kit_vendor	target_capture_kit_version	to_trim_adapter_sequence ]
        read_group_submitter_id = 'read_group-{}'.format(line['from'])
        if read_group_submitter_id in read_groups:
            continue
        read_group = {
            'type': 'read_group',
            '*aliquots': {
                'submitter_id': line['to']
            }
        }
        read_group['*submitter_id'] = read_group_submitter_id
        read_groups[read_group_submitter_id] = read_group

    for read_group in read_groups:
        read_groups_emitter.write(read_groups[read_group])

    read_groups_emitter.close()
Beispiel #9
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    aliquots_emitter = emitter('aliquot', output_dir=output_dir)
    for line in reader('{}/sample.json'.format(output_dir)):
        assert 'submitter_id' in line, line
        aliquots_emitter.write(
            default_aliquot(line['submitter_id'],
                            project_id=DEFAULT_PROJECT_ID))
    aliquots_emitter.close()
Beispiel #10
0
def sample(item_paths, limit=100):
    """Reads limit number of records from each file in paths."""
    for path in item_paths:
        i = 0
        for line in reader(path):
            if i < limit:
                yield line
                i = i + 1
            else:
                break
Beispiel #11
0
def lookups():
    look_ups = {}
    for p in LOOKUP_PATHS:
        c = p.replace('source/bcc/','').replace('.json','')
        look_ups[c] = {}
        print(p, c)
        for line in reader(p):
            name = line.get('display_name', line.get('alt_display_name', None))
            val = [line[k] for k in line if not k.startswith('_') and k.endswith('_id')][0]
            look_ups[c][val] = name
    return look_ups
Beispiel #12
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Reads bcc labkey json and writes participantid, dob json."""
    dob_emitter = emitter('bcc_participant_dob', output_dir=output_dir)

    for p in item_paths:
        for line in reader(p):
            dob_emitter.write({
                'participantid': line['ParticipantID'],
                'DateOfBirth': line['DateOfBirth']
            })
    dob_emitter.close()
Beispiel #13
0
def _DOBs(output_dir):
    global DOBs
    if DOBs:
        return DOBs
    # load date of birth cache
    DOBs = {}
    gdan_tmp_participant_path = '{}/gdan-tmp_participant_dob.json'.format(output_dir)
    if os.path.isfile(gdan-tmp_participant_path):
        for line in reader(gdan-tmp_participant_path):
            DOBs[line['participantid']] = datetime.strptime(line['DateOfBirth'], DATE_FORMAT)
    return DOBs
Beispiel #14
0
def upload(path, program, project, submission_client, batch_size, delete_first):
    """Read gen3 json and write to gen3."""
    pool = mp.Pool(mp.cpu_count())

    def collect_result(response):
        is_error = False
        for entity in response['entities']:
            for error in entity.get('errors', []):
                logger.error('{} {} {}'.format(error['type'], entity['type'], entity))
                is_error = True
        for error in response['transactional_errors']:
            logger.error('transactional_error {}'.format(error))
            logger.error(json.dumps(response))
            is_error = True
        if is_error:
            logger.debug(response)

    for p in glob(path):
        deleted = False
        print(p)
        for lines in grouper(batch_size, reader(p)):
            nodes = [l for l in lines]

            if nodes[0]['type'] == 'project':
                for node in nodes:
                    print('creating program')
                    response = submission_client.create_program({'name': program, 'dbgap_accession_number': program, 'type': 'program'})
                    # response = None
                    # try:
                    #     response = json.loads(r)
                    # except Exception as e:
                    #     pass
                    assert response, 'could not parse response {}'.format(r)
                    # assert 'code' in response, f'Unexpected response {response}'
                    # assert response['code'] == 200, 'could not create {} program'.format(response)
                    assert 'id' in response, 'could not create {} program'.format(response)
                    assert program in response['name'], 'could not create {} program'.format(response)

                    response = submission_client.create_project(program, node)
                    assert response, 'could not parse response'
                    assert 'code' in response, f'Unexpected response {response}'
                    assert response['code'] == 200, 'could not create {} {}'.format(nodes[0]['type'], response)
                    assert 'successful' in response['message'], 'could not create {} {}'.format(nodes[0]['type'], response)
                    print('Created project {}'.format(node['code']), file=sys.stderr)
                continue

            if nodes[0]['type'] == 'experiment':
                project = nodes[0]['projects'][0]['code']

            if not deleted and delete_first:
                delete_all(submission_client, program, project, types=[nodes[0]['type']])
                deleted = True

            collect_result(create_node(submission_client, program, project, nodes))
Beispiel #15
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    cases_emitter = emitter('case', output_dir=output_dir)
    bcc_cases_emitter = emitter('bcc_participant', output_dir=output_dir)
    cases = {}
    bcc_cases = {}
    submitter_ids = []
    for p in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            submitter_id = line.get('participantid',
                                    line.get('ParticipantID', None))
            submitter_ids.append(submitter_id)
            bcc_submitter_id = '{}-{}'.format(submitter_id, source)
            primary_site = line.get('site', None)
            case = {
                'type': 'case',
                'experiments': {
                    'submitter_id': experiment_code
                },
                'primary_site': primary_site,
                'submitter_id': submitter_id,
                'project_id': DEFAULT_PROJECT_ID
            }
            bcc_case = {
                'type': 'bcc_participant',
                'case': {
                    'submitter_id': submitter_id
                },
                'source': source,
                'submitter_id': bcc_submitter_id,
                'project_id': DEFAULT_PROJECT_ID
            }
            cases[submitter_id] = case
            if bcc_submitter_id in bcc_cases:
                # merge dupes
                bcc_case = bcc_cases[bcc_submitter_id]
            bcc_case.update(line)
            bcc_cases[bcc_submitter_id] = bcc_case
    for k in cases:
        cases_emitter.write(
            obscure_dates(cases[k], participantid=k, output_dir=output_dir))
    for k in bcc_cases:
        bcc_case = bcc_cases[k]
        for p in [
                'FirstName', 'MRN', 'LastName', 'DateOfBirth',
                '_labkeyurl_Gender_ID', '_labkeyurl_ParticipantID', 'Gender_ID'
        ]:
            del bcc_case[p]
        bcc_case = obscure_dates(bcc_case, output_dir=output_dir)
        bcc_cases_emitter.write(bcc_case)
    cases_emitter.close()
    bcc_cases_emitter.close()
Beispiel #16
0
def diagnosis_lookup_values(paths):
    look_ups = {}
    for p in paths:
        c = p.replace('source/bcc/', '').replace('genetrails_', '').replace(
            '.json', '').replace('diagnoses', 'diagnosis')
        look_ups[c] = {}
        for line in reader(p):
            name = line.get('display_name', line.get('diagnosis'))
            val = line.get('rowid', line.get('diagnosis_id'))
            if val == None or name == None:
                print(line)
            look_ups[c][val] = name
    return look_ups
Beispiel #17
0
def treatment_lookup_values(paths):
    look_ups = {}
    for p in paths:
        c = p.replace('source/bcc/', '').replace('.json', '')
        look_ups[c] = {}
        for line in reader(p):
            name = line.get('display_name', line.get('alt_display_name', None))
            for val in [
                    line[k] for k in line
                    if not k.startswith('_') and k.endswith('_id')
            ]:
                look_ups[c][val] = name
    return look_ups
Beispiel #18
0
def sample_lookup_values(paths):
    look_ups = {}
    for p in paths:
        c = p.replace('source/bcc/', '').replace('genetrails_', '').replace(
            '.json', '').replace('sample_type', 'sample_type_id')
        look_ups[c] = {}
        for line in reader(p):
            name = line.get('display_name')
            val = line.get('sample_type_id')
            if val == None or name == None:
                print(line)
            look_ups[c][val] = name
    return look_ups
Beispiel #19
0
def transform_gen3(item_paths, output_dir, project_id, compresslevel=0):
    """Creates gen3.lesion, returns set of lesion_ids."""
    cases = set([
        line['submitter_id']
        for line in reader('{}/case.json'.format(output_dir))
    ])
    observation_emitter = emitter('observation', output_dir=output_dir)
    observation_ids = set([])
    missing_cases = []
    for p, observation_type, callback in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            participantid = line.get('ParticipantID',
                                     line.get('participantid', None))
            assert participantid, 'ParticipantID not in {} {}'.format(
                p, line.keys())
            case_submitter_id = participantid
            observation = default_observation(case_submitter_id, project_id,
                                              line['date'], observation_type,
                                              line)
            observation_submitter_id = observation['submitter_id']
            if case_submitter_id not in cases:
                missing_cases.append(
                    missing_parent(parent_id=case_submitter_id,
                                   parent_type='case',
                                   child_id=observation_submitter_id,
                                   child_type='observation'))
                continue
            if observation_submitter_id in observation_ids:
                continue
            observation_ids.add(observation_submitter_id)
            observation = obscure_dates(
                observation,
                output_dir=output_dir,
                participantid=observation['cases']['submitter_id'])
            observation_emitter.write(observation)
    save_missing_parents(missing_cases)
    return observation_ids
Beispiel #20
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read medable csv and writes gen3 json."""
    somatic_variants_emitter = emitter('somatic_variants2',
                                       output_dir=output_dir)
    for line in reader(item_paths[0]):
        line['aliquot'] = {'submitter_id': line['aliquot']}
        line['submitter_id'] = '{}-{}-{}'.format(line['aliquot'],
                                                 line['allele_id'],
                                                 line['ensembl_transcript'])
        line['type'] = 'somatic_variant'
        del line['ensembl_transcript']
        del line['allele_id']
        somatic_variants_emitter.write(line)
    somatic_variants_emitter.close()
Beispiel #21
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    demographics_emitter = emitter('demographic', output_dir=output_dir)
    bcc_demographics_emitter = emitter('bcc_demographic',
                                       output_dir=output_dir)
    demographics = {}
    bcc_demographics = {}

    for p in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            case_submitter_id = line['participantid']
            submitter_id = '{}-demographic'.format(case_submitter_id)
            bcc_submitter_id = '{}-{}'.format(submitter_id, source)

            demographic = {
                'type': 'demographic',
                'cases': {
                    'submitter_id': case_submitter_id
                },
                'submitter_id': submitter_id,
                'project_id': DEFAULT_PROJECT_ID
            }
            bcc_demographic = {
                'type': 'bcc_demographic',
                'demographic': {
                    'submitter_id': submitter_id
                },
                'source': source,
                'submitter_id': bcc_submitter_id,
                'project_id': DEFAULT_PROJECT_ID
            }
            demographics[submitter_id] = demographic
            if bcc_submitter_id in bcc_demographics:
                bcc_demographic = bcc_demographics[bcc_submitter_id]
            bcc_demographic.update(line)
            bcc_demographics[bcc_submitter_id] = bcc_demographic
    for k in demographics:
        demographics[k] = obscure_dates(
            demographics[k],
            output_dir=output_dir,
            participantid=demographics[k]['cases']['submitter_id'])
        demographics_emitter.write(demographics[k])
    demographics_emitter.close()
    for k in bcc_demographics:
        bcc_demographics[k] = obscure_dates(bcc_demographics[k],
                                            output_dir=output_dir)
        bcc_demographics_emitter.write(bcc_demographics[k])
    bcc_demographics_emitter.close()
Beispiel #22
0
def observation_lookup_values(paths):
    look_ups = {}
    for p in paths:
        c = p.replace('source/bcc/', '').replace('genetrails_',
                                                 '').replace('.json', '')
        look_ups[c] = {}
        for line in reader(p):
            name = line.get('display_name')
            val = line.get(f'{c}_id')
            if val == None:
                val = line.get(c)
            if name == None:
                name = line.get('type_name', None)
            if val == None or name == None:
                print(c)
            look_ups[c][val] = name
    return look_ups
Beispiel #23
0
def transform_old(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    genes_emitter = emitter('gene', output_dir=output_dir)
    genes = {}
    for p in item_paths:
        for line in reader(p):
            case = {
                'type': 'gene',
                'experiments': {
                    'submitter_id': experiment_code
                },
                'submitter_id': line['participantid']
            }
            if line['participantid'] in genes:
                # print('merge', line['participantid'])
                case = genes[line['participantid']]
            case.update(line)
            genes[line['participantid']] = case
Beispiel #24
0
def transform(item_paths, output_dir, project_id, compresslevel=0):
    """Read bcc labkey json and writes postgres TSV with embedded gen3 json."""
    path = os.path.join(output_dir, 'gene.tsv')
    lookup_path = os.path.join(output_dir, 'gene_lookup.tsv')
    with open(lookup_path, 'w') as lookup_file:
        with open(path, 'w') as output_file:
            for p in item_paths:
                for line in reader(p):
                    gene_id = line['data']['gene_id'].lower()
                    symbol = line['data']['symbol'].lower()
                    node_id = uuid.uuid5(uuid.NAMESPACE_DNS, gene_id)
                    line['data']['project_id'] = project_id
                    line['data']['submitter_id'] = gene_id
                    # copy node_gene(node_id, acl, _sysan,  _props) from stdin  with delimiter E'\t' ;
                    output_file.write('{}\x01{}\x01{}\x01{}\n'.format(
                        node_id, '{}', '{}',
                        json.dumps(line['data'], separators=(',', ':'))))
                    lookup_file.write('{}\t{}\n'.format(symbol, gene_id))
Beispiel #25
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    samples_emitter = emitter('sample', output_dir=output_dir)
    for p in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            sample_id = line.rstrip('\n')
            submitter_id = f"sample-{sample_id}"
            sample = {
                'type': 'sample',
                'cases': {
                    'submitter_id': sample_id
                },
                'submitter_id': submitter_id,
                'project_id': DEFAULT_PROJECT_ID
            }
            samples_emitter.write(sample)
    samples_emitter.close()
Beispiel #26
0
def transform(item_paths, output_dir, experiment_code, compresslevel=0, callback=None):
    """Read bcc labkey json and writes gen3 json."""
    bcc_aliquot_emitter = emitter('bcc_aliquot', output_dir=output_dir)

    for p in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            line['source'] = source
            if callback:
                line = callback(line)
            bcc_aliquot = {
                'type': 'bcc_aliquot',
                'project_id': DEFAULT_PROJECT_ID,
                'aliquot': {'submitter_id': '{}-aliquot'.format(line['sample_code'])},
                'submitter_id': line['lsid']}
            bcc_aliquot.update(line)
            bcc_aliquot = obscure_dates(bcc_aliquot, output_dir=output_dir)
            bcc_aliquot_emitter.write(bcc_aliquot)
    bcc_aliquot_emitter.close()
Beispiel #27
0
def upload(path, program, project, submission_client, delete_first,
           output_dir):
    """Transforms submission record to node and edge files"""
    for p in glob(path):
        tables = None
        for line in reader(p):
            if 'project_id' not in line:
                line['project_id'] = '{}-{}'.format(program, project)
            assert 'project_id' in line, 'must have project_id'
            assert 'submitter_id' in line, 'must have submitter_id'
            if not tables:
                tables = get_tables(submission_client, line)
                tables['handle'] = open(
                    '{}/{}/{}.tsv'.format(output_dir, project,
                                          tables['node_table']), 'w')
                for l in tables['links']:
                    l['handle'] = open(
                        '{}/{}/{}.tsv'.format(output_dir, project,
                                              l['edge_table']), 'w')
                if delete_first:
                    print(
                        "$psql -c \"delete from {} where _props->>'project_id' = '{}-{}'  ;\""
                        .format(tables['node_table'], program, project))

            for l in tables['links']:
                line = write_edge(l, line, submission_client,
                                  '{}-{}'.format(program, project))
            write_node(tables['handle'], line)

        tables['handle'].close()
        node_path = '{}/{}/{}.tsv'.format(output_dir, project,
                                          tables['node_table'])
        print(
            "cat  $DATA/{} | $psql -c \"copy {}(node_id, acl, _sysan,  _props) from stdin  csv delimiter E'\\x01' quote E'\\x02' ;\""
            .format(node_path, tables['node_table']))
        for l in tables['links']:
            l['handle'].close()
            edge_path = '{}/{}/{}.tsv'.format(output_dir, project,
                                              l['edge_table'])
            print(
                "cat  $DATA/{} | $psql -c \"copy {}(src_id, dst_id, acl, _sysan, _props) from stdin  csv delimiter E'\\x01' quote E'\\x02' ;\""
                .format(edge_path, l['edge_table']))
Beispiel #28
0
def transform(item_paths, output_dir, project_id, type, filter=None):
    """Read bcc labkey json and writes postgres TSV with embedded gen3 json."""
    path = os.path.join(output_dir, '{}.tsv'.format(type))
    dedupes = set([])
    with open(path, 'w') as output_file:
        for p in item_paths:
            for line in reader(p):
                if filter and not filter(line):
                    continue
                src_id = uuid.uuid5(uuid.NAMESPACE_DNS, line['from'].lower())
                dst_id = uuid.uuid5(uuid.NAMESPACE_DNS, line['to'].lower())
                dedupe = '{}-{}'.format(src_id, dst_id)
                if dedupe in dedupes:
                    continue
                line['data']['from'] = line['from']
                line['data']['to'] = line['to']
                # copy $type (src_id, dst_id, acl, _sysan, _props) from stdin  csv delimiter E'\x01' quote E'\x02' ;"
                output_file.write('{}\x01{}\x01{}\x01{}\x01{}\n'.format(
                    src_id, dst_id, '{}', '{}',
                    json.dumps(line['data'], separators=(',', ':'))))
                dedupes.add(dedupe)
Beispiel #29
0
def transform(item_paths, output_dir, experiment_code, project_id, compresslevel=0, callback=None):
    """Read bcc labkey json and writes gen3 json."""
    alleles_emitter = emitter('allele', output_dir=output_dir)
    alleles = {}
    for p in item_paths:
        for line in reader(p):
            if callback:
                line = callback(line)
            allele = {
                'type': 'allele',
                'aliquots': {'submitter_id': '{}-aliquot'.format(line['sample_code'])},
                'projects': {'code': 'reference'},
                'submitter_id': line['lsid']}
            if line['lsid'] in alleles:
                allele = alleles[line['lsid']]
            allele['project_id'] = project_id
            allele.update(line)
            alleles[line['lsid']] = allele
    for k in alleles:
        alleles[k] = obscure_dates(alleles[k], output_dir=output_dir)
        alleles_emitter.write(alleles[k])
    alleles_emitter.close()
Beispiel #30
0
def transform_biomarker(item_paths,
                        output_dir,
                        project_id,
                        observation_ids,
                        compresslevel=0):
    """Read bcc labkey json and writes gen3 json."""
    bcc_biomarker_emitter = emitter('bcc_biomarker', output_dir=output_dir)
    for p, observation_type, callback in item_paths:
        source = os.path.splitext(os.path.basename(p))[0]
        for line in reader(p):
            participantid = line.get('ParticipantID',
                                     line.get('participantid', None))
            observation = default_observation(participantid, project_id,
                                              line['date'], observation_type,
                                              line)
            observation_submitter_id = observation['submitter_id']
            biomarker_submitter_id = '{}-bcc_biomarker'.format(
                observation_submitter_id)
            if observation_submitter_id not in observation_ids:
                print(
                    'transform_biomarker {} not in observation_ids, skipping.'.
                    format(biomarker_submitter_id))
                continue
            bcc_biomarker = {
                'type': 'bcc_biomarker',
                'project_id': project_id,
                'observation': {
                    'submitter_id': observation_submitter_id
                },
                'submitter_id': biomarker_submitter_id
            }
            line['source'] = source
            if callback:
                line = callback(line)
            bcc_biomarker.update(line)
            bcc_biomarker = obscure_dates(bcc_biomarker, output_dir=output_dir)
            bcc_biomarker_emitter.write(bcc_biomarker)
    bcc_biomarker_emitter.close()