Ejemplo n.º 1
0
def run_combiner(samples,
                 intervals,
                 out_file,
                 tmp_path,
                 header,
                 overwrite=True):
    tmp_path += f'/combiner-temporary/{uuid.uuid4()}/'
    sample_names, paths = [list(x) for x in zip(*samples)]
    sample_names = [[n] for n in sample_names]
    assert len(paths) == len(samples)
    out_paths = stage_one(paths, sample_names, tmp_path, intervals, header,
                          out_file)
    if not out_paths:
        return
    tmp_path += f'{uuid.uuid4()}/'
    mts = [hl.read_matrix_table(path) for path in out_paths]
    combined_mts = [
        comb.combine_gvcfs(mt) for mt in chunks(mts, MAX_COMBINE_NUMBER)
    ]
    i = 0
    while len(combined_mts) > 1:
        tmp = tmp_path + f'{i}/'
        pad = len(str(len(combined_mts)))
        hl.experimental.write_matrix_tables(combined_mts, tmp, overwrite=True)
        paths = [
            tmp + str(n).zfill(pad) + '.mt' for n in range(len(combined_mts))
        ]
        mts = [hl.read_matrix_table(path) for path in paths]
        combined_mts = [
            comb.combine_gvcfs(mts) for mt in chunks(mts, MAX_COMBINE_NUMBER)
        ]
        i += 1
    combined_mts[0].write(out_file, overwrite=overwrite)
Ejemplo n.º 2
0
def h(paths, sample_names, tmp_path, json, header, out_path, i, first):
    """inner part of stage one, including transformation from a gvcf into the combiner's format"""
    vcfs = [
        comb.transform_one(vcf)
        for vcf in hl.import_gvcfs(paths,
                                   json,
                                   array_elements_required=False,
                                   _external_header=header,
                                   _external_sample_ids=sample_names)
    ]
    combined = [
        comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINE_NUMBER)
    ]
    if first and len(
            paths
    ) <= MAX_COMBINE_NUMBER:  # only 1 item, just write it, unless we have already written other items
        combined[0].write(out_path, overwrite=True)
        return []
    pad = len(str(len(combined)))
    hl.experimental.write_matrix_tables(combined,
                                        tmp_path + f'{i}/',
                                        overwrite=True)
    return [
        tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt'
        for n in range(len(combined))
    ]
Ejemplo n.º 3
0
def run_combiner(sample_list,
                 intervals,
                 out_path,
                 tmp_path,
                 summary_path=None,
                 overwrite=False):
    import gc
    # make the temp path a directory, no matter what
    tmp_path += f'/combiner-temporary/{uuid.uuid4()}/'
    vcfs = [
        comb.transform_one(vcf) for vcf in hl.import_vcfs(
            sample_list, intervals, array_elements_required=False)
    ]
    combined = [
        comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINER_LENGTH)
    ]
    if len(combined) == 1:
        combined[0].write(out_path, overwrite=overwrite)
    else:
        hl.utils.java.info(f'Writing combiner temporary files to: {tmp_path}')
        i = 0
        while len(combined) > 1:
            pad = len(str(len(combined)))
            hl.experimental.write_matrix_tables(combined,
                                                tmp_path + f'{i}/',
                                                overwrite=True)
            paths = [
                tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt'
                for n in range(len(combined))
            ]
            i += 1
            wmts = [hl.read_matrix_table(path) for path in paths]
            combined = [
                comb.combine_gvcfs(mts)
                for mts in chunks(wmts, MAX_COMBINER_LENGTH)
            ]
            gc.collect()  # need to try to free memory on the master
        combined[0].write(out_path, overwrite=overwrite)
    if summary_path is not None:
        mt = hl.read_matrix_table(out_path)
        comb.summarize(mt).rows().write(summary_path, overwrite=overwrite)
Ejemplo n.º 4
0
def h(paths, sample_names, tmp_path, json, header, out_path, i):
    vcfs = [comb.transform_one(vcf)
            for vcf in hl.import_vcfs(paths, json, array_elements_required=False,
                                      _external_header=header,
                                      _external_sample_ids=sample_names)]
    combined = [comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINE_NUMBER)]
    if len(paths) <= MAX_COMBINE_NUMBER:  # only 1 item, just write it
        combined[0].write(out_path, overwrite=True)
        return []
    pad = len(str(len(combined)))
    hl.experimental.write_matrix_tables(combined, tmp_path + f'{i}/', overwrite=True)
    return [tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt' for n in range(len(combined))]
Ejemplo n.º 5
0
def run_combiner(sample_list, json, out_path, tmp_path, summary_path=None, overwrite=False):
    import gc
    # make the temp path a directory, no matter what
    tmp_path += f'/combiner-temporary/{uuid.uuid4()}/'
    vcfs = [comb.transform_one(vcf)
            for vcf in hl.import_vcfs(sample_list, json, array_elements_required=False)]
    combined = [comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINER_LENGTH)]
    if len(combined) == 1:
        combined[0].write(out_path, overwrite=overwrite)
    else:
        hl.utils.java.info(f'Writing combiner temporary files to: {tmp_path}')
        i = 0
        while len(combined) > 1:
            pad = len(str(len(combined)))
            hl.experimental.write_matrix_tables(combined, tmp_path + f'{i}/', overwrite=True)
            paths = [tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt' for n in range(len(combined))]
            i += 1
            wmts = [hl.read_matrix_table(path) for path in paths]
            combined = [comb.combine_gvcfs(mts) for mts in chunks(wmts, MAX_COMBINER_LENGTH)]
            gc.collect()  # need to try to free memory on the master
        combined[0].write(out_path, overwrite=overwrite)
    if summary_path is not None:
        mt = hl.read_matrix_table(out_path)
        comb.summarize(mt).rows().write(summary_path, overwrite=overwrite)
Ejemplo n.º 6
0
def python_only_10k_combine(path):
    vcf = setup(path)
    mt = comb.transform_gvcf(vcf)
    mts = [mt] * 10_000
    _ = [comb.combine_gvcfs(mts) for mts in chunks(mts, COMBINE_GVCF_MAX)]
Ejemplo n.º 7
0
def compile_2k_merge(path):
    vcf = setup(path)
    vcfs = [comb.transform_gvcf(vcf)] * COMBINE_GVCF_MAX
    combined = [comb.combine_gvcfs(vcfs)] * 20
    with TemporaryDirectory() as tmpdir:
        hl.experimental.write_matrix_tables(combined, os.path.join(tmpdir, 'combiner-multi-write'), overwrite=True)