Esempio n. 1
0
def h(paths, sample_names, tmp_path, json, header, out_path, i, first):
    """inner part of stage one, including transformation from a gvcf into the combiner's format"""
    vcfs = [
        comb.transform_one(vcf)
        for vcf in hl.import_gvcfs(paths,
                                   json,
                                   array_elements_required=False,
                                   _external_header=header,
                                   _external_sample_ids=sample_names)
    ]
    combined = [
        comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINE_NUMBER)
    ]
    if first and len(
            paths
    ) <= MAX_COMBINE_NUMBER:  # only 1 item, just write it, unless we have already written other items
        combined[0].write(out_path, overwrite=True)
        return []
    pad = len(str(len(combined)))
    hl.experimental.write_matrix_tables(combined,
                                        tmp_path + f'{i}/',
                                        overwrite=True)
    return [
        tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt'
        for n in range(len(combined))
    ]
Esempio n. 2
0
def h(paths, sample_names, tmp_path, json, header, out_path, i):
    vcfs = [comb.transform_one(vcf)
            for vcf in hl.import_vcfs(paths, json, array_elements_required=False,
                                      _external_header=header,
                                      _external_sample_ids=sample_names)]
    combined = [comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINE_NUMBER)]
    if len(paths) <= MAX_COMBINE_NUMBER:  # only 1 item, just write it
        combined[0].write(out_path, overwrite=True)
        return []
    pad = len(str(len(combined)))
    hl.experimental.write_matrix_tables(combined, tmp_path + f'{i}/', overwrite=True)
    return [tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt' for n in range(len(combined))]
Esempio n. 3
0
def run_combiner(sample_list,
                 intervals,
                 out_path,
                 tmp_path,
                 summary_path=None,
                 overwrite=False):
    import gc
    # make the temp path a directory, no matter what
    tmp_path += f'/combiner-temporary/{uuid.uuid4()}/'
    vcfs = [
        comb.transform_one(vcf) for vcf in hl.import_vcfs(
            sample_list, intervals, array_elements_required=False)
    ]
    combined = [
        comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINER_LENGTH)
    ]
    if len(combined) == 1:
        combined[0].write(out_path, overwrite=overwrite)
    else:
        hl.utils.java.info(f'Writing combiner temporary files to: {tmp_path}')
        i = 0
        while len(combined) > 1:
            pad = len(str(len(combined)))
            hl.experimental.write_matrix_tables(combined,
                                                tmp_path + f'{i}/',
                                                overwrite=True)
            paths = [
                tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt'
                for n in range(len(combined))
            ]
            i += 1
            wmts = [hl.read_matrix_table(path) for path in paths]
            combined = [
                comb.combine_gvcfs(mts)
                for mts in chunks(wmts, MAX_COMBINER_LENGTH)
            ]
            gc.collect()  # need to try to free memory on the master
        combined[0].write(out_path, overwrite=overwrite)
    if summary_path is not None:
        mt = hl.read_matrix_table(out_path)
        comb.summarize(mt).rows().write(summary_path, overwrite=overwrite)
Esempio n. 4
0
def run_combiner(sample_list, json, out_path, tmp_path, summary_path=None, overwrite=False):
    import gc
    # make the temp path a directory, no matter what
    tmp_path += f'/combiner-temporary/{uuid.uuid4()}/'
    vcfs = [comb.transform_one(vcf)
            for vcf in hl.import_vcfs(sample_list, json, array_elements_required=False)]
    combined = [comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINER_LENGTH)]
    if len(combined) == 1:
        combined[0].write(out_path, overwrite=overwrite)
    else:
        hl.utils.java.info(f'Writing combiner temporary files to: {tmp_path}')
        i = 0
        while len(combined) > 1:
            pad = len(str(len(combined)))
            hl.experimental.write_matrix_tables(combined, tmp_path + f'{i}/', overwrite=True)
            paths = [tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt' for n in range(len(combined))]
            i += 1
            wmts = [hl.read_matrix_table(path) for path in paths]
            combined = [comb.combine_gvcfs(mts) for mts in chunks(wmts, MAX_COMBINER_LENGTH)]
            gc.collect()  # need to try to free memory on the master
        combined[0].write(out_path, overwrite=overwrite)
    if summary_path is not None:
        mt = hl.read_matrix_table(out_path)
        comb.summarize(mt).rows().write(summary_path, overwrite=overwrite)