def run_hmmcopy(
        bam_file,
        corrected_reads_filename,
        segments_filename,
        parameters_filename,
        metrics_filename,
        hmmcopy_tar,
        cell_id,
        hmmparams,
        tempdir,
        docker_image
):

    # generate wig file for hmmcopy
    helpers.makedirs(tempdir)
    readcount_wig = os.path.join(tempdir, 'readcounter.wig')
    corrected_reads = os.path.join(tempdir, 'corrected_reads.csv')

    run_correction_hmmcopy(
        bam_file,
        corrected_reads,
        readcount_wig,
        hmmparams,
        docker_image
    )

    hmmcopy_tempdir = os.path.join(tempdir, '{}_hmmcopy'.format(cell_id))
    helpers.makedirs(hmmcopy_tempdir)

    run_hmmcopy_script(
        corrected_reads,
        hmmcopy_tempdir,
        cell_id,
        hmmparams,
        docker_image
    )

    hmmcopy_outdir = os.path.join(hmmcopy_tempdir, str(0))
    
    csvutils.rewrite_csv_file(
        os.path.join(hmmcopy_outdir, "reads.csv"), corrected_reads_filename,
        dtypes=dtypes()['reads']
    )
    
    csvutils.rewrite_csv_file(
        os.path.join(hmmcopy_outdir, "params.csv"), parameters_filename,
        dtypes=dtypes()['params']
    )
 
    csvutils.rewrite_csv_file(
        os.path.join(hmmcopy_outdir, "segs.csv"), segments_filename,
        dtypes=dtypes()['segs']
    )
    
    csvutils.rewrite_csv_file(
        os.path.join(hmmcopy_outdir, "metrics.csv"), metrics_filename,
        dtypes=dtypes()['metrics']
    )

    helpers.make_tarfile(hmmcopy_tar, hmmcopy_tempdir)
def add_quality(hmmcopy_metrics, alignment_metrics, multipliers, output, training_data, tempdir):
    helpers.makedirs(tempdir)

    hmmcopy_tables = ['/hmmcopy/metrics/{}'.format(mult) for mult in multipliers]

    model = classify.train_classifier(training_data)

    feature_names = model.feature_names_

    data = classify.load_data(hmmcopy_metrics, alignment_metrics,
                              hmmcopy_tables, '/alignment/metrics',
                              feature_names)

    for i, (hmmcopy_table, tabledata) in enumerate(data):
        intermediate_output = os.path.join(
            tempdir, '{}_metrics_with_quality.csv.gz'.format(i)
        )

        predictions = classify.classify(model, tabledata)

        classify.write_to_output(
            hmmcopy_metrics,
            hmmcopy_table,
            intermediate_output,
            predictions)

        csvutils.prep_csv_files(intermediate_output, output, dtypes=dtypes()['metrics'])
Beispiel #3
0
def concatenate_csv(inputs, output, data_type, low_memory=False):
    ref_dtypes = None
    if data_type:
        ref_dtypes = dtypes()[data_type]

    if low_memory:
        csvutils.concatenate_csv_files_quick_lowmem(inputs,
                                                    output,
                                                    dtypes=ref_dtypes)
    else:
        csvutils.concatenate_csv(inputs, output, dtypes=ref_dtypes)
def get_mappability_col(reads, annotated_reads):
    reads = csvutils.read_csv_and_yaml(reads, chunksize=100)

    alldata = []
    for read_data in reads:
        read_data['is_low_mappability'] = (read_data['map'] <= 0.9)
        alldata.append(read_data)

    alldata = pd.concat(alldata)

    csvutils.write_dataframe_to_csv_and_yaml(
        alldata, annotated_reads, dtypes()['reads'], write_header=True
    )
def add_clustering_order(
        reads, metrics, output, chromosomes=None, sample_info=None):
    """
    adds sample information to metrics in place
    """

    order = get_hierarchical_clustering_order(
        reads, chromosomes=chromosomes
    )

    if not sample_info:
        sample_info = {}

    for cell_id, order in order.items():
        if cell_id not in sample_info:
            sample_info[cell_id] = {}
        sample_info[cell_id]['order'] = order

    csvutils.annotate_csv(metrics, sample_info, output, dtypes()['metrics'])