Beispiel #1
0
def load_data(hmmcopy_filename, alignment_filename, colnames):
    hmmcopy_data = csvutils.read_csv_and_yaml(hmmcopy_filename)
    alignment_data = csvutils.read_csv_and_yaml(alignment_filename)

    hmmcopy_data = hmmcopy_data.set_index('cell_id')
    alignment_data = alignment_data.set_index('cell_id')

    data = []
    for colname in colnames:
        if colname in hmmcopy_data:
            coldata = hmmcopy_data[colname]
        else:
            coldata = alignment_data[colname]

        if colname == 'scaled_halfiness':
            # haploid poison adds inf, replace with big number since 0 is considered good
            # and we want to score to decrease
            coldata = coldata.replace(np.inf, 1e10)
        data.append(coldata)

    data = pd.concat(data, axis=1)

    data = data.replace(-np.inf, np.nan)
    data = data.fillna(0)

    return data
def compare_annotation(annotation, refannotation):
    annotation = csvutils.read_csv_and_yaml(annotation)
    refannotation = csvutils.read_csv_and_yaml(refannotation)

    common_cols = _check_for_missing_cols(annotation, refannotation)
    for col in common_cols:
        ann = annotation[col].dropna()
        ref = refannotation[col].dropna()
        assert set(ann) == set(ref)
def test_breakpoint_calling(args):
    output_path = args[1]
    ref_path = args[2]

    ref_strelka, ref_museq, ref_snpeff = get_inputs(ref_path)
    strelka, museq, snpeff = get_inputs(output_path)

    compare.compare_variant_calls(ref_snpeff, snpeff)

    ref_strelka = csvutils.read_csv_and_yaml(ref_strelka)
    strelka = csvutils.read_csv_and_yaml(strelka)

    assert ref_strelka.empty and strelka.empty
Beispiel #4
0
def read_data(filename, tablename, gzipped=True):
    fileformat = single_cell.utils.helpers.get_file_format(filename)

    if fileformat == 'h5':
        data = read_from_h5(filename, tablename)
    elif fileformat == 'csv':
        data = csvutils.read_csv_and_yaml(filename)
    elif fileformat == 'gzip':
        data = csvutils.read_csv_and_yaml(filename)
    else:
        raise Exception("unknown file format")

    return data
def cell_cycle_classifier(hmmcopy_reads,
                          hmmcopy_metrics,
                          alignment_metrics,
                          output,
                          tempdir,
                          docker_image=None):
    helpers.makedirs(tempdir)
    temp_output = os.path.join(tempdir, 'cell_cycle_output.csv')

    cmd = [
        'cell_cycle_classifier', 'train-classify', hmmcopy_reads,
        hmmcopy_metrics, alignment_metrics, temp_output
    ]

    pypeliner.commandline.execute(*cmd, docker_image=docker_image)

    cell_cycle_df = pd.read_csv(temp_output)

    hmm_metrics_df = csvutils.read_csv_and_yaml(hmmcopy_metrics)

    hmm_metrics_df = hmm_metrics_df.merge(cell_cycle_df,
                                          on=['cell_id'],
                                          how='outer')

    csvutils.write_dataframe_to_csv_and_yaml(hmm_metrics_df, output)
def add_corrupt_tree_order(corrupt_tree, metrics, output):
    """
    adds corrupt tree order to metrics
    """

    with open(corrupt_tree) as newickfile:
        newickdata = newickfile.readline()
        assert newickfile.readline() == ''

    tree = Tree(newickdata, format=1)

    leaves = [node.name for node in tree.traverse("levelorder")]
    leaves = [val[len('cell_'):] for val in leaves if val.startswith("cell_")]

    ordering = {val: i for i, val in enumerate(leaves)}

    metrics = csvutils.read_csv_and_yaml(metrics)

    cells = metrics.cell_id

    for cellid in cells:
        order = ordering.get(cellid, float('nan'))
        metrics.loc[metrics["cell_id"] == cellid, "order_corrupt_tree"] = order

    csvutils.write_dataframe_to_csv_and_yaml(metrics,
                                             output,
                                             write_header=True)
Beispiel #7
0
def add_contamination_status(infile,
                             outfile,
                             config,
                             reference='grch37',
                             threshold=0.05):
    data = csvutils.read_csv_and_yaml(infile)

    data = data.set_index('cell_id', drop=False)
    organisms = [genome['name'] for genome in config['genomes']]

    if reference not in organisms:
        raise Exception("Could not find the fastq screen counts")

    alts = [col for col in organisms if not col == reference]

    data['is_contaminated'] = False

    for altcol in alts:
        perc_alt = _get_col_data(data, altcol) / data['total_reads']
        data.loc[perc_alt > threshold, 'is_contaminated'] = True

    col_type = dtypes()['metrics']['is_contaminated']

    data['is_contaminated'] = data['is_contaminated'].astype(col_type)
    csvutils.write_dataframe_to_csv_and_yaml(data, outfile,
                                             dtypes()['metrics'])
Beispiel #8
0
def cell_cycle_classifier(hmmcopy_reads, hmmcopy_metrics, alignment_metrics,
                          output, tempdir, genome_labels):
    helpers.makedirs(tempdir)
    temp_output = os.path.join(tempdir, 'cell_cycle_output.csv')

    cmd = [
        'cell_cycle_classifier', 'train-classify', hmmcopy_reads,
        hmmcopy_metrics, alignment_metrics, temp_output
    ]

    pypeliner.commandline.execute(*cmd)

    cell_cycle_df = pd.read_csv(temp_output)

    cols_cell_cycle = cell_cycle_df.columns.values

    hmm_metrics_df = csvutils.read_csv_and_yaml(hmmcopy_metrics)

    hmm_metrics_df = hmm_metrics_df.merge(cell_cycle_df,
                                          on=['cell_id'],
                                          how='outer')

    out_dtypes = dtypes(genome_labels)
    for colname in cols_cell_cycle:
        hmm_metrics_df[colname] = hmm_metrics_df[colname].astype(
            out_dtypes[colname])

    csvutils.write_dataframe_to_csv_and_yaml(hmm_metrics_df, output,
                                             out_dtypes)
Beispiel #9
0
def filter_plot_tar(metrics, src_tar, pass_tar, fail_tar, tempdir, filters):
    allplots = os.path.join(tempdir, 'allplots')
    helpers.makedirs(allplots)
    helpers.extract_tar(src_tar, allplots)

    metrics_data = csvutils.read_csv_and_yaml(metrics)
    all_cells = metrics_data.cell_id.tolist()

    metrics_data = helpers.filter_metrics(metrics_data, filters)
    good_cells = metrics_data.cell_id.tolist()
    bad_cells = [cell for cell in all_cells if cell not in good_cells]

    plotdir = os.path.join(tempdir, 'segs_pass')
    helpers.makedirs(plotdir)
    for cell in good_cells:
        src_path = os.path.join(allplots, 'segments',
                                '{}_{}.png'.format(cell, 'segments'))
        dest_path = os.path.join(plotdir, '{}_{}.png'.format(cell, 'segments'))
        shutil.copyfile(src_path, dest_path)
    helpers.make_tarfile(pass_tar, plotdir)

    plotdir = os.path.join(tempdir, 'segs_fail')
    helpers.makedirs(plotdir)
    for cell in bad_cells:
        src_path = os.path.join(allplots, 'segments',
                                '{}_{}.png'.format(cell, 'segments'))
        dest_path = os.path.join(plotdir, '{}_{}.png'.format(cell, 'segments'))
        shutil.copyfile(src_path, dest_path)
    helpers.make_tarfile(fail_tar, plotdir)
Beispiel #10
0
def test_breakpoint_calling(args):
    output_path = args[1]
    ref_path = args[2]

    ref_must_exist, ref_lumpy, ref_destruct = get_inputs(ref_path)
    must_exist, lumpy, destruct = get_inputs(output_path)

    assert all(map(os.path.exists, ref_must_exist))
    assert all(map(os.path.exists, must_exist))

    compare.compare_breakpoint_calls(ref_lumpy, lumpy)

    ref_destruct = csvutils.read_csv_and_yaml(ref_destruct)
    destruct = csvutils.read_csv_and_yaml(destruct)

    assert ref_destruct.empty and destruct.empty
def load_data(infile, gc=False):
    df = csvutils.read_csv_and_yaml(infile)

    if gc:
        df.index = df.cell_id
        del df['cell_id']
        df.columns = map(int, df.columns.values)

    return df
def _load(file, by, reindex=False):
    loaded = csvutils.read_csv_and_yaml(file)

    loaded = loaded.sort_values(by, ascending=[True] * len(by))

    if reindex:
        loaded = loaded.set_index(by)

    return loaded
Beispiel #13
0
def get_good_cells(metrics, cell_filters):
    metrics_data = csvutils.read_csv_and_yaml(metrics)

    if not cell_filters:
        return metrics_data.cell_id.tolist()

    metrics_data = helpers.filter_metrics(metrics_data, cell_filters)

    return metrics_data.cell_id.tolist()
Beispiel #14
0
def load_hmmcopy_reads_data(readsfile):
    keepcols = ['ideal', 'valid', 'gc', 'map', 'state', 'cor_gc', 'copy']

    reads = csvutils.read_csv_and_yaml(readsfile)

    reads = reads.set_index(['cell_id', 'chr', 'start', 'end'])

    reads = reads[keepcols]

    return reads
def get_mappability_col(reads, annotated_reads):
    reads = csvutils.read_csv_and_yaml(reads, chunksize=100)

    alldata = []
    for read_data in reads:
        read_data['is_low_mappability'] = (read_data['map'] <= 0.9)
        alldata.append(read_data)

    alldata = pd.concat(alldata)

    csvutils.write_dataframe_to_csv_and_yaml(
        alldata, annotated_reads, dtypes()['reads'], write_header=True
    )
Beispiel #16
0
def write_to_output(hmmcopy_filename, output, predictions):
    data = csvutils.read_csv_and_yaml(hmmcopy_filename)

    data['quality'] = data['cell_id'].map(predictions)
    data.quality = data.quality.astype(float)

    fileformat = single_cell.utils.helpers.get_file_format(output)

    if fileformat == 'csv':
        write_to_csv(output, data)
    elif fileformat == "gzip":
        write_to_csv(output, data, gzipped=True)
    else:
        raise Exception("unknown file format")
Beispiel #17
0
def annotate_metrics(metrics, output, sample_info, cells):
    """
    adds sample information to metrics in place
    """

    metrics = csvutils.read_csv_and_yaml(metrics)

    for cellid in cells:
        cellinfo = sample_info[cellid]

        for colname, value in cellinfo.items():
            metrics.loc[metrics["cell_id"] == cellid, colname] = value

    csvutils.write_dataframe_to_csv_and_yaml(metrics, output)
def get_hierarchical_clustering_order(
        reads_filename, chromosomes=None):
    data = []
    chunksize = 10 ** 5
    for chunk in csvutils.read_csv_and_yaml(
            reads_filename, chunksize=chunksize):
        chunk["bin"] = list(zip(chunk.chr, chunk.start, chunk.end))

        # for some reason pivot doesnt like an Int64 state col
        chunk['state'] = chunk['state'].astype('float')

        chunk = chunk.pivot(index='cell_id', columns='bin', values='state')

        data.append(chunk)

    # merge chunks, sum cells that get split across chunks
    table = pd.concat(data)
    table = table.groupby(table.index).sum()

    bins = pd.DataFrame(
        table.columns.values.tolist(),
        columns=[
            'chr',
            'start',
            'end'])

    bins['chr'] = bins['chr'].astype(str)

    bins = sort_bins(bins, chromosomes)

    table = table.sort_values(bins, axis=0)

    data_mat = np.array(table.values)

    data_mat[np.isnan(data_mat)] = -1

    row_linkage = hc.linkage(sp.distance.pdist(data_mat, 'cityblock'),
                             method='ward')

    order = hc.leaves_list(row_linkage)

    samps = table.index
    order = [samps[i] for i in order]
    order = {v: i for i, v in enumerate(order)}

    return order
Beispiel #19
0
def test_contamination(tmpdir):
    data = {}

    cols = [
        'fastqscreen_nohit',
        'fastqscreen_grch37',
        'fastqscreen_grch37_multihit',
        'fastqscreen_mm10',
        'fastqscreen_mm10_multihit',
        'fastqscreen_salmon',
        'fastqscreen_salmon_multihit'
    ]

    for i in range(5):
        data[i] = {'cell_id': 'SA123_A123_R{0}_C{0}'.format(i)}
        for col in cols:
            data[i][col] = i * 10
        data[i]['fastqscreen_grch37'] = i * 1000
        data[i]['fastqscreen_mm10'] = i * 100

    for i in range(5, 10):
        data[i] = {'cell_id': 'SA123_A123_R{0}_C{0}'.format(i)}
        for col in cols:
            data[i][col] = (i * 10)
        data[i]['fastqscreen_grch37'] = i * 1000

    data = pd.DataFrame.from_dict(data, orient='index')
    data['total_reads'] = data[cols].sum(axis=1)

    dtypes = {col: 'int' for col in cols}
    dtypes['cell_id'] = 'str'
    dtypes['total_reads'] = 'int'

    infile = os.path.join(tmpdir, 'input.csv.gz')
    outfile = os.path.join(tmpdir, 'output.csv.gz')

    csvutils.write_dataframe_to_csv_and_yaml(data, infile, dtypes)

    config = {'genomes': [{'name': 'grch37'}, {'name': 'mm10'}, {'name': 'salmon'}]}

    tasks.add_contamination_status(infile, outfile, config)

    output = csvutils.read_csv_and_yaml(outfile)

    assert output['is_contaminated'].tolist() == [False] + [True] * 4 + [False] * 5
Beispiel #20
0
    def read_input_data(self, infile, tablename):
        fileformat = helpers.get_file_format(infile)

        if fileformat == "csv" or fileformat == 'gzip':
            metrics = csvutils.read_csv_and_yaml(infile)
        else:
            with pd.HDFStore(infile, 'r') as metrics_store:
                metrics = metrics_store[tablename]
            metrics = metrics.reset_index()

        if 'cell_call' in metrics.columns.values:
            # plotting code doesnt work with nan
            # tenx data will have nan for cell call, experimental condition
            # row, col
            metrics['cell_call'] = metrics["cell_call"].fillna("nan")
            metrics['experimental_condition'] = metrics["experimental_condition"].fillna("nan")

        return metrics
    def load(self, fname):
        '''
        load tsv file into a pandas data frame
        '''
        extension = os.path.splitext(fname)[-1]

        if extension in [".h5", ".hdf5"]:

            with pandas.HDFStore(self.input, 'r') as metrics_store:
                data = metrics_store[self.tablename]

            data = data.reset_index()

        else:
            data = csvutils.read_csv_and_yaml(fname)

            # data['chromosome'] = data['chromosome'].astype(str)

        return data
def add_contamination_status(infile,
                             outfile,
                             reference='grch37',
                             ref_threshold=0.6,
                             alt_threshold=0.2,
                             strict_validation=True):
    data = csvutils.read_csv_and_yaml(infile)

    data = data.set_index('cell_id', drop=False)

    fastqscreen_cols = [
        col for col in data.columns.values if col.startswith('fastqscreen_')
    ]

    reference = "fastqscreen_{}".format(reference)
    if reference not in fastqscreen_cols:
        raise Exception("Could not find the fastq screen counts")

    alts = [col for col in fastqscreen_cols if not col == reference]

    data['is_contaminated'] = False

    perc_ref = data[reference] / data['total_reads']
    data.loc[perc_ref <= ref_threshold, 'is_contaminated'] = True

    for altcol in alts:
        perc_alt = data[altcol] / data['total_reads']
        data.loc[perc_alt > alt_threshold, 'is_contaminated'] = True

    col_type = dtypes()['metrics']['is_contaminated']
    data['is_contaminated'] = data['is_contaminated'].astype(col_type)

    csvutils.write_dataframe_to_csv_and_yaml(data,
                                             outfile,
                                             write_header=True,
                                             dtypes=dtypes()['metrics'])

    # get cells that are contaminated and have enopugh human reads
    check_df = data.loc[data['is_contaminated'] == True]
    check_df['perc_ref'] = data[reference] / data['total_reads']
    check_df = check_df[check_df['perc_ref'] > ref_threshold]
    if strict_validation and (len(check_df) / len(data) > 0.2):
        logging.error("over 20% of cells are contaminated")
Beispiel #23
0
    def read_metrics(self):
        """
        read metrics and get cell to mad mapping
        """
        metrics = csvutils.read_csv_and_yaml(self.metrics)

        metrics = metrics.set_index("cell_id")
        cell_order = metrics.order.sort_values().index

        # assume all cells are good, dont filter
        if 'quality' not in metrics.columns.values:
            logging.getLogger("single_cell.hmmcopy.igv_seg").warn(
                "quality column missing in data")
            metrics['quality'] = 1

        qual_cell_map = {
            cell: mad
            for cell, mad in zip(metrics.index, metrics["quality"])
        }
        return qual_cell_map, cell_order
Beispiel #24
0
def classify_fastqscreen(training_data_path, metrics_path, metrics_output, dtypes):
    df = csvutils.read_csv_and_yaml(metrics_path)
    features_train, feature_transformer, model = train(training_data_path)

    features = ["fastqscreen_nohit_ratio", "fastqscreen_grch37_ratio", "fastqscreen_mm10_ratio",
                "fastqscreen_salmon_ratio"]
    label_to_species = {0: "grch37", 1: "mm10", 2: "salmon"}
    # check if all the features exists, if yes, make predictions, else create an empty species column.
    exist = all([feature[:-6] in df for feature in features])
    if exist:
        # make the feature columns
        for feature in features:
            df[feature] = df[feature[:-6]].divide(df["total_reads"])
        # check if there's any missing value
        feature_test = df[features]
        feature_test = feature_test.replace([np.inf, -np.inf], np.nan)
        feature_test.fillna(features_train.mean(), inplace=True)
        # scale the features
        scaled_features = feature_transformer.transform(feature_test)
        df["species"] = model.predict(scaled_features)
        df["species"].replace(label_to_species, inplace=True)
    csvutils.write_dataframe_to_csv_and_yaml(df, metrics_output, dtypes)
Beispiel #25
0
def add_contamination_status(infile,
                             outfile,
                             genome_labels,
                             reference='grch37',
                             threshold=0.05):
    data = csvutils.read_csv_and_yaml(infile)

    data = data.set_index('cell_id', drop=False)

    if reference not in genome_labels:
        raise Exception("Could not find the fastq screen counts")

    alts = [col for col in genome_labels if not col == reference]

    data['is_contaminated'] = False

    for altcol in alts:
        perc_alt = _get_col_data(data, altcol) / data['total_reads']
        data.loc[perc_alt > threshold, 'is_contaminated'] = True

    data['is_contaminated'] = data['is_contaminated'].astype('bool')
    csvutils.write_dataframe_to_csv_and_yaml(data, outfile,
                                             dtypes(genome_labels))
Beispiel #26
0
def load_metrics_data(filename):
    reads = csvutils.read_csv_and_yaml(filename)

    reads = reads.set_index(['cell_id'])

    return reads
Beispiel #27
0
 def read_csv(self, infile):
     return csvutils.read_csv_and_yaml(infile)
def get_max_cn(reads):
    df = csvutils.read_csv_and_yaml(reads)
    max_cn = np.nanpercentile(df['copy'], 99)
    return max_cn