Example #1
0
def add_corrupt_tree_order(corrupt_tree, metrics, output):
    """
    adds corrupt tree order to metrics
    """

    with open(corrupt_tree) as newickfile:
        newickdata = newickfile.readline()
        assert newickfile.readline() == ''

    tree = Tree(newickdata, format=1)

    leaves = [node.name for node in tree.traverse("levelorder")]
    leaves = [val[len('cell_'):] for val in leaves if val.startswith("cell_")]

    ordering = {val: i for i, val in enumerate(leaves)}

    metrics = csvutils.read_csv_and_yaml(metrics)

    cells = metrics.cell_id

    for cellid in cells:
        order = ordering.get(cellid, float('nan'))
        metrics.loc[metrics["cell_id"] == cellid, "order_corrupt_tree"] = order

    col_dtype = dtypes()['metrics']['order_corrupt_tree']
    metrics['order_corrupt_tree'] = metrics['order_corrupt_tree'].astype(
        col_dtype)

    csvutils.write_dataframe_to_csv_and_yaml(metrics,
                                             output,
                                             dtypes()['metrics'],
                                             write_header=True)
Example #2
0
def add_contamination_status(infile,
                             outfile,
                             config,
                             reference='grch37',
                             threshold=0.05):
    data = csvutils.read_csv_and_yaml(infile)

    data = data.set_index('cell_id', drop=False)
    organisms = [genome['name'] for genome in config['genomes']]

    if reference not in organisms:
        raise Exception("Could not find the fastq screen counts")

    alts = [col for col in organisms if not col == reference]

    data['is_contaminated'] = False

    for altcol in alts:
        perc_alt = _get_col_data(data, altcol) / data['total_reads']
        data.loc[perc_alt > threshold, 'is_contaminated'] = True

    col_type = dtypes()['metrics']['is_contaminated']

    data['is_contaminated'] = data['is_contaminated'].astype(col_type)
    csvutils.write_dataframe_to_csv_and_yaml(data, outfile,
                                             dtypes()['metrics'])
Example #3
0
def cell_cycle_classifier(hmmcopy_reads, hmmcopy_metrics, alignment_metrics,
                          output, tempdir, genome_labels):
    helpers.makedirs(tempdir)
    temp_output = os.path.join(tempdir, 'cell_cycle_output.csv')

    cmd = [
        'cell_cycle_classifier', 'train-classify', hmmcopy_reads,
        hmmcopy_metrics, alignment_metrics, temp_output
    ]

    pypeliner.commandline.execute(*cmd)

    cell_cycle_df = pd.read_csv(temp_output)

    cols_cell_cycle = cell_cycle_df.columns.values

    hmm_metrics_df = csvutils.read_csv_and_yaml(hmmcopy_metrics)

    hmm_metrics_df = hmm_metrics_df.merge(cell_cycle_df,
                                          on=['cell_id'],
                                          how='outer')

    out_dtypes = dtypes(genome_labels)
    for colname in cols_cell_cycle:
        hmm_metrics_df[colname] = hmm_metrics_df[colname].astype(
            out_dtypes[colname])

    csvutils.write_dataframe_to_csv_and_yaml(hmm_metrics_df, output,
                                             out_dtypes)
Example #4
0
def generate_qc_report(tempdir, reference_gc, fastqscreen_training_data,
                       metrics_df, gc_metrics_df, qc_report,
                       metrics_df_annotated):
    helpers.makedirs(tempdir)
    fastqscreen_classify.classify_fastqscreen(fastqscreen_training_data,
                                              metrics_df, metrics_df_annotated,
                                              dtypes()['metrics'])
    generate_qc.generate_html_report(tempdir, qc_report, reference_gc,
                                     metrics_df, gc_metrics_df)
Example #5
0
def annotate_metrics(metrics, output, sample_info, cells):
    """
    adds sample information to metrics in place
    """

    metrics = csvutils.read_csv_and_yaml(metrics)

    for cellid in cells:
        cellinfo = sample_info[cellid]

        for colname, value in cellinfo.items():
            metrics.loc[metrics["cell_id"] == cellid, colname] = value

    csvutils.write_dataframe_to_csv_and_yaml(metrics, output,
                                             dtypes()['metrics'])
Example #6
0
def add_quality(hmmcopy_metrics, alignment_metrics, output, training_data,
                tempdir, genome_labels):
    helpers.makedirs(tempdir)

    intermediate_output = os.path.join(tempdir, 'metrics_with_quality.csv')

    model = classify.train_classifier(training_data)

    feature_names = model.feature_names_

    data = classify.load_data(hmmcopy_metrics, alignment_metrics,
                              feature_names)

    predictions = classify.classify(model, data)

    classify.write_to_output(hmmcopy_metrics, intermediate_output, predictions)

    csvutils.rewrite_csv_file(intermediate_output,
                              output,
                              dtypes=dtypes(genome_labels))
Example #7
0
def add_contamination_status(infile,
                             outfile,
                             genome_labels,
                             reference='grch37',
                             threshold=0.05):
    data = csvutils.read_csv_and_yaml(infile)

    data = data.set_index('cell_id', drop=False)

    if reference not in genome_labels:
        raise Exception("Could not find the fastq screen counts")

    alts = [col for col in genome_labels if not col == reference]

    data['is_contaminated'] = False

    for altcol in alts:
        perc_alt = _get_col_data(data, altcol) / data['total_reads']
        data.loc[perc_alt > threshold, 'is_contaminated'] = True

    data['is_contaminated'] = data['is_contaminated'].astype('bool')
    csvutils.write_dataframe_to_csv_and_yaml(data, outfile,
                                             dtypes(genome_labels))