def add_corrupt_tree_order(corrupt_tree, metrics, output): """ adds corrupt tree order to metrics """ with open(corrupt_tree) as newickfile: newickdata = newickfile.readline() assert newickfile.readline() == '' tree = Tree(newickdata, format=1) leaves = [node.name for node in tree.traverse("levelorder")] leaves = [val[len('cell_'):] for val in leaves if val.startswith("cell_")] ordering = {val: i for i, val in enumerate(leaves)} metrics = csvutils.read_csv_and_yaml(metrics) cells = metrics.cell_id for cellid in cells: order = ordering.get(cellid, float('nan')) metrics.loc[metrics["cell_id"] == cellid, "order_corrupt_tree"] = order col_dtype = dtypes()['metrics']['order_corrupt_tree'] metrics['order_corrupt_tree'] = metrics['order_corrupt_tree'].astype( col_dtype) csvutils.write_dataframe_to_csv_and_yaml(metrics, output, dtypes()['metrics'], write_header=True)
def add_contamination_status(infile, outfile, config, reference='grch37', threshold=0.05): data = csvutils.read_csv_and_yaml(infile) data = data.set_index('cell_id', drop=False) organisms = [genome['name'] for genome in config['genomes']] if reference not in organisms: raise Exception("Could not find the fastq screen counts") alts = [col for col in organisms if not col == reference] data['is_contaminated'] = False for altcol in alts: perc_alt = _get_col_data(data, altcol) / data['total_reads'] data.loc[perc_alt > threshold, 'is_contaminated'] = True col_type = dtypes()['metrics']['is_contaminated'] data['is_contaminated'] = data['is_contaminated'].astype(col_type) csvutils.write_dataframe_to_csv_and_yaml(data, outfile, dtypes()['metrics'])
def cell_cycle_classifier(hmmcopy_reads, hmmcopy_metrics, alignment_metrics, output, tempdir, genome_labels): helpers.makedirs(tempdir) temp_output = os.path.join(tempdir, 'cell_cycle_output.csv') cmd = [ 'cell_cycle_classifier', 'train-classify', hmmcopy_reads, hmmcopy_metrics, alignment_metrics, temp_output ] pypeliner.commandline.execute(*cmd) cell_cycle_df = pd.read_csv(temp_output) cols_cell_cycle = cell_cycle_df.columns.values hmm_metrics_df = csvutils.read_csv_and_yaml(hmmcopy_metrics) hmm_metrics_df = hmm_metrics_df.merge(cell_cycle_df, on=['cell_id'], how='outer') out_dtypes = dtypes(genome_labels) for colname in cols_cell_cycle: hmm_metrics_df[colname] = hmm_metrics_df[colname].astype( out_dtypes[colname]) csvutils.write_dataframe_to_csv_and_yaml(hmm_metrics_df, output, out_dtypes)
def generate_qc_report(tempdir, reference_gc, fastqscreen_training_data, metrics_df, gc_metrics_df, qc_report, metrics_df_annotated): helpers.makedirs(tempdir) fastqscreen_classify.classify_fastqscreen(fastqscreen_training_data, metrics_df, metrics_df_annotated, dtypes()['metrics']) generate_qc.generate_html_report(tempdir, qc_report, reference_gc, metrics_df, gc_metrics_df)
def annotate_metrics(metrics, output, sample_info, cells): """ adds sample information to metrics in place """ metrics = csvutils.read_csv_and_yaml(metrics) for cellid in cells: cellinfo = sample_info[cellid] for colname, value in cellinfo.items(): metrics.loc[metrics["cell_id"] == cellid, colname] = value csvutils.write_dataframe_to_csv_and_yaml(metrics, output, dtypes()['metrics'])
def add_quality(hmmcopy_metrics, alignment_metrics, output, training_data, tempdir, genome_labels): helpers.makedirs(tempdir) intermediate_output = os.path.join(tempdir, 'metrics_with_quality.csv') model = classify.train_classifier(training_data) feature_names = model.feature_names_ data = classify.load_data(hmmcopy_metrics, alignment_metrics, feature_names) predictions = classify.classify(model, data) classify.write_to_output(hmmcopy_metrics, intermediate_output, predictions) csvutils.rewrite_csv_file(intermediate_output, output, dtypes=dtypes(genome_labels))
def add_contamination_status(infile, outfile, genome_labels, reference='grch37', threshold=0.05): data = csvutils.read_csv_and_yaml(infile) data = data.set_index('cell_id', drop=False) if reference not in genome_labels: raise Exception("Could not find the fastq screen counts") alts = [col for col in genome_labels if not col == reference] data['is_contaminated'] = False for altcol in alts: perc_alt = _get_col_data(data, altcol) / data['total_reads'] data.loc[perc_alt > threshold, 'is_contaminated'] = True data['is_contaminated'] = data['is_contaminated'].astype('bool') csvutils.write_dataframe_to_csv_and_yaml(data, outfile, dtypes(genome_labels))