def add_corrupt_tree_order(corrupt_tree, metrics, output): """ adds corrupt tree order to metrics """ with open(corrupt_tree) as newickfile: newickdata = newickfile.readline() assert newickfile.readline() == '' tree = Tree(newickdata, format=1) leaves = [node.name for node in tree.traverse("levelorder")] leaves = [val[len('cell_'):] for val in leaves if val.startswith("cell_")] ordering = {val: i for i, val in enumerate(leaves)} metrics = csvutils.read_csv_and_yaml(metrics) cells = metrics.cell_id for cellid in cells: order = ordering.get(cellid, float('nan')) metrics.loc[metrics["cell_id"] == cellid, "order_corrupt_tree"] = order csvutils.write_dataframe_to_csv_and_yaml(metrics, output, write_header=True)
def cell_cycle_classifier(hmmcopy_reads, hmmcopy_metrics, alignment_metrics, output, tempdir, genome_labels): helpers.makedirs(tempdir) temp_output = os.path.join(tempdir, 'cell_cycle_output.csv') cmd = [ 'cell_cycle_classifier', 'train-classify', hmmcopy_reads, hmmcopy_metrics, alignment_metrics, temp_output ] pypeliner.commandline.execute(*cmd) cell_cycle_df = pd.read_csv(temp_output) cols_cell_cycle = cell_cycle_df.columns.values hmm_metrics_df = csvutils.read_csv_and_yaml(hmmcopy_metrics) hmm_metrics_df = hmm_metrics_df.merge(cell_cycle_df, on=['cell_id'], how='outer') out_dtypes = dtypes(genome_labels) for colname in cols_cell_cycle: hmm_metrics_df[colname] = hmm_metrics_df[colname].astype( out_dtypes[colname]) csvutils.write_dataframe_to_csv_and_yaml(hmm_metrics_df, output, out_dtypes)
def get_tri_nucelotide_context(ref_genome_fasta_file, vcf_file, out_file, table_name): vcf_reader = vcf.Reader(filename=vcf_file) fasta_reader = pysam.Fastafile(ref_genome_fasta_file) data = [] for record in vcf_reader: chrom = record.CHROM coord = record.POS tri_nucleotide_context = fasta_reader.fetch(chrom, coord - 2, coord + 1) data.append({ 'chrom': record.CHROM, 'coord': record.POS, 'tri_nucleotide_context': tri_nucleotide_context }) data = pd.DataFrame(data) csvutils.write_dataframe_to_csv_and_yaml(data, out_file, data.dtypes.to_dict(), write_header=True)
def cell_cycle_classifier(hmmcopy_reads, hmmcopy_metrics, alignment_metrics, output, tempdir, docker_image=None): helpers.makedirs(tempdir) temp_output = os.path.join(tempdir, 'cell_cycle_output.csv') cmd = [ 'cell_cycle_classifier', 'train-classify', hmmcopy_reads, hmmcopy_metrics, alignment_metrics, temp_output ] pypeliner.commandline.execute(*cmd, docker_image=docker_image) cell_cycle_df = pd.read_csv(temp_output) hmm_metrics_df = csvutils.read_csv_and_yaml(hmmcopy_metrics) hmm_metrics_df = hmm_metrics_df.merge(cell_cycle_df, on=['cell_id'], how='outer') csvutils.write_dataframe_to_csv_and_yaml(hmm_metrics_df, output)
def add_contamination_status(infile, outfile, config, reference='grch37', threshold=0.05): data = csvutils.read_csv_and_yaml(infile) data = data.set_index('cell_id', drop=False) organisms = [genome['name'] for genome in config['genomes']] if reference not in organisms: raise Exception("Could not find the fastq screen counts") alts = [col for col in organisms if not col == reference] data['is_contaminated'] = False for altcol in alts: perc_alt = _get_col_data(data, altcol) / data['total_reads'] data.loc[perc_alt > threshold, 'is_contaminated'] = True col_type = dtypes()['metrics']['is_contaminated'] data['is_contaminated'] = data['is_contaminated'].astype(col_type) csvutils.write_dataframe_to_csv_and_yaml(data, outfile, dtypes()['metrics'])
def convert_hdf_to_csv(h5_input, outputs): with pd.HDFStore(h5_input) as h5_data: for tablename, outfile in outputs.items(): df = h5_data[tablename] csvutils.write_dataframe_to_csv_and_yaml(df, outfile, write_header=True)
def annotate_db_status(db_vcf_file, target_vcf_file, out_file): db_reader = vcf.Reader(filename=db_vcf_file) reader = vcf.Reader(filename=target_vcf_file) data = [] for record in reader: chrom = record.CHROM coord = record.POS try: db_position_records = [ x for x in db_reader.fetch(chrom, coord - 1, coord) ] except ValueError: db_position_records = [] for db_record in db_position_records: if (db_record.CHROM != chrom) or (db_record.POS != coord): continue if db_record.is_indel: indel = 1 else: indel = 0 for alt in record.ALT: if (record.REF == db_record.REF) and (alt in db_record.ALT): exact_match = 1 else: exact_match = 0 out_row = { 'chrom': chrom, 'coord': coord, 'ref': record.REF, 'alt': str(alt), 'db_id': db_record.ID, 'exact_match': exact_match, 'indel': indel } data.append(out_row) data = pd.DataFrame(data) csvutils.write_dataframe_to_csv_and_yaml(data, out_file, data.dtypes.to_dict(), write_header=True)
def convert_vcf_to_table(in_file, out_file): data = [] parser = ClassicSnpEffParser(in_file) for row in parser: data.append(row) data = pd.DataFrame(data) csvutils.write_dataframe_to_csv_and_yaml(data, out_file, dtypes())
def write(self, input_df, transpose=False): ''' write the dataframe to output file ''' if transpose: del input_df["gc"] input_df = input_df.T input_df["cell_id"] = input_df.index input_df.columns = input_df.columns.astype(str) csvutils.write_dataframe_to_csv_and_yaml(input_df, self.output, self.dtypes)
def get_mappability_col(reads, annotated_reads): reads = csvutils.read_csv_and_yaml(reads, chunksize=100) alldata = [] for read_data in reads: read_data['is_low_mappability'] = (read_data['map'] <= 0.9) alldata.append(read_data) alldata = pd.concat(alldata) csvutils.write_dataframe_to_csv_and_yaml( alldata, annotated_reads, dtypes()['reads'], write_header=True )
def write_dfs(self, tmpdir, dfs, dtypes, write_heads=True): n_dfs = len(dfs) names = [ os.path.join(tmpdir, str(i) + ".csv.gz") for i in range(n_dfs) ] assert len({n_dfs, len(dtypes)}) == 1 for i in range(n_dfs): csvutils.write_dataframe_to_csv_and_yaml(dfs[i], names[i], dtypes[i], write_heads) return names
def get_mappability(mappability_file, vcf_file, out_file, region=None, append_chr=True): map_reader = BigWigFile(open(mappability_file, 'rb')) vcf_reader = vcf.Reader(filename=vcf_file) if region is not None: chrom, beg, end = parse_region_for_vcf(region) try: vcf_reader = vcf_reader.fetch(chrom, start=beg, end=end) except ValueError: print("no data for region {} in vcf".format(region)) vcf_reader = [] data = [] for record in vcf_reader: if append_chr: chrom = 'chr{0}'.format(record.CHROM) else: chrom = record.CHROM coord = record.POS beg = coord - 100 beg = max(beg, 0) end = coord + 100 result = map_reader.query(chrom, beg, end, 1) if result is None: mappability = 0 else: mappability = result[0]['mean'] data.append({ 'chrom': record.CHROM, 'coord': record.POS, 'mappability': mappability }) data = pd.DataFrame(data) csvutils.write_dataframe_to_csv_and_yaml(data, out_file, dtypes())
def merge_fastq_screen_counts(all_detailed_counts, all_summary_counts, merged_detailed_counts, merged_summary_counts): if isinstance(all_detailed_counts, dict): all_detailed_counts = all_detailed_counts.values() detailed_data = [] for countsfile in all_detailed_counts: if os.stat(countsfile).st_size == 0: continue detailed_data.append(pd.read_csv(countsfile)) if len(detailed_data) > 0: df = pd.concat(detailed_data) else: df = pd.DataFrame( columns=["cell_id", "readend", "human", "mouse", "count"]) index_cols = [v for v in df.columns.values if v != "count"] df['count'] = df.groupby(index_cols)['count'].transform('sum') df = df.drop_duplicates(subset=index_cols) csvutils.write_dataframe_to_csv_and_yaml(df, merged_detailed_counts, write_header=True, dtypes=dtypes()) if isinstance(all_summary_counts, dict): all_summary_counts = all_summary_counts.values() summary_counts = [ pd.read_csv(countsfile) for countsfile in all_summary_counts ] if len(summary_counts) > 0: df = pd.concat(summary_counts) else: df = pd.DataFrame(columns - ["cell_id", "fastqscreen_nohit"]) update_cols = [v for v in df.columns.values if v != 'cell_id'] for colname in update_cols: df[colname] = df.groupby('cell_id')[colname].transform('sum') df = df.drop_duplicates(subset=['cell_id']) csvutils.write_dataframe_to_csv_and_yaml(df, merged_summary_counts, write_header=True, dtypes=dtypes())
def test_write_to_csv_yaml_empty(self, tmpdir): """ write empty df """ dtypes = {v: "int" for v in 'ABCD'} df = pd.DataFrame() filename = os.path.join(tmpdir, "df.csv.gz") yaml_filename = filename + ".yaml" csvutils.write_dataframe_to_csv_and_yaml(df, filename, dtypes) assert os.path.exists(filename) assert os.path.exists(yaml_filename)
def genotype(input_bam, reference, input_vcf, output_vcf, output_csv, tempdir, cell_id, docker_image=None): """ calls svtyper-sso on input bam and vcf to perform genotyping. :param input_bam: :type input_bam: :param reference: :type reference: :param input_vcf: :type input_vcf: :param output_vcf: :type output_vcf: :param output_csv: :type output_csv: :param tempdir: :type tempdir: :param docker_image: :type docker_image: :return: :rtype: """ helpers.makedirs(tempdir) cmd = [ 'svtyper-sso', '--input_vcf', input_vcf, '--bam', input_bam, '--ref_fasta', reference, '-o', output_vcf ] pypeliner.commandline.execute(*cmd, docker_image=docker_image) base_data = parse_vcf(output_vcf, None, return_pandas=True) svtype_annotations = extract_svtyper_info(base_data) base_data = base_data.iloc[:, :-2] # assumes svtyper info in last 2 cols output = pd.concat([base_data, svtype_annotations], axis=1) output['cell_id'] = cell_id csvutils.write_dataframe_to_csv_and_yaml(output, output_csv, write_header=True)
def annotate_metrics(metrics, output, sample_info, cells): """ adds sample information to metrics in place """ metrics = csvutils.read_csv_and_yaml(metrics) for cellid in cells: cellinfo = sample_info[cellid] for colname, value in cellinfo.items(): metrics.loc[metrics["cell_id"] == cellid, colname] = value csvutils.write_dataframe_to_csv_and_yaml(metrics, output)
def test_contamination(tmpdir): data = {} cols = [ 'fastqscreen_nohit', 'fastqscreen_grch37', 'fastqscreen_grch37_multihit', 'fastqscreen_mm10', 'fastqscreen_mm10_multihit', 'fastqscreen_salmon', 'fastqscreen_salmon_multihit' ] for i in range(5): data[i] = {'cell_id': 'SA123_A123_R{0}_C{0}'.format(i)} for col in cols: data[i][col] = i * 10 data[i]['fastqscreen_grch37'] = i * 1000 data[i]['fastqscreen_mm10'] = i * 100 for i in range(5, 10): data[i] = {'cell_id': 'SA123_A123_R{0}_C{0}'.format(i)} for col in cols: data[i][col] = (i * 10) data[i]['fastqscreen_grch37'] = i * 1000 data = pd.DataFrame.from_dict(data, orient='index') data['total_reads'] = data[cols].sum(axis=1) dtypes = {col: 'int' for col in cols} dtypes['cell_id'] = 'str' dtypes['total_reads'] = 'int' infile = os.path.join(tmpdir, 'input.csv.gz') outfile = os.path.join(tmpdir, 'output.csv.gz') csvutils.write_dataframe_to_csv_and_yaml(data, infile, dtypes) config = {'genomes': [{'name': 'grch37'}, {'name': 'mm10'}, {'name': 'salmon'}]} tasks.add_contamination_status(infile, outfile, config) output = csvutils.read_csv_and_yaml(outfile) assert output['is_contaminated'].tolist() == [False] + [True] * 4 + [False] * 5
def merge_fastq_screen_counts(all_detailed_counts, all_summary_counts, merged_detailed_counts, merged_summary_counts, fastqscreen_config): genome_labels = [ genome['name'] for genome in fastqscreen_config['genomes'] ] all_detailed_counts = helpers.flatten(all_detailed_counts) all_detailed_counts = [ pd.read_csv(file) for file in all_detailed_counts if not helpers.is_empty(file) ] df = pd.concat(all_detailed_counts) index_cols = [v for v in df.columns.values if v != "count"] df['count'] = df.groupby(index_cols)['count'].transform('sum') df = df.drop_duplicates(subset=index_cols) csvutils.write_dataframe_to_csv_and_yaml( df, merged_detailed_counts, fastqscreen_dtypes(genome_labels)['fastqscreen_detailed'], write_header=True) all_summary_counts = helpers.flatten(all_summary_counts) all_summary_counts = [ pd.read_csv(file) for file in all_summary_counts if not helpers.is_empty(file) ] df = pd.concat(all_summary_counts) update_cols = [v for v in df.columns.values if v != 'cell_id'] for colname in update_cols: df[colname] = df.groupby('cell_id')[colname].transform('sum') df = df.drop_duplicates(subset=['cell_id']) csvutils.write_dataframe_to_csv_and_yaml( df, merged_summary_counts, fastqscreen_dtypes(genome_labels)['metrics'], write_header=True)
def merge_fastq_screen_counts(all_detailed_counts, all_summary_counts, merged_detailed_counts, merged_summary_counts): if isinstance(all_detailed_counts, dict): all_detailed_counts = all_detailed_counts.values() detailed_data = [] for countsfile in all_detailed_counts: if os.stat(countsfile).st_size == 0: continue detailed_data.append(pd.read_csv(countsfile)) df = pd.concat(detailed_data) index_cols = [v for v in df.columns.values if v != "count"] df['count'] = df.groupby(index_cols)['count'].transform('sum') df = df.drop_duplicates(subset=index_cols) csvutils.write_dataframe_to_csv_and_yaml(df, merged_detailed_counts, dtypes()['fastqscreen_detailed'], write_header=True) if isinstance(all_summary_counts, dict): all_summary_counts = all_summary_counts.values() summary_counts = [ pd.read_csv(countsfile) for countsfile in all_summary_counts ] df = pd.concat(summary_counts) update_cols = [v for v in df.columns.values if v != 'cell_id'] for colname in update_cols: df[colname] = df.groupby('cell_id')[colname].transform('sum') df = df.drop_duplicates(subset=['cell_id']) csvutils.write_dataframe_to_csv_and_yaml(df, merged_summary_counts, dtypes()['metrics'], write_header=True)
def base_write_to_csv_yaml_test(self, temp, dtypes, length, write_header=True): """ base test for write csv yaml """ df = self.make_test_dfs([dtypes], length) csv = self.write_dfs(temp, df, [dtypes], write_header) filename = csv[0] yaml_filename = filename + ".yaml" os.remove(yaml_filename) assert not os.path.exists(yaml_filename) csvutils.write_dataframe_to_csv_and_yaml(df[0], filename, dtypes, write_header=write_header) return df[0], filename, yaml_filename
def add_contamination_status(infile, outfile, reference='grch37', ref_threshold=0.6, alt_threshold=0.2, strict_validation=True): data = csvutils.read_csv_and_yaml(infile) data = data.set_index('cell_id', drop=False) fastqscreen_cols = [ col for col in data.columns.values if col.startswith('fastqscreen_') ] reference = "fastqscreen_{}".format(reference) if reference not in fastqscreen_cols: raise Exception("Could not find the fastq screen counts") alts = [col for col in fastqscreen_cols if not col == reference] data['is_contaminated'] = False perc_ref = data[reference] / data['total_reads'] data.loc[perc_ref <= ref_threshold, 'is_contaminated'] = True for altcol in alts: perc_alt = data[altcol] / data['total_reads'] data.loc[perc_alt > alt_threshold, 'is_contaminated'] = True col_type = dtypes()['metrics']['is_contaminated'] data['is_contaminated'] = data['is_contaminated'].astype(col_type) csvutils.write_dataframe_to_csv_and_yaml(data, outfile, write_header=True, dtypes=dtypes()['metrics']) # get cells that are contaminated and have enopugh human reads check_df = data.loc[data['is_contaminated'] == True] check_df['perc_ref'] = data[reference] / data['total_reads'] check_df = check_df[check_df['perc_ref'] > ref_threshold] if strict_validation and (len(check_df) / len(data) > 0.2): logging.error("over 20% of cells are contaminated")
def convert_vcf_to_table(in_file, out_file, table_name, classic_mode=True): data = [] if classic_mode: parser = biowrappers.components.variant_calling.snpeff.parser.ClassicSnpEffParser( in_file) else: parser = biowrappers.components.variant_calling.snpeff.parser.SnpEffParser( in_file) for row in parser: data.append(row) data = pd.DataFrame(data) csvutils.write_dataframe_to_csv_and_yaml(data, out_file, data.dtypes.to_dict(), write_header=True)
def test_concat_csv_with_nans(self, tmpdir, n_rows): """ concat two csvs with NaNs """ dtypes = {v: "float" for v in 'ABCD'} concatenated = os.path.join(tmpdir, 'concat.csv.gz') dfs = self.make_test_dfs([dtypes, dtypes], n_rows) csvs = [os.path.join(tmpdir, "0.csv.gz"), os.path.join(tmpdir, "1.csv.gz")] dfs[0].iloc[2, dfs[0].columns.get_loc("A")] = np.NaN dfs[1].iloc[2, dfs[1].columns.get_loc("A")] = np.NaN csvutils.write_dataframe_to_csv_and_yaml(dfs[0], csvs[0], dtypes) csvutils.write_dataframe_to_csv_and_yaml(dfs[1], csvs[1], dtypes) ref = pd.concat(dfs, ignore_index=True) csvutils.concatenate_csv(csvs, concatenated) assert self.dfs_exact_match(ref, concatenated)
def classify_fastqscreen(training_data_path, metrics_path, metrics_output, dtypes): df = csvutils.read_csv_and_yaml(metrics_path) features_train, feature_transformer, model = train(training_data_path) features = ["fastqscreen_nohit_ratio", "fastqscreen_grch37_ratio", "fastqscreen_mm10_ratio", "fastqscreen_salmon_ratio"] label_to_species = {0: "grch37", 1: "mm10", 2: "salmon"} # check if all the features exists, if yes, make predictions, else create an empty species column. exist = all([feature[:-6] in df for feature in features]) if exist: # make the feature columns for feature in features: df[feature] = df[feature[:-6]].divide(df["total_reads"]) # check if there's any missing value feature_test = df[features] feature_test = feature_test.replace([np.inf, -np.inf], np.nan) feature_test.fillna(features_train.mean(), inplace=True) # scale the features scaled_features = feature_transformer.transform(feature_test) df["species"] = model.predict(scaled_features) df["species"].replace(label_to_species, inplace=True) csvutils.write_dataframe_to_csv_and_yaml(df, metrics_output, dtypes)
def add_contamination_status(infile, outfile, genome_labels, reference='grch37', threshold=0.05): data = csvutils.read_csv_and_yaml(infile) data = data.set_index('cell_id', drop=False) if reference not in genome_labels: raise Exception("Could not find the fastq screen counts") alts = [col for col in genome_labels if not col == reference] data['is_contaminated'] = False for altcol in alts: perc_alt = _get_col_data(data, altcol) / data['total_reads'] data.loc[perc_alt > threshold, 'is_contaminated'] = True data['is_contaminated'] = data['is_contaminated'].astype('bool') csvutils.write_dataframe_to_csv_and_yaml(data, outfile, dtypes(genome_labels))
def get_snv_allele_counts_for_vcf_targets(bam_file, vcf_file, out_file, count_duplicates=False, min_bqual=0, min_mqual=0, region=None, vcf_to_bam_chrom_map=None, report_zero_count_positions=False, dtypes=None, **extra_columns): bam = pysam.AlignmentFile(bam_file, 'rb') vcf_reader = vcf.Reader(filename=vcf_file) if region is not None: chrom, beg, end = utils.parse_region_for_vcf(region) try: vcf_reader = vcf_reader.fetch(chrom, start=beg, end=end) except ValueError: vcf_reader = () data = [] for record in vcf_reader: if vcf_to_bam_chrom_map is not None: bam_chrom = vcf_to_bam_chrom_map[record.CHROM] else: bam_chrom = record.CHROM df = _get_counts_df( bam, bam_chrom, record.POS, record.POS + 1, count_duplicates=count_duplicates, min_bqual=min_bqual, min_mqual=min_mqual, strand='both', report_zero_count_positions=report_zero_count_positions, ) if df is None: continue counts = df.iloc[0] ref_base = record.REF # Skip record with reference base == N if ref_base not in nucleotides: continue for alt_base in record.ALT: alt_base = str(alt_base) if (len(ref_base) != 1) or (len(alt_base) != 1): continue # Skip record with alt base == N if alt_base not in nucleotides: continue if not report_zero_count_positions and counts[ ref_base] == 0 and counts[alt_base] == 0: continue # Format output record out_row = { 'chrom': record.CHROM, 'coord': record.POS, 'ref': ref_base, 'alt': alt_base, 'ref_counts': counts[ref_base], 'alt_counts': counts[alt_base] } data.append(out_row) data = pd.DataFrame( data, columns=['chrom', 'coord', 'ref', 'alt', 'ref_counts', 'alt_counts']) for col, value in extra_columns.items(): data[col] = value csvutils.write_dataframe_to_csv_and_yaml(data, out_file, dtypes)