def read_metrics_csv(self, cndata): """ read the input file """ samples = cndata.index data = {} numread_data = {} reads_per_bin_data = {} sepdata = defaultdict(list) colordata = {} header, dtypes, columns = csvutils.get_metadata(self.metrics) idxs = self.build_label_indices(columns) color_col = self.color_by_col sep_col = self.plot_by_col with helpers.getFileHandle(self.metrics) as freader: if header: assert freader.readline().strip().split(',') == columns for line in freader: line = line.strip().split(self.sep) sample_id = line[idxs['cell_id']] # skip samples that are just na or inf if sample_id not in samples: continue val = line[idxs["mad_neutral_state"]] val = float('nan') if val == "NA" else float(val) ec = 'all' if sep_col == 'all' else line[idxs[sep_col]] cc = line[idxs[color_col]] numreads = float(line[idxs['total_mapped_reads_hmmcopy']]) reads_per_bin = line[idxs['median_hmmcopy_reads_per_bin']] reads_per_bin = 0 if reads_per_bin == "NA" else float( reads_per_bin) if self.cellcalls and cc not in self.cellcalls: continue numread_data[sample_id] = numreads data[sample_id] = val reads_per_bin_data[sample_id] = reads_per_bin colordata[sample_id] = cc sepdata[ec].append(sample_id) return data, sepdata, colordata, numread_data, reads_per_bin_data
def parse_segs(self, segs, metrics): """parses hmmcopy segments data :param segs: path to hmmcopy segs file """ header_flag, dtypes, columns = csvutils.get_metadata(segs) header = {v: i for i, v in enumerate(columns)} segs_data = {} with helpers.getFileHandle(segs) as segfile: if header_flag: assert segfile.readline().strip().split(',') == columns for row in segfile: row = row.strip().split(',') chrom = row[header["chr"]] start = row[header["start"]] end = row[header["end"]] cell_id = row[header["cell_id"]] state = row[header["state"]] # float to handle scientific notation segment_length = int(float(end)) - int(float(start)) + 1 if metrics[cell_id] > self.quality_threshold: continue segs_data[cell_id] = [ cell_id, chrom, start, end, segment_length, state ] return segs_data
def read_segs_csv(self): """ read the input file """ data = {} bins = {} header, dtypes, columns = csvutils.get_metadata(self.input) with helpers.getFileHandle(self.input, 'rt') as freader: idxs = self.build_label_indices(columns) if header: assert freader.readline().strip().split(',') == columns for line in freader: line = line.strip().split(self.sep) sample_id = line[idxs['cell_id']] val = line[idxs[self.column_name]] val = float('nan') if val == "NA" else float(val) chrom = line[idxs['chr']] start = int(line[idxs['start']]) end = int(line[idxs['end']]) seg = (chrom, start, end) if self.mappability_threshold and float( line[idxs["map"]]) <= self.mappability_threshold: val = float("nan") if chrom not in bins: bins[chrom] = set() bins[chrom].add((start, end)) # just a sanity check, not required if sample_id in data and seg in data[sample_id]: raise Exception("repeated val") if sample_id not in data: data[sample_id] = {} data[sample_id][seg] = val samples = sorted(data.keys()) bins = self.sort_bins_csv(bins) data = self.conv_to_matrix(data, bins, samples) data = self.get_pandas_dataframe(data, bins) return data