def findClosestMatching(repprofiles, goodprofile, closest=False, returncorr=False): """ will find what replicate matches best what known profile using numpy's corrcoef Args: ----- repprofiles: dataframe the new expression profile to test against: dfs should be SAMPLESxGENE goodprofile: dataframe the known- expression profile closest: bool whether to rerturn the closest matching or just the one that matches perfectly, if any returncorr: bool to return the full corelation matrix Returns: -------- match: dict(id:id) listing samples that are the closest for all samples corr: dataframe of correlations if requested """ match = {} a = set(repprofiles.columns) & set(goodprofile.columns) ind = goodprofile.index.tolist() corr = [] for i, (k, v) in enumerate(repprofiles[a].iterrows()): h.showcount(i, len(repprofiles)) res = np.array([np.corrcoef(v, w)[0, 1] for _, w in goodprofile[a].iterrows()]) if max(res) == 1 or closest: match[k] = ind[np.argmax(res)] if returncorr: corr.append(res) if returncorr: corr = pd.DataFrame(data=corr, index=repprofiles.index.tolist( ), columns=goodprofile.index.tolist()) return match, corr else: return match
def mergeSplicingVariants(df, defined='.'): df = df.sort_index() foundpoint = False # pdb.set_trace() torename = {} todrop = [] for i, v in enumerate(df.index.tolist()): h.showcount(i, len(df)) if foundpoint: if foundpoint in v: tomerge.append(v) else: if foundpoint not in df.index: if len(tomerge) > 1: #print("merging "+str(len(tomerge))) df.loc[foundpoint] = df.loc[tomerge].sum() todrop.extend(tomerge) else: torename.update({tomerge[0]: foundpoint}) else: todrop.extend(tomerge) tomerge.append(foundpoint) df.loc[foundpoint] = df.loc[tomerge].sum() foundpoint = False elif defined in v: foundpoint = v.split(defined)[0] tomerge = [v] if len(torename) > 0: df = df.rename(index=torename) df = df.drop(index=todrop) return df
def findLikelyDup(tracker, name='stripped_cell_line_name', signs=['-', '_', '.', ' '], arxspid='arxspan_id', looksub=True): """ find cell lines that are likely to be duplicates will return , as well, and Args: ----- tracker: dataframe of the sample tracker looksub: bool, look if a name if within another name (can flag many derivatives) Returns: -------- a list[tuples(str,str)] of likly duplicate names as tuples (rh13, RH-13) a list[tuples(str,str)] of associated arxspan ids a dict[str:set(str)] of arxspan ids that have multiple cell line names associated """ names = set(tracker[name]) simi = [] arxsp = [] issues = {} for i, name1 in enumerate(names): h.showcount(i, len(names)) n1 = name1 for s in signs: name1 = name1.replace(s, '') name1 = name1.upper() for name2 in names - set([n1]): n2 = name2 for s in signs: name2 = name2.replace(s, '') name2 = name2.upper() if name1 == name2: if (looksub and (name1 in name2 or name2 in name1) and abs(len(name1) - len(name2)) < 2) or not looksub: if (n1, n2) not in simi and (n2, n1) not in simi: simi.append((n1, n2)) arxsp.append( (tracker[tracker[name] == n1][arxspid][0], tracker[tracker[name] == n2][arxspid][0])) for val in set(tracker[name]): v = set(tracker[tracker[name] == val][arxspid]) if len(v) > 1: issues.update({val: v}) return simi, arxsp, issues
def manageGapsInSegments(segtocp, Chromosome='Chromosome', End="End", Start="Start", cyto=None): """ extends the ends of segments in a segment file from GATK so as to remove all gaps ove the genome (works with multiple sample file) Args: ---- segtocp: dataframe of segments from GATK CN pipeline Chromosome: str the value for the Chromosome columns End: str the value for the End columns Start: str the value for the Start columns cyto: dataframe with chrom;end; columns giving the size of each chromosome (else puts last segment to 1000000000) """ prevchr = '' prevend = 0 count = 0 l = [] segments = segtocp.copy() le = len(segments) for k, val in segments.iterrows(): h.showcount(count, le) count += 1 if val[Chromosome] != prevchr: # we changed chromosome # we extend the previous segment (last of the prev chrom) to.. way enough if len(l) > 0: l[-1][2] = 1000000000 if cyto is None else cyto[cyto['chrom'] == prevchr]['end'].values[-1] # we extend the first segment to 0 l.append([val[Chromosome], 0, val[End]]) else: if val[Start] > prevend + 1: # we have a gap in the same chrom sizeofgap = val[Start] - prevend # we add to the previous one half of the gap l[-1][2] += int(sizeofgap / 2) if sizeofgap % 2 == 0 else int(sizeofgap / 2) + 1 # the rest to the other l.append([val[Chromosome], val[Start] - int(sizeofgap / 2), val[End]]) elif val[Start] < prevend: # this should never happen # import pdb; pdb.set_trace() raise ValueError("start comes after end") else: l.append([val[Chromosome], val[Start], val[End]]) prevchr = val[Chromosome] prevend = val[End] # we extend the last one l[-1][2] = 1000000000 if cyto is None else cyto[cyto['chrom'] == prevchr]['end'].values[-1] segments[[Chromosome, Start, End]] = l return segments.reset_index(drop=True)
def computeDistsFromClass(dots, seconddots, conds=['DMSO', 'VHL'], groupcol="group", sclass='green', signal="mean_green", area="area"): """ """ dists = {} twodists = {} for val in set(dots.exp): for e in conds: d = dots[(dots.exp == val) & (dots.treat == e)] dist = [] weight = [] newdist = [] ind = [] m = seconddots[(seconddots.exp == val) & (seconddots.treat == e)] print(val, e) for i, (k, v) in enumerate(m.iterrows()): h.showcount(i, len(m)) dist.append( distance_matrix( d[(d['class'] == sclass) & (d[groupcol] == v[groupcol])][['x', "y", "z"]].values, np.array([v[['x_mean', "y_mean", "z_mean"]]])).T[0].astype(float)) weight.append(d[(d['class'] == sclass) & (d[groupcol] == v[groupcol])][signal]) dat = d[(d['class'] == sclass) & (d[groupcol] == v[groupcol])][[ 'x', "y", "z", signal, area, "m_id" ]] a = dat.values a[:, :3] = a[:, :3] - v[['x_mean', "y_mean", "z_mean"]].values newdist.append(a) ind.extend(dat.index.tolist()) twodists[val + e] = pd.DataFrame( data=np.vstack(newdist), columns=['x', 'y', 'z', signal, area, "m_id"], index=ind) dists[val + e] = [np.hstack(dist), np.hstack(weight)] return twodists, dists
def fromGTF2BED(gtfname, bedname, gtftype='geneAnnot'): """ transforms a gtf file into a bed file Args: ---- gtfname: filepath to gtf file bedname: filepath to beddfile gtftype: only geneAnnot for now Returns: -------- newbed: the bedfile as a pandas.df """ if gtftype == 'geneAnnot': gtf = pd.read_csv(gtfname, sep='\t', header=0, names=[ "chr", "val", "type", "start", 'stop', 'dot', 'strand', 'loc', 'name' ]) gtf['name'] = [ i.split('gene_id "')[-1].split('"; trans')[0] for i in gtf['name'] ] prevname = '' newbed = {'chr': [], 'start': [], 'end': [], 'gene': []} for i, val in gtf.iterrows(): h.showcount(i, len(gtf)) if val['name'] == prevname: newbed['end'][-1] = val['stop'] else: newbed['chr'].append(val['chr']) newbed['start'].append(val['start']) newbed['end'].append(val['stop']) newbed['gene'].append(val['name']) prevname = val['name'] newbed = pd.DataFrame(newbed) newbed = newbed[~newbed.chr.str.contains('_fix')] newbed.to_csv(bedname + ".bed", sep='\t', index=None) newbed.to_csv(bedname + "_genes.bed", sep='\t', index=None) return newbed
def mafToMat(maf, mode="bool", freqcol='tumor_f', samplesCol="DepMap_ID", mutNameCol="Hugo_Symbol", minfreqtocall=0.2): """ turns a maf file into a matrix of mutations x samples (works with multiple sample file) Args: ----- maf: dataframe of the maf file sample_col: str colname for samples mode: flag "bool" to convert the matrix into a boolean (mut/no mut) "float" to keep the allele frequencies as is (0.x) "genotype" to have either 1, 0.5 or 0 freqcol: str colname where ref/alt frequencies are stored mutNameCol: str colname where mutation names are stored, will merge things over that column name Returns: -------- the dataframe matrix """ samples = set(maf[samplesCol]) maf = maf[maf[freqcol] >= minfreqtocall] maf = maf.sort_values(by=mutNameCol) mut = pd.DataFrame(data=np.zeros((len(set(maf[mutNameCol])), 1)), columns=[ 'fake'], index=set(maf[mutNameCol])).astype(float) for i, val in enumerate(samples): h.showcount(i, len(samples)) if mode == "genotype": mut = mut.join(maf[maf[samplesCol] == val].set_index(mutNameCol)[freqcol].groupby( mutNameCol).agg('sum').rename(val)) else: mut = mut.join(maf[maf[samplesCol] == val].drop_duplicates( mutNameCol).set_index(mutNameCol)[freqcol].rename(val)) mut = mut.fillna(0).astype( bool if mode == "bool" else float).drop(columns=['fake']) if mode == "genotype": mut[(mut > 1.3)] = 3 mut[(mut >= 0.7) & (mut <= 1.3)] = 2 mut[(mut > .3) & (mut < .7)] = 1 mut[mut <= .3] = 0 return mut
def substractPeaksTo(peaks, loci, bp=50): """ removes all peaks that are not within a bp distance to a set of loci Args: ---- peaks: a bed file df with a chrom,start, end column at least loci: a df witth a chrom & loci column bp: the max allowed distance to the loci Returns: ------- all the peaks that are within this distance """ i = 0 j = 0 keep = [] bp = 50 while j < len(peaks) and i < len(loci): h.showcount(j, len(peaks)) if peaks.loc[j].chrom > loci.loc[i].chrom: i += 1 continue if peaks.loc[j].chrom < loci.loc[i].chrom: j += 1 continue if peaks.loc[j].start - bp > loci.loc[i].loci: i += 1 continue if peaks.loc[j].end + bp < loci.loc[i].loci: j += 1 continue if peaks.loc[j].end + bp >= loci.loc[ i].loci and peaks.loc[j].start - bp <= loci.loc[i].loci: keep.append(j) j += 1 return peaks.loc[set(keep)]
def toGeneMatrix(segments, gene_mapping, style='weighted', missingchrom=['Y'], gene_names_col='gene_name'): """ makes a geneXsample matrix from segment level copy number (works with multiple sample file) Args: ---- style: str one of "weighted","mean","closest" segments: dataframe of segments containing: [Chromosome, Segment_Mean, Chromosome, start, end] columns gene_mapping: dataframe with symbol, ensembl_id columns for each gene missingchrom: list[str] chromosomes not to look into Returns: ------- pd.dataframe: the matrix """ samples = list(set(segments.DepMap_ID)) data = np.zeros((len(samples), len(gene_mapping))) for i, sample in enumerate(samples): segs = segments[segments.DepMap_ID == sample][[ 'Chromosome', 'Start', 'End', "Segment_Mean"]].values hasmissing = set(missingchrom) - set(segs[:, 0]) j = 0 h.showcount(i, len(samples)) for k, gene in enumerate(gene_mapping[['Chromosome', 'start', 'end']].values): #print(i,j) if gene[0] in hasmissing: data[i, k] = np.nan continue try: while gene[0] != segs[j][0] or gene[1] >= segs[j][2]: #print("went beyong",gene, segs[j]) j += 1 # some genes are within other genes, we need to go back in the list of segment in that case except: raise ValueError('forgot to sort one of the DF?') while gene[1] < segs[j][1]: j -= 1 #print("decrease gene",gene) # we are entirely within the segment c = 1 if gene[2] <= segs[j][2]: data[i, k] = segs[j][3] else: # how much of the gene is covered by the segment coef = (segs[j][2] - gene[1]) / (gene[2] - gene[1]) # print('coef',coef) val = segs[j][3] * coef if style == "weighted" else segs[j][3] end = segs[j][2] # until the end of a segments goes beyond the end of the gene (say if we have X segments within the gene) while end < gene[2]: # pdb.set_trace() j += 1 c += 1 nextend = segs[j][2] if segs[j][2] < gene[2] else gene[2] # here, end (of prevsegment) is the next segment's start ncoef = (nextend - end) / (gene[2] - gene[1]) # print('multi',gene, ncoef) if style == "closest": if ncoef > coef: val = segs[j][3] else: # we switch it back (see line 894) ncoef = coef else: val += segs[j][3] * ncoef if style == "weighted" else segs[j][3] end = segs[j][2] coef = ncoef data[i, k] = val if style == "weighted" else val / c return pd.DataFrame(data=data, index=samples, columns=gene_mapping[gene_names_col])