Exemple #1
0
def findClosestMatching(repprofiles, goodprofile, closest=False, returncorr=False):
  """
  will find what replicate matches best what known profile using numpy's corrcoef

  Args:
  -----
    repprofiles: dataframe the new  expression profile to test against: dfs should be SAMPLESxGENE
    goodprofile: dataframe the known- expression profile
    closest: bool whether to rerturn the closest matching or just the one that matches perfectly, if any
    returncorr: bool to return the full corelation matrix

  Returns:
  --------
    match: dict(id:id) listing samples that are the closest for all samples
    corr: dataframe of correlations if requested

  """
  match = {}
  a = set(repprofiles.columns) & set(goodprofile.columns)
  ind = goodprofile.index.tolist()
  corr = []
  for i, (k, v) in enumerate(repprofiles[a].iterrows()):
    h.showcount(i, len(repprofiles))
    res = np.array([np.corrcoef(v, w)[0, 1]
                    for _, w in goodprofile[a].iterrows()])
    if max(res) == 1 or closest:
        match[k] = ind[np.argmax(res)]
    if returncorr:
      corr.append(res)
  if returncorr:
    corr = pd.DataFrame(data=corr, index=repprofiles.index.tolist(
    ), columns=goodprofile.index.tolist())
    return match, corr
  else:
    return match
Exemple #2
0
def mergeSplicingVariants(df, defined='.'):
  df = df.sort_index()
  foundpoint = False
  # pdb.set_trace()
  torename = {}
  todrop = []
  for i, v in enumerate(df.index.tolist()):
    h.showcount(i, len(df))
    if foundpoint:
      if foundpoint in v:
        tomerge.append(v)
      else:
        if foundpoint not in df.index:
          if len(tomerge) > 1:
            #print("merging "+str(len(tomerge)))
            df.loc[foundpoint] = df.loc[tomerge].sum()
            todrop.extend(tomerge)
          else:
            torename.update({tomerge[0]: foundpoint})
        else:
          todrop.extend(tomerge)
          tomerge.append(foundpoint)
          df.loc[foundpoint] = df.loc[tomerge].sum()
        foundpoint = False
    elif defined in v:
      foundpoint = v.split(defined)[0]
      tomerge = [v]
  if len(torename) > 0:
    df = df.rename(index=torename)
  df = df.drop(index=todrop)
  return df
Exemple #3
0
def findLikelyDup(tracker,
                  name='stripped_cell_line_name',
                  signs=['-', '_', '.', ' '],
                  arxspid='arxspan_id',
                  looksub=True):
    """
  find cell lines that are likely to be duplicates

  will return ,  as well,
  and

  Args:
  -----
    tracker: dataframe of the sample tracker
    looksub: bool, look if a name if within another name (can flag many derivatives)

  Returns:
  --------
    a list[tuples(str,str)] of likly duplicate names as tuples (rh13, RH-13)
    a list[tuples(str,str)] of associated arxspan ids
    a dict[str:set(str)] of arxspan ids that have multiple cell line names associated
  """
    names = set(tracker[name])
    simi = []
    arxsp = []
    issues = {}
    for i, name1 in enumerate(names):
        h.showcount(i, len(names))
        n1 = name1
        for s in signs:
            name1 = name1.replace(s, '')
        name1 = name1.upper()
        for name2 in names - set([n1]):
            n2 = name2
            for s in signs:
                name2 = name2.replace(s, '')
            name2 = name2.upper()
            if name1 == name2:
                if (looksub and (name1 in name2 or name2 in name1)
                        and abs(len(name1) - len(name2)) < 2) or not looksub:
                    if (n1, n2) not in simi and (n2, n1) not in simi:
                        simi.append((n1, n2))
                        arxsp.append(
                            (tracker[tracker[name] == n1][arxspid][0],
                             tracker[tracker[name] == n2][arxspid][0]))
    for val in set(tracker[name]):
        v = set(tracker[tracker[name] == val][arxspid])
        if len(v) > 1:
            issues.update({val: v})
    return simi, arxsp, issues
Exemple #4
0
def manageGapsInSegments(segtocp, Chromosome='Chromosome', End="End", Start="Start", cyto=None):
  """
  extends the ends of segments in a segment file from GATK so as to remove all gaps ove the genome (works with multiple sample file)

  Args:
  ----
    segtocp: dataframe of segments from GATK CN pipeline
    Chromosome: str the value for the Chromosome columns
    End: str the value for the End columns
    Start: str the value for the Start columns
    cyto: dataframe with chrom;end; columns giving the size of each chromosome (else puts last segment to 1000000000)
  """
  prevchr = ''
  prevend = 0
  count = 0
  l = []
  segments = segtocp.copy()
  le = len(segments)
  for k, val in segments.iterrows():
    h.showcount(count, le)
    count += 1
    if val[Chromosome] != prevchr:  # we changed chromosome
      # we extend the previous segment (last of the prev chrom) to.. way enough
      if len(l) > 0:
        l[-1][2] = 1000000000 if cyto is None else cyto[cyto['chrom']
                                                      == prevchr]['end'].values[-1]
      # we extend the first segment to 0
      l.append([val[Chromosome], 0, val[End]])
    else:
      if val[Start] > prevend + 1:  # we have a gap in the same chrom
        sizeofgap = val[Start] - prevend
        # we add to the previous one half of the gap
        l[-1][2] += int(sizeofgap /
                        2) if sizeofgap % 2 == 0 else int(sizeofgap / 2) + 1
        # the rest to the other
        l.append([val[Chromosome], val[Start] - int(sizeofgap / 2), val[End]])
      elif val[Start] < prevend:  # this should never happen
        # import pdb; pdb.set_trace()
        raise ValueError("start comes after end")
      else:
        l.append([val[Chromosome], val[Start], val[End]])
    prevchr = val[Chromosome]
    prevend = val[End]
  # we extend the last one
  l[-1][2] = 1000000000 if cyto is None else cyto[cyto['chrom']
                                                  == prevchr]['end'].values[-1]
  segments[[Chromosome, Start, End]] = l
  return segments.reset_index(drop=True)
Exemple #5
0
def computeDistsFromClass(dots,
                          seconddots,
                          conds=['DMSO', 'VHL'],
                          groupcol="group",
                          sclass='green',
                          signal="mean_green",
                          area="area"):
    """
  """
    dists = {}
    twodists = {}
    for val in set(dots.exp):
        for e in conds:
            d = dots[(dots.exp == val) & (dots.treat == e)]
            dist = []
            weight = []
            newdist = []
            ind = []
            m = seconddots[(seconddots.exp == val) & (seconddots.treat == e)]
            print(val, e)
            for i, (k, v) in enumerate(m.iterrows()):
                h.showcount(i, len(m))
                dist.append(
                    distance_matrix(
                        d[(d['class'] == sclass)
                          & (d[groupcol] == v[groupcol])][['x', "y",
                                                           "z"]].values,
                        np.array([v[['x_mean', "y_mean",
                                     "z_mean"]]])).T[0].astype(float))
                weight.append(d[(d['class'] == sclass)
                                & (d[groupcol] == v[groupcol])][signal])
                dat = d[(d['class'] == sclass)
                        & (d[groupcol] == v[groupcol])][[
                            'x', "y", "z", signal, area, "m_id"
                        ]]
                a = dat.values
                a[:, :3] = a[:, :3] - v[['x_mean', "y_mean", "z_mean"]].values
                newdist.append(a)
                ind.extend(dat.index.tolist())
            twodists[val + e] = pd.DataFrame(
                data=np.vstack(newdist),
                columns=['x', 'y', 'z', signal, area, "m_id"],
                index=ind)
            dists[val + e] = [np.hstack(dist), np.hstack(weight)]
    return twodists, dists
Exemple #6
0
def fromGTF2BED(gtfname, bedname, gtftype='geneAnnot'):
    """
    transforms a  gtf file into a bed file

    Args:
    ----
      gtfname: filepath to gtf file
      bedname: filepath to beddfile
      gtftype: only geneAnnot for now

    Returns:
    --------
      newbed: the bedfile as a pandas.df

    """
    if gtftype == 'geneAnnot':
        gtf = pd.read_csv(gtfname,
                          sep='\t',
                          header=0,
                          names=[
                              "chr", "val", "type", "start", 'stop', 'dot',
                              'strand', 'loc', 'name'
                          ])
        gtf['name'] = [
            i.split('gene_id "')[-1].split('"; trans')[0] for i in gtf['name']
        ]
        prevname = ''
        newbed = {'chr': [], 'start': [], 'end': [], 'gene': []}
        for i, val in gtf.iterrows():
            h.showcount(i, len(gtf))
            if val['name'] == prevname:
                newbed['end'][-1] = val['stop']
            else:
                newbed['chr'].append(val['chr'])
                newbed['start'].append(val['start'])
                newbed['end'].append(val['stop'])
                newbed['gene'].append(val['name'])
            prevname = val['name']
        newbed = pd.DataFrame(newbed)
        newbed = newbed[~newbed.chr.str.contains('_fix')]
        newbed.to_csv(bedname + ".bed", sep='\t', index=None)
        newbed.to_csv(bedname + "_genes.bed", sep='\t', index=None)
        return newbed
Exemple #7
0
def mafToMat(maf, mode="bool", freqcol='tumor_f',
             samplesCol="DepMap_ID", mutNameCol="Hugo_Symbol",
             minfreqtocall=0.2):
  """
  turns a maf file into a matrix of mutations x samples (works with multiple sample file)

  Args:
  -----
    maf: dataframe of the maf file
    sample_col: str colname for samples
    mode: flag  "bool" to convert the matrix into a boolean (mut/no mut)
                "float" to keep the allele frequencies as is (0.x)
                "genotype" to have either 1, 0.5 or 0
    freqcol: str colname where ref/alt frequencies are stored
    mutNameCol: str colname where mutation names are stored, will merge things over that column name

  Returns:
  --------
    the dataframe matrix
  """
  samples = set(maf[samplesCol])
  maf = maf[maf[freqcol] >= minfreqtocall]
  maf = maf.sort_values(by=mutNameCol)
  mut = pd.DataFrame(data=np.zeros((len(set(maf[mutNameCol])), 1)), columns=[
      'fake'], index=set(maf[mutNameCol])).astype(float)
  for i, val in enumerate(samples):
    h.showcount(i, len(samples))
    if mode == "genotype":
      mut = mut.join(maf[maf[samplesCol] == val].set_index(mutNameCol)[freqcol].groupby(
          mutNameCol).agg('sum').rename(val))
    else:
      mut = mut.join(maf[maf[samplesCol] == val].drop_duplicates(
          mutNameCol).set_index(mutNameCol)[freqcol].rename(val))
  mut = mut.fillna(0).astype(
      bool if mode == "bool" else float).drop(columns=['fake'])
  if mode == "genotype":
    mut[(mut > 1.3)] = 3
    mut[(mut >= 0.7) & (mut <= 1.3)] = 2
    mut[(mut > .3) & (mut < .7)] = 1
    mut[mut <= .3] = 0
  return mut
Exemple #8
0
def substractPeaksTo(peaks, loci, bp=50):
    """
  removes all peaks that are not within a bp distance to a set of loci

  Args:
  ----
    peaks: a bed file df with a chrom,start, end column at least
    loci: a df witth a chrom & loci column
    bp: the max allowed distance to the loci

  Returns:
  -------
    all the peaks that are within this distance
  """
    i = 0
    j = 0
    keep = []
    bp = 50
    while j < len(peaks) and i < len(loci):
        h.showcount(j, len(peaks))
        if peaks.loc[j].chrom > loci.loc[i].chrom:
            i += 1
            continue
        if peaks.loc[j].chrom < loci.loc[i].chrom:
            j += 1
            continue
        if peaks.loc[j].start - bp > loci.loc[i].loci:
            i += 1
            continue
        if peaks.loc[j].end + bp < loci.loc[i].loci:
            j += 1
            continue
        if peaks.loc[j].end + bp >= loci.loc[
                i].loci and peaks.loc[j].start - bp <= loci.loc[i].loci:
            keep.append(j)
            j += 1
    return peaks.loc[set(keep)]
Exemple #9
0
def toGeneMatrix(segments, gene_mapping, style='weighted', missingchrom=['Y'], gene_names_col='gene_name'):
  """
  makes a geneXsample matrix from segment level copy number (works with multiple sample file)

  Args:
  ----
    style: str one of "weighted","mean","closest"
    segments: dataframe of segments containing: [Chromosome, Segment_Mean, Chromosome, start, end] columns
    gene_mapping: dataframe with symbol, ensembl_id columns for each gene
    missingchrom: list[str] chromosomes not to look into

  Returns:
  -------
    pd.dataframe: the matrix
  """
  samples = list(set(segments.DepMap_ID))
  data = np.zeros((len(samples), len(gene_mapping)))
  for i, sample in enumerate(samples):
    segs = segments[segments.DepMap_ID == sample][[
        'Chromosome', 'Start', 'End', "Segment_Mean"]].values
    hasmissing = set(missingchrom) - set(segs[:, 0])
    j = 0
    h.showcount(i, len(samples))
    for k, gene in enumerate(gene_mapping[['Chromosome', 'start', 'end']].values):
        #print(i,j)
        if gene[0] in hasmissing:
          data[i, k] = np.nan
          continue
        try:
          while gene[0] != segs[j][0] or gene[1] >= segs[j][2]:
            #print("went beyong",gene, segs[j])
            j += 1
          # some genes are within other genes, we need to go back in the list of segment in that case
        except:
          raise ValueError('forgot to sort one of the DF?')
        while gene[1] < segs[j][1]:
          j -= 1
          #print("decrease gene",gene)
        # we are entirely within the segment
        c = 1
        if gene[2] <= segs[j][2]:
          data[i, k] = segs[j][3]
        else:
          # how much of the gene is covered by the segment
          coef = (segs[j][2] - gene[1]) / (gene[2] - gene[1])
          # print('coef',coef)
          val = segs[j][3] * coef if style == "weighted" else segs[j][3]
          end = segs[j][2]
          # until the end of a segments goes beyond the end of the gene (say if we have X segments within the gene)
          while end < gene[2]:
            # pdb.set_trace()
            j += 1
            c += 1
            nextend = segs[j][2] if segs[j][2] < gene[2] else gene[2]
            # here, end (of prevsegment) is the next segment's start
            ncoef = (nextend - end) / (gene[2] - gene[1])
            # print('multi',gene, ncoef)
            if style == "closest":
              if ncoef > coef:
                val = segs[j][3]
              else:
                # we switch it back (see line 894)
                ncoef = coef
            else:
              val += segs[j][3] * ncoef if style == "weighted" else segs[j][3]
            end = segs[j][2]
            coef = ncoef
          data[i, k] = val if style == "weighted" else val / c
  return pd.DataFrame(data=data, index=samples, columns=gene_mapping[gene_names_col])