Ejemplo n.º 1
0
def bias_correction(bam, signal, fBiasDict, rBiasDict, genome_file_name, chrName, start, end):

  # Parameters
  window = 50
  defaultKmerValue = 1.0

  # Initialization
  fastaFile = Fastafile(genome_file_name)
  k_nb = len(fBiasDict.keys()[0])
  p1 = start; p2 = end
  p1_w = p1 - (window/2); p2_w = p2 + (window/2)
  p1_wk = p1_w - (k_nb/2); p2_wk = p2_w + (k_nb/2)

  # Raw counts
  nf = [0.0] * (p2_w-p1_w); nr = [0.0] * (p2_w-p1_w)
  for r in bam.fetch(chrName, p1_w, p2_w):
    if((not r.is_reverse) and (r.pos > p1_w)): nf[r.pos-p1_w] += 1.0
    if((r.is_reverse) and ((r.aend-1) < p2_w)): nr[r.aend-1-p1_w] += 1.0

  # Smoothed counts
  Nf = []; Nr = [];
  fSum = sum(nf[:window]); rSum = sum(nr[:window]);
  fLast = nf[0]; rLast = nr[0]
  for i in range((window/2),len(nf)-(window/2)):
    Nf.append(fSum)
    Nr.append(rSum)
    fSum -= fLast; fSum += nf[i+(window/2)]; fLast = nf[i-(window/2)+1]
    rSum -= rLast; rSum += nr[i+(window/2)]; rLast = nr[i-(window/2)+1]

  # Fetching sequence
  currStr = str(fastaFile.fetch(chrName, p1_wk-1, p2_wk-2)).upper()
  currRevComp = revcomp(str(fastaFile.fetch(chrName,p1_wk+2, p2_wk+1)).upper())

  # Iterating on sequence to create signal
  af = []; ar = []
  for i in range((k_nb/2),len(currStr)-(k_nb/2)+1):
    fseq = currStr[i-(k_nb/2):i+(k_nb/2)]
    rseq = currRevComp[len(currStr)-(k_nb/2)-i:len(currStr)+(k_nb/2)-i]
    try: af.append(fBiasDict[fseq])
    except Exception: af.append(defaultKmerValue)
    try: ar.append(rBiasDict[rseq])
    except Exception: ar.append(defaultKmerValue)

  # Calculating bias and writing to wig file
  fSum = sum(af[:window]); rSum = sum(ar[:window]);
  fLast = af[0]; rLast = ar[0]
  bias_corrected_signal = []
  for i in range((window/2),len(af)-(window/2)):
    nhatf = Nf[i-(window/2)]*(af[i]/fSum)
    nhatr = Nr[i-(window/2)]*(ar[i]/rSum)
    zf = log(nf[i]+1)-log(nhatf+1)
    zr = log(nr[i]+1)-log(nhatr+1)
    bias_corrected_signal.append(zf+zr)
    fSum -= fLast; fSum += af[i+(window/2)]; fLast = af[i-(window/2)+1]
    rSum -= rLast; rSum += ar[i+(window/2)]; rLast = ar[i-(window/2)+1]

  # Termination
  fastaFile.close()
  return bias_corrected_signal
Ejemplo n.º 2
0
    outputFile.write("fixedStep chrom="+chrName+" start="+str(p1+1)+" step=1\n")
    fSum = sum(af[:window]); rSum = sum(ar[:window]);
    fLast = af[0]; rLast = ar[0]
    for i in range((window/2),len(af)-(window/2)):
      nhatf = Nf[i-(window/2)]*(af[i]/fSum)
      nhatr = Nr[i-(window/2)]*(ar[i]/rSum)
      zf = log(nf[i]+1)-log(nhatf+1)
      zr = log(nr[i]+1)-log(nhatr+1)
      outputFile.write(str(round(zf+zr,4))+"\n")
      #print i+p1+1-(window/2), af[i], ar[i], fSum, rSum, Nf[i-(window/2)], Nr[i-(window/2)]
      fSum -= fLast; fSum += af[i+(window/2)]; fLast = af[i-(window/2)+1]
      rSum -= rLast; rSum += ar[i+(window/2)]; rLast = ar[i-(window/2)+1]

    #for i in range(p1, p2):
    #  print i+1, z[i-p1]

  except Exception: continue

# Closing files
bamFile.close()
fastaFile.close()
coordFile.close()
outputFile.close()

# Converting to bigwig
os.system(" ".join(["wigToBigWig",outputFileName,csFileName,outputFileName[:-3]+"bw"]))
os.system(" ".join(["wigToBigWig",outputFileNameRaw,csFileName,outputFileNameRaw[:-3]+"bw"]))
#os.system(" ".join(["rm",outputFileName]))


Ejemplo n.º 3
0
def bias_correction(chrom, start, end, bam, bias_table, genome_file_name, forward_shift, reverse_shift):
    # Parameters
    window = 50
    defaultKmerValue = 1.0

    # Initialization
    fastaFile = Fastafile(genome_file_name)
    fBiasDict = bias_table[0]
    rBiasDict = bias_table[1]
    k_nb = len(fBiasDict.keys()[0])
    p1 = start
    p2 = end
    p1_w = p1 - (window / 2)
    p2_w = p2 + (window / 2)
    p1_wk = p1_w - int(floor(k_nb / 2.))
    p2_wk = p2_w + int(ceil(k_nb / 2.))
    if p1 <= 0 or p1_w <= 0 or p2_wk <= 0:
        # Return raw counts
        bc_signal = [0.0] * (p2 - p1)
        for read in bam.fetch(chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    bc_signal[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    bc_signal[cut_site - p1] += 1.0

        return bc_signal

    # Raw counts
    nf = [0.0] * (p2_w - p1_w)
    nr = [0.0] * (p2_w - p1_w)
    for read in bam.fetch(chrom, p1_w, p2_w):
        # check if the read is unmapped, according to issue #112
        if read.is_unmapped:
            continue

        if not read.is_reverse:
            cut_site = read.pos + forward_shift
            if p1_w <= cut_site < p2_w:
                nf[cut_site - p1_w] += 1.0
        else:
            cut_site = read.aend + reverse_shift - 1
            if p1_w <= cut_site < p2_w:
                nr[cut_site - p1_w] += 1.0

    # Smoothed counts
    Nf = []
    Nr = []
    f_sum = sum(nf[:window])
    r_sum = sum(nr[:window])
    f_last = nf[0]
    r_last = nr[0]
    for i in range((window / 2), len(nf) - (window / 2)):
        Nf.append(f_sum)
        Nr.append(r_sum)
        f_sum -= f_last
        f_sum += nf[i + (window / 2)]
        f_last = nf[i - (window / 2) + 1]
        r_sum -= r_last
        r_sum += nr[i + (window / 2)]
        r_last = nr[i - (window / 2) + 1]

    # Fetching sequence
    currStr = str(fastaFile.fetch(chrom, p1_wk, p2_wk - 1)).upper()
    currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrom, p1_wk + 1, p2_wk)).upper())

    # Iterating on sequence to create signal
    af = []
    ar = []
    for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1):
        fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
        rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i]
        try:
            af.append(fBiasDict[fseq])
        except Exception:
            af.append(defaultKmerValue)
        try:
            ar.append(rBiasDict[rseq])
        except Exception:
            ar.append(defaultKmerValue)

    # Calculating bias and writing to wig file
    f_sum = sum(af[:window])
    r_sum = sum(ar[:window])
    f_last = af[0]
    r_last = ar[0]
    bc_signal = []
    for i in range((window / 2), len(af) - (window / 2)):
        nhatf = Nf[i - (window / 2)] * (af[i] / f_sum)
        nhatr = Nr[i - (window / 2)] * (ar[i] / r_sum)
        bc_signal.append(nhatf + nhatr)
        f_sum -= f_last
        f_sum += af[i + (window / 2)]
        f_last = af[i - (window / 2) + 1]
        r_sum -= r_last
        r_sum += ar[i + (window / 2)]
        r_last = ar[i - (window / 2) + 1]

    # Termination
    fastaFile.close()
    return bc_signal
Ejemplo n.º 4
0
def bias_correction(bam, signal, fBiasDict, rBiasDict, genome_file_name,
                    chrName, start, end):

    # Parameters
    window = 50
    defaultKmerValue = 1.0

    # Initialization
    fastaFile = Fastafile(genome_file_name)
    k_nb = len(fBiasDict.keys()[0])
    p1 = start
    p2 = end
    p1_w = p1 - (window / 2)
    p2_w = p2 + (window / 2)
    p1_wk = p1_w - (k_nb / 2)
    p2_wk = p2_w + (k_nb / 2)

    # Raw counts
    nf = [0.0] * (p2_w - p1_w)
    nr = [0.0] * (p2_w - p1_w)
    for r in bam.fetch(chrName, p1_w, p2_w):
        if ((not r.is_reverse) and (r.pos > p1_w)): nf[r.pos - p1_w] += 1.0
        if ((r.is_reverse) and ((r.aend - 1) < p2_w)):
            nr[r.aend - 1 - p1_w] += 1.0

    # Smoothed counts
    Nf = []
    Nr = []
    fSum = sum(nf[:window])
    rSum = sum(nr[:window])
    fLast = nf[0]
    rLast = nr[0]
    for i in range((window / 2), len(nf) - (window / 2)):
        Nf.append(fSum)
        Nr.append(rSum)
        fSum -= fLast
        fSum += nf[i + (window / 2)]
        fLast = nf[i - (window / 2) + 1]
        rSum -= rLast
        rSum += nr[i + (window / 2)]
        rLast = nr[i - (window / 2) + 1]

    # Fetching sequence
    currStr = str(fastaFile.fetch(chrName, p1_wk - 1, p2_wk - 2)).upper()
    currRevComp = revcomp(
        str(fastaFile.fetch(chrName, p1_wk + 2, p2_wk + 1)).upper())

    # Iterating on sequence to create signal
    af = []
    ar = []
    for i in range((k_nb / 2), len(currStr) - (k_nb / 2) + 1):
        fseq = currStr[i - (k_nb / 2):i + (k_nb / 2)]
        rseq = currRevComp[len(currStr) - (k_nb / 2) - i:len(currStr) +
                           (k_nb / 2) - i]
        try:
            af.append(fBiasDict[fseq])
        except Exception:
            af.append(defaultKmerValue)
        try:
            ar.append(rBiasDict[rseq])
        except Exception:
            ar.append(defaultKmerValue)

    # Calculating bias and writing to wig file
    fSum = sum(af[:window])
    rSum = sum(ar[:window])
    fLast = af[0]
    rLast = ar[0]
    bias_corrected_signal = []
    for i in range((window / 2), len(af) - (window / 2)):
        nhatf = Nf[i - (window / 2)] * (af[i] / fSum)
        nhatr = Nr[i - (window / 2)] * (ar[i] / rSum)
        zf = log(nf[i] + 1) - log(nhatf + 1)
        zr = log(nr[i] + 1) - log(nhatr + 1)
        bias_corrected_signal.append(zf + zr)
        fSum -= fLast
        fSum += af[i + (window / 2)]
        fLast = af[i - (window / 2) + 1]
        rSum -= rLast
        rSum += ar[i + (window / 2)]
        rLast = ar[i - (window / 2) + 1]

    # Termination
    fastaFile.close()
    return bias_corrected_signal
Ejemplo n.º 5
0
def estimate_bias_kmer(args):
    # Parameters
    maxDuplicates = 100
    pseudocount = 1.0

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    # Initializing dictionaries
    obsDictF = dict()
    obsDictR = dict()
    expDictF = dict()
    expDictR = dict()

    ct_reads_r = 0
    ct_reads_f = 0
    ct_kmers = 0

    # Iterating on HS regions
    for region in regions:

        # Initialization
        prevPos = -1
        trueCounter = 0

        # Evaluating observed frequencies ####################################
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):

            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prevPos:
                trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if trueCounter > maxDuplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                ct_reads_f += 1
                try:
                    obsDictF[currStr] += 1
                except Exception:
                    obsDictF[currStr] = 1
            else:
                ct_reads_r += 1
                try:
                    obsDictR[currStr] += 1
                except Exception:
                    obsDictR[currStr] = 1

        # Evaluating expected frequencies ####################################
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        currRevComp = AuxiliaryFunctions.revcomp(currStr)

        # Iterating on each sequence position
        for i in range(0, len(currStr) - args.k_nb):
            ct_kmers += 1
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            try:
                expDictF[s] += 1
            except Exception:
                expDictF[s] = 1

            # Counting k-mer in dictionary for reverse complement
            s = currRevComp[i:i + args.k_nb]
            try:
                expDictR[s] += 1
            except Exception:
                expDictR[s] = 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in kmerComb])
    bias_table_R = dict([(e, 0.0) for e in kmerComb])
    for kmer in kmerComb:
        try:
            obsF = obsDictF[kmer] + pseudocount
        except Exception:
            obsF = pseudocount
        try:
            expF = expDictF[kmer] + pseudocount
        except Exception:
            expF = pseudocount
        if ct_reads_f == 0:
            bias_table_F[kmer] = 1
        else:
            bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6)
        try:
            obsR = obsDictR[kmer] + pseudocount
        except Exception:
            obsR = pseudocount
        try:
            expR = expDictR[kmer] + pseudocount
        except Exception:
            expR = pseudocount
        if ct_reads_r == 0:
            bias_table_R[kmer] = 1
        else:
            bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
Ejemplo n.º 6
0
    def bias_correction(self, signal, bias_table, genome_file_name, chrName,
                        start, end, forward_shift, reverse_shift,
                        strands_specific):
        """
        Performs bias correction.

        Keyword arguments:
        signal -- Input signal.
        bias_table -- Bias table.

        Return:
        bias_corrected_signal -- Bias-corrected sequence.
        """

        if (not bias_table): return signal

        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fastaFile = Fastafile(genome_file_name)
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(floor(k_nb / 2.))
        p2_wk = p2_w + int(ceil(k_nb / 2.))
        if (p1 <= 0 or p1_w <= 0 or p1_wk <= 0): return signal

        # Raw counts
        nf = [0.0] * (p2_w - p1_w)
        nr = [0.0] * (p2_w - p1_w)
        for read in self.bam.fetch(chrName, p1_w, p2_w):
            if (not read.is_reverse):
                cut_site = read.pos + forward_shift
                if cut_site >= start and cut_site < end:
                    nf[cut_site - p1_w] += 1.0
                    # for i in range(max(read.pos + forward_shift, start), min(read.pos + forward_shift + 1, end - 1)):
                    #    nf[i - start] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if cut_site >= start and cut_site < end:
                    nr[cut_site - p1_w] += 1.0
                    # for i in range(max(read.aend + reverse_shift - 1, start), min(read.aend + reverse_shift, end - 1)):
                    #    nr[i - start] += 1.0

                    # if ((not read.is_reverse) and (read.pos > p1_w)): nf[read.pos - p1_w] += 1.0
                    # if ((read.is_reverse) and ((read.aend - 1) < p2_w)): nr[read.aend - 1 - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(nf[:window])
        rSum = sum(nr[:window])
        fLast = nf[0]
        rLast = nr[0]
        for i in range((window / 2), len(nf) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += nf[i + (window / 2)]
            fLast = nf[i - (window / 2) + 1]
            rSum -= rLast
            rSum += nr[i + (window / 2)]
            rLast = nr[i - (window / 2) + 1]

        # Fetching sequence
        currStr = str(fastaFile.fetch(chrName, p1_wk - 1, p2_wk - 2)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(
            str(fastaFile.fetch(chrName, p1_wk + 2, p2_wk + 1)).upper())
        #currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper()
        #currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1,
        #                                                            p2_wk)).upper())

        # Iterating on sequence to create signal
        af = []
        ar = []
        for i in range(int(ceil(k_nb / 2.)),
                       len(currStr) - int(floor(k_nb / 2)) + 1):
            fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
            rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) -
                               i:len(currStr) + int(floor(k_nb / 2.)) - i]
            try:
                af.append(fBiasDict[fseq])
            except Exception:
                af.append(defaultKmerValue)
            try:
                ar.append(rBiasDict[rseq])
            except Exception:
                ar.append(defaultKmerValue)

        # Calculating bias and writing to wig file
        fSum = sum(af[:window])
        rSum = sum(ar[:window])
        fLast = af[0]
        rLast = ar[0]
        bias_corrected_signal = []
        bias_corrected_signal_forward = []
        bias_corrected_signal_reverse = []
        for i in range((window / 2), len(af) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (af[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (ar[i] / rSum)
            zf = log(nf[i] + 1) - log(nhatf + 1)
            zr = log(nr[i] + 1) - log(nhatr + 1)
            bias_corrected_signal_forward.append(zf)
            bias_corrected_signal_reverse.append(zr)
            bias_corrected_signal.append(zf + zr)
            fSum -= fLast
            fSum += af[i + (window / 2)]
            fLast = af[i - (window / 2) + 1]
            rSum -= rLast
            rSum += ar[i + (window / 2)]
            rLast = ar[i - (window / 2) + 1]

        # Fixing the negative number in bias corrected signal
        min_value = abs(min(bias_corrected_signal_forward))
        bias_fixed_signal_forward = [
            e + min_value for e in bias_corrected_signal_forward
        ]

        min_value = abs(min(bias_corrected_signal_reverse))
        bias_fixed_signal_reverse = [
            e + min_value for e in bias_corrected_signal_reverse
        ]

        min_value = abs(min(bias_corrected_signal))
        bias_fixed_signal = [e + min_value for e in bias_corrected_signal]

        # Termination
        fastaFile.close()
        if not strands_specific:
            return bias_corrected_signal
        else:
            return bias_fixed_signal_forward, bias_fixed_signal_reverse
Ejemplo n.º 7
0
    def bias_correction_atac(self, bias_table, genome_file_name, chrName, start, end,
                             forward_shift, reverse_shift):

        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fastaFile = Fastafile(genome_file_name)
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(floor(k_nb / 2.))
        p2_wk = p2_w + int(ceil(k_nb / 2.))

        if (p1 <= 0 or p1_w <= 0 or p2_wk <= 0):
            # Return raw counts
            nf = [0.0] * (p2 - p1)
            nr = [0.0] * (p2 - p1)
            for read in self.bam.fetch(chrName, p1, p2):
                if not read.is_reverse:
                    cut_site = read.pos + forward_shift
                    if p1 <= cut_site < p2:
                        nf[cut_site - p1] += 1.0
                else:
                    cut_site = read.aend + reverse_shift - 1
                    if p1 <= cut_site < p2:
                        nr[cut_site - p1] += 1.0

            return nf, nr

        # Raw counts
        nf = [0.0] * (p2_w - p1_w)
        nr = [0.0] * (p2_w - p1_w)
        for read in self.bam.fetch(chrName, p1_w, p2_w):
            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1_w <= cut_site < p2_w:
                    nf[cut_site - p1_w] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1_w <= cut_site < p2_w:
                    nr[cut_site - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(nf[:window])
        rSum = sum(nr[:window])
        fLast = nf[0]
        rLast = nr[0]
        for i in range((window / 2), len(nf) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += nf[i + (window / 2)]
            fLast = nf[i - (window / 2) + 1]
            rSum -= rLast
            rSum += nr[i + (window / 2)]
            rLast = nr[i - (window / 2) + 1]

        # Fetching sequence
        currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1,
                                                                     p2_wk)).upper())

        # Iterating on sequence to create signal
        af = []
        ar = []
        for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1):
            fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
            rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i]
            try:
                af.append(fBiasDict[fseq])
            except Exception:
                af.append(defaultKmerValue)
            try:
                ar.append(rBiasDict[rseq])
            except Exception:
                ar.append(defaultKmerValue)

        # Calculating bias and writing to wig file
        fSum = sum(af[:window])
        rSum = sum(ar[:window])
        fLast = af[0]
        rLast = ar[0]
        bias_corrected_signal_forward = []
        bias_corrected_signal_reverse = []
        for i in range((window / 2), len(af) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (af[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (ar[i] / rSum)
            bias_corrected_signal_forward.append(nhatf)
            bias_corrected_signal_reverse.append(nhatr)
            fSum -= fLast
            fSum += af[i + (window / 2)]
            fLast = af[i - (window / 2) + 1]
            rSum -= rLast
            rSum += ar[i + (window / 2)]
            rLast = ar[i - (window / 2) + 1]

        # Termination
        fastaFile.close()
        return bias_corrected_signal_forward, bias_corrected_signal_reverse
Ejemplo n.º 8
0
    def bias_correction(self, signal, bias_table, genome_file_name, chrName, start, end,
                        forward_shift, reverse_shift, strands_specific):
        """
        Performs bias correction.

        Keyword arguments:
        signal -- Input signal.
        bias_table -- Bias table.

        Return:
        bias_corrected_signal -- Bias-corrected sequence.
        """

        if (not bias_table): return signal

        # Parameters
        window = 50
        defaultKmerValue = 1.0

        # Initialization
        fastaFile = Fastafile(genome_file_name)
        fBiasDict = bias_table[0]
        rBiasDict = bias_table[1]
        k_nb = len(fBiasDict.keys()[0])
        p1 = start
        p2 = end
        p1_w = p1 - (window / 2)
        p2_w = p2 + (window / 2)
        p1_wk = p1_w - int(floor(k_nb / 2.))
        p2_wk = p2_w + int(ceil(k_nb / 2.))
        if (p1 <= 0 or p1_w <= 0 or p1_wk <= 0): return signal

        # Raw counts
        nf = [0.0] * (p2_w - p1_w)
        nr = [0.0] * (p2_w - p1_w)
        for read in self.bam.fetch(chrName, p1_w, p2_w):
            if (not read.is_reverse):
                cut_site = read.pos + forward_shift
                if cut_site >= start and cut_site < end:
                    nf[cut_site - p1_w] += 1.0
                    # for i in range(max(read.pos + forward_shift, start), min(read.pos + forward_shift + 1, end - 1)):
                    #    nf[i - start] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if cut_site >= start and cut_site < end:
                    nr[cut_site - p1_w] += 1.0
                    # for i in range(max(read.aend + reverse_shift - 1, start), min(read.aend + reverse_shift, end - 1)):
                    #    nr[i - start] += 1.0

                    # if ((not read.is_reverse) and (read.pos > p1_w)): nf[read.pos - p1_w] += 1.0
                    # if ((read.is_reverse) and ((read.aend - 1) < p2_w)): nr[read.aend - 1 - p1_w] += 1.0

        # Smoothed counts
        Nf = []
        Nr = []
        fSum = sum(nf[:window])
        rSum = sum(nr[:window])
        fLast = nf[0]
        rLast = nr[0]
        for i in range((window / 2), len(nf) - (window / 2)):
            Nf.append(fSum)
            Nr.append(rSum)
            fSum -= fLast
            fSum += nf[i + (window / 2)]
            fLast = nf[i - (window / 2) + 1]
            rSum -= rLast
            rSum += nr[i + (window / 2)]
            rLast = nr[i - (window / 2) + 1]

        # Fetching sequence
        currStr = str(fastaFile.fetch(chrName, p1_wk-1, p2_wk-2)).upper()
        currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName,p1_wk+2, p2_wk+1)).upper())
        #currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper()
        #currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1,
         #                                                            p2_wk)).upper())

        # Iterating on sequence to create signal
        af = []
        ar = []
        for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1):
            fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
            rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i]
            try:
                af.append(fBiasDict[fseq])
            except Exception:
                af.append(defaultKmerValue)
            try:
                ar.append(rBiasDict[rseq])
            except Exception:
                ar.append(defaultKmerValue)

        # Calculating bias and writing to wig file
        fSum = sum(af[:window])
        rSum = sum(ar[:window])
        fLast = af[0]
        rLast = ar[0]
        bias_corrected_signal = []
        bias_corrected_signal_forward = []
        bias_corrected_signal_reverse = []
        for i in range((window / 2), len(af) - (window / 2)):
            nhatf = Nf[i - (window / 2)] * (af[i] / fSum)
            nhatr = Nr[i - (window / 2)] * (ar[i] / rSum)
            zf = log(nf[i] + 1) - log(nhatf + 1)
            zr = log(nr[i] + 1) - log(nhatr + 1)
            bias_corrected_signal_forward.append(zf)
            bias_corrected_signal_reverse.append(zr)
            bias_corrected_signal.append(zf + zr)
            fSum -= fLast
            fSum += af[i + (window / 2)]
            fLast = af[i - (window / 2) + 1]
            rSum -= rLast
            rSum += ar[i + (window / 2)]
            rLast = ar[i - (window / 2) + 1]

        # Fixing the negative number in bias corrected signal
        min_value = abs(min(bias_corrected_signal_forward))
        bias_fixed_signal_forward = [e + min_value for e in bias_corrected_signal_forward]

        min_value = abs(min(bias_corrected_signal_reverse))
        bias_fixed_signal_reverse = [e + min_value for e in bias_corrected_signal_reverse]

        min_value = abs(min(bias_corrected_signal))
        bias_fixed_signal = [e + min_value for e in bias_corrected_signal]

        # Termination
        fastaFile.close()
        if not strands_specific:
            return bias_corrected_signal
        else:
            return bias_fixed_signal_forward, bias_fixed_signal_reverse
Ejemplo n.º 9
0
    def estimate_table_pwm(self, regions, dnase_file_name, genome_file_name,
                           k_nb, forward_shift, reverse_shift):
        """
        Estimates bias based on HS regions, DNase-seq signal and genomic sequences.

        Keyword arguments:
        regions -- DNase-seq HS regions.
        atac_file_name -- DNase-seq file name.
        genome_file_name -- Genome to fetch genomic sequences from.

        Return:
        bias_table_F, bias_table_R -- Bias tables.
        """

        # Initializing bam and fasta
        if (dnase_file_name.split(".")[-1].upper() != "BAM"):
            return None  # TODO ERROR
        bamFile = Samfile(dnase_file_name, "rb")
        fastaFile = Fastafile(genome_file_name)

        obsSeqsF = []
        obsSeqsR = []
        expSeqsF = []
        expSeqsR = []

        # Iterating on HS regions
        for region in regions:
            # Evaluating observed frequencies
            # Fetching reads
            for r in bamFile.fetch(region.chrom, region.initial, region.final):
                # Calculating positions
                # if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift
                # else: p1 = r.aend - (k_nb/2) + 1 - shift
                if (not r.is_reverse):
                    cut_site = r.pos + forward_shift - 1
                    p1 = cut_site - int(floor(k_nb / 2))
                else:
                    cut_site = r.aend + reverse_shift + 1
                    p1 = cut_site - int(floor(k_nb / 2))
                p2 = p1 + k_nb

                # Fetching k-mer
                try:
                    currStr = str(fastaFile.fetch(region.chrom, p1,
                                                  p2)).upper()
                except Exception:
                    continue
                if (r.is_reverse):
                    currStr = AuxiliaryFunctions.revcomp(currStr)

                # Counting k-mer in dictionary
                if 'N' not in currStr:
                    if (not r.is_reverse):
                        obsSeqsF.append(Seq(currStr))
                    else:
                        obsSeqsR.append(Seq(currStr))

            # Evaluating expected frequencies
            # Fetching whole sequence
            try:
                currStr = str(
                    fastaFile.fetch(region.chrom, region.initial,
                                    region.final)).upper()
            except Exception:
                continue
            currRevComp = AuxiliaryFunctions.revcomp(currStr)

            # Iterating on each sequence position
            for i in range(0, len(currStr) - k_nb):
                s = currStr[i:i + k_nb]
                if 'N' not in currStr:
                    # Counting k-mer in dictionary
                    expSeqsF.append(Seq(s))

                    # Counting k-mer in dictionary for reverse complement
                    s = currRevComp[i:i + k_nb]
                    expSeqsR.append(Seq(s))

        # Closing files
        bamFile.close()
        fastaFile.close()

        obsMotifsF = motifs.create(obsSeqsF)
        obsMotifsR = motifs.create(obsSeqsR)
        expMotifsF = motifs.create(expSeqsF)
        expMotifsR = motifs.create(expSeqsR)

        obsPwmF = obsMotifsF.pwm
        obsPwmR = obsMotifsR.pwm
        expPwmF = expMotifsF.pwm
        expPwmR = expMotifsR.pwm

        # Output logos
        logo_obs_f = os.path.join(
            self.output_loc, "Bias", "logo",
            "obs_{}_{}_f.pdf".format(str(k_nb), str(forward_shift)))
        logo_obs_r = os.path.join(
            self.output_loc, "Bias", "logo",
            "obs_{}_{}_r.pdf".format(str(k_nb), str(forward_shift)))
        logo_exp_f = os.path.join(
            self.output_loc, "Bias", "logo",
            "exp_{}_{}_f.pdf".format(str(k_nb), str(forward_shift)))
        logo_exp_r = os.path.join(
            self.output_loc, "Bias", "logo",
            "exp_{}_{}_r.pdf".format(str(k_nb), str(forward_shift)))
        obsMotifsF.weblogo(logo_obs_f,
                           format="pdf",
                           stack_width="large",
                           color_scheme="color_classic",
                           yaxis_scale=0.2,
                           yaxis_tic_interval=0.1)
        obsMotifsR.weblogo(logo_obs_r,
                           format="pdf",
                           stack_width="large",
                           color_scheme="color_classic",
                           yaxis_scale=0.2,
                           yaxis_tic_interval=0.1)
        expMotifsF.weblogo(logo_exp_f,
                           format="pdf",
                           stack_width="large",
                           color_scheme="color_classic",
                           yaxis_scale=0.02,
                           yaxis_tic_interval=0.01)
        expMotifsR.weblogo(logo_exp_r,
                           format="pdf",
                           stack_width="large",
                           color_scheme="color_classic",
                           yaxis_scale=0.02,
                           yaxis_tic_interval=0.01)

        # Output pwms
        pwm_data_list = [obsPwmF, obsPwmR, expPwmF, expPwmR]
        pwm_file_list = []
        pwm_obs_f = os.path.join(
            self.output_loc, "Bias", "pwm",
            "obs_{}_{}_f.pwm".format(str(k_nb), str(forward_shift)))
        pwm_obs_r = os.path.join(
            self.output_loc, "Bias", "pwm",
            "obs_{}_{}_r.pwm".format(str(k_nb), str(forward_shift)))
        pwm_exp_f = os.path.join(
            self.output_loc, "Bias", "pwm",
            "exp_{}_{}_f.pwm".format(str(k_nb), str(forward_shift)))
        pwm_exp_r = os.path.join(
            self.output_loc, "Bias", "pwm",
            "exp_{}_{}_r.pwm".format(str(k_nb), str(forward_shift)))

        pwm_file_list.append(pwm_obs_f)
        pwm_file_list.append(pwm_obs_r)
        pwm_file_list.append(pwm_exp_f)
        pwm_file_list.append(pwm_exp_r)

        for i in range(len(pwm_data_list)):
            with open(pwm_file_list[i], "w") as f:
                f.write(str(pwm_data_list[i]))

        # Creating bias dictionary
        alphabet = ["A", "C", "G", "T"]
        k_mer_comb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
        bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
        bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
        for k_mer in k_mer_comb:
            obsF = self.get_pwm_score(k_mer, obsPwmF, k_nb)
            expF = self.get_pwm_score(k_mer, expPwmF, k_nb)
            bias_table_F[k_mer] = round(obsF / expF, 6)
            obsR = self.get_pwm_score(k_mer, obsPwmR, k_nb)
            expR = self.get_pwm_score(k_mer, expPwmR, k_nb)
            bias_table_R[k_mer] = round(obsR / expR, 6)

        # Return
        return [bias_table_F, bias_table_R]
Ejemplo n.º 10
0
    def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb,
                       forward_shift, reverse_shift):
        """ 
        Estimates bias based on HS regions, DNase-seq signal and genomic sequences.

        Keyword arguments:
        regions -- DNase-seq HS regions.
        dnase_file_name -- DNase-seq file name.
        genome_file_name -- Genome to fetch genomic sequences from.
        
        Return:
        bias_table_F, bias_table_R -- Bias tables.
        """

        # Parameters
        maxDuplicates = 100
        pseudocount = 1.0

        # Initializing bam and fasta
        if (dnase_file_name.split(".")[-1].upper() != "BAM"):
            return None  # TODO ERROR
        bamFile = Samfile(dnase_file_name, "rb")
        fastaFile = Fastafile(genome_file_name)

        # Initializing dictionaries
        obsDictF = dict()
        obsDictR = dict()
        expDictF = dict()
        expDictR = dict()

        ct_reads_r = 0
        ct_reads_f = 0
        ct_kmers = 0

        # Iterating on HS regions
        for region in regions:

            # Initialization
            prevPos = -1
            trueCounter = 0

            # Evaluating observed frequencies ####################################
            # Fetching reads
            for r in bamFile.fetch(region.chrom, region.initial, region.final):

                # Calculating positions
                if (not r.is_reverse):
                    cut_site = r.pos + forward_shift - 1
                    p1 = cut_site - int(floor(k_nb / 2))
                else:
                    cut_site = r.aend + reverse_shift + 1
                    p1 = cut_site - int(floor(k_nb / 2))
                p2 = p1 + k_nb

                # Verifying PCR artifacts
                if (p1 == prevPos):
                    trueCounter += 1
                else:
                    prevPos = p1
                    trueCounter = 0
                if (trueCounter > maxDuplicates): continue

                # Fetching k-mer
                try:
                    currStr = str(fastaFile.fetch(region.chrom, p1,
                                                  p2)).upper()
                except Exception:
                    continue
                if (r.is_reverse):
                    currStr = AuxiliaryFunctions.revcomp(currStr)

                # Counting k-mer in dictionary
                if (not r.is_reverse):
                    ct_reads_f += 1
                    try:
                        obsDictF[currStr] += 1
                    except Exception:
                        obsDictF[currStr] = 1
                else:
                    ct_reads_r += 1
                    try:
                        obsDictR[currStr] += 1
                    except Exception:
                        obsDictR[currStr] = 1

            # Evaluating expected frequencies ####################################
            # Fetching whole sequence
            try:
                currStr = str(
                    fastaFile.fetch(region.chrom, region.initial,
                                    region.final)).upper()
            except Exception:
                continue
            currRevComp = AuxiliaryFunctions.revcomp(currStr)

            # Iterating on each sequence position
            for i in range(0, len(currStr) - k_nb):
                ct_kmers += 1
                # Counting k-mer in dictionary
                s = currStr[i:i + k_nb]
                try:
                    expDictF[s] += 1
                except Exception:
                    expDictF[s] = 1

                # Counting k-mer in dictionary for reverse complement
                s = currRevComp[i:i + k_nb]
                try:
                    expDictR[s] += 1
                except Exception:
                    expDictR[s] = 1

        # Closing files
        bamFile.close()
        fastaFile.close()

        # Creating bias dictionary
        alphabet = ["A", "C", "G", "T"]
        kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
        bias_table_F = dict([(e, 0.0) for e in kmerComb])
        bias_table_R = dict([(e, 0.0) for e in kmerComb])
        for kmer in kmerComb:
            try:
                obsF = obsDictF[kmer] + pseudocount
            except Exception:
                obsF = pseudocount
            try:
                expF = expDictF[kmer] + pseudocount
            except Exception:
                expF = pseudocount
            if ct_reads_f == 0:
                bias_table_F[kmer] = 1
            else:
                bias_table_F[kmer] = round(
                    float(obsF / ct_reads_f) / float(expF / ct_kmers), 6)
            try:
                obsR = obsDictR[kmer] + pseudocount
            except Exception:
                obsR = pseudocount
            try:
                expR = expDictR[kmer] + pseudocount
            except Exception:
                expR = pseudocount
            if ct_reads_r == 0:
                bias_table_R[kmer] = 1
            else:
                bias_table_R[kmer] = round(
                    float(obsR / ct_reads_r) / float(expR / ct_kmers), 6)

        # Return
        return [bias_table_F, bias_table_R]
Ejemplo n.º 11
0
    def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, shift):
        """ 
        Estimates bias based on HS regions, DNase-seq signal and genomic sequences.

        Keyword arguments:
        regions -- DNase-seq HS regions.
        dnase_file_name -- DNase-seq file name.
        genome_file_name -- Genome to fetch genomic sequences from.
        
        Return:
        bias_table_F, bias_table_R -- Bias tables.
        """

        # Parameters
        maxDuplicates = 100
        pseudocount = 1.0

        # Initializing bam and fasta
        if(dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR
        bamFile = Samfile(dnase_file_name, "rb")
        fastaFile = Fastafile(genome_file_name)

        # Initializing dictionaries
        obsDictF = dict(); obsDictR = dict()
        expDictF = dict(); expDictR = dict()

        ct_reads_r=0
        ct_reads_f=0
        ct_kmers=0

        # Iterating on HS regions
        for region in regions:

            # Initialization
            prevPos = -1
            trueCounter = 0

            # Evaluating observed frequencies ####################################

            # Fetching reads
            for r in bamFile.fetch(region.chrom, region.initial, region.final):

                # Calculating positions
                if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift
                else: p1 = r.aend - (k_nb/2) + 1 - shift
                p2 = p1 + k_nb

                # Verifying PCR artifacts
                if(p1 == prevPos): trueCounter += 1
                else:
                    prevPos = p1
                    trueCounter = 0
                if(trueCounter > maxDuplicates): continue

                # Fetching k-mer
                try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
                except Exception: continue
                if(r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr)

                # Counting k-mer in dictionary
                if(not r.is_reverse):
                    ct_reads_r+=1
                    try: obsDictF[currStr] += 1
                    except Exception: obsDictF[currStr] = 1
                else:
                    ct_reads_f+=1
                    try: obsDictR[currStr] += 1
                    except Exception: obsDictR[currStr] = 1 


            # Evaluating expected frequencies ####################################

            # Fetching whole sequence
            try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
            except Exception: continue
            currRevComp = AuxiliaryFunctions.revcomp(currStr)

            # Iterating on each sequence position
            for i in range(0,len(currStr)-k_nb):
                ct_kmers+=1
                # Counting k-mer in dictionary
                s = currStr[i:i+k_nb]
                try: expDictF[s] += 1
                except Exception: expDictF[s] = 1

                # Counting k-mer in dictionary for reverse complement
                s = currRevComp[i:i+k_nb]
                try: expDictR[s] += 1
                except Exception: expDictR[s] = 1

        # Closing files
        bamFile.close()
        fastaFile.close()

        # Creating bias dictionary
        alphabet = ["A","C","G","T"]
        kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
        bias_table_F = dict([(e,0.0) for e in kmerComb]) 
        bias_table_R = dict([(e,0.0) for e in kmerComb]) 
        for kmer in kmerComb:
            try: obsF = obsDictF[kmer] + pseudocount
            except Exception: obsF = pseudocount
            try: expF = expDictF[kmer] + pseudocount
            except Exception: expF = pseudocount
            bias_table_F[kmer] = round(float(obsF/ct_reads_f)/float(expF/ct_kmers),6)
            try: obsR = obsDictR[kmer] + pseudocount
            except Exception: obsR = pseudocount
            try: expR = expDictR[kmer] + pseudocount
            except Exception: expR = pseudocount
            bias_table_R[kmer] = round(float(obsR/ct_reads_r)/float(expR/ct_kmers),6)

        # Return
        return [bias_table_F, bias_table_R]
Ejemplo n.º 12
0
def estimate_bias_kmer(args):
    # Parameters
    maxDuplicates = 100
    pseudocount = 1.0

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    # Initializing dictionaries
    obsDictF = dict()
    obsDictR = dict()
    expDictF = dict()
    expDictR = dict()

    ct_reads_r = 0
    ct_reads_f = 0
    ct_kmers = 0

    # Iterating on HS regions
    for region in regions:

        # Initialization
        prevPos = -1
        trueCounter = 0

        # Evaluating observed frequencies ####################################
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):

            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prevPos:
                trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if trueCounter > maxDuplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                ct_reads_f += 1
                try:
                    obsDictF[currStr] += 1
                except Exception:
                    obsDictF[currStr] = 1
            else:
                ct_reads_r += 1
                try:
                    obsDictR[currStr] += 1
                except Exception:
                    obsDictR[currStr] = 1

        # Evaluating expected frequencies ####################################
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        currRevComp = AuxiliaryFunctions.revcomp(currStr)

        # Iterating on each sequence position
        for i in range(0, len(currStr) - args.k_nb):
            ct_kmers += 1
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            try:
                expDictF[s] += 1
            except Exception:
                expDictF[s] = 1

            # Counting k-mer in dictionary for reverse complement
            s = currRevComp[i:i + args.k_nb]
            try:
                expDictR[s] += 1
            except Exception:
                expDictR[s] = 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in kmerComb])
    bias_table_R = dict([(e, 0.0) for e in kmerComb])
    for kmer in kmerComb:
        try:
            obsF = obsDictF[kmer] + pseudocount
        except Exception:
            obsF = pseudocount
        try:
            expF = expDictF[kmer] + pseudocount
        except Exception:
            expF = pseudocount
        if ct_reads_f == 0:
            bias_table_F[kmer] = 1
        else:
            bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6)
        try:
            obsR = obsDictR[kmer] + pseudocount
        except Exception:
            obsR = pseudocount
        try:
            expR = expDictR[kmer] + pseudocount
        except Exception:
            expR = pseudocount
        if ct_reads_r == 0:
            bias_table_R[kmer] = 1
        else:
            bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
Ejemplo n.º 13
0
def estimate_bias_pwm(args):
    # Parameters
    max_duplicates = 100

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])

    # Iterating on HS regions
    for region in regions:
        # Initialization
        prev_pos = -1
        true_counter = 0

        # Evaluating observed frequencies
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):
            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prev_pos:
                true_counter += 1
            else:
                prev_pos = p1
                true_counter = 0
            if true_counter > max_duplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                for i in range(0, len(currStr)):
                    obs_f_pwm_dict[currStr[i]][i] += 1
            else:
                for i in range(0, len(currStr)):
                    obs_r_pwm_dict[currStr[i]][i] += 1

        # Evaluating expected frequencies
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue

        # Iterating on each sequence position
        s = None
        for i in range(0, len(currStr) - args.k_nb):
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            for i in range(0, len(s)):
                exp_f_pwm_dict[s[i]][i] += 1

            # Counting k-mer in dictionary for reverse complement
            s = AuxiliaryFunctions.revcomp(s)
            for i in range(0, len(s)):
                exp_r_pwm_dict[s[i]][i] += 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Output pwms
    os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
    pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
    pwm_file_list = []
    pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
    pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
    pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
    pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))

    pwm_file_list.append(pwm_obs_f)
    pwm_file_list.append(pwm_obs_r)
    pwm_file_list.append(pwm_exp_f)
    pwm_file_list.append(pwm_exp_r)

    for i in range(len(pwm_dict_list)):
        with open(pwm_file_list[i], "w") as pwm_file:
            for e in ["A", "C", "G", "T"]:
                pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n")

    motif_obs_f = motifs.read(open(pwm_obs_f), "pfm")
    motif_obs_r = motifs.read(open(pwm_obs_r), "pfm")
    motif_exp_f = motifs.read(open(pwm_exp_f), "pfm")
    motif_exp_r = motifs.read(open(pwm_exp_r), "pfm")

    # Output logos
    os.system("mkdir -p " + os.path.join(args.output_location, "logo"))
    logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb)))
    logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb)))
    logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb)))
    logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb)))

    motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)
    motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
    bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
    for k_mer in k_mer_comb:
        obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb)
        exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb)
        bias_table_F[k_mer] = round(obs_f / exp_f, 6)
        obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb)
        exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb)
        bias_table_R[k_mer] = round(obs_r / exp_r, 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
Ejemplo n.º 14
0
outputFile4.write("\t".join(header4) + "\n")
for i in range(0, maxV):
    vec = []
    for j in range(0, len(vectorTable4)):
        try:
            vec.append(vectorTable4[j][i])
        except Exception:
            vec.append("NA")
        try:
            vec.append(vectorTable5[j][i])
        except Exception:
            vec.append("NA")
    outputFile4.write("\t".join(vec) + "\n")

stagFile.close()
outputFile1.close()
outputFile2.close()
outputFile3.close()
outputFile4.close()
genomeFile.close()
regionsFile.close()
#chrommHmmFile.close()
enhancersFile.close()
[e.close() for e in signalFileList]
[e.close() for e in controlFileList]
[e.close() for e in motifFileList]

# Removing all files
command = "rm -rf " + tempLocation
os.system(command)
Ejemplo n.º 15
0
def estimate_bias_pwm(args):
    # Parameters
    max_duplicates = 100

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])

    # Iterating on HS regions
    for region in regions:
        # Initialization
        prev_pos = -1
        true_counter = 0

        # Evaluating observed frequencies
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):
            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prev_pos:
                true_counter += 1
            else:
                prev_pos = p1
                true_counter = 0
            if true_counter > max_duplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                for i in range(0, len(currStr)):
                    obs_f_pwm_dict[currStr[i]][i] += 1
            else:
                for i in range(0, len(currStr)):
                    obs_r_pwm_dict[currStr[i]][i] += 1

        # Evaluating expected frequencies
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue

        # Iterating on each sequence position
        s = None
        for i in range(0, len(currStr) - args.k_nb):
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            for i in range(0, len(s)):
                exp_f_pwm_dict[s[i]][i] += 1

            # Counting k-mer in dictionary for reverse complement
            s = AuxiliaryFunctions.revcomp(s)
            for i in range(0, len(s)):
                exp_r_pwm_dict[s[i]][i] += 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Output pwms
    os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
    pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
    pwm_file_list = []
    pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
    pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
    pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
    pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))

    pwm_file_list.append(pwm_obs_f)
    pwm_file_list.append(pwm_obs_r)
    pwm_file_list.append(pwm_exp_f)
    pwm_file_list.append(pwm_exp_r)

    for i in range(len(pwm_dict_list)):
        with open(pwm_file_list[i], "w") as pwm_file:
            for e in ["A", "C", "G", "T"]:
                pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n")

    motif_obs_f = motifs.read(open(pwm_obs_f), "pfm")
    motif_obs_r = motifs.read(open(pwm_obs_r), "pfm")
    motif_exp_f = motifs.read(open(pwm_exp_f), "pfm")
    motif_exp_r = motifs.read(open(pwm_exp_r), "pfm")

    # Output logos
    os.system("mkdir -p " + os.path.join(args.output_location, "logo"))
    logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb)))
    logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb)))
    logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb)))
    logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb)))

    motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)
    motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
    bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
    for k_mer in k_mer_comb:
        obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb)
        exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb)
        bias_table_F[k_mer] = round(obs_f / exp_f, 6)
        obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb)
        exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb)
        bias_table_R[k_mer] = round(obs_r / exp_r, 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
Ejemplo n.º 16
0
def bias_correction(chrom, start, end, bam, bias_table, genome_file_name,
                    forward_shift, reverse_shift):
    # Parameters
    window = 50
    defaultKmerValue = 1.0

    # Initialization
    fastaFile = Fastafile(genome_file_name)
    fBiasDict = bias_table[0]
    rBiasDict = bias_table[1]
    k_nb = len(list(fBiasDict.keys())[0])
    p1 = start
    p2 = end
    p1_w = p1 - (window // 2)
    p2_w = p2 + (window // 2)
    p1_wk = p1_w - int(floor(k_nb / 2.))
    p2_wk = p2_w + int(ceil(k_nb / 2.))
    if p1 <= 0 or p1_w <= 0 or p1_wk <= 0 or p2_wk <= 0:
        # Return raw counts
        bc_signal = [0.0] * (p2 - p1)
        for read in bam.fetch(chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    bc_signal[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    bc_signal[cut_site - p1] += 1.0

        return bc_signal

    # Raw counts
    nf = [0.0] * (p2_w - p1_w)
    nr = [0.0] * (p2_w - p1_w)
    for read in bam.fetch(chrom, p1_w, p2_w):
        # check if the read is unmapped, according to issue #112
        if read.is_unmapped:
            continue

        if not read.is_reverse:
            cut_site = read.pos + forward_shift
            if p1_w <= cut_site < p2_w:
                nf[cut_site - p1_w] += 1.0
        else:
            cut_site = read.aend + reverse_shift - 1
            if p1_w <= cut_site < p2_w:
                nr[cut_site - p1_w] += 1.0

    # Smoothed counts
    Nf = []
    Nr = []
    f_sum = sum(nf[:window])
    r_sum = sum(nr[:window])
    f_last = nf[0]
    r_last = nr[0]
    for i in range(int(window / 2), len(nf) - int(window / 2)):
        Nf.append(f_sum)
        Nr.append(r_sum)
        f_sum -= f_last
        f_sum += nf[i + int(window / 2)]
        f_last = nf[i - int(window / 2) + 1]
        r_sum -= r_last
        r_sum += nr[i + int(window / 2)]
        r_last = nr[i - int(window / 2) + 1]

    # Fetching sequence
    currStr = str(fastaFile.fetch(chrom, p1_wk, p2_wk - 1)).upper()
    currRevComp = AuxiliaryFunctions.revcomp(
        str(fastaFile.fetch(chrom, p1_wk + 1, p2_wk)).upper())

    # Iterating on sequence to create signal
    af = []
    ar = []
    for i in range(int(ceil(k_nb / 2.)),
                   len(currStr) - int(floor(k_nb / 2)) + 1):
        fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
        rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) -
                           i:len(currStr) + int(floor(k_nb / 2.)) - i]
        try:
            af.append(fBiasDict[fseq])
        except Exception:
            af.append(defaultKmerValue)
        try:
            ar.append(rBiasDict[rseq])
        except Exception:
            ar.append(defaultKmerValue)

    # Calculating bias and writing to wig file
    f_sum = sum(af[:window])
    r_sum = sum(ar[:window])
    f_last = af[0]
    r_last = ar[0]
    bc_signal = []
    for i in range(int(window / 2), len(af) - int(window / 2)):
        nhatf = Nf[i - int(window / 2)] * (af[i] / f_sum)
        nhatr = Nr[i - int(window / 2)] * (ar[i] / r_sum)
        bc_signal.append(nhatf + nhatr)
        f_sum -= f_last
        f_sum += af[i + int(window / 2)]
        f_last = af[i - int(window / 2) + 1]
        r_sum -= r_last
        r_sum += ar[i + int(window / 2)]
        r_last = ar[i - int(window / 2) + 1]

    # Termination
    fastaFile.close()
    return bc_signal
Ejemplo n.º 17
0
def bias_correction_dnase(signal_class, signal, chrName, start, end,
                          forward_shift, reverse_shift):
  table_file_name_F = os.path.join(os.path.dirname(__file__), '../data/single_hit_bias_table_F.txt')
  table_file_name_R = os.path.join(os.path.dirname(__file__), '../data/single_hit_bias_table_R.txt')
  bias_table = load_table(table_file_name_F, table_file_name_R)
  if not bias_table: return signal
  # Parameters
  window = 50
  defaultKmerValue = 1.0
  genome_file_name = signal_class.fastaFile
  # Initialization
  fastaFile = Fastafile(genome_file_name)
  fBiasDict = bias_table[0]
  rBiasDict = bias_table[1]
  k_nb = len(list(fBiasDict.keys())[0])
  p1 = start
  p2 = end
  p1_w = int(p1 - (window / 2))
  p2_w = int(p2 + (window / 2))
  p1_wk = p1_w - int(floor(k_nb / 2.))
  p2_wk = p2_w + int(ceil(k_nb / 2.))
  if p1 <= 0 or p1_w <= 0 or p1_wk <= 0: return signal

  # Raw counts
  nf = [0.0] * int(p2_w - p1_w)
  nr = [0.0] * int(p2_w - p1_w)
  for read in signal_class.bam.fetch(chrName, p1_w, p2_w):
    # check if the read is unmapped, according to issue #112
    if read.is_unmapped:
      continue

    if not read.is_reverse:
      cut_site = read.pos + forward_shift
      if p1_w <= cut_site < p2_w:
        nf[cut_site - p1_w] += 1.0
    else:
      cut_site = read.aend + reverse_shift - 1
      if p1_w <= cut_site < p2_w:
        nr[cut_site - p1_w] += 1.0

  # Smoothed counts
  Nf = []
  Nr = []
  fSum = sum(nf[:window])
  rSum = sum(nr[:window])
  fLast = nf[0]
  rLast = nr[0]
  for i in range(int(window / 2), int(len(nf) - (window / 2))):
    Nf.append(fSum)
    Nr.append(rSum)
    fSum -= fLast
    fSum += nf[i + int(window / 2)]
    fLast = nf[i - int(window / 2) + 1]
    rSum -= rLast
    rSum += nr[i + int(window / 2)]
    rLast = nr[i - int(window / 2) + 1]

  # Fetching sequence
  currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper()
  currRevComp = revcomp(str(fastaFile.fetch(chrName, p1_wk + 1, p2_wk)).upper())

  # Iterating on sequence to create signal
  af = []
  ar = []
  for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1):
    fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))]
    rseq = currRevComp[
           len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(
             floor(k_nb / 2.)) - i]
    try:
      af.append(fBiasDict[fseq])
    except Exception:
      af.append(defaultKmerValue)
    try:
      ar.append(rBiasDict[rseq])
    except Exception:
      ar.append(defaultKmerValue)

  # Calculating bias and writing to wig file
  fSum = sum(af[:window])
  rSum = sum(ar[:window])
  fLast = af[0]
  rLast = ar[0]
  bias_corrected_signal = []
  for i in range(int(window / 2), int(len(af) - (window / 2))):
    nhatf = Nf[i - int(window / 2)] * (af[i] / fSum)
    nhatr = Nr[i - int(window / 2)] * (ar[i] / rSum)
    zf = log(nf[i] + 1) - log(nhatf + 1)
    zr = log(nr[i] + 1) - log(nhatr + 1)
    bias_corrected_signal.append(zf + zr)
    fSum -= fLast
    fSum += af[i + int(window / 2)]
    fLast = af[i - int(window / 2) + 1]
    rSum -= rLast
    rSum += ar[i + int(window / 2)]
    rLast = ar[i - int(window / 2) + 1]

  # Termination
  fastaFile.close()
  return bias_corrected_signal