Ejemplo n.º 1
0
 def __init__(self, location, verbose=True):
     if os.path.isdir(location):
         self.genome = {}
         filenames = os.listdir(location)
         count = 0
         for f in filenames:
             count += 1
             self.genome.update(
                 {f.split('.')[0]: self.readFasta(str(location + f))})
             if verbose:
                 functions.showProgress(count, len(filenames),
                                        'Loading genome')
         if verbose: print('')
     elif os.path.isfile(location):
         self.genome = {}
         fc = open(location, 'r')
         line = fc.readline()
         header, *_ = line.split()
         tmp_name = header.lstrip('>')
         seq = ''
         line = fc.readline()
         while line:
             if line[0] == '>':
                 self.genome[tmp_name] = seq
                 seq = ''
                 header, *_ = line.split()
                 tmp_name = header.lstrip('>')
             else:
                 line = line.replace('\n', '')
                 seq = seq + line.upper()
             line = fc.readline()
         self.genome[tmp_name] = seq
         fc.close()
     else:
         sys.stderr.write('No file or directory. Exit')
Ejemplo n.º 2
0
def main(parclipfile,
         gfffile,
         upstream,
         downstream,
         sense,
         minSize,
         maxSize,
         verbose,
         xbins,
         ybins,
         vstring=''):
    anno = gff.GFF(gfffile)
    anno.filterSize(minSize, maxSize)
    totalsize = upstream + maxSize + 1 + downstream
    anno.sort2size()
    pc = ParclipSiteContainer.from_file(parclipfile)
    mat = []
    annosize = []
    for g in range(anno.size()):
        tmp = [-1] * totalsize
        if verbose:
            functions.showProgress(g, (anno.size() - 1), vstring)
        if anno.strand[g] == '+':
            values = pc.getValues(anno.chr[g], anno.start[g], anno.strand[g],
                                  sense, upstream,
                                  (anno.stop[g] - anno.start[g]) + downstream)
        else:
            values = pc.getValues(anno.chr[g], anno.stop[g], anno.strand[g],
                                  sense, upstream,
                                  (anno.stop[g] - anno.start[g]) + downstream)
        if values is not None:
            tmp[0:(len(values) - 1)] = values
        mat.append(functions.shrinkValues(tmp, xbins))
        annosize.append(anno.stop[g] - anno.start[g])
    smat = []
    sannosize = []
    if ybins >= anno.size():
        print('Warning: --ybins >= entries in ' + gfffile)
        ybins = anno.size()
    ystep = round(anno.size() / ybins)
    ystart = 0
    ystop = ystep
    while ystop < anno.size():
        tmp = [0] * xbins
        for i in range(xbins):
            count = 0
            tmpanno = 0
            for j in range(ystart, ystop):
                tmp[i] += mat[j][i]  # [row][col]
                tmpanno += annosize[j]
                count += 1
            tmp[i] = tmp[i] / count
            tmpanno = tmpanno / count
        smat.append(tmp)
        sannosize.append(tmpanno)
        ystart = ystop
        ystop += ystep
    return smat, sannosize
    if verbose:
        print()
Ejemplo n.º 3
0
def precalculateDistributions(minN, maxN, par, verbose):
    distributions = [0] * (maxN + 1)
    for i in range(minN, (maxN + 1)):
        tmp_probabilities = []
        for j in range((i + 1)):
            tmp_probabilities.append(prob_bg(j, i, par))
        distributions[i] = tmp_probabilities
        if verbose:
            functions.showProgress(i + 1, (maxN + 1),
                                   'precalculating probability distributions')
    if verbose: print('')
    return (distributions)
Ejemplo n.º 4
0
def processPileup(file_pileup, verbose, reference='T', mutation='C'):
    pseudocount = 1
    maxr = 1000  #upper limit for sequencing depth which is used for parameter estimation
    reference_reverse = functions.makeReverseComplement(reference)
    mutation_reverse = functions.makeReverseComplement(mutation)

    lines = 0
    for line in open(file_pileup):
        lines += 1

    mr_list_neg = [[0]]  #initializing empty matrices for parameter estimation
    mr_list_signal = [[0]]
    for i in range(1, maxr):
        mr_list_neg.append([pseudocount] * (i + 1))
        mr_list_signal.append([pseudocount] * (i + 1))

    file_pileup = open(file_pileup, 'r')
    line = file_pileup.readline()
    count = percent_old = percent_new = 0
    if verbose: functions.showProgress(count, lines, 'Processing Pileup')
    while (line):
        count += 1
        split = line.split('\t')
        if split[2] == reference:
            tmp_counts = functions.getCounts(split[4], forward=True)
            counts = [tmp_counts[0], tmp_counts[1][mutation]]
            if counts[0] < maxr:
                mr_list_signal[counts[0]][counts[1]] += 1
        if split[2] == reference_reverse:
            tmp_counts = functions.getCounts(split[4], forward=False)
            counts = [tmp_counts[0], tmp_counts[1][mutation_reverse]]
            if counts[0] < maxr:
                mr_list_signal[counts[0]][counts[1]] += 1
        if split[2] != reference and split[2] != reference_reverse:
            tmp_counts = functions.getCounts(split[4], forward=True)
            counts = [tmp_counts[0], max(tmp_counts[1].values())]
            if counts[0] < maxr:
                mr_list_neg[counts[0]][counts[1]] += 1
        percent_new = math.trunc((count / lines) * 100)
        if (percent_new > percent_old):
            if verbose:
                functions.showProgress(count, lines, 'Processing Pileup')
            percent_old = percent_new
        line = file_pileup.readline()
    if verbose: print('')
    file_pileup.close()
    return ([mr_list_neg, mr_list_signal])
def getCountMat(file_pileup, minCoverage, verbose):
    alphabet = ['A', 'C', 'G', 'T']
    translate = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

    if verbose:
        lines = 0
        with open(file_pileup) as infile:
            for line in infile:
                lines += 1

    mat = [[0] * 4, [0] * 4, [0] * 4, [0] * 4]
    with open(file_pileup) as file_pileup:
        count = 0
        percent_old = 0
        percent_new = 0

        if verbose:
            functions.showProgress(count, lines, 'Processing Pileup')

        for line in file_pileup:
            count += 1
            split = line.split('\t')
            nuc = split[2].upper()
            if nuc != 'N':
                tmp_counts = functions.getCounts(split[4], forward=True)
                if tmp_counts[0] >= minCoverage:
                    for c in alphabet:
                        if c == nuc:
                            mat[translate[nuc]][translate[c]] += tmp_counts[0] - tmp_counts[2]
                        else:
                            mat[translate[nuc]][translate[c]] += tmp_counts[1][c]
            if verbose:
                percent_new = math.trunc(count / lines * 100)
                if percent_new > percent_old:
                    functions.showProgress(count, lines, 'Processing Pileup')
                    percent_old = percent_new

        if verbose:
            print()

        return mat
Ejemplo n.º 6
0
def main(parclipfile, outputfile, gfffile, downstream, upstream, gene, sense, minSize,
         maxSize, verbose, vstring=''):
    anno = gff.GFF(gfffile)
    anno.filterSize(minSize, maxSize)
    pc = ParclipSiteContainer()
    pc.loadFromFile(parclipfile)
    with open(outputfile, 'w') as fc_out:
        for g in range(anno.size()):
            if verbose:
                functions.showProgress(g, (anno.size() - 1), vstring)
            if anno.strand[g] == '+':
                values_upstream = pc.getValues(anno.chr[g], anno.start[g], anno.strand[g],
                                               sense, upstream, gene)
                values_dostream = pc.getValues(anno.chr[g], anno.stop[g], anno.strand[g],
                                               sense, gene, downstream)
            else:
                values_upstream = pc.getValues(anno.chr[g], anno.stop[g], anno.strand[g],
                                               sense, upstream, gene)
                values_dostream = pc.getValues(anno.chr[g], anno.start[g], anno.strand[g],
                                               sense, gene, downstream)
            if values_upstream is not None and values_dostream is not None:
                print(*chain(values_upstream, values_dostream), sep='\t', file=fc_out)
        if verbose:
            print()
Ejemplo n.º 7
0
def main(parclip, outdir, prefix, genomepath, negset, gfffile, kmer, key,
         useQuantiles, verbose, args):
    scriptPath = os.path.dirname(os.path.realpath(__file__))
    plot_script = os.path.join(scriptPath, 'plotKmerLogOdds.R')
    pc = ParclipSiteContainer.from_file(parclip)

    if gfffile is not None:
        pc.remove_gff_sites(gfffile)
    pc.sort(by=key, ascending=False)

    kmers = functions.makekmers(kmer, list('ACGT'))[kmer - 1]
    negfreq = loadNegTable(negset)

    with EfficientGenome(genomepath) as genomeseq:
        allfreqs = []
        fileprefix = '%s_logodds_%smer_sort_%s' % (prefix, kmer, key)
        if useQuantiles:
            fileprefix = fileprefix + '_quantiles'
            allfreqs.append(
                getkmerLogs(pc, genomeseq, negfreq, kmers, 0, 1000, 15))
            quantiles = [
                0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.125, 0.15, 0.175, 0.2,
                0.225, 0.25, 0.275, 0.3, 0.325, 0.35, 0.375, 0.4, 0.45, 0.5,
                0.55, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9
            ]
            count = 1
            stop = 1000
            for q in quantiles:
                if verbose:
                    functions.showProgress(
                        count, len(quantiles),
                        'Getting kmer log-odds from quantiles...')
                old_stop = stop
                start = functions.getQuantileIndex(len(pc), q) - 500
                stop = functions.getQuantileIndex(len(pc), q) + 500
                if start < 0:
                    start = 0
                if stop > len(pc) - 2:
                    break
                count = count + 1
                if (stop - 500) < old_stop:
                    msg_pat = 'Bin %s and %s are overlapping by %s sites!'
                    # TODO 2x quantiles[count - 2] is probably a bug
                    msg = msg_pat % (quantiles[count - 2],
                                     quantiles[count - 2], old_stop -
                                     (stop - 500))
                    print(msg, file=sys.stderr)
                allfreqs.append(
                    getkmerLogs(pc, genomeseq, negfreq, kmers, start, stop,
                                15))
        else:
            maxsize = 50000
            stepsize = 1000
            start = 0
            stop = 1000
            run = True
            while run:
                if stop > len(pc) - 2 or stop > maxsize:
                    print()
                    print('STOP at: %s' % +stop)
                    run = False
                    break
                if verbose:
                    functions.showProgress(
                        stop, maxsize, 'Getting kmer log-odds from bins...')
                allfreqs.append(
                    getkmerLogs(pc, genomeseq, negfreq, kmers, start, stop,
                                15))
                start = stop
                stop = stop + stepsize

    table_file = os.path.join(outdir, fileprefix + '.table')
    pdf_file = os.path.join(outdir, fileprefix + '.pdf')
    sortAndSave(allfreqs, table_file, kmers)

    cmd = [
        'R',
        '-q',
        '--slave',
        '-f %r' % plot_script,
        '--args',
        '%r' % table_file,
        '%r' % pdf_file,
    ]
    execute(cmd)

    if not args.keep_tmp_files:
        os.remove(table_file)
Ejemplo n.º 8
0
def findPvalueParclipInPileup(pileup,
                              output_file,
                              mincov,
                              maxcov,
                              probabilities,
                              verbose,
                              reference='T',
                              mutation='C',
                              maxPvalue=0.001,
                              SNPlikely=False):
    reference_reverse = functions.makeReverseComplement(reference)
    mutation_reverse = functions.makeReverseComplement(mutation)

    found_sites = 0
    lines = 0
    with open(pileup) as handle:
        for line in handle:
            lines += 1

    with open(pileup) as file_pileup, open(output_file, 'w') as file_table:
        header = list(PC_MANDATORY_FIELDS) + ['p_value']
        print(*header, sep='\t', file=file_table)

        line = file_pileup.readline()
        linecount = 0
        percent_old = 0
        percent_new = 0

        if verbose:
            functions.showProgress(linecount, lines, 'Processing Pileup')
        while (line):
            linecount += 1
            split = line.split('\t')
            counts = [0, 0]
            pvalue = 1
            if split[2] == reference:
                tmp_counts = functions.getCounts(split[4], forward=True)
                counts = [tmp_counts[0], tmp_counts[1][mutation]]
                if counts[0] > mincov and counts[1] > 0:
                    if counts[0] > 500:
                        pvalue = getPvalue(
                            round((counts[1] / counts[0]) * 500), 500,
                            probabilities, SNPlikely)
                    else:
                        pvalue = getPvalue(counts[1], counts[0], probabilities,
                                           SNPlikely)
                    if pvalue <= maxPvalue:
                        print(split[0],
                              split[1],
                              counts[1],
                              counts[0],
                              1 - pvalue,
                              '+',
                              0,
                              pvalue,
                              sep='\t',
                              file=file_table)
                        found_sites += 1
            if split[2] == reference_reverse:
                tmp_counts = functions.getCounts(split[4], forward=False)
                counts = [tmp_counts[0], tmp_counts[1][mutation_reverse]]
                if counts[0] > mincov and counts[1] > 0:
                    if counts[0] > 500:
                        pvalue = getPvalue(
                            round((counts[1] / counts[0]) * 500), 500,
                            probabilities, SNPlikely)
                    else:
                        pvalue = getPvalue(counts[1], counts[0], probabilities,
                                           SNPlikely)
                    if pvalue <= maxPvalue:
                        print(split[0],
                              split[1],
                              counts[1],
                              counts[0],
                              1 - pvalue,
                              '-',
                              0,
                              pvalue,
                              sep='\t',
                              file=file_table)
                        found_sites += 1
            percent_new = math.trunc((linecount / lines) * 100)
            if (percent_new > percent_old):
                if verbose:
                    functions.showProgress(linecount, lines,
                                           'Processing Pileup')
                percent_old = percent_new
            line = file_pileup.readline()
        print('Found %s PAR-CLIP sites.' % found_sites)