Ejemplo n.º 1
0
def main(parclipA, parclipB, outfile, width, verbose):
    quantiles = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
    total       = (len(quantiles)-1)*(len(quantiles)-1)
    total_count = 0
    if verbose:
       functions.showProgress(total_count, total, 'Calculating Jaccard-Index')
    fc = open(outfile, 'w')
    for q1 in range(len(quantiles)-1):
        a = ParclipSiteContainer()
        a.loadFromFile(parclipA)
        aq = getEntries(a,quantiles[q1], quantiles[q1+1])
        #removeEntries(a,quantiles[q1], quantiles[q1+1])
        for q2 in range(len(quantiles)-1):
            b = ParclipSiteContainer()
            b.loadFromFile(parclipB)
            #removeEntries(b,quantiles[q2], quantiles[q2+1])
            bq = getEntries(b,quantiles[q2], quantiles[q2+1])
            intersect = 0
            for j in range(bq.size()):
                if aq.exactSearch(bq.chrs[j], bq.pos[j], bq.strand[j], width=width)[1]:
                    intersect += 1
            jaccard = intersect/(aq.size()+bq.size()-intersect)
            #print('q1: '+str(quantiles[q1])+' q2: '+str(quantiles[q2])+' '+str(round(jaccard,4)))
            fc.write(str(round(jaccard,4))+'\t')
            total_count += 1
            if verbose:
                functions.showProgress(total_count, total, 'Calculating Jaccard-Index')
        fc.write('\n')
    print('')
    fc.close()
Ejemplo n.º 2
0
def main(parclipfile,
         gfffile,
         upstream,
         downstream,
         sense,
         minSize,
         maxSize,
         verbose,
         xbins,
         ybins,
         vstring=''):
    anno = gff.GFF(gfffile)
    anno.filterSize(minSize, maxSize)
    totalsize = upstream + maxSize + 1 + downstream
    anno.sort2size()
    pc = ParclipSiteContainer.from_file(parclipfile)
    mat = []
    annosize = []
    for g in range(anno.size()):
        tmp = [-1] * totalsize
        if verbose:
            functions.showProgress(g, (anno.size() - 1), vstring)
        if anno.strand[g] == '+':
            values = pc.getValues(anno.chr[g], anno.start[g], anno.strand[g],
                                  sense, upstream,
                                  (anno.stop[g] - anno.start[g]) + downstream)
        else:
            values = pc.getValues(anno.chr[g], anno.stop[g], anno.strand[g],
                                  sense, upstream,
                                  (anno.stop[g] - anno.start[g]) + downstream)
        if values is not None:
            tmp[0:(len(values) - 1)] = values
        mat.append(functions.shrinkValues(tmp, xbins))
        annosize.append(anno.stop[g] - anno.start[g])
    smat = []
    sannosize = []
    if ybins >= anno.size():
        print('Warning: --ybins >= entries in ' + gfffile)
        ybins = anno.size()
    ystep = round(anno.size() / ybins)
    ystart = 0
    ystop = ystep
    while ystop < anno.size():
        tmp = [0] * xbins
        for i in range(xbins):
            count = 0
            tmpanno = 0
            for j in range(ystart, ystop):
                tmp[i] += mat[j][i]  # [row][col]
                tmpanno += annosize[j]
                count += 1
            tmp[i] = tmp[i] / count
            tmpanno = tmpanno / count
        smat.append(tmp)
        sannosize.append(tmpanno)
        ystart = ystop
        ystop += ystep
    return smat, sannosize
    if verbose:
        print()
Ejemplo n.º 3
0
def main(inputfile, outputfile):
    if os.path.isfile(inputfile) == False:
        print('Inputfile: '+inputfile+' does not exist')
        sys.exit(-1)
    sites = ParclipSiteContainer()
    sites.loadFromFile(inputfile)
    
    for i in range(sites.size()):
       sites.occ[i] = sites.m[i]/sites.r[i]
    sites.save2File(outputfile)
Ejemplo n.º 4
0
def main(input_file, output_file, q):
    if not 0 <= q < 1:
        print('q must lie between 0 and 1 - got %s' % q)
        sys.exit(1)
    sites = ParclipSiteContainer.from_file(input_file)

    # dirty hack to avoid errors on empty files
    occ_vals = []
    for rec in sites:
        occ_vals.append(rec.occupancy)

    if len(occ_vals) > 0:
        max_occ = functions.getQuantile(occ_vals, q)

    records = []
    for rec in sites:
        if rec.occupancy > max_occ:
            rec = rec._replace(occupancy=max_occ)
        records.append(rec)

    new_sites = ParclipSiteContainer(records)
    new_sites.save2File(output_file)
Ejemplo n.º 5
0
def getEntries(sites, q1, q2):
    i = 0
    lower = functions.getQuantile(sites.occ,q1)
    upper = functions.getQuantile(sites.occ,q2)
    pc = ParclipSiteContainer()
    count = 0
    size = sites.size()
    for i in range(sites.size()):
        if sites.occ[i] > lower and sites.occ[i] <= upper:
            count += 1
            pc.addSite(sites.chrs[i], sites.pos[i], sites.m[i], sites.r[i], sites.result[i], sites.strand[i], sites.occ[i])
    pc.getChromosomePositions()
    return pc
Ejemplo n.º 6
0
def run():
    scriptPath = os.path.dirname(os.path.realpath(__file__))
    plot_script = os.path.join(scriptPath, 'plotKmerPerPosition.R')

    parser = create_parser()
    args = parser.parse_args()

    sites = ParclipSiteContainer.from_file(args.inputfile)

    if args.filterGFF != '':
        sites.remove_gff_sites(args.filterGFF, args.awidth)

    sites.sort(by=args.key, ascending=False)

    with EfficientGenome(args.genome) as genome:
        sites = sites[args.start:args.stop]
        seqs = sites.get_all_sequences(genome, args.width)

    prefix_fmt = '%s_kmerPerPosition_kmer%s_start%s_stop%s_width%s_sort_%s'

    prefix = prefix_fmt % (args.prefix, args.kmer, args.start, args.stop,
                           args.width, args.key)
    outfile_table = os.path.join(args.outdir, prefix + '.table')
    outfile_pdf = os.path.join(args.outdir, prefix + '.pdf')
    seq_len = 2 * args.width + 1
    getKmerOccurences(seqs,
                      seq_len,
                      outfile_table,
                      kmer=(args.kmer - 1),
                      verbose=args.verbose)

    cmd = [
        'R', '-q', '--slave',
        '-f %s' % plot_script, '--args', outfile_table, outfile_pdf,
        args.width, 0, args.width + 1
    ]
    execute(cmd)
    if args.remove:
        os.remove(outfile_table)
Ejemplo n.º 7
0
def main(parclipfile, outputfile, gfffile, downstream, upstream, gene, sense, minSize,
         maxSize, verbose, vstring=''):
    anno = gff.GFF(gfffile)
    anno.filterSize(minSize, maxSize)
    pc = ParclipSiteContainer()
    pc.loadFromFile(parclipfile)
    with open(outputfile, 'w') as fc_out:
        for g in range(anno.size()):
            if verbose:
                functions.showProgress(g, (anno.size() - 1), vstring)
            if anno.strand[g] == '+':
                values_upstream = pc.getValues(anno.chr[g], anno.start[g], anno.strand[g],
                                               sense, upstream, gene)
                values_dostream = pc.getValues(anno.chr[g], anno.stop[g], anno.strand[g],
                                               sense, gene, downstream)
            else:
                values_upstream = pc.getValues(anno.chr[g], anno.stop[g], anno.strand[g],
                                               sense, upstream, gene)
                values_dostream = pc.getValues(anno.chr[g], anno.start[g], anno.strand[g],
                                               sense, gene, downstream)
            if values_upstream is not None and values_dostream is not None:
                print(*chain(values_upstream, values_dostream), sep='\t', file=fc_out)
        if verbose:
            print()
Ejemplo n.º 8
0
def run():
    scriptPath = os.path.dirname(os.path.realpath(__file__))
    parser = create_parser()
    args = parser.parse_args()

    prefix_pat = '%s_xxmotif_start%s_stop%s_width%s_sort_%s'
    file_prefix = prefix_pat % (args.prefix, args.start, args.stop, args.width,
                                args.key)

    sites = ParclipSiteContainer.from_file(args.inputfile)

    if args.filterGFF != '':
        sites.remove_gff_sites(args.filterGFF, args.awidth)

    sites.sort(by=args.key, ascending=False)
    sites = sites[args.start:args.stop]
    gen_file = os.path.join(args.outdir, file_prefix + '.fa')
    with EfficientGenome(args.genome) as genome:
        sites.save2Fasta(genome, gen_file, width=args.width)

    cmd = [
        'XXmotif',
        args.outdir,
        gen_file,
        '--zoops',
        '--merge-motif-threshold LOW',
        '--max-match-positions 10',
    ]
    if args.negSet:
        cmd.append('--negSet %s' % args.negSet)
    execute(cmd)

    tmp_dir = os.path.join(args.outdir, 'tmp')
    mini_plot_script = os.path.join(tmp_dir, 'plotDistribution.R')

    mini_plot_cmd = [
        'R',
        '-q',
        '--slave',
        '-f %r' % mini_plot_script,
        '--args',
        '%r' % args.outdir,
    ]
    execute(mini_plot_cmd)

    plot_script = os.path.join(scriptPath, '..', 'plots', 'weblogo.R')
    pwm_file = os.path.join(args.outdir, file_prefix + '.pwm')
    plot_cmd = [
        'R',
        '-q',
        '--slave',
        '-f %s' % plot_script,
        '--args',
        pwm_file,
        args.outdir,
        file_prefix,
        args.plotPWM,
    ]
    if args.plotPWM > 0:
        execute(plot_cmd)

    if not args.keep_tmp_files:
        shutil.rmtree(tmp_dir, ignore_errors=True)
Ejemplo n.º 9
0
def main(parclipA, parclipB, start, stop, width, anno=None, annowidth=100,
         logRatio=False, verbose=False):
    tmpA     = ParclipSiteContainer()
    dataB    = ParclipSiteContainer()
    tmpA.loadFromFile(parclipA)
    tmpA.sort(key='occ')
    dataB.loadFromFile(parclipB)
    if start < 0 or stop < start or stop >= tmpA.size():
        print('Bullshit start and stop indices. Come on! Concentrate!')
        sys.exit()
    dataA = parclipsites.ParclipSites('')
    total = stop - start
    count = 0
    i = start
    while count < total and i < (tmpA.size()-1):
        if verbose:
            functions.showProgress(count,total-1,'Selecting PAR-CLIP sites')
        if anno == None:
            dataA.addSite(tmpA.chrs[i], tmpA.pos[i], tmpA.m[i], tmpA.r[i],
                          tmpA.result[i], tmpA.strand[i], tmpA.occ[i])
            count +=1
        else:
            if anno.isInside(tmpA.chrs[i], tmpA.pos[i], tmpA.strand[i], 
                             annowidth, annowidth)[1]:
                dataA.addSite(tmpA.chrs[i], tmpA.pos[i], tmpA.m[i], tmpA.r[i],
                              tmpA.result[i], tmpA.strand[i], tmpA.occ[i])
                count +=1
        i += 1
    coloc = 1
    count_coloc = 1
    if verbose:
        print('\n')
    for i in range(dataA.size()):
        values = dataB.getValues(dataA.chrs[i], dataA.pos[i], dataA.strand[i], True, width, width)
        if values != None:
            count_coloc += 1
            coloc += max(values)
        if verbose:
            functions.showProgress(i, (dataA.size()-1), 'Collecting colocolization data')
    coloc = coloc / count_coloc
    if verbose:
        print('')
    if logRatio:
        return math.log( coloc/functions.getQuantile(dataB.occ,0.5) ,2)
    else:
        return coloc
Ejemplo n.º 10
0
 def _get_container(self):
     pc_table = ParclipSiteContainer.from_file(TABLE_DIR)
     return pc_table
Ejemplo n.º 11
0
def main(inputfile, outputfile, gfffile, gffmin, gffmax, takeStop, upstream,
         downstream, verbose):
    takeStart = True
    if takeStop:
        takeStart = False
    sites = ParclipSiteContainer()
    sites.loadFromFile(inputfile)
    anno = gff.GFF(gfffile)
    anno.filterSize(gffmin, gffmax)
    anno.getChromosomePositions()
    if anno.size() < 10:
        print('Warning: Low number of annotation enries! ' + str(anno.size()))
    fsites = ParclipSiteContainer()
    percent_old = 0
    percent_new = 0
    for i in range(sites.size()):
        if anno.isAround(sites.chrs[i], sites.pos[i], sites.strand[i],
                         takeStart, upstream, downstream)[1]:
            fsites.addSite(sites.chrs[i], sites.pos[i], sites.m[i], sites.r[i],
                           sites.result[i], sites.strand[i], sites.occ[i])
        percent_new = round(i / sites.size() * 100)
        if percent_new > percent_old:
            if verbose:
                functions.showProgress(i, anno.size(), 'selecting sites')
            percent_old = percent_new
    fsites.save2File(outputfile)
Ejemplo n.º 12
0
def main(parclip, outdir, prefix, genomepath, negset, gfffile, kmer, key,
         useQuantiles, verbose, args):
    scriptPath = os.path.dirname(os.path.realpath(__file__))
    plot_script = os.path.join(scriptPath, 'plotKmerLogOdds.R')
    pc = ParclipSiteContainer.from_file(parclip)

    if gfffile is not None:
        pc.remove_gff_sites(gfffile)
    pc.sort(by=key, ascending=False)

    kmers = functions.makekmers(kmer, list('ACGT'))[kmer - 1]
    negfreq = loadNegTable(negset)

    with EfficientGenome(genomepath) as genomeseq:
        allfreqs = []
        fileprefix = '%s_logodds_%smer_sort_%s' % (prefix, kmer, key)
        if useQuantiles:
            fileprefix = fileprefix + '_quantiles'
            allfreqs.append(
                getkmerLogs(pc, genomeseq, negfreq, kmers, 0, 1000, 15))
            quantiles = [
                0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.125, 0.15, 0.175, 0.2,
                0.225, 0.25, 0.275, 0.3, 0.325, 0.35, 0.375, 0.4, 0.45, 0.5,
                0.55, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9
            ]
            count = 1
            stop = 1000
            for q in quantiles:
                if verbose:
                    functions.showProgress(
                        count, len(quantiles),
                        'Getting kmer log-odds from quantiles...')
                old_stop = stop
                start = functions.getQuantileIndex(len(pc), q) - 500
                stop = functions.getQuantileIndex(len(pc), q) + 500
                if start < 0:
                    start = 0
                if stop > len(pc) - 2:
                    break
                count = count + 1
                if (stop - 500) < old_stop:
                    msg_pat = 'Bin %s and %s are overlapping by %s sites!'
                    # TODO 2x quantiles[count - 2] is probably a bug
                    msg = msg_pat % (quantiles[count - 2],
                                     quantiles[count - 2], old_stop -
                                     (stop - 500))
                    print(msg, file=sys.stderr)
                allfreqs.append(
                    getkmerLogs(pc, genomeseq, negfreq, kmers, start, stop,
                                15))
        else:
            maxsize = 50000
            stepsize = 1000
            start = 0
            stop = 1000
            run = True
            while run:
                if stop > len(pc) - 2 or stop > maxsize:
                    print()
                    print('STOP at: %s' % +stop)
                    run = False
                    break
                if verbose:
                    functions.showProgress(
                        stop, maxsize, 'Getting kmer log-odds from bins...')
                allfreqs.append(
                    getkmerLogs(pc, genomeseq, negfreq, kmers, start, stop,
                                15))
                start = stop
                stop = stop + stepsize

    table_file = os.path.join(outdir, fileprefix + '.table')
    pdf_file = os.path.join(outdir, fileprefix + '.pdf')
    sortAndSave(allfreqs, table_file, kmers)

    cmd = [
        'R',
        '-q',
        '--slave',
        '-f %r' % plot_script,
        '--args',
        '%r' % table_file,
        '%r' % pdf_file,
    ]
    execute(cmd)

    if not args.keep_tmp_files:
        os.remove(table_file)
Ejemplo n.º 13
0
from mockinbird.utils import ParclipSiteContainer

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Takes PAR-CLIP sites and a genome and saves genomic sequences as fasta file around PAR-CLIP sites according to the given parameters.', epilog="contact: [email protected]")
    parser.add_argument('sites',      help='PAR-CLIP file *.table')
    parser.add_argument('genome',     help='path to genome')
    parser.add_argument('fafile',     help='output filename')
    parser.add_argument('filterGFF',  help='set path to GFF if sites should be removed that overlap with the GFF [default = '']', default='')
    parser.add_argument('start',      help='start index of PAR-CLIP sites [default=0]', type=int, default = 0)
    parser.add_argument('stop',       help='stop index of PAR-CLIP sites [default=1500]', type=int, default = 1500)
    parser.add_argument('width',      help='number of nt +/- the crosslink site [default=15]', type=int, default = 15)
    parser.add_argument('additionalFilterWidth', help='number of nt that are added to the start/stop indices of the GFF annotations', type=int, default = 20)
    parser.add_argument('key',  help='set key that is used for PAR-CLIP site ordering [default = \'occ\'], options: [\'occ\', \'m\', \'r\', \'mr\', \'pvalue\']', default='occ')
    parser.add_argument('-v','--verbose', dest='verbose', action="store_true", default=False, help='verbose output')
    args = parser.parse_args()
    
    yeast   = genome.Genome(args.genome, False)
    sites   = ParclipSiteContainer()
    sites.loadFromFile(args.sites)
    
    if args.verbose:
        print('#sites              : '+str(sites.size()))
    if args.filterGFF != '':
        anno     = gff.GFF(args.filterGFF)
        sites   = sites.removeSitesLocatedInGFF(anno, args.additionalFilterWidth)
        print('#sites after removal: '+str(sites.size()))
    
    sites.sort(args.key)
    sites.save2Fasta(yeast, args.fafile, args.start, args.stop, args.width)