Ejemplo n.º 1
0
def main(inputfile, outputfile, gfffile, gffmin, gffmax, takeStop, upstream,
         downstream, verbose):
    takeStart = True
    if takeStop:
        takeStart = False
    sites = ParclipSiteContainer()
    sites.loadFromFile(inputfile)
    anno = gff.GFF(gfffile)
    anno.filterSize(gffmin, gffmax)
    anno.getChromosomePositions()
    if anno.size() < 10:
        print('Warning: Low number of annotation enries! ' + str(anno.size()))
    fsites = ParclipSiteContainer()
    percent_old = 0
    percent_new = 0
    for i in range(sites.size()):
        if anno.isAround(sites.chrs[i], sites.pos[i], sites.strand[i],
                         takeStart, upstream, downstream)[1]:
            fsites.addSite(sites.chrs[i], sites.pos[i], sites.m[i], sites.r[i],
                           sites.result[i], sites.strand[i], sites.occ[i])
        percent_new = round(i / sites.size() * 100)
        if percent_new > percent_old:
            if verbose:
                functions.showProgress(i, anno.size(), 'selecting sites')
            percent_old = percent_new
    fsites.save2File(outputfile)
Ejemplo n.º 2
0
def main(parclipA, parclipB, outfile, width, verbose):
    quantiles = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
    total       = (len(quantiles)-1)*(len(quantiles)-1)
    total_count = 0
    if verbose:
       functions.showProgress(total_count, total, 'Calculating Jaccard-Index')
    fc = open(outfile, 'w')
    for q1 in range(len(quantiles)-1):
        a = ParclipSiteContainer()
        a.loadFromFile(parclipA)
        aq = getEntries(a,quantiles[q1], quantiles[q1+1])
        #removeEntries(a,quantiles[q1], quantiles[q1+1])
        for q2 in range(len(quantiles)-1):
            b = ParclipSiteContainer()
            b.loadFromFile(parclipB)
            #removeEntries(b,quantiles[q2], quantiles[q2+1])
            bq = getEntries(b,quantiles[q2], quantiles[q2+1])
            intersect = 0
            for j in range(bq.size()):
                if aq.exactSearch(bq.chrs[j], bq.pos[j], bq.strand[j], width=width)[1]:
                    intersect += 1
            jaccard = intersect/(aq.size()+bq.size()-intersect)
            #print('q1: '+str(quantiles[q1])+' q2: '+str(quantiles[q2])+' '+str(round(jaccard,4)))
            fc.write(str(round(jaccard,4))+'\t')
            total_count += 1
            if verbose:
                functions.showProgress(total_count, total, 'Calculating Jaccard-Index')
        fc.write('\n')
    print('')
    fc.close()
Ejemplo n.º 3
0
def main(inputfile, outputfile):
    if os.path.isfile(inputfile) == False:
        print('Inputfile: '+inputfile+' does not exist')
        sys.exit(-1)
    sites = ParclipSiteContainer()
    sites.loadFromFile(inputfile)
    
    for i in range(sites.size()):
       sites.occ[i] = sites.m[i]/sites.r[i]
    sites.save2File(outputfile)
Ejemplo n.º 4
0
def main(parclipA, parclipB, start, stop, width, anno=None, annowidth=100,
         logRatio=False, verbose=False):
    tmpA     = ParclipSiteContainer()
    dataB    = ParclipSiteContainer()
    tmpA.loadFromFile(parclipA)
    tmpA.sort(key='occ')
    dataB.loadFromFile(parclipB)
    if start < 0 or stop < start or stop >= tmpA.size():
        print('Bullshit start and stop indices. Come on! Concentrate!')
        sys.exit()
    dataA = parclipsites.ParclipSites('')
    total = stop - start
    count = 0
    i = start
    while count < total and i < (tmpA.size()-1):
        if verbose:
            functions.showProgress(count,total-1,'Selecting PAR-CLIP sites')
        if anno == None:
            dataA.addSite(tmpA.chrs[i], tmpA.pos[i], tmpA.m[i], tmpA.r[i],
                          tmpA.result[i], tmpA.strand[i], tmpA.occ[i])
            count +=1
        else:
            if anno.isInside(tmpA.chrs[i], tmpA.pos[i], tmpA.strand[i], 
                             annowidth, annowidth)[1]:
                dataA.addSite(tmpA.chrs[i], tmpA.pos[i], tmpA.m[i], tmpA.r[i],
                              tmpA.result[i], tmpA.strand[i], tmpA.occ[i])
                count +=1
        i += 1
    coloc = 1
    count_coloc = 1
    if verbose:
        print('\n')
    for i in range(dataA.size()):
        values = dataB.getValues(dataA.chrs[i], dataA.pos[i], dataA.strand[i], True, width, width)
        if values != None:
            count_coloc += 1
            coloc += max(values)
        if verbose:
            functions.showProgress(i, (dataA.size()-1), 'Collecting colocolization data')
    coloc = coloc / count_coloc
    if verbose:
        print('')
    if logRatio:
        return math.log( coloc/functions.getQuantile(dataB.occ,0.5) ,2)
    else:
        return coloc
Ejemplo n.º 5
0
def main(parclipfile, outputfile, gfffile, downstream, upstream, gene, sense, minSize,
         maxSize, verbose, vstring=''):
    anno = gff.GFF(gfffile)
    anno.filterSize(minSize, maxSize)
    pc = ParclipSiteContainer()
    pc.loadFromFile(parclipfile)
    with open(outputfile, 'w') as fc_out:
        for g in range(anno.size()):
            if verbose:
                functions.showProgress(g, (anno.size() - 1), vstring)
            if anno.strand[g] == '+':
                values_upstream = pc.getValues(anno.chr[g], anno.start[g], anno.strand[g],
                                               sense, upstream, gene)
                values_dostream = pc.getValues(anno.chr[g], anno.stop[g], anno.strand[g],
                                               sense, gene, downstream)
            else:
                values_upstream = pc.getValues(anno.chr[g], anno.stop[g], anno.strand[g],
                                               sense, upstream, gene)
                values_dostream = pc.getValues(anno.chr[g], anno.start[g], anno.strand[g],
                                               sense, gene, downstream)
            if values_upstream is not None and values_dostream is not None:
                print(*chain(values_upstream, values_dostream), sep='\t', file=fc_out)
        if verbose:
            print()
Ejemplo n.º 6
0
from mockinbird.utils import ParclipSiteContainer

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Takes PAR-CLIP sites and a genome and saves genomic sequences as fasta file around PAR-CLIP sites according to the given parameters.', epilog="contact: [email protected]")
    parser.add_argument('sites',      help='PAR-CLIP file *.table')
    parser.add_argument('genome',     help='path to genome')
    parser.add_argument('fafile',     help='output filename')
    parser.add_argument('filterGFF',  help='set path to GFF if sites should be removed that overlap with the GFF [default = '']', default='')
    parser.add_argument('start',      help='start index of PAR-CLIP sites [default=0]', type=int, default = 0)
    parser.add_argument('stop',       help='stop index of PAR-CLIP sites [default=1500]', type=int, default = 1500)
    parser.add_argument('width',      help='number of nt +/- the crosslink site [default=15]', type=int, default = 15)
    parser.add_argument('additionalFilterWidth', help='number of nt that are added to the start/stop indices of the GFF annotations', type=int, default = 20)
    parser.add_argument('key',  help='set key that is used for PAR-CLIP site ordering [default = \'occ\'], options: [\'occ\', \'m\', \'r\', \'mr\', \'pvalue\']', default='occ')
    parser.add_argument('-v','--verbose', dest='verbose', action="store_true", default=False, help='verbose output')
    args = parser.parse_args()
    
    yeast   = genome.Genome(args.genome, False)
    sites   = ParclipSiteContainer()
    sites.loadFromFile(args.sites)
    
    if args.verbose:
        print('#sites              : '+str(sites.size()))
    if args.filterGFF != '':
        anno     = gff.GFF(args.filterGFF)
        sites   = sites.removeSitesLocatedInGFF(anno, args.additionalFilterWidth)
        print('#sites after removal: '+str(sites.size()))
    
    sites.sort(args.key)
    sites.save2Fasta(yeast, args.fafile, args.start, args.stop, args.width)