def getSeeds(fafile,sense=True,start_at=2,stop_at=8): if sense: for fa in IO.BioReader(fafile,'fasta'): yield fa.id,fa.seq.seq[start_at-1:stop_at-1] else: for fa in IO.BioReader(fafile,'fasta'): yield fa.id,Utils.rc(fa.seq.seq)[start_at-1:stop_at-1]
def calPvalues(siRNASeeds,seedfile,outfile,method='RRA',N=10000): # read headers print >>sys.stderr, Utils.touchtime(),"reading header" fh = gzip.open(seedfile,'rb') genes,lengths = SeedUtils.parseHeader(fh) num_genes = len(genes) # aRRA print >>sys.stderr, Utils.touchtime(),"start analyzing data." method = {'RRA':SeedUtils.aRRA}.get(method,SeedUtils.aRRA) escores = numpy.zeros(num_genes) for i in range(0,num_genes,N): n = N if i+N < num_genes else num_genes%N if n: geneSeeds = numpy.sum(numpy.frombuffer(fh.read(2**17*n),dtype=numpy.uint16,count=2**16*n).reshape((n,4,2**14)),axis=1) escores[i:i+n] = method(siRNASeeds,geneSeeds) print Utils.touchtime(), "analyzed", i+n fh.close() # Writting result to file print >>sys.stderr, Utils.touchtime(), 'writing result to file.' df = pandas.DataFrame({'gid':genes,'length':lengths, 'escore':escores}) rv = SeedUtils.evdplot(df,outfile+".pdf") df.loc[:,'pvalue'] = rv.pdf(df.escore) df = df.sort_values(by='pvalue',ascending=False) df.to_csv(outfile,index=False,sep='\t',columns=['gid','length','escore','pvalue']) print >>sys.stderr, Utils.touchtime(), "finished." return df
def parseGeneSeeds(gfile,dumpfile,N=10): ''' Parse seeds on gene sequences. Parameters: gfile: string Gene sequence file in Fasta format. N: int Parse N sequence at a time instead all to reduce memory usage. [Default=10000] N = 10000, memory usage for the two huge matrix is up to 1.83G. ''' # create seed tables SeedUtils.createTables() # read genes ofh = gzip.open(dumpfile,'wb') genes = [fa for fa in IO.BioReader(gfile,'fasta')] # write headers gids = ';'.join(["{0}:{1}".format(fa.id,len(fa)) for fa in genes]) header = numpy.array([0x19840405,len(genes),len(gids)],dtype=numpy.uint64) ofh.write(header.data) ofh.write(gids) # parse genes cnt = 0 mat = numpy.zeros((N,2**15),dtype=numpy.uint16) seeds = numpy.zeros((N,4,2**14),dtype=numpy.uint16) # N x type x seeds for fa in genes: SeedUtils.findSeeds(fa.seq.seq.upper(),mat[cnt%N]) cnt += 1 if cnt%(N)==0: # calculate seeds for the N genes SeedUtils.parseSeeds(mat,seeds) ofh.write(seeds.data) print >>sys.stderr, touchtime(), "Parsed {0} genes .".format(cnt) seeds.fill(0) mat.fill(0) # calculate seeds for the rest of genes rest = cnt%N if rest: SeedUtils.parseSeeds(mat[:rest],seeds[:rest]) ofh.write(seeds[:rest].data) print >>sys.stderr, Utils.touchtime(), "Parsed {0} genes. ".format(cnt) ofh.close()
p._optionals.title = "Options" p.add_argument("-i","--input",dest='ifname',type=str,metavar="input.bed",required=True,help="Input file. Can be stdin.") p.add_argument("-f","--format",dest="ftype",type=str,metavar="bed",default="bed",help="Format of input file. Default is 'bed'. Can be 'bed3', 'bedgraph', 'bed','peak','wig', 'sam2bed' or 'genepred'.") p.add_argument("-g","--genome",dest='genome',type=str,metavar="Genome",default=None,help="Genome version (hg19, mm10 .etc) or genome size file with chrom and size in each line.") p.add_argument("-u","--up",dest="up",type=int,metavar="upstream",default=0,help="bps extended to upstream. If minus, trim the 5' end.") p.add_argument("-d","--down",dest="down",type=int,metavar="downstream",default=0,help="bps extended to downstream. If minus, trim the 3' end.") p.add_argument("-o","--output",dest="ofname",type=str,metavar="output.bed",default="stdout",help="Output file. Default is stdout.") if len(sys.argv)==1: sys.exit(p.print_help()) args = p.parse_args() return args # ------------------------------------ # Classes # ------------------------------------ # ------------------------------------ # Main # ------------------------------------ if __name__=="__main__": # Get parameters args=argParser() fh = IO.mopen(args.ofname, 'w') if args.genome: genome=Utils.genomeSize(args.genome) for item in IO.BioReader(args.ifname,args.ftype): tbed=item.extend(args.up,args.down, args.genome) print >> fh, tbed IO.mclose(fh)
if __name__=="__main__": if len(sys.argv)==1: print "Usage: "+sys.argv[0]+" annotation.tab/bed genomesize *.bed" print " Find the nearest annotation for given bed." else: # check file if '.tab' in sys.argv[1]: ftype='gene' else: ftype='bed' # initiation annotations. annos={} #for chrom in IO.genomeSize('hg19'): for chrom in Utils.genomeSize(sys.argv[2]): if ftype=='bed': annos[chrom]=BedList() else: annos[chrom]=GeneBedList() # read annotations. for anno in IO.BioReader(sys.argv[1],ftype=ftype): if annos.has_key(anno.chrom): annos[anno.chrom].append(anno) # sort for chrom in annos: annos[chrom].sort() # Find nearest annoations
def findSeeds(seq,sary): sary.fill(0) seq = Utils.rc(seq) for i in range(len(seq)-7): sary[twoBytesTable[seq[i:i+8]]] += 1
"--output", dest="ofname", type=str, metavar="output.bed", default="stdout", help="Output file. Default is stdout.") if len(sys.argv) == 1: sys.exit(p.print_help()) args = p.parse_args() return args # ------------------------------------ # Classes # ------------------------------------ # ------------------------------------ # Main # ------------------------------------ if __name__ == "__main__": # Get parameters args = argParser() fh = IO.mopen(args.ofname, 'w') if args.genome: genome = Utils.genomeSize(args.genome) for item in IO.BioReader(args.ifname, args.ftype): tbed = item.extend(args.up, args.down, args.genome) print >> fh, tbed IO.mclose(fh)