def load_genomes(UTRfilestring, twobitfile): """ make this a separate function so that these only need to be loaded a single time """ UTRdict= rph.readindict(open(UTRfilestring, "rU")) genome= twobitreader.TwoBitFile(twobitfile) # do we actually need to load this in here? return UTRdict, genome
def main(): ## using argparser to load arguments from workflow parser = argparse.ArgumentParser() parser.add_argument('--trspdictfilestring', help='input transcript density files') parser.add_argument('--UTRfilestring', help='UTRs file') parser.add_argument('--cdsDenThresh', help='boolean value- should density filter be used?') parser.add_argument( '--norm_type', help='type of normalizaion, should be raw reads or rpm') parser.add_argument('--raw_dense_thresh', help='threshold for CDS density for raw normalization') parser.add_argument('--rpm_dense_thresh', help='threshold for CDS density for rpm normalization') parser.add_argument( '--inset_choice', help='inset values to be used to avoid start and stop codon peaks') parser.add_argument('--outfilestring', help='output file name') # parser.add_argument args = parser.parse_args() args.cdsDenThresh = args.cdsDenThresh == 'True' print "ARGS: ", args.cdsDenThresh # print bool(args.cdsDenThresh) trspdict = rph.readcountsf(args.trspdictfilestring) UTRdict = rph.readindict(open(args.UTRfilestring, "rU")) countsIDlist, countsOutdict = build_count_tables( trspdict, UTRdict, args.inset_choice, args.cdsDenThresh, args.norm_type, float(args.raw_dense_thresh), float(args.rpm_dense_thresh), args.outfilestring) write_countTable_to_csv(countsIDlist, countsOutdict, args.outfilestring)
def load_genomes(UTRfilestring, firstStopsCSV, twobitfile): """ make this a separate function so that these only need to be loaded a single time """ UTRdict= rph.readindict(open(UTRfilestring, "rU")) utr3adj = pd.read_csv(firstStopsCSV, index_col=0) genome= twobitreader.TwoBitFile(twobitfile) # do we actually need to load this in here? return UTRdict, utr3adj, genome
def main(): ## using argparser to load arguments from workflow parser = argparse.ArgumentParser() parser.add_argument('--trspdictfilestring', help='input transcript density files') parser.add_argument('--UTRfilestring', help='UTRs file') parser.add_argument('--cdsDenThresh', help='boolean value- should density filter be used?') parser.add_argument( '--norm_type', help='type of normalizaion, should be raw reads or rpm') parser.add_argument('--raw_dense_thresh', help='threshold for CDS density for raw normalization') parser.add_argument('--rpm_dense_thresh', help='threshold for CDS density for rpm normalization') parser.add_argument( '--inset_choice', help='inset values to be used to avoid start and stop codon peaks') parser.add_argument('--outfilestring', help='output file name') parser.add_argument( '--totreads', help='total number of reads used after raw densebuilder run') parser.add_argument( '--stopcodons', help= 'csv file with positions of all stopcodons, riboseq_stopcodon_finder.py' ) # parser.add_argument args = parser.parse_args() # utr3adj = pd.read_csv(stopframe_csv, index_col=0) utr3adj = pd.read_csv(args.stopcodons, index_col=0) # print utr3adj.head() # print utr3adj.loc['ENST00000426362.6'] # print utr3adj.loc[utr3adj['#transcript'] == 'ENST00000426362.6'] trspdict = rph.readcountsf(args.trspdictfilestring) UTRdict = rph.readindict(open(args.UTRfilestring, "rU")) # print UTRdict['ENST00000426362.6'] countsIDlist, countsOutdict = build_count_tables( trspdict, UTRdict, utr3adj, args.inset_choice, bool(args.cdsDenThresh), args.norm_type, float(args.raw_dense_thresh), float(args.rpm_dense_thresh), args.outfilestring, int(args.totreads), minUtr3len) write_countTable_to_csv(countsIDlist, countsOutdict, args.outfilestring)
def codonaverage(self): outlist, headers, motiffilelist = [], [], [] headers.append("motif") for motif in self.motifs: motiffile = args.motiffilerootpath + motif + "_1.csv" motiffilelist.append(motiffile) headers.append(motif) outlist.append(headers) codon_occu = [] codon_occu.append(self.sample_name) f_output = open(args.outfileparams, "w") f_output.write("Density file is " + str(self.sample_name) + "\n") f_output.write("cds5trim is " + str(args.cds5trim) + "\n") f_output.write("cds3trim is " + str(args.cds3trim) + "\n") f_output.write("Seqwin is " + str(args.seqwin) + "\n") f_output.write("Motiflist is " + str(motiffilelist) + "\n") readcountsdict = rph.readcountsf(args.trspdictfilestring) exclusionmodule = exclusionfiles[0] if exclusionmodule != '0': exclusiondict = self.readindict(open(exclusionmodule, "rU")) else: exclusiondict = '0' print "Exclusion file is " + str(exclusionmodule) UTRdict = rph.readindict(open(args.UTRfilestring, "rU")) occupancy = self.occupancy(readcountsdict, motiffilelist, exclusiondict, codon_occu, UTRdict, f_output) outlist.append(codon_occu) f_output.close() co = np.asarray(outlist) # convert outlist to a np.array output = co.T # print "output: ", output # print "self.outlistfile: ", self.outlistfile self.writerows(output, self.outlistfile) # write these rows to a csv
comments += "Threshold signifies minimal rpkm needed in coding region for gene to be in the average.\n" comments += "alignpos =1 anchors average around the start codon and only includes 5'UTRs. alignpos =2 is the same for stop codon." fc = open(args.outfilebase + "_" + str(args.alignpos) + "_output.txt", "w") fc.write(comments) fc.write("\n") fc.write("Avggene was called with parameters:\n") fc.write("transcripts= " + str(args.trspdictfilestring) + "\n") fc.write("filtermodule= " + str(args.filtermodule) + "\n") fc.write("exclusionmodule= " + str(args.exclusionmodule) + "\n") fc.write("threshold= " + str(args.threshold) + "\n") fc.write("regionlength5= " + str(args.regionlength5) + "\n") fc.write("regionlength3= " + str(args.regionlength3) + "\n") fc.write("equalweight= " + str(args.equalweight) + "\n") fc.write("alignpos= " + str(args.alignpos) + "\n") fc.close() if args.filtermodule != '0': filterdict = rph.readindict(open(args.filtermodule, "rU")) else: filterdict = '0' if args.exclusionmodule != '0': exclusiondict = rph.readindict(open(args.exclusionmodule, "rU")) else: exclusiondict = '0' trspdict = rph.readcountsf(args.trspdictfilestring) UTRdict = rph.readindict(open(args.UTRfilestring, "rU")) metagene = Avggene(args.regionlength5, args.regionlength3, trspdict, UTRdict, filterdict, exclusiondict, args.threshold, args.alignpos, args.equalweight, args.outfilebase) metagene.totalavg()
for attr in dir(libset): if not attr.startswith("_"): globals()[attr] = getattr(libset, attr) threadNumb = str(args.threadNumb) import rphelper as rph ### function inputs inset_choice = 'zero' # samplelist = samplelist ### load UTRdict to be used for all samples UTRdict= rph.readindict(open(UTRfilestring, "rU")) # countsIDlist, countsOutdict = build_count_tables(trspdict, UTRdict, utr3adj, args.inset_choice, # bool(args.cdsDenThresh), args.norm_type, float(args.raw_dense_thresh), # float(args.rpm_dense_thresh), args.outfilestring, int(args.totreads), # minUtr3len) ### sample inputs: # def build_count_table_single(sample, UTRdict = UTRdict, inset_choice = 'default'): def build_count_table_single(sample, UTRdict = UTRdict,): """ This is the function that returns total counts within the following regions: