def main():
    ## using argparser to load arguments from workflow
    parser = argparse.ArgumentParser()
    parser.add_argument('--trspdictfilestring',
                        help='input transcript density files')
    parser.add_argument('--UTRfilestring', help='UTRs file')
    parser.add_argument('--cdsDenThresh',
                        help='boolean value- should density filter be used?')
    parser.add_argument(
        '--norm_type', help='type of normalizaion, should be raw reads or rpm')
    parser.add_argument('--raw_dense_thresh',
                        help='threshold for CDS density for raw normalization')
    parser.add_argument('--rpm_dense_thresh',
                        help='threshold for CDS density for rpm normalization')
    parser.add_argument(
        '--inset_choice',
        help='inset values to be used to avoid start and stop codon peaks')
    parser.add_argument('--outfilestring', help='output file name')
    # parser.add_argument
    args = parser.parse_args()

    args.cdsDenThresh = args.cdsDenThresh == 'True'

    print "ARGS: ", args.cdsDenThresh
    # print bool(args.cdsDenThresh)

    trspdict = rph.readcountsf(args.trspdictfilestring)
    UTRdict = rph.readindict(open(args.UTRfilestring, "rU"))
    countsIDlist, countsOutdict = build_count_tables(
        trspdict, UTRdict, args.inset_choice, args.cdsDenThresh,
        args.norm_type, float(args.raw_dense_thresh),
        float(args.rpm_dense_thresh), args.outfilestring)
    write_countTable_to_csv(countsIDlist, countsOutdict, args.outfilestring)
Beispiel #2
0
def main():
    ## using argparser to load arguments from workflow
    parser = argparse.ArgumentParser()
    parser.add_argument('--trspdictfilestring',
                        help='input transcript density files')
    parser.add_argument('--UTRfilestring', help='UTRs file')
    parser.add_argument('--cdsDenThresh',
                        help='boolean value- should density filter be used?')
    parser.add_argument(
        '--norm_type', help='type of normalizaion, should be raw reads or rpm')
    parser.add_argument('--raw_dense_thresh',
                        help='threshold for CDS density for raw normalization')
    parser.add_argument('--rpm_dense_thresh',
                        help='threshold for CDS density for rpm normalization')
    parser.add_argument(
        '--inset_choice',
        help='inset values to be used to avoid start and stop codon peaks')
    parser.add_argument('--outfilestring', help='output file name')
    parser.add_argument(
        '--totreads',
        help='total number of reads used after raw densebuilder run')
    parser.add_argument(
        '--stopcodons',
        help=
        'csv file with positions of all stopcodons, riboseq_stopcodon_finder.py'
    )
    # parser.add_argument
    args = parser.parse_args()

    # utr3adj = pd.read_csv(stopframe_csv, index_col=0)
    utr3adj = pd.read_csv(args.stopcodons, index_col=0)
    # print utr3adj.head()
    # print utr3adj.loc['ENST00000426362.6']
    # print utr3adj.loc[utr3adj['#transcript'] == 'ENST00000426362.6']

    trspdict = rph.readcountsf(args.trspdictfilestring)
    UTRdict = rph.readindict(open(args.UTRfilestring, "rU"))

    # print UTRdict['ENST00000426362.6']

    countsIDlist, countsOutdict = build_count_tables(
        trspdict, UTRdict, utr3adj, args.inset_choice, bool(args.cdsDenThresh),
        args.norm_type, float(args.raw_dense_thresh),
        float(args.rpm_dense_thresh), args.outfilestring, int(args.totreads),
        minUtr3len)
    write_countTable_to_csv(countsIDlist, countsOutdict, args.outfilestring)
Beispiel #3
0
    def codonaverage(self):
        outlist, headers, motiffilelist = [], [], []
        headers.append("motif")

        for motif in self.motifs:

            motiffile = args.motiffilerootpath + motif + "_1.csv"
            motiffilelist.append(motiffile)
            headers.append(motif)
        outlist.append(headers)

        codon_occu = []
        codon_occu.append(self.sample_name)

        f_output = open(args.outfileparams, "w")
        f_output.write("Density file is " + str(self.sample_name) + "\n")
        f_output.write("cds5trim is " + str(args.cds5trim) + "\n")
        f_output.write("cds3trim is " + str(args.cds3trim) + "\n")
        f_output.write("Seqwin is " + str(args.seqwin) + "\n")
        f_output.write("Motiflist is " + str(motiffilelist) + "\n")

        readcountsdict = rph.readcountsf(args.trspdictfilestring)
        exclusionmodule = exclusionfiles[0]

        if exclusionmodule != '0':
            exclusiondict = self.readindict(open(exclusionmodule, "rU"))
        else:
            exclusiondict = '0'
        print "Exclusion file is " + str(exclusionmodule)

        UTRdict = rph.readindict(open(args.UTRfilestring, "rU"))
        occupancy = self.occupancy(readcountsdict, motiffilelist,
                                   exclusiondict, codon_occu, UTRdict,
                                   f_output)
        outlist.append(codon_occu)
        f_output.close()

        co = np.asarray(outlist)  # convert outlist to a np.array
        output = co.T
        # print "output: ", output
        # print "self.outlistfile: ", self.outlistfile
        self.writerows(output, self.outlistfile)  # write these rows to a csv
Beispiel #4
0
    comments += "Threshold signifies minimal rpkm needed in coding region for gene to be in the average.\n"
    comments += "alignpos =1 anchors average around the start codon and only includes 5'UTRs. alignpos =2 is the same for stop codon."
    fc = open(args.outfilebase + "_" + str(args.alignpos) + "_output.txt", "w")
    fc.write(comments)
    fc.write("\n")
    fc.write("Avggene was called with parameters:\n")
    fc.write("transcripts= " + str(args.trspdictfilestring) + "\n")
    fc.write("filtermodule= " + str(args.filtermodule) + "\n")
    fc.write("exclusionmodule= " + str(args.exclusionmodule) + "\n")
    fc.write("threshold= " + str(args.threshold) + "\n")
    fc.write("regionlength5= " + str(args.regionlength5) + "\n")
    fc.write("regionlength3= " + str(args.regionlength3) + "\n")
    fc.write("equalweight= " + str(args.equalweight) + "\n")
    fc.write("alignpos= " + str(args.alignpos) + "\n")
    fc.close()

    if args.filtermodule != '0':
        filterdict = rph.readindict(open(args.filtermodule, "rU"))
    else:
        filterdict = '0'
    if args.exclusionmodule != '0':
        exclusiondict = rph.readindict(open(args.exclusionmodule, "rU"))
    else:
        exclusiondict = '0'
    trspdict = rph.readcountsf(args.trspdictfilestring)
    UTRdict = rph.readindict(open(args.UTRfilestring, "rU"))
    metagene = Avggene(args.regionlength5, args.regionlength3, trspdict,
                       UTRdict, filterdict, exclusiondict, args.threshold,
                       args.alignpos, args.equalweight, args.outfilebase)
    metagene.totalavg()
def build_count_table_single(sample, UTRdict = UTRdict,):

	"""
	This is the function that returns total counts within the following regions:
		mRNA, cds, utr5, utr3

	*** Do not filter out any transcripts in output file ***

	Length of each region is modified by insets to account for biases around start and stop codons

	incorporating trspdict into this function

	### using totreads hack to get back to unnormalized, integer values without going thru the trouble of also mapping RAW A site reads

	"""


	densitystring = "Density_rpm" 
	ribosome_shift = "A"
	fp_assign_path = '%s/FPassignment/%s/%s/%s' % (rootpath, genome_name, experiment, sample)
	
	totreads_countfile = "%s/%s_FPassigned_counts.txt" % (fp_assign_path, sample)
	totreadcountf = open(totreads_countfile, "r")
	totreads = int(totreadcountf.read())
	totreadcountf.close()

	trspdictfilestring = '%s/%s/density5p_rnaseq/%s/%sf_' %(
		fp_assign_path, densitystring, sample, sample)

	outfolder =  "%s/countTables" % (fp_assign_path) # make a new folder to store count tables
	if not os.path.exists(outfolder):   os.makedirs(outfolder)

	
	### load the counts for all transcripts into a dictionary
	trspdict= rph.readcountsf(trspdictfilestring)

	### define the insets, and select which insets to use when assigning footprint counts
	### purpose of insets is to avoid counting ribosomes around the start and stop codons
	defaultInsets = { 'utr5Inset3' : 6, 'cdsInset5' : 18, 'cdsInset3' : 15, 'utr3Inset5' : 6 }
	zeroInsets    = { 'utr5Inset3' : 0, 'cdsInset5' : 0, 'cdsInset3' : 0, 'utr3Inset5' : 0 }
	customInsets  = { 'utr5Inset3' : 15, 'cdsInset5' : 24, 'cdsInset3' : 15, 'utr3Inset5' : 15 }

	if inset_choice == "default":
		insets = defaultInsets
	elif inset_choice == "zero":
		insets = zeroInsets
	elif inset_choice == "custom":
		insets = customInsets
	else:
		print "Insets were not set"
		sys.exit()
	
	countsIDlist = []
	countsOutdict = {}

	noUTRentry = 0
	zeroUtrlen = 0
	zeroCdsdense = 0
	totalCountedTranscripts = 0

	for trsp in trspdict:
		if UTRdict.has_key(trsp)!=True: # check to make sure density file has an annotation in the UTR csv
			noUTRentry +=1
			continue
		
		### Schema for subsetting UTRdict is below:
		# from csv: #transcript,chrom,featnum,strand,mrna_len,cds_len,5utr_len,3utr_len,gene_name,stopcodon,stop4nt
		# position in list: key, 0,		1,		2,		3,		4,		5,		6,		7,		8,			9

		# define base region sizes from UTRdict
		mrnalen = int(UTRdict[trsp][3])
		cdslen = int(UTRdict[trsp][4])
		utr5len = int(UTRdict[trsp][5])
		utr3len = int(UTRdict[trsp][6])

		## count transcripts with zero lengths for utr's
		if utr5len == 0 or utr3len == 0:
			zeroUtrlen +=1
			# print("transcript has zero utr5 len %s") % trsp
			# sys.exit() 
		# if utr3len == 0:
		# 	zeroUtrlen +=1

		### get counts from density file
		exonsplicedcounts = trspdict[trsp]

		### set starts and ends of the coding sequence
		cdsstart = utr5len
		cdsend = len(exonsplicedcounts) - utr3len
		if cdsstart == cdsend:
			print "Error, gene length is 0 for transcript %s" % trsp
			sys.exit()


		### modify region lengths using insets:
		## not sure if these modified lengths are necessarily informative here since I am not calcing densities
		utr5len = utr5len-insets['utr5Inset3']
		cdslen = cdslen-insets['cdsInset5']-insets['cdsInset3']
		utr3len = utr3len-insets['utr3Inset5']
		mrnalen = utr5len+cdslen+utr3len

		
		### This is the place to add insets for counts
		## assigning RAW counts to a given region
		utr5Counts = sum(exonsplicedcounts[:cdsstart-insets['utr5Inset3']])
		utr3Counts = sum(exonsplicedcounts[cdsend+insets['utr3Inset5']:])
		cdsCounts = sum(exonsplicedcounts[cdsstart+insets['cdsInset5']:cdsend-insets['cdsInset3']])
		mrnaCounts = int(utr5Counts+cdsCounts+utr3Counts)


		### 181208 - adding raw counts back to data tables, use totreads to adjust these back to the appropriate value
		RAWutr5Counts = int(utr5Counts*(totreads/1E6))
		RAWutr3Counts = int(utr3Counts*(totreads/1E6))
		RAWcdsCounts = int(cdsCounts*(totreads/1E6))
		RAWmrnaCounts = RAWutr5Counts+RAWcdsCounts+RAWutr3Counts ## nevermind, already had a calculation for RAWcounts, this is not necessary anymore...
			## although, this could be a problem that I am filtering out certain transcripts still
			## I'm not sure If I can just correct this to be unfiltered...  

		### define variables to output for final list
		transcriptOutlist = [RAWmrnaCounts, RAWcdsCounts, RAWutr5Counts, RAWutr3Counts] #adding row for utr3 adjust:
		countsIDlist.append(transcriptOutlist[0]) # contains list of keys for the dictionary
		countsOutdict[trsp] = transcriptOutlist # add this transcript to the dictionary
		totalCountedTranscripts += 1
	
	print "***********************************************************"
	print "Count table for complete for %s" % sample
	print "Number of transcripts absent in UTRfile: %s" % noUTRentry
	print "Number of transcripts with zero UTR lengths: %s" % zeroUtrlen
	print "Number of transcripts with zero CDS density: %s" % zeroCdsdense
	print "Total transcripts included in count table: %s" % totalCountedTranscripts
	print "***********************************************************"	

	#### need to think about writing up these things here, might not be strictly necessary
	# this is all for writing to the output text file
	# fc = open(outfilestring+"_output.txt", "w") # open in appending mode
	# fc.write("---Summary of DenseTables Run--- \n")
	# fc.write(str(datetime.now())+"\n")
	# fc.write("Total reads in library used for normalization: %s \n" % totreads)
	# fc.write("Number of transcripts absent in UTRfile: %s \n" % noUTRentry)
	# fc.write("Number of transcripts with zero UTR lengths: %s \n" % zeroUtrlen)
	# fc.write("Number of transcripts with zero CDS density: %s \n" % zeroCdsdense)
	# fc.write("Number of transcripts below min 3'UTR length to first inframe stop: %s \n" % tooShortUtr3AdjLenCalc)
	# fc.write("Total transcripts included in count table: %s \n" % totalCountedTranscripts)
	# fc.write("%s insets were used with the following values: %s \n" % (inset_choice, insets))
	# fc.write("Normalization type: %s \n" % norm_type)
	# if cdsDenThresh == True and norm_type == 'raw':
	# 	fc.write("CDS density threshold on raw reads: %s \n" % str(raw_dense_thresh))
	# elif cdsDenThresh == True and norm_type == 'rpm':
	# 	fc.write("CDS density threshold on rpm: %s \n" % str(rpm_dense_thresh))
	# elif cdsDenThresh == False:
	# 	fc.write("No threshold was set on cds density \n")
	# else:
	# 	print "cdsDenThresh not set!"
	# 	fc.close()
	# 	sys.exit()
	# fc.close()

	# print countsOutdict['uc001qop.2'] # gapdh

	dfout = pd.DataFrame.from_dict(countsOutdict, orient='index') #upgrade to pandas v 0.23 to use the columns keyword
	cols= [sample+'_mRNACounts', sample+'_cdsCounts', sample+'_utr5Counts', sample+'_utr3Counts']
	dfout.columns = cols
	# dfout.rename(columns=cols)
	# print dfout.head()
	df_mRNA = dfout.iloc[:,0]
	df_cds = dfout.iloc[:,1]
	df_utr5 = dfout.iloc[:,2]
	df_utr3 = dfout.iloc[:,3]

	return df_mRNA, df_cds, df_utr5, df_utr3
Beispiel #6
0
def build_avggene_firstStop(UTRdict, threshold, samp, region_choice,
                            inset_choice):
    """
	frame_selection == ['frameMinusOne', 'frameZero', 'framePlusOne']
	"""
    fp_assign_path = '%s/FPassignment/%s/%s/%s' % (rootpath, genome_name,
                                                   experiment, samp)
    trspfilestring = '%s/%s/density%sp_%sshift_%sto%s/%s_%sto%sf/%s_%sto%sf_' % (
        fp_assign_path, densitystring, assignment, ribosome_shift, minlen,
        maxlen, samp, minlen, maxlen, samp, minlen, maxlen)
    totreads_countfile = "%s/%s_FPassigned_counts.txt" % (fp_assign_path, samp)
    totreadcountf = open(totreads_countfile, "r")
    totreads = int(totreadcountf.read())
    totreadcountf.close()

    ### This is where all the count files are loaded into a dictionary
    trspdict = rph.readcountsf(trspfilestring)  ### this takes a minute

    ## create a list of 0's that is the length of the region of interest
    ## add 3 to account for the stop codon
    ## this will be added to for every transcript
    averagegene = [0 for num in range(0, 3 + upstreamNTs + downstreamNTs)
                   ]  # add 3 for start or stop codon

    # print averagegene

    ## add counters and set to zero
    noUTRentry = 0  ### discard transcripts not in UTRdict
    zeroCdsdense = 0  ### discard transcripts with zero reads in CDS
    lowCdsdense = 0  ## Optional CDS density thresholding, in RPKM
    regionTooShort = 0  ### not enough 3'UTR in region of interest past first stop codon
    totalCountedTranscripts = 0  ## number included in final output

    ### calculate mRNA-region densities

    defaultInsets = {
        'utr5Inset3': 6,
        'cdsInset5': 18,
        'cdsInset3': 15,
        'utr3Inset5': 6
    }
    zeroInsets = {
        'utr5Inset3': 0,
        'cdsInset5': 0,
        'cdsInset3': 0,
        'utr3Inset5': 0
    }
    customInsets = {
        'utr5Inset3': 15,
        'cdsInset5': 24,
        'cdsInset3': 15,
        'utr3Inset5': 15
    }

    if inset_choice == "default":
        insets = defaultInsets
    elif inset_choice == "zero":
        insets = zeroInsets
    elif inset_choice == "custom":
        insets = customInsets
    else:
        print "Insets were not set"
        sys.exit()

    ### Iterated through transcripts one at a time, retrieving counts in region of interest:
    for trsp in trspdict:

        # if trsp != 'ENST00000319248.13': ## testing with PRDX1
        # 	continue

        ### Load in count file for the transcript here
        exonsplicedcounts = trspdict[trsp]

        if UTRdict.has_key(
                trsp
        ) != True:  # check to make sure density file has an annotation in the UTR csv
            noUTRentry += 1
            continue

        mrnalen = int(UTRdict[trsp][3])
        cdslen = int(UTRdict[trsp][4])
        utr5len = int(UTRdict[trsp][5])
        utr3len = int(UTRdict[trsp][6])
        assert mrnalen == cdslen + utr5len + utr3len  ## check that this is true

        ### define Coding sequence here
        cdsstart = utr5len
        cdsend = len(
            exonsplicedcounts) - utr3len  # cdsend == first position of utr3
        if cdsstart == cdsend:
            print "Error, gene length is 0 for transcript " + trsp
            sys.exit()

        # cdscounts= exonsplicedcounts[cdsstart:cdsend] ### counts are already in rpm
        # utr3CountList = exonsplicedcounts[cdsend:]

        # cdsdensity= sum(cdscounts)/len(cdscounts) ### unmodified CDS density

        # if cdsdensity == 0:
        # 	zeroCdsdense += 1
        # 	continue

        # if cdsdensity*float(1000)< int(threshold):	# Threshold on cds density: (thresholding on "rpkm")
        # 	lowCdsdense += 1
        # 	continue

        ### Calculate Region Densities ###
        utr5lenMod = utr5len - insets['utr5Inset3']
        cdslenMod = cdslen - insets['cdsInset5'] - insets['cdsInset3']
        utr3lenMod = utr3len - insets['utr3Inset5']
        mrnalenMod = utr5lenMod + cdslenMod + utr3lenMod

        utr5Counts = sum(exonsplicedcounts[:cdsstart - insets['utr5Inset3']])
        cdsCounts = sum(
            exonsplicedcounts[cdsstart + insets['cdsInset5']:cdsend -
                              insets['cdsInset3']])
        utr3Counts = sum(exonsplicedcounts[cdsend + insets['utr3Inset5']:])
        mrnaCounts = utr5Counts + cdsCounts + utr3Counts

        ### RAW counts
        RAWutr5Counts = int(utr5Counts * (totreads / 1E6))
        RAWutr3Counts = int(utr3Counts * (totreads / 1E6))
        RAWcdsCounts = int(cdsCounts * (totreads / 1E6))
        RAWmrnaCounts = int(mrnaCounts * (totreads / 1E6))

        ### denisites
        # mrnaDensity = (mrnaCounts/mrnalenMod)
        cdsDensity = (cdsCounts / cdslenMod)
        # utr5Density = (utr5Counts/utr5lenMod)
        # utr3Density = (utr3Counts/utr3lenMod)

        #### RPKM densities
        # mrnaDensity_rpkm = (mrnaCounts/mrnalenMod) * 1000
        cdsDensity_rpkm = (cdsCounts / cdslenMod) * 1000
        # utr5Density_rpkm = (utr5Counts/utr5lenMod) * 1000
        # utr3Density_rpkm = (utr3Counts/utr3lenMod) * 1000

        ### throw out zero's
        if cdsDensity == 0:
            zeroCdsdense += 1
            continue

        if cdsDensity * float(1000) < int(
                threshold
        ):  # Threshold on cds density: (thresholding on "rpkm")
            lowCdsdense += 1
            continue

        ### define vector in valid CDS region, normalize by cdsDensity
        cdsSplicedCounts = exonsplicedcounts[cdsstart +
                                             insets['cdsInset5']:cdsend -
                                             insets['cdsInset3']]
        cdsNormCounts = [
            rpf / cdsDensity for rpf in cdsSplicedCounts
        ]  ## just region of cds within insets for counts, sum/len == 1

        # print cdslenMod, "modified cds length"
        # print sum(cdsNormCounts), "total normalized counts, should equal length of cds"
        # print sum(cdsNormCounts)/len(cdsNormCounts) ## this should == 1

        ### define vector for whole transcript, normlaized by cdsDensity
        exonNormCounts = [
            rpf / cdsDensity for rpf in exonsplicedcounts
        ]  ## counts normalized by cds density, using only region within insets
        # print exonNormCounts
        # print sum(exonNormCounts)
        # print sum(exonNormCounts)/len(exonNormCounts) ## should typically be less than 1, unless greater density from utrs and start/stop codons

        ### for start codon metagenes -
        if region_choice == 'start':
            ### check boundaries:
            # print mRNAdf.loc[trsp]['mRNAseqs'][cdsstart:cdsstart+3] ### start codon

            if len(exonNormCounts[cdsstart - upstreamNTs:cdsstart + 3 +
                                  downstreamNTs]
                   ) < upstreamNTs + 3 + downstreamNTs:
                regionTooShort += 1
                continue
            else:
                totalCountedTranscripts += 1
                avggene_counts = exonNormCounts[cdsstart -
                                                upstreamNTs:cdsstart + 3 +
                                                downstreamNTs]
                for i in range(len(avggene_counts)
                               ):  ### add these counts to the running total
                    averagegene[i] += avggene_counts[i]

        if region_choice == 'stop':
            ### check boundaries
            # print mRNAdf.loc[trsp]['mRNAseqs'][cdsend-3:cdsend] ### stop codon

            if len(exonNormCounts[cdsend - 3 - upstreamNTs:cdsend +
                                  downstreamNTs]
                   ) < upstreamNTs + 3 + downstreamNTs:
                regionTooShort += 1
                continue
            else:
                totalCountedTranscripts += 1
                # print upstreamNTs+3+downstreamNTs
                # print len(exonNormCounts[cdsend-3-upstreamNTs:cdsend+downstreamNTs])
                avggene_counts = exonNormCounts[cdsend - 3 -
                                                upstreamNTs:cdsend +
                                                downstreamNTs]
                for i in range(len(avggene_counts)
                               ):  ### add these counts to the running total
                    averagegene[i] += avggene_counts[i]

    averagegene_equal = [rpf / totalCountedTranscripts for rpf in averagegene
                         ]  ### divide by total number of valid transcripts
    # print averagegene
    # print averagegene_equal

    positions = range(-upstreamNTs - 1,
                      downstreamNTs + 2)  # start or stop codon is [-1,0,1]

    df = pd.DataFrame({'position': positions, 'avg': averagegene_equal})
    df = df[['position', 'avg']]

    # print df['avg'].sum()/len(df)

    # print df

    print "Avggene run compolete for sample %s" % samp
    print "Transcripts inculded %s" % totalCountedTranscripts
    print "Number of transcripts absent in UTRfile: %s" % noUTRentry
    print "Number of transcripts with zero CDS density: %s" % zeroCdsdense
    print "Number of transcripts below CDS density threshold: %s" % lowCdsdense
    print "Number of transcripts too short avggene region: %s" % regionTooShort
    print "- - - - - - - -"
    print df
    print "- - - - - - - -"

    if region_choice == 'start':
        alignpos = "1"
    elif region_choice == 'stop':
        alignpos = "2"
    else:
        print "alignpos not set!!!"

    ### write csv file
    fp_assign_path = '%s/FPassignment/%s/%s/%s' % (rootpath, genome_name,
                                                   experiment, samp)
    avggene_csv_path = "%s/avggene%s_ORF%s_%sshift_%s%s150" % (
        fp_assign_path, alignpos, norm_type, ribosome_shift, assignment, norm)
    if not os.path.exists(avggene_csv_path): os.makedirs(avggene_csv_path)

    csv_outfile = "%s/%s_%s_rpkmThresh%s_%sto%sf_avg_%s_cdsNorm.csv" % (  # adding cdsNorm to indicate normalized densities
        avggene_csv_path, samp, pop, threshold, minlen, maxlen, alignpos)

    df.to_csv(csv_outfile, index=False)
def retrieve_stop_density(din, samp):

    d = din.copy()

    dropList = []
    averagegeneNTC = [0 for num in range(0, 3 + upstreamNTs + downstreamNTs)]
    averagegeneTC3 = [0 for num in range(0, 3 + upstreamNTs + downstreamNTs)]

    d['NTC_up_counts'] = np.zeros(len(d))
    d['NTC_up_dense'] = np.zeros(len(d))
    d['NTC_down_counts'] = np.zeros(len(d))
    d['NTC_down_dense'] = np.zeros(len(d))
    #     d['NTC_ratio'] = np.zeros(len(d))
    d['TC3_up_counts'] = np.zeros(len(d))
    d['TC3_up_dense'] = np.zeros(len(d))
    d['TC3_down_counts'] = np.zeros(len(d))
    d['TC3_down_dense'] = np.zeros(len(d))
    #     d['TC3_ratio'] = np.zeros(len(d))

    lowCdsdense = 0
    totalCountedTranscripts = 0

    fp_assign_path = '%s/FPassignment/%s/%s/%s' % (rootpath, genome_name,
                                                   experiment, samp)
    trspfilestring = '%s/%s/density%sp_%sshift_%sto%s/%s_%sto%sf/%s_%sto%sf_' % (
        fp_assign_path, densitystring, assignment, ribosome_shift, minlen,
        maxlen, samp, minlen, maxlen, samp, minlen, maxlen)
    totreads_countfile = "%s/%s_FPassigned_counts.txt" % (fp_assign_path, samp)
    totreadcountf = open(totreads_countfile, "r")
    totreads = int(totreadcountf.read())
    totreadcountf.close()

    defaultInsets = {
        'utr5Inset3': 6,
        'cdsInset5': 18,
        'cdsInset3': 15,
        'utr3Inset5': 6
    }
    zeroInsets = {
        'utr5Inset3': 0,
        'cdsInset5': 0,
        'cdsInset3': 0,
        'utr3Inset5': 0
    }
    customInsets = {
        'utr5Inset3': 15,
        'cdsInset5': 24,
        'cdsInset3': 15,
        'utr3Inset5': 15
    }

    if inset_choice == "default":
        insets = defaultInsets
    elif inset_choice == "zero":
        insets = zeroInsets
    elif inset_choice == "custom":
        insets = customInsets
    else:
        print "Insets were not set"
        sys.exit()

    ### This is where all the count files are loaded into a dictionary
    trspdict = rph.readcountsf(trspfilestring)  ### this takes a minute

    for tr in d.index:
        #         print tr

        mrnalen = int(d.loc[tr, 'mrna_len'])
        cdslen = int(d.loc[tr, 'cds_len'])
        utr5len = int(d.loc[tr, '5utr_len'])
        utr3len = int(d.loc[tr, '3utr_len'])
        assert mrnalen == cdslen + utr5len + utr3len

        exonsplicedcounts = trspdict[tr]
        utr3LenAdj = int(
            d.loc[tr, 'frameZeroUtr3LenAdj']
        ) + 3  ## adding +3 here to now include length of stop codon

        ### calculate normalized counts to cds
        cdsstart = utr5len
        cdsend = len(
            exonsplicedcounts) - utr3len  # cdsend == first position of utr3

        cdslenMod = cdslen - insets['cdsInset5'] - insets['cdsInset3']
        cdsCounts = sum(
            exonsplicedcounts[cdsstart + insets['cdsInset5']:cdsend -
                              insets['cdsInset3']])
        cdsDensity = (cdsCounts / cdslenMod)

        #         print cdsDensity*float(1000)

        if cdsDensity * float(1000) < int(
                threshold
        ):  # Threshold on cds density: (thresholding on "rpkm")
            lowCdsdense += 1
            dropList.append(tr)
            continue

        exonNormCounts = [
            rpf / cdsDensity for rpf in exonsplicedcounts
        ]  ## counts normalized by cds density, using only region within insets
        #         print sum(exonNormCounts)
        #         print sum(exonNormCounts[cdsstart+insets['cdsInset5']:cdsend-insets['cdsInset3']])
        utr3CountList = exonNormCounts[cdsend:]

        ntcCounts = exonNormCounts[(cdsend - upstreamNTs - 3):(cdsend +
                                                               downstreamNTs)]
        tc3Counts = utr3CountList[(utr3LenAdj - upstreamNTs -
                                   3):(utr3LenAdj + downstreamNTs)]

        #         print len(ntcCounts)
        #         print ntcCounts
        #         print len(tc3Counts)
        #         print tc3Counts

        ### define regions: upstream counts == [0:27], downstream counts == [36:0]
        ### more generally: upstream counts == [:upstreamNTs-3], downstream counts == [downstreamNTs+6:]

        d.at[tr, 'NTC_up_counts'] = sum(ntcCounts[:upstreamNTs - 3])
        d.at[tr, 'NTC_up_dense'] = sum(ntcCounts[:upstreamNTs - 3]) / (
            upstreamNTs - 3)  ## avoiding codon before stop
        d.at[tr, 'NTC_down_counts'] = sum(ntcCounts[downstreamNTs + 6:])
        d.at[tr, 'NTC_down_dense'] = sum(ntcCounts[downstreamNTs + 6:]) / (
            downstreamNTs - 3)  ## avoiding codon after stop
        #         d.at[tr, 'NTC_ratio'] = d.loc[tr, 'NTC_down_dense']/d.loc[tr, 'NTC_up_dense']
        d.at[tr, 'TC3_up_counts'] = sum(tc3Counts[:upstreamNTs - 3])
        d.at[tr, 'TC3_up_dense'] = sum(
            tc3Counts[:upstreamNTs - 3]) / (upstreamNTs - 3)
        d.at[tr, 'TC3_down_counts'] = sum(tc3Counts[downstreamNTs + 6:])
        d.at[tr, 'TC3_down_dense'] = sum(
            tc3Counts[downstreamNTs + 6:]) / (downstreamNTs - 3)
        #         d.at[tr, 'TC3_ratio'] = d.loc[tr, 'TC3_down_dense']/d.loc[tr, 'TC3_up_dense']

        for i in range(len(ntcCounts)):
            averagegeneNTC[i] += ntcCounts[i]
        for i in range(len(tc3Counts)):
            averagegeneTC3[i] += tc3Counts[i]
        totalCountedTranscripts += 1

    avgGeneEqualNTC = [
        rpf / totalCountedTranscripts for rpf in averagegeneNTC
    ]  ### test average gene plot
    avgGeneEqualTC3 = [
        rpf / totalCountedTranscripts for rpf in averagegeneTC3
    ]  ### test average gene plot

    #     print "avgGeneNTC!", avgGeneEqualNTC
    #     print "avgGeneTC3!", avgGeneEqualTC3

    dout = d.drop(dropList, axis=0)

    print len(d)
    print len(dropList)
    print len(dout)

    #         print exonsplicedcounts

    return avgGeneEqualNTC, avgGeneEqualTC3, dout
def region_size_dist_ftsize(readsize, sample):
	"""
	build the counts for a given region 
	"""
	fp_assign_path = '%s/FPassignment/%s/%s/%s' % (rootpath, genome_name, experiment, sample)
	totreads_countfile = "%s/%s_FPassigned_counts.txt" % (fp_assign_path, sample)
	totreadcountf = open(totreads_countfile, "r")
	totreads = int(totreadcountf.read())
	totreadcountf.close()
	print "total reads for sample %s = %s" % (sample, totreads)

	# for readsize in ftsize:
	readsize = str(readsize) # convert to string
	trspdictfilestring = '%s/DensityUnnormalized/density5p_0shift_%s/%s_%sf/%s_%sf_' %(
		fp_assign_path, readsize, sample, readsize, sample, readsize)

	bamfilepath_readsize = '%s/%s_star_default/%s_%s_match.sorted.bam' % (
		fp_assign_path, sample, sample, readsize)
	bamfile = pysam.AlignmentFile(bamfilepath_readsize, 'rb')
	read_count_bam = bamfile.count()
	print "total reads in bamfile for sample: %s, read length: %s, equals == %s" % (sample, readsize, read_count_bam)

	## check total number of reads in this bamfile:

	## build the trspdict now for a given readlength:
	trspdict = rph.readcountsf(trspdictfilestring)

	## add counters 
	noUTRentry = 0
	zeroUtrlen = 0
	zeroUtrlenInsets = 0
	zeroCdsdense = 0
	lowCdsdense = 0
	lowCdsCounts = 0 # adding cds raw read counter

	### counters for output
	totUtr5Counts = 0
	totCdsCounts = 0
	totUtr3Counts = 0
	totMrnaCounts = 0

	## iterate through every transcript in the gtf file
	for trsp in trspdict:
		if UTRdict.has_key(trsp)!=True: # check to make sure density file has an annotation in the UTR csv
			noUTRentry +=1
			continue
	# from csv: #transcript,chrom,featnum,strand,mrna_len,cds_len,5utr_len,3utr_len,gene_name,stopcodon,stop4nt
	# position in list: key, 0,		1,		2,		3,		4,		5,		6,		7,		8,			9

	# define base region sizes from UTRdict
		mrnalen = int(UTRdict[trsp][3])
		cdslen = int(UTRdict[trsp][4])
		utr5len = int(UTRdict[trsp][5])
		utr3len = int(UTRdict[trsp][6])

		### Not sure if I want to keep this here... see how many have lengths of zero first
		if utr5len == 0:
			zeroUtrlen +=1
			# print("transcript has zero utr5 len %s") % trsp
			# sys.exit() 
			continue
		if utr3len == 0:
			zeroUtrlen +=1
			continue

# get counts from density file
		exonsplicedcounts = trspdict[trsp]

		# set starts and ends 
		cdsstart = utr5len
		cdsend = len(exonsplicedcounts) - utr3len
		if cdsstart == cdsend:
			print "Error, gene length is 0 for transcript %s" % trsp
			sys.exit()


		# modify region lengths using insets:
		
		utr5len = utr5len-insets['utr5Inset3']
		cdslen = cdslen-insets['cdsInset5']-insets['cdsInset3']
		utr3len = utr3len-insets['utr3Inset5']
		mrnalen = utr5len+cdslen+utr3len

		if utr5len == 0:
			zeroUtrlenInsets +=1
			# print "transcript has zero utr5 len %s" % trsp
			# sys.exit() 
			continue
		if utr3len == 0:
			zeroUtrlenInsets +=1
			continue

		utr5Counts = sum(exonsplicedcounts[:cdsstart-insets['utr5Inset3']])
		utr3Counts = sum(exonsplicedcounts[cdsend+insets['utr3Inset5']:])
		cdsCounts = sum(exonsplicedcounts[cdsstart+insets['cdsInset5']:cdsend-insets['cdsInset3']])
		mrnaCounts = utr5Counts+cdsCounts+utr3Counts

		totUtr5Counts += utr5Counts
		totCdsCounts += cdsCounts
		totUtr3Counts += utr3Counts
		totMrnaCounts += mrnaCounts
	print "UTR5total = %s, CDStotal = %s, UTR3total = %s, mRNAtotal = %s " % (totUtr5Counts, totCdsCounts, totUtr3Counts, totMrnaCounts)
	return [int(readsize), totUtr5Counts, totCdsCounts, totUtr3Counts, totMrnaCounts]
def build_avggene_firstStop(UTRdict, utr3adj, threshold, file,
                            frame_selection):
    """
	frame_selection == ['frameMinusOne', 'frameZero', 'framePlusOne']
	Not equally weighted here
	"""
    fp_assign_path = '%s/FPassignment/%s/%s/%s' % (rootpath, genome_name,
                                                   experiment, file)
    trspfilestring = '%s/%s/density%sp_%sshift_%sto%s/%s_%sto%sf/%s_%sto%sf_' % (
        fp_assign_path, densitystring, assignment, ribosome_shift, minlen,
        maxlen, file, minlen, maxlen, file, minlen, maxlen)
    totreads_countfile = "%s/%s_FPassigned_counts.txt" % (fp_assign_path, file)
    totreadcountf = open(totreads_countfile, "r")
    totreads = int(totreadcountf.read())
    totreadcountf.close()

    ### This is where all the count files are loaded into a dictionary
    trspdict = rph.readcountsf(trspfilestring)  ### this takes a minute

    ## create a list of 0's that is the length of the region of interest
    ## add 3 to account for the stop codon
    ## this will be added to for every transcript
    averagegene = [0 for num in range(0, 3 + upstreamNTs + downstreamNTs)
                   ]  # add 3 for the inframe stop codon

    ## add counters and set to zero
    noUTRentry = 0  ### discard transcripts not in UTRdict
    zeroUtrlen = 0  ### discard transcripts with zero 3'UTR length
    zeroCdsdense = 0  ### discard transcripts with zero reads in CDS
    lowCdsdense = 0  ## Optional CDS density thresholding, in RPKM
    noInframeStops = 0  ### discard transcripts without any inframe stop codons
    tooCloseToCDS = 0  ### not enough space between first inframe stop and Coding sequence
    utr3tooShort = 0  ### not enough 3'UTR in region of interest past first stop codon
    totalCountedTranscripts = 0  ## number included in final output

    ### Iterated through transcripts one at a time, retrieving counts in region of interest:
    for trsp in trspdict:
        if UTRdict.has_key(
                trsp
        ) != True:  # check to make sure density file has an annotation in the UTR csv
            noUTRentry += 1
            continue

        mrnalen = int(UTRdict[trsp][3])
        cdslen = int(UTRdict[trsp][4])
        utr5len = int(UTRdict[trsp][5])
        utr3len = int(UTRdict[trsp][6])
        assert mrnalen == cdslen + utr5len + utr3len  ## check that this is true

        if utr3len == 0:  # only check the 3'UTR here
            zeroUtrlen += 1
            continue

        ### set stop selection:
        if frame_selection == 'frameZero':

            utr3LenAdj = int(
                utr3adj.loc[trsp].frameZeroUtr3LenAdj
            ) + 3  # add plus 3 to include length of stop codon
            ### utr3LenAdj is now the 0-based position one nucleotide past the first inframe stop codon relative to the 3'UTR
            ### position of stop codon is utr3CountList[utr3LenAdj-3:utr3LenAdj]
            ### possibly a stupid way to do this, but ill think about it more
            inframeStopCount = int(utr3adj.loc[trsp].frameZeroStopCount)

        elif frame_selection == 'framePlusOne':
            utr3LenAdj = int(utr3adj.loc[trsp].framePlusOneUtr3LenAdj) + 3
            inframeStopCount = int(utr3adj.loc[trsp].framePlusOneStopCount)

        elif frame_selection == 'frameMinusOne':
            utr3LenAdj = int(utr3adj.loc[trsp].frameMinusOneUtr3LenAdj) + 3
            inframeStopCount = int(utr3adj.loc[trsp].frameMinusOneStopCount)

        else:
            print "frame not set!"
            sys.exit()

        ### filter out trsp's with no inframe stops, or stops too close to CDS or end of transcript
        if inframeStopCount == 0:
            noInframeStops += 1
            continue

        if utr3LenAdj - upstreamNTs - 3 < 0:  # taking everything up to the stop codon, includes first nt of utr3
            tooCloseToCDS += 1
            continue

        if utr3LenAdj + downstreamNTs > utr3len:
            utr3tooShort += 1
            continue

        ### Load in count file for the transcript here
        exonsplicedcounts = trspdict[trsp]

        ### define Coding sequence here
        cdsstart = utr5len
        cdsend = len(
            exonsplicedcounts) - utr3len  # cdsend == first position of utr3
        if cdsstart == cdsend:
            print "Error, gene length is 0 for transcript " + trsp
            sys.exit()

        cdscounts = exonsplicedcounts[cdsstart:cdsend]
        utr3CountList = exonsplicedcounts[cdsend:]

        cdsdensity = sum(cdscounts) / len(cdscounts)

        if cdsdensity == 0:
            zeroCdsdense += 1
            continue

        if cdsdensity * float(1000) < int(
                threshold
        ):  # Threshold on cds density: (thresholding on "rpkm")
            lowCdsdense += 1
            continue

        ### create the counts we want for our region
        avgGeneCounts = utr3CountList[(utr3LenAdj - upstreamNTs -
                                       3):(utr3LenAdj + downstreamNTs)]
        totalCountedTranscripts += 1

        for i in range(len(avgGeneCounts)):
            averagegene[i] += avgGeneCounts[i]

    print "Avggene run compolete for sample %s" % file
    print "Frame selection is: %s" % frame_selection
    print "Genes inculded %s" % totalCountedTranscripts
    print "Number of transcripts absent in UTRfile: %s" % noUTRentry
    print "Number of transcripts with zero UTR lengths: %s" % zeroUtrlen
    print "Number of transcripts with zero CDS density: %s" % zeroCdsdense
    print "Number of transcripts below CDS density threshold: %s" % lowCdsdense
    print "Number of transcripts with no In-frame stops: %s" % noInframeStops
    print "Number of transcripts too close to normal stop codon: %s" % tooCloseToCDS
    print "Number of transcripts with insufficient 3'UTR: %s" % utr3tooShort
    print "- - - - - - - -"
    print "RPM: %s" % averagegene
    print "- - - - - - - -"

    return averagegene