Python Utility_extended.separate_by_chrom_sort Examples

Programming Language: Python

Class/Type: Utility_extended

Method/Function: separate_by_chrom_sort

Examples at hotexamples.com: 7

Python Utility_extended.separate_by_chrom_sort - 7 examples found. These are the top rated real world Python examples of Utility_extended.separate_by_chrom_sort extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

fileExists(28)

get_subset_ids_from_dic(10)

is_list_sorted(9)

chrom_files_exist(9)

separate_by_chrom_sort(6)

intersect(3)

is_listT_sorted(3)

union(2)

get_read_counts_on_regions(2)

overlap(1)

shared(1)

separate_by_strand(1)

rescale_a_column(1)

associate_tags_with_regions(1)

output_list(1)

output_dic(1)

is_tuplelist_sorted(1)

intersection(1)

find_islands_overlapping_with_regions(1)

find_islands_overlapping_with_region(1)

find_coverage_by_islands_on_regions(1)

union_with_trace(1)

Example #1

Show file

def calculateExonIntrons(entrez_genes, bedfile, column_index, chroms,  fragment_size, totalcount, out_file):
	lib_name = (bedfile).split('/')[-1] # remove directory
	suffix = lib_name.split('.')[-1] # txt
	lib_name = lib_name.split('.')[0] 
	extension = "-" + lib_name +'.' + suffix +"1"
	if Utility_extended.fileExists(bedfile):
		if Utility_extended.chrom_files_exist(chroms, extension) != 1:
			# Separate by chrom and sort by start
			print chroms, extension, " files do not exist, separate by chroms. "
			Utility_extended.separate_by_chrom_sort(chroms, bedfile, extension, [column_index])
	else:
		print bedfile, " is not found";
		sys.exit(1)
	
	all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]}
	all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]}
	all_reads_on_merged_transcripts = {} #{entrezID:[((start, end), read_count)]}
	all_summary = {}
	
	for chrom in chroms:
		chrombed = chrom + extension
		entrez_genes_by_chrom =  Entrez.KnownEntrezGenes([chrom], entrez_genes.subset_by_chrom(chrom))
		(reads_on_shared_exons, reads_on_shared_introns, reads_on_merged_transcripts, summary) =  calculateExonIntrons_by_chrom (entrez_genes_by_chrom, chrombed, fragment_size, totalcount, out_file)
		#if chrom == chroms[0]:
			#myid = reads_on_shared_exons.keys()[0]
			#test(entrez_genes_by_chrom, reads_on_shared_introns, myid)
		all_reads_on_shared_exons.update(reads_on_shared_exons)
		all_reads_on_shared_introns.update(reads_on_shared_introns)
		all_reads_on_merged_transcripts.update(reads_on_merged_transcripts)
		all_summary.update(summary)
		
	SeparateByChrom.cleanup(chroms, extension)
	return (all_reads_on_shared_exons, all_reads_on_shared_introns, all_reads_on_merged_transcripts, summary)

Example #2

Show file

def Calculate3UTRUsage(entrez_genes, bedfile, chroms, outfile, threshold,
                       PAfile, extension, index):
    """
	entrez genes are made sure to be on one strand, 
	the bed file are reads for that strand

	entrez_genes is a KnownEntrezGenes class object
	The raw read file needs to conform to bed format

	column_index: column in bed file for sorting

	"""
    # Separate reads by chrom
    rawreadslibName1 = (bedfile).split('/')[-1]
    rawreadssuffix1 = rawreadslibName1.split('.')[-1]
    rawreadslibName1 = rawreadslibName1.split('.')[0]
    rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1"
    if Utility_extended.fileExists(bedfile):
        if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1:
            # Separate by chrom and sort by start
            print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. "
            Utility_extended.separate_by_chrom_sort(chroms, bedfile,
                                                    rawreadsextension1,
                                                    str(index))
    else:
        print bedfile, " is not found"
        sys.exit(1)

    #This part is to access the polyadenylation sites
    PA1 = open(PAfile, 'r')

    PAsiteslist = []
    PA2 = 'i'
    while PA2 != '':
        PA2 = PA1.readline()
        if PA2 != '':
            PA3 = PA2.strip('\n')
            PA4 = PA3.split('\t')
            PAsiteslist.append((PA4[0], PA4[1]))

    PA1.close()

    # Here the output is 'a', i.e. the output is appended to an existing file instead of creating one
    outf = open(outfile, 'a')
    for chrom in chroms:
        if chrom in entrez_genes.chroms:
            # a KnownEntrezGenes object
            entrez_genes_by_chrom = Entrez.KnownEntrezGenes(
                [chrom], entrez_genes.subset_by_chrom(chrom))
            # Get the read locations
            if Utility_extended.fileExists(chrom + rawreadsextension1):
                f = open(chrom + rawreadsextension1, 'r')
                tag_positions = []
                for line in f:
                    line = line.strip()
                    sline = line.split()
                    #make sure the extension is always 0, otherwise the rest of the program might not work as intended
                    tag_positions.append(
                        associate_tags_with_regions.tag_position(sline, 0))

                f.close()
                if not Utility_extended.is_list_sorted(tag_positions):
                    tag_positions.sort()
                #By this point tag_positions is a sorted list of all the reads located on the strand and chromosome the code is currently dealing with

                for entrez_id in entrez_genes_by_chrom.entrez_ids:
                    gene = entrez_genes_by_chrom.entrez_genes[
                        entrez_id]  # an EntrezGene class object
                    # get_3UTRs gets the ENTREZ 3'UTR, which appears to generally give the beginning of the 3'UTR and a site very close to the most distal polyadenylation site
                    three_UTRs = gene.get_3UTRs()
                    # Mastertuplemaker uses the ENTREZ 3'UTR and the polyA sites given to create the true data for the 3'UTR needed for CUTR_vs_AUTR to work
                    true3UTRstarts, true3UTRends, UTRregion_start, UTRregion_end, UTRbeginning = Mastertuplemaker(
                        three_UTRs, PAsiteslist, chrom, gene.strand, extension)
                    #value should always be 1 as only 3'UTR with more than 1 polyA site need be considered
                    if len(true3UTRends) > 1:
                        #find all reads inside the 3'UTR
                        inside_reads = associate_tags_with_3UTR(
                            tag_positions, UTRregion_start, UTRregion_end)
                        #finds reads in each region of the 3'UTR and calculates aUTR/cUTR for each of them
                        #PolyAsites potentially useful for output
                        RUDs, basic_RUD, PolyAsites = CUTR_vs_AUTR(
                            true3UTRstarts, true3UTRends, inside_reads,
                            gene.strand, threshold)

                        #important if one wants to output gene_symbol information
                        gene_symbol = []
                        for mytranscript in gene.transcripts:
                            if mytranscript.additional_annotations[
                                    0] not in gene_symbol:
                                gene_symbol.append(
                                    mytranscript.additional_annotations[0])

                        #outline to use to output RUDs
                        outline = str(
                            entrez_id
                        ) + "\t" + chrom + "\t" + gene.strand + "\t" + str(
                            basic_RUD) + "\t" + ",".join(map(str, RUDs)) + "\n"

                        #outline to use to output polyA information for a species
                        #outline = str(entrez_id) + "\t" + chrom + "\t" + gene.strand + "\t" + str(UTRbeginning) + "\t" + ",".join(map(str, PolyAsites)) + "\n"

                        outf.write(outline)
    outf.close()

Example #3

Show file

File: get_strand_specific_read_count_on_genes.py Project: PrinnyJungle/Bioinformatics

def getReadCount(KnownGenes, bedfile, chroms, fragment_size, region_type,
                 upstream_extension, downstream_extension, totalcount,
                 out_file):
    """
	Known genes are made sure to be on one strand, and the bed file are reads for that strand
	The raw read file needs to conform to bed format
	"""
    ReadCount = {}  # keyed by name, valued by (rc, length, rpkm)

    # Separate by chrom reads
    rawreadslibName1 = (bedfile).split('/')[-1]
    rawreadssuffix1 = rawreadslibName1.split('.')[-1]
    rawreadslibName1 = rawreadslibName1.split('.')[0]
    rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1"
    if Utility_extended.fileExists(bedfile):
        if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1:
            # Separate by chrom and sort by start
            print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. "
            Utility_extended.separate_by_chrom_sort(chroms, bedfile,
                                                    rawreadsextension1, [2])
    else:
        print bedfile, " is not found"
        sys.exit(1)

    # dictionary has chrom as key and ucsc_lite object (name, chrom, strand, txStart, txEnd) as values
    if region_type == 'Promoter':
        region_dic = KnownGenes.getPromoters(upstream_extension,
                                             downstream_extension)
    elif region_type == 'GeneBody':
        region_dic = KnownGenes.getGenebodys(downstream_extension)
    elif region_type == 'ExtendedGeneBody':
        region_dic = KnownGenes.getExtendedGenebodys(upstream_extension,
                                                     downstream_extension)
    elif region_type == 'PromoterGenebody':
        region_dic = KnownGenes.getPromotergenebodys(upstream_extension)
    elif region_type == 'GeneEnd':
        region_dic = KnownGenes.getGeneEnds(upstream_extension,
                                            downstream_extension)
    elif region_type == 'ExonicRegion':
        region_dic = KnownGenes.getExons()
    elif region_type == 'IntronicRegion':
        region_dic = KnownGenes.getIntrons()
    elif region_type == '5UTR':
        region_dic = KnownGenes.get5UTRs(upstream_extension,
                                         downstream_extension)
    elif region_type == '3UTR':
        region_dic = KnownGenes.get3UTRs(upstream_extension,
                                         downstream_extension)
    else:
        print region_type, "is not recognized"
        exit(1)

    outf = open(out_file, 'a')

    for chrom in chroms:
        chrombed = chrom + rawreadsextension1
        if Utility_extended.fileExists(chrombed) and (chrom
                                                      in KnownGenes.keys()):
            tag_position_list = []
            inf = open(chrombed, 'r')
            for line in inf:
                if not re.match("#", line):
                    line = line.strip()
                    sline = line.split()
                    tag_position_list.append(
                        associate_tags_with_regions.tag_position(
                            sline, fragment_size))
            inf.close()
            if Utility_extended.is_list_sorted(tag_position_list) != 1:
                tag_position_list.sort()

            if len(region_dic[chrom]) > 0:
                for region in region_dic[chrom]:
                    thisregion = [(region.txStart, region.txEnd)]
                    (total_length,
                     rc) = get_read_count_on_regions(thisregion,
                                                     tag_position_list)
                    if total_length > 0:
                        RPKM = rc * (1000.0 / total_length) * (
                            1000000 / float(totalcount))
                    else:
                        assert rc < 0.01
                        RPKM = 0
                    outline = str(region.name) + '\t' + str(rc) + '\t' + str(
                        total_length) + '\t' + str(RPKM) + '\n'
                    outf.write(outline)
                    ReadCount[region.name] = (rc, total_length, RPKM)
    outf.close()

    #SeparateByChrom.cleanup(chroms, rawreadsextension1)

    return ReadCount

Example #4

Show file

File: get_strand_specific_read_count_on_genes.py Project: PrinnyJungle/Bioinformatics

def get_read_count_on_onic_transcript(KnownGenes, bedfile, chroms,
                                      fragment_size, region_type, totalcount,
                                      out_file):
    """
	Return: a dictionary keyed by geneName valued by TotalReadCount,TotalLength, RPKM
	"""

    ReadCount = {}  # keyed by name, valued by (rc, length, rpkm)

    # Separate by chrom reads
    rawreadslibName1 = (bedfile).split('/')[-1]
    rawreadssuffix1 = rawreadslibName1.split('.')[-1]
    rawreadslibName1 = rawreadslibName1.split('.')[0]
    rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1"
    if Utility_extended.fileExists(bedfile):
        if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1:
            # Separate by chrom and sort by start
            print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. "
            Utility_extended.separate_by_chrom_sort(chroms, bedfile,
                                                    rawreadsextension1,
                                                    [2])  # sort by start
    else:
        print bedfile, " is not found"
        sys.exit(1)

    outf = open(out_file, 'a')
    for chrom in chroms:
        chrombed = chrom + rawreadsextension1
        if Utility_extended.fileExists(chrombed) and (chrom
                                                      in KnownGenes.keys()):
            tag_position_list = []
            inf = open(chrombed, 'r')
            for line in inf:
                if not re.match("#", line):
                    line = line.strip()
                    sline = line.split()
                    tag_position_list.append(
                        associate_tags_with_regions.tag_position(
                            sline, fragment_size))
            inf.close()
            if Utility_extended.is_list_sorted(tag_position_list) != 1:
                tag_position_list.sort()

            for gene in KnownGenes[chrom]:
                if region_type == "ExonicTranscript":
                    ons = gene.getExons()
                elif region_type == "IntronicTranscript":
                    ons = gene.getIntrons()
                else:
                    print region_type, "is not recognized."
                    exit(1)
                if len(ons > 0):
                    (total_length,
                     rc) = get_read_count_on_regions(ons, tag_position_list)
                    RPKM = rc * (1000.0 / total_length) * (1000000 /
                                                           float(totalcount))
                else:
                    total_length = 0
                    rc = 0
                    RPKM = 0
                outline = str(gene.name) + '\t' + str(rc) + '\t' + str(
                    total_length) + '\t' + str(RPKM) + '\n'
                outf.write(outline)
                ReadCount[region.name] = (rc, total_length, RPKM)
    outf.close()

    #SeparateByChrom.cleanup(chroms, rawreadsextension1)

    return ReadCount

Example #5

Show file

File: Calculate3UTRUsageUsingStrandSpecificRNASeq.py Project: PrinnyJungle/Bioinformatics

def Calculate3UTRUsage(entrez_genes, bedfile, column_index, chroms,
                       fragment_size, downstream_extension, outfile):
    """
	entrez genes are made sure to be on one strand, 
	the bed file are reads for that strand
	
	entrez_genes is a KnownEntrezGenes class object
	The raw read file needs to conform to bed format
	
	column_index: column in bed file for sorting
	
	"""
    # Separate reads by chrom
    rawreadslibName1 = (bedfile).split('/')[-1]
    rawreadssuffix1 = rawreadslibName1.split('.')[-1]
    rawreadslibName1 = rawreadslibName1.split('.')[0]
    rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1"
    if Utility_extended.fileExists(bedfile):
        if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1:
            # Separate by chrom and sort by start
            print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. "
            Utility_extended.separate_by_chrom_sort(chroms, bedfile,
                                                    rawreadsextension1,
                                                    [column_index])
    else:
        print bedfile, " is not found"
        sys.exit(1)

    # Here the output is 'a'
    outf = open(outfile, 'a')
    for chrom in chroms:
        if chrom in entrez_genes.chroms:
            # a KnownEntrezGenes object
            entrez_genes_by_chrom = Entrez.KnownEntrezGenes(
                [chrom], entrez_genes.subset_by_chrom(chrom))
            # this_chrom_length = chrom_lengths[chrom]
            # Get the read locations
            if Utility_extended.fileExists(chrom + rawreadsextension1):
                f = open(chrom + rawreadsextension1, 'r')
                tag_positions = []
                for line in f:
                    line = line.strip()
                    sline = line.split()
                    tag_positions.append(
                        associate_tags_with_regions.tag_position(
                            sline, fragment_size))
                if not Utility_extended.is_list_sorted(tag_positions):
                    tag_positions.sort()
                f.close()

                for entrez_id in entrez_genes_by_chrom.entrez_ids:
                    gene = entrez_genes_by_chrom.entrez_genes[
                        entrez_id]  # an EntrezGene class object
                    three_UTRs = gene.get_3UTRs(downstream_extension)
                    print three_UTRs
                    union = Utility_extended.union(
                        three_UTRs
                    )  # Find the union of 3UTRs [(start, end)], returns a [(start,end)]
                    if len(union) > 1:
                        print "There are disjoint 3UTRs in %s" % (
                            str(entrez_id))
                    else:
                        # returns [((start, end), [tag_positions])], [tag_positions] = return[0][1]
                        inside_reads = (Utility_extended.
                                        associate_simple_tags_with_regions(
                                            tag_positions, union))[0][1]
                        total_read_count = len(inside_reads)
                        RUD = CUTR_vs_AUTR(three_UTRs, inside_reads,
                                           gene.strand)

                        ## For the set of genes, use the distal 3UTR at the designated representative 3UTR
                        #myindex = Calculate3UTRUsageIndexFromCuratedGenes.find_distal_3UTR(genes)
                        #gene = genes[myindex]
                        #results = ThreeUTRCharacteristics(gene, inside_reads)

                        gene_symbol = []
                        for mytranscript in gene.transcripts:
                            if mytranscript.additional_annotations[
                                    0] not in gene_symbol:
                                gene_symbol.append(
                                    mytranscript.additional_annotations[0])

                        union_length = union[0][1] - union[0][0] + 1
                        outline = str(entrez_id) + "\t" + str(
                            union_length) + "\t" + str(RUD) + "\t" + str(
                                total_read_count) + "\t" + ','.join([
                                    transcript.name
                                    for transcript in gene.transcripts
                                ]) + "\t" + ','.join(gene_symbol) + "\n"

                    outf.write(outline)
    outf.close()

Example #6

Show file

def calculateExonIntrons(entrez_genes,
                         bedfile,
                         column_index,
                         chroms,
                         fragment_size,
                         totalcount,
                         out_file=None):
    """
	entrez_genes is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object
	
	return:
	all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]}
	all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]}
	all_reads_on_merged_transcripts = {} #{entrezID:[((start, end), read_count)]}
	all_summary = {} # {entrezID:{attribute:value}}
		(summary[entrez_id])["merged_exons_rc"] = merged_exons_rc
		(summary[entrez_id])["merged_exon_RPKM"] = merged_exon_RPKM
		(summary[entrez_id])["merged_exons_total_length"] = merged_exons_total_length
		(summary[entrez_id])["shared_exons_rc"] = shared_exons_rc
		(summary[entrez_id])["shared_exon_RPKM"] = shared_exon_RPKM
		(summary[entrez_id])["shared_exons_total_length"] = shared_exons_total_length
		(summary[entrez_id])["shared_introns_rc"] = shared_introns_rc
		(summary[entrez_id])["shared_intron_RPKM"] = shared_intron_RPKM
		(summary[entrez_id])["shared_introns_total_length"] = shared_introns_total_length
		(summary[entrez_id])["merged_transcript_rc"] = merged_transcript_rc
		(summary[entrez_id])["merged_transcript_RPKM"] = merged_transcript_RPKM
		(summary[entrez_id])["merged_transcript_length"] = merged_transcript_length
	"""
    lib_name = (bedfile).split('/')[-1]  # remove directory
    suffix = lib_name.split('.')[-1]  # txt
    lib_name = lib_name.split('.')[0]
    extension = "-" + lib_name + '.' + suffix + "1"
    if Utility_extended.fileExists(bedfile):
        if Utility_extended.chrom_files_exist(chroms, extension) != 1:
            # Separate by chrom and sort by start
            print chroms, extension, " files do not exist, separate by chroms. "
            Utility_extended.separate_by_chrom_sort(chroms, bedfile, extension,
                                                    [column_index])
    else:
        print bedfile, " is not found"
        sys.exit(1)

    all_reads_on_shared_exons = {}  # {entrezID:[((start, end), read_count)]}
    all_reads_on_shared_introns = {}  # {entrezID:[((start, end), read_count)]}
    all_reads_on_merged_transcripts = {
    }  #{entrezID:[((start, end), read_count)]}
    all_summary = {}  # {entrezID:{attributes}}

    for chrom in chroms:
        chrombed = chrom + extension
        if chrom in entrez_genes.chroms:
            entrez_genes_by_chrom = Entrez.KnownEntrezGenes(
                [chrom], entrez_genes.subset_by_chrom(chrom))
            (reads_on_shared_exons, reads_on_shared_introns,
             reads_on_merged_transcripts,
             summary) = calculateExonIntrons_by_chrom(entrez_genes_by_chrom,
                                                      chrombed, fragment_size,
                                                      totalcount, out_file)
            #if chrom == chroms[0]:
            #myid = reads_on_shared_exons.keys()[0]
            #test(entrez_genes_by_chrom, reads_on_shared_introns, myid)
            all_reads_on_shared_exons.update(reads_on_shared_exons)
            all_reads_on_shared_introns.update(reads_on_shared_introns)
            all_reads_on_merged_transcripts.update(reads_on_merged_transcripts)
            all_summary.update(summary)
            print len(all_summary.keys())

    SeparateByChrom.cleanup(chroms, extension)
    return (all_reads_on_shared_exons, all_reads_on_shared_introns,
            all_reads_on_merged_transcripts, all_summary)

Example #7

Show file

File: RUD_calculator.py Project: Antion19/UndergradPythonCode

def Calculate3UTRUsage(entrez_genes, bedfile, chroms, outfile, threshold, PAfile, extension, index):
	"""
	entrez genes are made sure to be on one strand, 
	the bed file are reads for that strand

	entrez_genes is a KnownEntrezGenes class object
	The raw read file needs to conform to bed format

	column_index: column in bed file for sorting

	"""
	# Separate reads by chrom 
	rawreadslibName1 = (bedfile).split('/')[-1]
	rawreadssuffix1 = rawreadslibName1.split('.')[-1] 
	rawreadslibName1 = rawreadslibName1.split('.')[0]
	rawreadsextension1 = "-" + rawreadslibName1 +'.' + rawreadssuffix1 + "1"
	if Utility_extended.fileExists(bedfile):
		if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1:
			# Separate by chrom and sort by start
			print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. "
			Utility_extended.separate_by_chrom_sort(chroms, bedfile, rawreadsextension1, str(index))
	else:
		print bedfile, " is not found"
		sys.exit(1)

	#This part is to access the polyadenylation sites
	PA1 = open(PAfile, 'r')
	
	PAsiteslist = []
	PA2 = 'i'
	while PA2 != '':
		PA2 = PA1.readline()
		if PA2 != '':
			PA3 = PA2.strip('\n')
			PA4 = PA3.split('\t')
			PAsiteslist.append((PA4[0],PA4[1]))

	PA1.close()

	# Here the output is 'a', i.e. the output is appended to an existing file instead of creating one
	outf = open(outfile, 'a')	
	for chrom in chroms: 
		if chrom in entrez_genes.chroms:
			# a KnownEntrezGenes object
			entrez_genes_by_chrom =  Entrez.KnownEntrezGenes([chrom], entrez_genes.subset_by_chrom(chrom))
			# Get the read locations
			if Utility_extended.fileExists(chrom + rawreadsextension1):
				f = open(chrom + rawreadsextension1, 'r')
				tag_positions = []
				for line in f:
					line = line.strip()
					sline = line.split()
					#make sure the extension is always 0, otherwise the rest of the program might not work as intended
					tag_positions.append(associate_tags_with_regions.tag_position(sline, 0))
				
				f.close()
				if not Utility_extended.is_list_sorted(tag_positions):
					tag_positions.sort()					
				#By this point tag_positions is a sorted list of all the reads located on the strand and chromosome the code is currently dealing with

				for entrez_id in entrez_genes_by_chrom.entrez_ids:
					gene = entrez_genes_by_chrom.entrez_genes[entrez_id] # an EntrezGene class object
					# get_3UTRs gets the ENTREZ 3'UTR, which appears to generally give the beginning of the 3'UTR and a site very close to the most distal polyadenylation site
					three_UTRs = gene.get_3UTRs()
					# Mastertuplemaker uses the ENTREZ 3'UTR and the polyA sites given to create the true data for the 3'UTR needed for CUTR_vs_AUTR to work
					true3UTRstarts, true3UTRends, UTRregion_start, UTRregion_end, UTRbeginning = Mastertuplemaker(three_UTRs,PAsiteslist,chrom,gene.strand, extension)
					#value should always be 1 as only 3'UTR with more than 1 polyA site need be considered
					if len(true3UTRends) > 1:
						#find all reads inside the 3'UTR
						inside_reads = associate_tags_with_3UTR(tag_positions, UTRregion_start, UTRregion_end)
						#finds reads in each region of the 3'UTR and calculates aUTR/cUTR for each of them
						#PolyAsites potentially useful for output
						RUDs, basic_RUD, PolyAsites = CUTR_vs_AUTR(true3UTRstarts, true3UTRends, inside_reads, gene.strand, threshold)
						
						#important if one wants to output gene_symbol information
						gene_symbol = []
						for mytranscript in gene.transcripts:
							if mytranscript.additional_annotations[0] not in gene_symbol:
								gene_symbol.append(mytranscript.additional_annotations[0])


						#outline to use to output RUDs
						outline = str(entrez_id) + "\t" + chrom + "\t" + gene.strand + "\t" + str(basic_RUD) + "\t" + ",".join(map(str, RUDs)) + "\n"
						
						#outline to use to output polyA information for a species
						#outline = str(entrez_id) + "\t" + chrom + "\t" + gene.strand + "\t" + str(UTRbeginning) + "\t" + ",".join(map(str, PolyAsites)) + "\n"
					
						outf.write(outline)
	outf.close()