def getMergedExonicRegions(transcripts): # Return the merged exons in the format of list of (start, end) all_exons = [] for transcript in transcripts: all_exons += transcript.getExons() all_exons = sorted(all_exons, key=itemgetter(0)) return Utility_extended.union(all_exons)
def Calculate3UTRUsage(entrez_genes, bedfile, column_index, chroms, fragment_size, downstream_extension, outfile): """ entrez genes are made sure to be on one strand, the bed file are reads for that strand entrez_genes is a KnownEntrezGenes class object The raw read file needs to conform to bed format column_index: column in bed file for sorting """ # Separate reads by chrom rawreadslibName1 = (bedfile).split('/')[-1] rawreadssuffix1 = rawreadslibName1.split('.')[-1] rawreadslibName1 = rawreadslibName1.split('.')[0] rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1: # Separate by chrom and sort by start print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, rawreadsextension1, [column_index]) else: print bedfile, " is not found" sys.exit(1) # Here the output is 'a' outf = open(outfile, 'a') for chrom in chroms: if chrom in entrez_genes.chroms: # a KnownEntrezGenes object entrez_genes_by_chrom = Entrez.KnownEntrezGenes( [chrom], entrez_genes.subset_by_chrom(chrom)) # this_chrom_length = chrom_lengths[chrom] # Get the read locations if Utility_extended.fileExists(chrom + rawreadsextension1): f = open(chrom + rawreadsextension1, 'r') tag_positions = [] for line in f: line = line.strip() sline = line.split() tag_positions.append( associate_tags_with_regions.tag_position( sline, fragment_size)) if not Utility_extended.is_list_sorted(tag_positions): tag_positions.sort() f.close() for entrez_id in entrez_genes_by_chrom.entrez_ids: gene = entrez_genes_by_chrom.entrez_genes[ entrez_id] # an EntrezGene class object three_UTRs = gene.get_3UTRs(downstream_extension) print three_UTRs union = Utility_extended.union( three_UTRs ) # Find the union of 3UTRs [(start, end)], returns a [(start,end)] if len(union) > 1: print "There are disjoint 3UTRs in %s" % ( str(entrez_id)) else: # returns [((start, end), [tag_positions])], [tag_positions] = return[0][1] inside_reads = (Utility_extended. associate_simple_tags_with_regions( tag_positions, union))[0][1] total_read_count = len(inside_reads) RUD = CUTR_vs_AUTR(three_UTRs, inside_reads, gene.strand) ## For the set of genes, use the distal 3UTR at the designated representative 3UTR #myindex = Calculate3UTRUsageIndexFromCuratedGenes.find_distal_3UTR(genes) #gene = genes[myindex] #results = ThreeUTRCharacteristics(gene, inside_reads) gene_symbol = [] for mytranscript in gene.transcripts: if mytranscript.additional_annotations[ 0] not in gene_symbol: gene_symbol.append( mytranscript.additional_annotations[0]) union_length = union[0][1] - union[0][0] + 1 outline = str(entrez_id) + "\t" + str( union_length) + "\t" + str(RUD) + "\t" + str( total_read_count) + "\t" + ','.join([ transcript.name for transcript in gene.transcripts ]) + "\t" + ','.join(gene_symbol) + "\n" outf.write(outline) outf.close()