Ejemplo n.º 1
0
def getMergedExonicRegions(transcripts):
    # Return the merged exons in the format of list of (start, end)
    all_exons = []
    for transcript in transcripts:
        all_exons += transcript.getExons()
    all_exons = sorted(all_exons, key=itemgetter(0))
    return Utility_extended.union(all_exons)
def Calculate3UTRUsage(entrez_genes, bedfile, column_index, chroms,
                       fragment_size, downstream_extension, outfile):
    """
	entrez genes are made sure to be on one strand, 
	the bed file are reads for that strand
	
	entrez_genes is a KnownEntrezGenes class object
	The raw read file needs to conform to bed format
	
	column_index: column in bed file for sorting
	
	"""
    # Separate reads by chrom
    rawreadslibName1 = (bedfile).split('/')[-1]
    rawreadssuffix1 = rawreadslibName1.split('.')[-1]
    rawreadslibName1 = rawreadslibName1.split('.')[0]
    rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1"
    if Utility_extended.fileExists(bedfile):
        if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1:
            # Separate by chrom and sort by start
            print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. "
            Utility_extended.separate_by_chrom_sort(chroms, bedfile,
                                                    rawreadsextension1,
                                                    [column_index])
    else:
        print bedfile, " is not found"
        sys.exit(1)

    # Here the output is 'a'
    outf = open(outfile, 'a')
    for chrom in chroms:
        if chrom in entrez_genes.chroms:
            # a KnownEntrezGenes object
            entrez_genes_by_chrom = Entrez.KnownEntrezGenes(
                [chrom], entrez_genes.subset_by_chrom(chrom))
            # this_chrom_length = chrom_lengths[chrom]
            # Get the read locations
            if Utility_extended.fileExists(chrom + rawreadsextension1):
                f = open(chrom + rawreadsextension1, 'r')
                tag_positions = []
                for line in f:
                    line = line.strip()
                    sline = line.split()
                    tag_positions.append(
                        associate_tags_with_regions.tag_position(
                            sline, fragment_size))
                if not Utility_extended.is_list_sorted(tag_positions):
                    tag_positions.sort()
                f.close()

                for entrez_id in entrez_genes_by_chrom.entrez_ids:
                    gene = entrez_genes_by_chrom.entrez_genes[
                        entrez_id]  # an EntrezGene class object
                    three_UTRs = gene.get_3UTRs(downstream_extension)
                    print three_UTRs
                    union = Utility_extended.union(
                        three_UTRs
                    )  # Find the union of 3UTRs [(start, end)], returns a [(start,end)]
                    if len(union) > 1:
                        print "There are disjoint 3UTRs in %s" % (
                            str(entrez_id))
                    else:
                        # returns [((start, end), [tag_positions])], [tag_positions] = return[0][1]
                        inside_reads = (Utility_extended.
                                        associate_simple_tags_with_regions(
                                            tag_positions, union))[0][1]
                        total_read_count = len(inside_reads)
                        RUD = CUTR_vs_AUTR(three_UTRs, inside_reads,
                                           gene.strand)

                        ## For the set of genes, use the distal 3UTR at the designated representative 3UTR
                        #myindex = Calculate3UTRUsageIndexFromCuratedGenes.find_distal_3UTR(genes)
                        #gene = genes[myindex]
                        #results = ThreeUTRCharacteristics(gene, inside_reads)

                        gene_symbol = []
                        for mytranscript in gene.transcripts:
                            if mytranscript.additional_annotations[
                                    0] not in gene_symbol:
                                gene_symbol.append(
                                    mytranscript.additional_annotations[0])

                        union_length = union[0][1] - union[0][0] + 1
                        outline = str(entrez_id) + "\t" + str(
                            union_length) + "\t" + str(RUD) + "\t" + str(
                                total_read_count) + "\t" + ','.join([
                                    transcript.name
                                    for transcript in gene.transcripts
                                ]) + "\t" + ','.join(gene_symbol) + "\n"

                    outf.write(outline)
    outf.close()