Esempio n. 1
0
def extend_UTRs(old_fn, new_fn, UTR_fn, genome_dir):
    UTR_boundaries = call_UTRs.read_UTR_file(UTR_fn)
    all_features = get_all_features(old_fn)

    genes = transcript.get_gff_transcripts(all_features, '/dev/null')

    genes = {g.name: g for g in genes}

    smallest_start = lambda e: e.start
    largest_end = lambda e: e.end

    for name in UTR_boundaries:
        gene = genes[name]
        if len(gene.mRNAs) != 1:
            raise ValueError('not exactly one mRNA')

        _, _, five_pos, three_pos = UTR_boundaries[name]

        if gene.strand == '+':
            leftmost_exon = min(gene.exons, key=smallest_start)
            leftmost_exon.start = five_pos
            gene.mRNAs[0].start = five_pos
            gene.top_level_feature.start = five_pos

            rightmost_exon = max(gene.exons, key=largest_end)
            rightmost_exon.end = three_pos
            gene.mRNAs[0].end = three_pos
            gene.top_level_feature.end = three_pos
        elif gene.strand == '-':
            leftmost_exon = max(gene.exons, key=largest_end)
            leftmost_exon.end = five_pos
            gene.mRNAs[0].end = five_pos
            gene.top_level_feature.end = five_pos

            rightmost_exon = min(gene.exons, key=smallest_start)
            rightmost_exon.start = three_pos
            gene.mRNAs[0].start = three_pos
            gene.top_level_feature.start = three_pos
    
    with open(new_fn, 'w') as new_fh:
        original_lines = open(old_fn)

        for line in original_lines:
            if line.startswith('#'):
                new_fh.write(line)
            else:
                break
        
        new_fh.write('''\
# UTRs have been extended.
# Top-level features have had distances to the closest other top-level features annotated. 
''')

        mark_nearby(all_features, genome_dir)

        for feature in all_features:
            new_fh.write(str(feature) + '\n')
Esempio n. 2
0
def get_CDSs(gff_fn, genome_dir, annotate_nearby=False):
    all_features = get_all_features(gff_fn)

    if annotate_nearby:
        mark_nearby(all_features, genome_dir)
    
    genes = transcript.get_gff_transcripts(all_features, genome_dir)
    translated_genes = [g for g in genes if g.CDSs]
    
    return translated_genes
Esempio n. 3
0
def get_noncoding_RNA_transcripts(gff_fn):
    all_features = get_all_features(gff_fn)
    genes = transcript.get_gff_transcripts(all_features, '/dev/null')
    rRNA_transcripts = []
    tRNA_transcripts = []
    other_ncRNA_transcripts = []
    for gene in genes:
        if gene.top_level_feature.feature == 'rRNA':
            rRNA_transcripts.append(gene)
        elif gene.top_level_feature.feature == 'tRNA':
            tRNA_transcripts.append(gene)
        elif 'RNA' in gene.top_level_feature.feature:
            other_ncRNA_transcripts.append(gene)
    return rRNA_transcripts, tRNA_transcripts, other_ncRNA_transcripts