Example #1
0
    def get_expressed_transcript(self):
        self.novel_transcript_gtf = os.path.join(self.out_dir,
                                                 'novel_transcript.gtf')
        sample_to_replicate_dict = sample_to_replicate(args.replicate_info)
        for each_quant_file in self.quant_file_list:
            self.store_exp(each_quant_file, sample_to_replicate_dict)
        self.exp_novel_transcript_gtf = os.path.join(
            self.out_dir, 'exp_novel_transcript.gtf')
        output = open(self.exp_novel_transcript_gtf, 'w')
        exp_flag_dict = {}
        for eachline in GFF_Reader(self.novel_transcript_gtf):
            tr_id = eachline.attr['transcript_id']
            if tr_id in exp_flag_dict and exp_flag_dict[tr_id]:
                output.write(eachline.get_gff_line())
                continue
            elif tr_id not in self.transcript_exp_dict:
                continue
            for each_rep in self.transcript_exp_dict[tr_id]:
                exp_flag_dict[tr_id] = False
                tr_exp = min(self.transcript_exp_dict[tr_id][each_rep])
                # tr_exp = python_tools.Median(self.transcript_exp_dict[tr_id][each_rep])
                tr_exon_num = self.assemlby_dict[tr_id]['exon_num']
                if tr_exon_num == 1 and tr_exp >= 2:
                    output.write(eachline.get_gff_line())
                    break
                elif tr_exp >= 0.5:
                    output.write(eachline.get_gff_line())
                    break
            else:
                exp_flag_dict[tr_id] = True

        output.close()
Example #2
0
def get_gene_number(gtf):
    gene_dict = {}
    for eachline in GFF_Reader(gtf):
        gene_id = eachline.attr['gene_id']
        if gene_id not in gene_dict:
            gene_dict[gene_id] = 1
    return len(gene_dict.keys())
Example #3
0
def convert_blast(gtf, kegg_blast, out_dir, species, diff_type):
    if diff_type == 'gene':
        tr_gene_dict = {}
        for eachline in GFF_Reader(gtf):
            gene_id = eachline.attr['gene_id']
            transcript_id = eachline.attr['transcript_id']
            tr_gene_dict[transcript_id] = gene_id
    diff_blast_dict = {}
    with open(kegg_blast, 'r') as kegg_blast_info:
        for eachline in kegg_blast_info:
            each_tr_blast = eachline.strip().split('\t')
            query_id = tr_gene_dict[each_tr_blast[0]]
            if diff_type == 'gene':
                query_id = tr_gene_dict[each_tr_blast[0]]
            if query_id not in diff_blast_dict:
                diff_blast_dict[query_id] = each_tr_blast[1:]
            else:
                if diff_blast_dict[query_id][-2] == each_tr_blast[
                        -2] and diff_blast_dict[query_id][-1] < each_tr_blast[
                            -1]:
                    diff_blast_dict[query_id] = each_tr_blast[1:]
                elif diff_blast_dict[query_id][-2] > each_tr_blast[-2]:
                    diff_blast_dict[query_id] = each_tr_blast[1:]
                else:
                    continue
    diff_blast = os.path.join(out_dir, '%s.blasttab.json' % species)
    with open(diff_blast, 'w') as diff_blast_file:
        json.dump(diff_blast_dict, diff_blast_file)
    return diff_blast
    def get_transcript_dict(self):
        for eachline in GFF_Reader(self.gtf):
            genomic_featrue = eachline.type.lower()
            if genomic_featrue not in self.TYPE:
                continue
            gene_id = eachline.attr['gene_id']
            transcript_id = eachline.attr['transcript_id']
            start = eachline.iv.start
            end = eachline.iv.end
            chrom = eachline.iv.chrom
            strand = eachline.iv.strand
            self.transcript_dict.setdefault(transcript_id,
                                            {})['strand'] = strand
            self.transcript_dict.setdefault(transcript_id, {})['chrom'] = chrom
            self.transcript_dict.setdefault(transcript_id,
                                            {})['gene_id'] = gene_id
            if transcript_id in self.transcript_dict and genomic_featrue in self.transcript_dict[
                    transcript_id]:
                if intersect_exon_in_same_tr(
                        self.transcript_dict[transcript_id][genomic_featrue],
                    (start, end)):
                    self.transcript_dict[transcript_id][
                        genomic_featrue].append((start, end))
            else:
                self.transcript_dict.setdefault(transcript_id,
                                                {})[genomic_featrue] = [(start,
                                                                         end)]

            genomic_featrue_test = genomic_featrue.upper()
            if self.cds_flag == 0 and genomic_featrue == 'cds':
                self.cds_flag = 1
            if self.utr_flag == 0 and 'utr' in genomic_featrue_test:
                self.utr_flag = 1
Example #5
0
def ReadGff(GFF):
    dict_gene = {}
    dict_ens = {}
    for line in GFF_Reader(GFF):
        dict_ens[re.split('\.', line.attr['gene_id'])[0]] = line
        dict_gene[line.attr['gene_name']] = line
    return dict_gene, dict_ens
def read_transcriptdata(gtffn):
    gid_key = lambda f: (f.iv.chrom, f.attr.get("gene_id"))
    tid_key = lambda f: f.attr.get("transcript_id")
    sort_key = lambda f: f.iv.start

    tidtc = dict()
    tid2gid = dict()

    a = GenomicArrayOfSets("auto", stranded=True)

    gtf = GFF_Reader(gtffn)

    for (chrom, gid), ggfiter in groupby(gtf, gid_key):
        for tid, tgfiter in groupby(ggfiter, tid_key):
            fs = sorted(tgfiter, key=sort_key)

            exonfs = filter(lambda f: f.type == "exon", fs)
            cdsfs = filter(lambda f: f.type == "CDS", fs)
            if not any(exonfs) or not any(cdsfs):
                continue

            try:
                tc, (cdsstart, cdsend) = validate_and_extract_cds(fs)
            except ValueError:
                continue

            tidtc[tid] = tc, (cdsstart, cdsend)
            for efs in exonfs:
                a[efs.iv] += tid

            tid2gid[tid] = gid

    return a, tidtc, tid2gid
Example #7
0
def oms_lncRNA_classify(feelnc_prd, lnc_gtf, method='Luo'):
    '''
    add lncRNA class to FEElnc_classifier.pl output
    '''
    feelnc_df = pd.read_table(feelnc_prd, index_col=2)
    feelnc_best_df = feelnc_df[feelnc_df.isBest == 1]
    feelnc_best_df = feelnc_best_df.loc[:, feelnc_best_df.columns[1:]]
    lnc_class_df = pd.DataFrame([], columns=feelnc_best_df.columns)
    lnc_class_list = list()
    for eachline in GFF_Reader(lnc_gtf):
        if 'transcript_id' not in eachline.attr:
            continue
        tr_id = eachline.attr['transcript_id']
        gene_id = eachline.attr['gene_id']
        if tr_id in lnc_class_df.index:
            continue
        if tr_id in feelnc_best_df.index:
            if method == 'Luo':
                dirt, ltype, dis, subtype, loc = feelnc_best_df.loc[tr_id][3:]
                lnc_class = get_luo_code(dirt, subtype, loc)
                lnc_class_list.append(lnc_class)
                lnc_class_df.loc[tr_id] = feelnc_best_df.loc[tr_id]
            else:
                sys.exit('undefined classification method.')
        else:
            class_detail = ['--' for each in lnc_class_df.columns]
            class_detail[0] = gene_id
            lnc_class_list.append('lincRNA')
            lnc_class_df.loc[tr_id] = class_detail
    lnc_class_df.loc[:, 'classification'] = lnc_class_list
    lnc_class_df.index.name = feelnc_best_df.index.name
    return lnc_class_df
Example #8
0
 def get_novel_transcript(self):
     ## annotate mRNA related transcript
     for each_gtf in self.combined_gtf_list:
         self.add_compare_info(each_gtf)
     ## filter transcripts
     self.assembly_dict_json = os.path.join(self.out_dir,
                                            'assembly_tr_info.json')
     store_into_json(self.assemlby_dict, self.assembly_dict_json)
     ## output filtered transcript gtf
     self.novel_transcript_gtf = os.path.join(self.out_dir,
                                              'novel_transcript.gtf')
     output = open(self.novel_transcript_gtf, 'w')
     for eachline in GFF_Reader(self.assembly_gtf):
         tr_id = eachline.attr['transcript_id']
         ## filter length
         if self.assemlby_dict[tr_id]['tr_length'] <= self.length:
             continue
         ## annotation filter
         elif 'status' not in self.assemlby_dict[tr_id]:
             continue
         elif self.assemlby_dict[tr_id]['status'] not in self.lncRNA_type:
             continue
         else:
             output.write(eachline.get_gff_line())
     output.close()
Example #9
0
def ReadGff(GFF):
    dict_gene = {}
    dict_ens = {}
    for line in GFF_Reader(GFF):
        dict_ens[line.attr['transcript_id']] = line
        dict_gene[line.attr['transcript_name']] = line
    return dict_gene, dict_ens
Example #10
0
def get_transcript_info(gtf, genename_dict={}):
    transcript_info_dict = {}
    for eachline in GFF_Reader(gtf):
        if eachline.type != 'exon':
            continue
        gene_id = eachline.attr['gene_id']
        transcript_id = eachline.attr['transcript_id']
        ## get gene_type info
        if 'gene_type' in eachline.attr:
            gene_type = eachline.attr['gene_type']
        elif 'gene_biotype' in eachline.attr:
            gene_type = eachline.attr['gene_biotype']
        else:
            gene_type = '--'
        ## get gene name info
        if 'gene_name' in eachline.attr:
            genename = eachline.attr['gene_name']
        elif gene_id in genename_dict:
            genename = genename_dict[gene_id][0]
        else:
            genename = '--'
        ## get location info
        chrom = eachline.iv.chrom
        start = eachline.iv.start + 1
        end = eachline.iv.end
        strand = eachline.iv.strand
        length = end - start + 1
        if transcript_id in transcript_info_dict:
            transcript_info_dict[transcript_id]['exon_start'].append(start)
            transcript_info_dict[transcript_id]['exon_start'].sort()
            transcript_info_dict[transcript_id]['exon_end'].append(end)
            transcript_info_dict[transcript_id]['exon_end'].sort()
            transcript_info_dict[transcript_id]['exon_len'].append(length)
        else:
            transcript_info_dict.setdefault(transcript_id, {})["chrom"] = chrom
            transcript_info_dict.setdefault(transcript_id,
                                            {})["exon_start"] = [start]
            transcript_info_dict.setdefault(transcript_id,
                                            {})["exon_end"] = [end]
            transcript_info_dict.setdefault(transcript_id,
                                            {})["strand"] = strand
            transcript_info_dict.setdefault(transcript_id,
                                            {})["gene_id"] = gene_id
            transcript_info_dict.setdefault(transcript_id,
                                            {})["gene_name"] = genename
            transcript_info_dict.setdefault(transcript_id,
                                            {})["gene_description"] = "--"
            transcript_info_dict.setdefault(transcript_id,
                                            {})['gene_type'] = gene_type
            transcript_info_dict.setdefault(transcript_id,
                                            {})['exon_len'] = [length]
            if gene_id in genename_dict:
                transcript_info_dict.setdefault(
                    transcript_id,
                    {})["gene_description"] = genename_dict[gene_id][1]
                gene_description_flag = 1
    return transcript_info_dict
def get_transcript_info(gtf, genename_dict={}):
    transcript_info_dict = {}
    for eachline in GFF_Reader(gtf):
        gene_id = eachline.attr['gene_id']
        transcript_id = eachline.attr['transcript_id']
        # get gene_type info
        if 'gene_type' in eachline.attr:
            gene_type = eachline.attr['gene_type']
        elif 'gene_biotype' in eachline.attr:
            gene_type = eachline.attr['gene_biotype']
        else:
            gene_type = '--'
        # get gene name info
        if 'gene_name' in eachline.attr:
            genename = eachline.attr['gene_name']
        elif gene_id in genename_dict:
            genename = genename_dict[gene_id][0]
        else:
            genename = '--'
        # get location info
        chrom = eachline.iv.chrom
        start = eachline.iv.start + 1
        end = eachline.iv.end
        strand = eachline.iv.strand
        length = end - start + 1
        if transcript_id in transcript_info_dict:
            if start < transcript_info_dict[transcript_id]['start']:
                transcript_info_dict[transcript_id]['start'] = start
            if end > transcript_info_dict[transcript_id]['end']:
                transcript_info_dict[transcript_id]['end'] = end
            transcript_info_dict.setdefault(transcript_id,
                                            {})['length'] += length
            transcript_info_dict.setdefault(transcript_id, {})['exon_num'] += 1
        else:
            transcript_info_dict.setdefault(transcript_id, {})["chrom"] = chrom
            transcript_info_dict.setdefault(transcript_id, {})["start"] = start
            transcript_info_dict.setdefault(transcript_id, {})["end"] = end
            transcript_info_dict.setdefault(transcript_id,
                                            {})["strand"] = strand
            transcript_info_dict.setdefault(transcript_id,
                                            {})["gene_id"] = gene_id
            transcript_info_dict.setdefault(transcript_id,
                                            {})["gene_name"] = genename
            transcript_info_dict.setdefault(transcript_id,
                                            {})["gene_description"] = "--"
            transcript_info_dict.setdefault(transcript_id,
                                            {})['gene_type'] = gene_type
            transcript_info_dict.setdefault(transcript_id,
                                            {})['length'] = length
            transcript_info_dict.setdefault(transcript_id, {})['exon_num'] = 1
            if gene_id in genename_dict:
                transcript_info_dict.setdefault(
                    transcript_id,
                    {})["gene_description"] = genename_dict[gene_id][1]
                gene_description_flag = 1
    return transcript_info_dict
Example #12
0
def _get_genomic_reader(filename):
    """regions from a BED_Reader or GFF_Reader.
    """
    if isinstance(filename, str) and filename.endswith('.bed'):
        regions_ = BED_Reader(filename)
    elif isinstance(filename, str) and (filename.endswith('.gff')
                                        or filename.endswith('.gtf')):
        regions_ = GFF_Reader(filename)
    else:
        raise Exception('Regions must be a bed, gff or gtf-file.')

    return regions_
Example #13
0
def get_transcript_length(gtf):
    tr_length_dict = {}
    for eachline in GFF_Reader(gtf):
        gene_id = eachline.attr['gene_id']
        transcript_id  = eachline.attr['transcript_id']
        start  = eachline.iv.start
        end    = eachline.iv.end
        length = end - start + 1
        if gene_id in tr_length_dict and transcript_id in tr_length_dict[gene_id] :
            tr_length_dict[gene_id][transcript_id] += length
        else :
            tr_length_dict.setdefault(gene_id,{})[transcript_id] = length
    return tr_length_dict
Example #14
0
    def add_compare_info(self, combined_gtf):
        if 'mRNA' in combined_gtf:
            for eachline in GFF_Reader(combined_gtf):
                tr_id, class_code, nearest_ref = get_class_code(eachline)
                self.assemlby_dict[tr_id]['nearest_ref'] = nearest_ref
                if class_code in ['u', 'p']:
                    self.assemlby_dict[tr_id]['status'] = 'lincRNA'
                elif class_code == 'x':
                    self.assemlby_dict[tr_id]['status'] = 'antisense'
                elif class_code == 'i':
                    if self.assemlby_dict[tr_id]['strand'] != self.ref_dict[
                            nearest_ref]['strand']:
                        self.assemlby_dict[tr_id][
                            'status'] = 'antisense_intronic'
                    else:
                        if self.assemlby_dict[tr_id]['exon_num'] > 1:
                            self.assemlby_dict[tr_id][
                                'status'] = 'sense_intronic'
                        else:
                            self.assemlby_dict[tr_id]['status'] = 'backgroud'
                else:
                    self.assemlby_dict[tr_id]['status'] = 'protein_coding'

        elif 'other_ncRNA' in combined_gtf:
            for eachline in GFF_Reader(combined_gtf):
                tr_id, class_code, nearest_ref = get_class_code(eachline)
                if class_code in self.overlap_flags and 'status' in self.assemlby_dict[
                        tr_id]:
                    self.assemlby_dict[tr_id]['status'] = 'ncRNA_host'

        elif 'lncRNA' in combined_gtf:
            for eachline in GFF_Reader(combined_gtf):
                tr_id, class_code, nearest_ref = get_class_code(eachline)
                if class_code in self.overlap_flags and 'status' in self.assemlby_dict[
                        tr_id]:
                    self.assemlby_dict[tr_id]['status'] = 'Annotated_lncRNA'
def novel_gtf(compare_gtf, outfile):
    novel_gtf_dict = dict()
    outfile_inf = open(outfile, 'w')
    for record in GFF_Reader(compare_gtf):
        if 'class_code' in record.attr:
            if record.attr['class_code'] in NOVEL_TR_CODE:
                novel_gtf_dict[record.attr['transcript_id']] = 1
                record.attr = {
                    key: val
                    for key, val in record.attr.items() if key in OUT_ATTR
                }
            else:
                continue
        elif record.attr['transcript_id'] in novel_gtf_dict:
            pass
        else:
            continue
        outline = record.get_gff_line().strip()
        outfile_inf.write(f'{outline};\n')
    outfile_inf.close()
Example #16
0
def gff2dict(gff, fix_id_flag=False):
    by_gene_dict = OrderedDict()
    by_tr_dict = OrderedDict()
    gene_entry_dict = dict()
    tr2gene = dict()
    for eachline in GFF_Reader(gff):
        if eachline.type == 'gene':
            gene_id = eachline.attr['ID']
            eachline.attr['gene_id'] = gene_id
            gene_entry_dict[gene_id] = eachline
            gene_entry_dict[gene_id].attr['fixed'] = True
            continue
        if 'geneID' in eachline.attr:
            parent = eachline.attr['geneID']
            eachline.attr['Parent'] = parent
        else:
            parent = eachline.attr['Parent']
        if eachline.type in ["transcript", "mRNA"]:
            tr_id = fix_id(eachline.attr['ID'], eachline.type, fix_id_flag)
            eachline.attr['ID'] = tr_id
            gene_id = parent
            tr2gene[tr_id] = parent
        else:
            if 'ID' in eachline.attr:
                eachline.attr['ID'] = fix_id(eachline.attr.get('ID'),
                                             eachline.type, fix_id_flag)
            tr_id = fix_id(parent, 'mRNA', fix_id_flag)
            eachline.attr['Parent'] = tr_id
            gene_id = tr2gene[tr_id]
        eachline.attr['tr_id'] = tr_id
        eachline.attr['gene_id'] = gene_id
        by_gene_dict.setdefault(gene_id, []).append(eachline)
        by_tr_dict.setdefault(tr_id, []).append(eachline)
        gene_entry_dict[gene_id] = update_gene_inf(
            gene_entry_dict.get(gene_id), eachline)
    return by_gene_dict, by_tr_dict, gene_entry_dict, tr2gene
def main(gtf, feelnc_classify, bed_intersect, out_dir):
    feelnc_df = pd.read_table(feelnc_classify, index_col=2)
    intersect_df = pd.read_table(bed_intersect, index_col=[3, 15], header=None)
    lnc_class_list = []
    out_header = list(feelnc_df.columns[1:])
    out_header.insert(0, 'lncRNA_transcript')
    out_header.append('lncRNA_class')

    def get_class(fee_rd, intersect_df):
        if fee_rd.type == 'intergenic':
            if fee_rd.subtype == 'divergent':
                return 'divergent', 0, 0
            else:
                return 'intergenic', 0, 0
        else:
            inter_index = (fee_rd.name, fee_rd.partnerRNA_transcript)
            inter_rd = intersect_df.loc[inter_index]
            overlap1, overlap2 = overlap_portion(inter_rd)
            if fee_rd.direction == 'sense':
                if fee_rd.subtype == 'containing':
                    return 'other_sense_overlap', overlap1, overlap2
                elif fee_rd.subtype == 'nested':
                    return 'sense_intronic', overlap1, overlap2
                elif fee_rd.subtype == 'overlapping':
                    if overlap1 >= OVERLAP_CUTOFF:
                        introns = iterintrons(inter_rd[13], inter_rd[22],
                                              inter_rd[23])
                        lnc_can_start = inter_rd[1]
                        if tss_in_interval(lnc_can_start, introns):
                            return 'sense_intronic', overlap1, overlap2
                    return 'other_sense_overlap', overlap1, overlap2
                else:
                    sys.exit('unkown type [{t.subtype}]'.format(t=fee_rd))
            else:
                if fee_rd.subtype == 'nested':
                    return 'antisense', overlap1, overlap2
                else:
                    if overlap1 >= OVERLAP_CUTOFF:
                        return 'antisense', overlap1, overlap2
                    else:
                        return 'intergenic', overlap1, overlap2

    def lnc_classify(tr_id, feelnc_df, intersect_df):
        tr_detail_df = feelnc_df.loc[tr_id]
        out_inf = []
        class_inf = []
        if tr_detail_df.index[0] == tr_id:
            for n in range(len(tr_detail_df)):
                class_value = list(get_class(tr_detail_df.ix[n], intersect_df))
                dis = tr_detail_df.ix[n].distance
                tmp_class_inf = class_value[:]
                tmp_class_inf.insert(1, dis)
                tmp_out_inf = list(tr_detail_df.ix[n][1:])
                if not out_inf:
                    out_inf = tmp_out_inf
                    class_inf = tmp_class_inf
                else:
                    if compare_class(tmp_class_inf, class_inf):
                        out_inf = tmp_out_inf
                        class_inf = tmp_class_inf

        else:
            class_value = list(get_class(tr_detail_df, intersect_df))
            out_inf = list(tr_detail_df[1:])
            class_inf = class_value
        out_inf.insert(0, tr_id)
        out_inf.append(class_inf[0])
        return out_inf

    for eachline in GFF_Reader(gtf):
        if eachline.type == 'transcript':
            tr_id = eachline.attr['transcript_id']
            gene_id = eachline.attr['gene_id']
            if tr_id not in feelnc_df.index:
                out_inf = [tr_id, gene_id]
                out_inf.extend(INTERGENIC_INF)
            else:
                out_inf = lnc_classify(tr_id, feelnc_df, intersect_df)
            out_inf_series = pd.Series(out_inf, index=out_header)
            lnc_class_list.append(out_inf_series)

    out_df = pd.concat(lnc_class_list, axis=1).T
    out_file = os.path.join(out_dir, 'lncRNA.classify.txt')
    out_df.to_csv(out_file, sep='\t', index=False)
Example #18
0
def main(gtf, output_dir):
    # read gtf file
    gtf_dict = dict()
    for eachline in GFF_Reader(gtf):
        if eachline.type == 'exon':
            gene_id = eachline.attr['gene_id']
            chrom = eachline.iv.chrom
            start = eachline.iv.start + 1
            end = eachline.iv.end
            transcript_id = eachline.attr['transcript_id']
            exon_len = eachline.iv.end - eachline.iv.start
            if 'gene_biotype' in eachline.attr:
                gtf_dict.setdefault('gene_biotype',
                                    []).append(eachline.attr['gene_biotype'])
            gtf_dict.setdefault('gene_id', []).append(gene_id)
            gtf_dict.setdefault('transcript_id', []).append(transcript_id)
            gtf_dict.setdefault('exon', []).append(exon_len)
            gtf_dict.setdefault('chr', []).append(chrom)
            gtf_dict.setdefault('start', []).append(start)
            gtf_dict.setdefault('end', []).append(end)
    gtf_df = pd.DataFrame(gtf_dict)

    # generate gtf summary
    gene_stat_df = gtf_df.loc[:, ['gene_id', 'transcript_id']]
    gene_stat_df = gene_stat_df.drop_duplicates()
    gene_stat = gene_stat_df.groupby(['gene_id'])['transcript_id']
    gene_num = len(gene_stat.count())
    tr_per_gene = gene_stat.count().mean()
    tr_num = len(gene_stat_df)
    tr_stat = gtf_df.groupby(['transcript_id'])['exon']
    single_exon = tr_stat.count().value_counts()[1]
    tr_length = tr_stat.sum().mean()
    exon_length = gtf_df.loc[:, 'exon'].mean()
    exon_num = tr_stat.count().mean()
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    gtf_summary_file = os.path.join(output_dir, 'transcriptome.summary.txt')
    with open(gtf_summary_file, 'w') as gtf_summary_inf:
        gtf_summary_inf.write('Gene number\t{gn}'.format(gn=gene_num))
        gtf_summary_inf.write(
            'Transcript per gene\t{tg:.2f}'.format(tg=tr_per_gene))
        gtf_summary_inf.write('Transcript number\t{tn}'.format(tn=tr_num))
        gtf_summary_inf.write(
            'Single exon transcripts\t{st}'.format(st=single_exon))
        gtf_summary_inf.write(
            'Mean transcript length\t{tl:.2f}'.format(tl=tr_length))
        gtf_summary_inf.write(
            'Mean exon length\t{el:.2f}'.format(el=exon_length))
        gtf_summary_inf.write(
            'Exon number per transcript\t{et:.2f}'.format(et=exon_num))

    # generate transcript information
    tr_list = []
    tr_list.append(gtf_df.groupby(['transcript_id'])['chr'].unique())
    tr_list.append(gtf_df.groupby(['transcript_id'])['start'].min())
    tr_list.append(gtf_df.groupby(['transcript_id'])['end'].max())
    tr_list.append(gtf_df.groupby(['transcript_id'])['gene_id'].unique())
    tr_list.append(gtf_df.groupby(['transcript_id'])['exon'].count())
    tr_list.append(gtf_df.groupby(['transcript_id'])['exon'].sum())
    if 'gene_biotype' in gtf_df.columns:
        tr_list.append(
            gtf_df.groupby(['transcript_id'])['gene_biotype'].unique())
    tr_df = pd.concat(tr_list, axis=1)
    tr_df.loc[:, 'chr'] = map(','.join, tr_df.loc[:, 'chr'])
    tr_df.loc[:, 'gene_id'] = map(','.join, tr_df.loc[:, 'gene_id'])
    if 'gene_biotype' in tr_df.columns:
        tr_df.loc[:, 'gene_biotype'] = map(','.join, tr_df.loc[:,
                                                               'gene_biotype'])
    tr_file = os.path.join(output_dir, 'transcripts.detail.txt')
    tr_df.to_csv(tr_file, sep='\t')
Example #19
0
bed = sys.argv[1]
gtf = sys.argv[2]

chr_dict = dict()

with open(bed) as bed_inf:
    for eachline in bed_inf:
        eachline_inf = eachline.strip().split()
        chrom = eachline_inf[0]
        start = int(eachline_inf[1]) + 1
        end = int(eachline_inf[2])
        split_chr = eachline_inf[3]
        chr_dict.setdefault(chrom, {})[(start, end)] = split_chr

for eachline in GFF_Reader(gtf):
    chrom = eachline.iv.chrom
    start = eachline.iv.start + 1
    end = eachline.iv.end
    if chrom in chr_dict:
        for each_inter in chr_dict[chrom]:
            if start >= each_inter[0] and end <= each_inter[1]:
                new_chr = chr_dict[chrom][each_inter]
                new_start = start - each_inter[0] + 1
                new_end = end - each_inter[0] + 1
    else:
        new_chr = chrom
        new_start = start
        new_end = end
    output_line = eachline.get_gff_line().strip().split('\t')
    output_line[0] = new_chr
input_gtf = sys.argv[3]
output_gtf = sys.argv[4]

miRNA_gene_list = [each.strip() for each in open(miRNA_gene_file)]

sense_overlapping_tr_dict = {}
with open(lncRNA_classify_file) as lncRNA_classify_file_info:
    for n, eachline in enumerate(lncRNA_classify_file_info):
        if n != 0:
            eachline_info = eachline.strip().split('\t')
            each_tr_id = eachline_info[2]            
            if eachline_info[5] == 'sense' and eachline_info[6] == 'genic' and eachline_info[9] == 'exonic':
                sense_overlapping_tr_dict[each_tr_id] = 'so'
                if eachline_info[3] in miRNA_gene_list:
                    sense_overlapping_tr_dict[each_tr_id] = 'mi'

output_gtf_info = open(output_gtf, 'w')
for eachline in GFF_Reader(input_gtf):
    tr_id = eachline.attr['transcript_id']
    output_line = '%s;' % eachline.get_gff_line().strip()
    if tr_id in sense_overlapping_tr_dict:
        if sense_overlapping_tr_dict[tr_id] == 'mi':
            output_line = '%s transcript_type "miRNA_host";' % output_line
        else:
            continue
    output_gtf_info.write('%s\n' % output_line)
output_gtf_info.close()



    endPoint = 2000000
    plusStrand = GenomicInterval(chromo, 0, endPoint, '+')
    minusStrand = GenomicInterval(chromo, 0, endPoint, '-')
    bothStrands = GenomicInterval(chromo, 0, endPoint, '.')

    pyplot.plot(list(hitMap[plusStrand]))
    pyplot.plot(list(hitMap[minusStrand]))
    pyplot.show()

    print('\n Using HTSeq to access GFF genome features\n')

    remoteFileName = '/Bacteria/Escherichia_coli_536_uid58531/NC_008253.gff'
    gffFile = 'examples/EcoliGenomeFeatures.gff'
    downloadFile(FTP_ROOT + remoteFileName, gffFile)

    fileObj = GFF_Reader(gffFile)

    for genomeFeature in fileObj:

        genomeRegion = genomeFeature.iv

        data = (genomeRegion.chrom, genomeRegion.start, genomeRegion.end,
                genomeRegion.strand)

        print('%s %s - %s (%s)' % data)

        data = (genomeFeature.name, genomeFeature.type, genomeFeature.source)

        print('%s %s (%s)' % data)

        print(genomeFeature.attr)
'''
Usage:
extract_ncRNA_from_assemblyline.py <all.gtf> <ncRNA.gtf>

Extract ncRNA gtf from assemblyline assembled gtf

'''

from docopt import docopt
from HTSeq import GFF_Reader

if __name__ == '__main__':
    arguments = docopt(__doc__, version="v1")
    all_gtf = arguments['<all.gtf>']
    nc_gtf = arguments['<ncRNA.gtf>']
    nc_gtf_inf = open(nc_gtf, 'w')
    for eachline in GFF_Reader(all_gtf):
        if 'transcript_category' in eachline.attr:
            if eachline.attr['transcript_category'] == 'ncRNA' or eachline.attr[
                    'transcript_category'] == 'lncRNA':
                nc_gtf_inf.write(eachline.get_gff_line())
Example #23
0
parser.add_argument('--id_file', help='Output directory.', required=True)
parser.add_argument('--flag',
                    help='extract "ex" or delete "de" gtf record in id_file.',
                    choices=['ex', 'de'],
                    default='ex')
parser.add_argument('--id_type',
                    help='id type.',
                    choices=['gene', 'transcript'],
                    default='transcript')
parser.add_argument('--output', help='output gtf file.', required=True)
args = parser.parse_args()

id_dict = {}
with open(args.id_file, 'r') as id_file_info:
    for eachline in id_file_info:
        eachline = eachline.strip()
        if eachline:
            id_dict[eachline] = 1

output_info = open(args.output, 'w')

attr_flag = '{i}_id'.format(i=args.id_type)
for eachline in GFF_Reader(args.gtf):
    if args.flag == 'ex':
        if eachline.attr[attr_flag] in id_dict:
            output_info.write("%s;\n" % eachline.get_gff_line().strip())
    else:
        if eachline.attr[attr_flag] not in id_dict:
            output_info.write("%s;\n" % eachline.get_gff_line().strip())
output_info.close()
    sys.exit(1)

class_code_dict = {
    'u':'lincRNA',
    'p':'lincRNA',
    'x':'antisense_lncRNA',
    'i':'intronic_lncRNA',
}

ref_gtf = sys.argv[1]
cuffcompare_gtf = sys.argv[2]

ref_tr_dict = RNAseq_tools.get_transcript_info(ref_gtf)
intronic_tr_dict = {}

for eachline in GFF_Reader(cuffcompare_gtf):
    each_tr_id = eachline.attr['transcript_id']
    each_gene_id = eachline.attr['gene_id']
    chrom  = eachline.iv.chrom
    start  = eachline.iv.start + 1
    end    = eachline.iv.end
    strand = eachline.iv.strand
    source = eachline.source
    track_type = eachline.type
    class_code = eachline.attr['class_code']
    if class_code not in class_code_dict:
        continue
    tr_type = class_code_dict[class_code]
    each_track_out = '{chrom}\t{source}\t{track_type}\t{start}\t{end}\t.\t{strand}\t.\tgene_id "{each_gene_id}"; transcript_id "{each_tr_id}"; transcript_type "{tr_type}";'.format(**locals())
    if 'nearest_ref' in eachline.attr:
        nearest_ref_tr = eachline.attr['nearest_ref']
import sys
import os
from HTSeq import GFF_Reader

stringtie_gtf = sys.argv[1]
output = sys.argv[2]

out_dir = os.path.split(output)[0]
if not os.path.exists(out_dir):
    os.system('mkdir -p %s ' % out_dir)

output_info = open(output, 'w')
tr_fpkm_dict = {}
for eachline in GFF_Reader(stringtie_gtf):
    if eachline.type == 'transcript':
        tr_id = eachline.attr['transcript_id']
        tr_fpkm = eachline.attr['FPKM']
        tr_fpkm_dict[tr_id] = tr_fpkm
        output = "%s;\n" % eachline.get_gff_line().strip()
    else:
        tr_id = eachline.attr['transcript_id']
        tr_fpkm = tr_fpkm_dict[tr_id]
        output = '%s; FPKM "%s";\n' % (eachline.get_gff_line().strip(),
                                       tr_fpkm)
    output_info.write(output)
output_info.close()
Example #26
0
    sys.exit(0)

lncRNA_feature = sys.argv[1]
novel_gtf = sys.argv[2]
add_gtf = sys.argv[3]

lncRNA_tr_dict = {}
lncRNA_gene_dict = {}
with open(lncRNA_feature) as lncRNA_feature_inf:
    for n, eachline in enumerate(lncRNA_feature_inf):
        if n != 0:
            eachline_inf = eachline.strip().split('\t')
            tr_id = eachline_inf[4]
            gene_id = eachline_inf[5]
            tr_type = eachline_inf[-1]
            lncRNA_tr_dict[tr_id] = tr_type
            lncRNA_gene_dict[gene_id] = tr_type

out_list = []
for eachline in GFF_Reader(novel_gtf):
    gene_id = eachline.attr['gene_id']
    transcript_id = eachline.attr['transcript_id']
    gene_type = tr_type = 'TUCP'
    if gene_id in lncRNA_gene_dict:
        gene_type = lncRNA_gene_dict[gene_id]
    if transcript_id in lncRNA_tr_dict:
        tr_type = lncRNA_tr_dict[transcript_id]
    out_list.append('%s; gene_biotype "%s"; transcript_biotype "%s";' %
                    (eachline.get_gff_line().strip(), gene_type, tr_type))
python_tools.write_obj_to_file(out_list, add_gtf)
Example #27
0
from HTSeq import GFF_Reader
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--gff', help='assembled GTF file or gff', required=True)
parser.add_argument('--out_dir', help='Output directory.', required=True)
parser.add_argument('--name', help='Output file prefix.', required=True)
args = parser.parse_args()

gene_trans_map_file = os.path.join(
    args.out_dir, '{n}.gene_trans_map.txt'.format(n=args.name))
gene_trans_map_file_info = open(gene_trans_map_file, 'w')
tr_dict = {}

if args.gff.endswith('gff') or args.gff.endswith('gff3'):
    for eachline in GFF_Reader(args.gff):
        if eachline.type == "transcript":
            transcript_id = eachline.attr['ID']
            gene_id = eachline.attr['Parent']
            gene_trans_map_file_info.write('%s\t%s\n' % (gene_id,
                                                         transcript_id))
    gene_trans_map_file_info.close()
elif args.gff.endswith('gtf'):
    for eachline in GFF_Reader(args.gff):
        if 'transcript_id' not in eachline.attr:
            continue
        transcript_id = eachline.attr['transcript_id']
        gene_id = eachline.attr['gene_id']
        if 'ref_gene_id' in eachline.attr:
            gene_id = eachline.attr['ref_gene_id']
        if transcript_id not in tr_dict:
def main():
    parser = argparse.ArgumentParser(
        description=dedent('''
        Terminitor pipeline extract candidate sequence
        -----------------------------------------------------------
        This script is the last step of Terminitor RNA-seq pipeline.
        It extracts candidate sequence from alignment bam file for
        testing by the pre-trained neural network model.
        '''),
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version='Terminitor ' + VERSION)
    parser.add_argument(
        '-t',
        '--annot_trans',
        help='Transcript annotation file, GTF format. This file contains '
        'features whose type is transcript, can be downloaded from the '
        'auxiliary files or created by users',
        required=True)
    parser.add_argument(
        '-a',
        '--annot_all',
        help='Annotation file, GTF format. For example annotations from Ensembl '
        'ftp site',
        required=True)
    parser.add_argument(
        '-m',
        '--aln',
        help=
        'The alignment file from assembled transcript contigs to reference genome in '
        'bam format.',
        required=True)
    parser.add_argument(
        '-g',
        '--genome',
        help=
        'Indexed reference genome assembly in Fasta format, which can be downloaded from '
        'Ensembl',
        required=True)
    parser.add_argument(
        '-o',
        help=
        'Output file, fasta format containing candidate sequences to be tested',
        required=True)
    parser.add_argument('-u',
                        '--up_len',
                        help='Upstream sequence length',
                        type=int,
                        default=100)
    parser.add_argument('-d',
                        '--down_len',
                        help='Downstream sequence length',
                        type=int,
                        default=100)

    args = parser.parse_args()

    annot_trans = args.annot_trans
    annot_all = args.annot_all
    aln = args.aln
    assem = args.genome
    out_file = args.o
    up_len = args.up_len
    down_len = args.down_len

    # Read in chromosome lengths from FASTA index
    chrom_len = {}

    if os.path.exist(assem + '.fai'):
        with open(assem + '.fai', 'r') as f:
            for line in f:
                cols = line.split()
                chrom_len[cols[0]] = int(cols[1])
    else:
        print('Cannot find FASTA index for reference genome, i.e. `' + assem +
              '.fai`')
        sys.exit(1)

    # GTF file is 1-based inclusive, bed file is 0-based half open
    trans_coord = {}
    gtf_features = GFF_Reader(annot_all)
    for feature in gtf_features:
        # Sometimes, 'chr' is not in the chromesome name, add it back
        chrom = feature.iv.chrom  #if 'chr' in feature.iv.chrom else  'chr' + feature.iv.chrom
        #if chrom not in CHR:
        #    continue
        if chrom not in trans_coord:
            trans_coord[chrom] = {}
        if feature.type == 'transcript':
            trans = feature.attr['transcript_id']
            trans_coord[chrom][trans] = {
                'exon': [],
                'CDS': [],
                'strand': feature.iv.strand
            }
        elif feature.type in ['exon', 'CDS']:
            trans = feature.attr['transcript_id']
            coord = [feature.iv.start, feature.iv.end]
            trans_coord[chrom][trans][feature.type].append(coord)

    for chrom, transs in trans_coord.items():
        for trans, info in transs.items():
            info['exon'].sort()
            info['CDS'].sort()

            strand = info['strand']
            info['utr3'] = []
            info['utr5'] = []
            info['introns'] = []

            # Introns
            start = info['exon'][0][1]
            exon_p = 1
            while exon_p < len(info['exon']):
                info['introns'].append([start, info['exon'][exon_p][0]])
                start = info['exon'][exon_p][1]
                exon_p += 1

            # UTRs
            if info['CDS']:
                exon_p = 0
                CDS_p = 0
                while exon_p < len(info['exon']) and CDS_p < len(info['CDS']):
                    if info['exon'][exon_p][1] <= info['CDS'][CDS_p][0]:
                        if strand == '+':
                            info['utr5'].append(list(info['exon'][exon_p]))
                        else:
                            info['utr3'].append(list(info['exon'][exon_p]))
                        exon_p += 1
                        continue

                    if info['exon'][exon_p][0] < info['CDS'][CDS_p][0]:
                        if strand == '+':
                            info['utr5'].append([
                                info['exon'][exon_p][0], info['CDS'][CDS_p][0]
                            ])
                        else:
                            info['utr3'].append([
                                info['exon'][exon_p][0], info['CDS'][CDS_p][0]
                            ])

                    if info['CDS'][CDS_p][1] < info['exon'][exon_p][1]:
                        if strand == '+':
                            info['utr3'].append([
                                info['CDS'][CDS_p][1], info['exon'][exon_p][1]
                            ])
                        else:
                            info['utr5'].append([
                                info['CDS'][CDS_p][1], info['exon'][exon_p][1]
                            ])
                    CDS_p += 1
                    exon_p += 1
                while exon_p < len(info['exon']):
                    if strand == '+':
                        info['utr3'].append(list(info['exon'][exon_p]))
                    else:
                        info['utr5'].append(list(info['exon'][exon_p]))
                    exon_p += 1
                if info['utr3']:
                    if strand == '+' and info['utr3'][0][1] - info['utr3'][0][
                            0] <= 3:
                        del info['utr3'][0]
                    elif strand == '-' and info['utr3'][-1][1] - info['utr3'][
                            -1][0] <= 3:
                        del info['utr3'][-1]
                    else:
                        if strand == '+':
                            info['utr3'][0][0] += 3
                        else:
                            info['utr3'][-1][1] -= 3
            else:
                info['CDS'] = deepcopy(info['exon'])

    # Process alignment
    ensembl = pybedtools.BedTool(annot_trans)
    alignment = pybedtools.BedTool(aln)
    intersect = alignment.intersect(ensembl, bed=True, wo=True, split=True)

    seq_dict = {}

    for info in intersect:
        feature = parse_GFF_attribute_string(info[20][:-1] + '\n')
        trans = feature['transcript_id']
        chrom = info[0]  #if 'chr' in info[0] else 'chr' + info[0]
        #if chrom not in CHR:
        #    continue

        strand = info[5]
        if strand == '+':
            name = info[3] + '_' + info[0] + '_' + str(int(info[2]) -
                                                       1) + '_' + 'F'
            dis2annot = abs(
                int(info[2]) - trans_coord[chrom][trans]['exon'][-1][1])
            if trans_coord[chrom][trans]['utr3']:
                utr3 = True if int(info[2]) - trans_coord[chrom][trans][
                    'utr3'][0][0] > 0 else False
            else:
                utr3 = False
        else:
            name = info[3] + '_' + info[0] + '_' + info[1] + '_' + 'R'
            dis2annot = abs(
                int(info[1]) - trans_coord[chrom][trans]['exon'][0][0])
            if trans_coord[chrom][trans]['utr3']:
                utr3 = True if trans_coord[chrom][trans]['utr3'][-1][1] - int(
                    info[1]) > 0 else False
            else:
                utr3 = False
        if name not in seq_dict:
            seq_dict[name] = {
                'trans': trans,
                'dis2annot': dis2annot,
                'utr3': utr3
            }
        else:
            if seq_dict[name]['utr3'] and utr3 and seq_dict[name][
                    'dis2annot'] > dis2annot:
                seq_dict[name] = {
                    'trans': trans,
                    'dis2annot': dis2annot,
                    'utr3': utr3
                }
            elif seq_dict[name]['utr3'] and (not utr3):
                continue
            elif (not seq_dict[name]['utr3']) and utr3:
                seq_dict[name] = {
                    'trans': trans,
                    'dis2annot': dis2annot,
                    'utr3': utr3
                }
            elif (not seq_dict[name]['utr3']) and (
                    not utr3) and seq_dict[name]['dis2annot'] > dis2annot:
                seq_dict[name] = {
                    'trans': trans,
                    'dis2annot': dis2annot,
                    'utr3': utr3
                }

    samfile = pysam.AlignmentFile(aln, 'rb')
    for read in samfile.fetch(until_eof=True):
        cigar_string = read.cigartuples
        qseq = read.query_sequence
        if read.flag != 0 and read.flag != 16:
            continue

        chrom = read.reference_name  #if 'chr' in read.reference_name else 'chr' + read.reference_name
        #if chrom not in CHR:
        #    continue

        if read.flag == 0:
            strand = '+'
            cs = read.reference_end - 1
            name = read.query_name + '_' + chrom + '_' + str(cs) + '_' + 'F'
        elif read.flag == 16:
            strand = '-'
            cs = read.reference_start
            name = read.query_name + '_' + chrom + '_' + str(cs) + '_' + 'R'

        if strand == '+':
            if cigar_string[-1][0] == 4 or cigar_string[-1][0] == 5:
                clipped = cigar_string[-1][1]
                clipped_seq = qseq[-clipped:]
                if not is_polya(clipped_seq, True):
                    continue
                u_seq = qseq[-clipped - up_len:-clipped]
            elif cigar_string[-1][0] == 0 and name in seq_dict and seq_dict[
                    name]['utr3']:
                u_seq = qseq[-up_len:]
            else:
                continue

        else:
            if cigar_string[0][0] == 4 or cigar_string[0][0] == 5:
                clipped = cigar_string[0][1]
                clipped_seq = qseq[:clipped]
                if not is_polya(clipped_seq, False):
                    continue
                u_seq = qseq[clipped:clipped + up_len]
            elif cigar_string[0][0] == 0 and name in seq_dict and seq_dict[
                    name]['utr3']:
                u_seq = qseq[:up_len]
            else:
                continue

        if name not in seq_dict:
            seq_dict[name] = {}
        seq_dict[name]['U'] = u_seq

    bed_string = []
    for k, v in seq_dict.items():
        info = k.split('_')
        if 'U' not in v:
            continue
        if info[-1] == 'F':
            chrom = info[1]
            start = int(info[2]) + 1
            end = int(info[2]) + down_len + 1
            if start >= 0 and end < chrom_len[chrom]:
                # interval must be valid
                bed_string.append(chrom + '\t' + str(start) + '\t' + str(end) +
                                  '\t' + k + '\t-\t+')
        else:
            chrom = info[1]
            start = int(info[2]) - down_len
            end = int(info[2])
            if start >= 0 and end < chrom_len[chrom]:
                # interval must be valid
                bed_string.append(chrom + '\t' + str(start) + '\t' + str(end) +
                                  '\t' + k + '\t-\t-')

    down_bed = pybedtools.BedTool('\n'.join(bed_string), from_string=True)

    down_seqs = down_bed.sequence(fi=assem, name=True, split=True)

    out = open(out_file, 'w')

    with open(down_seqs.seqfn) as f:
        for line in f:
            name = line[1:-1].split(':')[0]
            seq = next(f).strip()
            seq_dict[name]['D'] = seq
    nonredundant = {}
    for name, v in seq_dict.items():
        direction = name.split('_')[-1]
        if 'U' not in v or 'D' not in v or len(v['U']) < up_len:
            continue
        if direction == 'F':
            seq = v['U'] + v['D']
        elif direction == 'R':
            seq = v['D'] + v['U']
            seq = rev_comp(seq)
        if 'N' in seq:
            continue
        if seq not in nonredundant:
            out.write('>' + name + '\n' + seq + '\n')
            nonredundant[seq] = 0

    out.close()
Example #29
0
def ReadGff(GFF):
    list_gff = []
    for line in GFF_Reader(GFF):
        list_gff.append(line)
    return list_gff
Example #30
0
    target_length_info.close()
    pos_file_inf.close()
    return 'produced gene length and position file!'


if __name__ == '__main__':
    ## read arguments
    arguments = docopt(__doc__, version='1.0')
    gtf_file = arguments['--gtf']
    species = arguments['--species']
    out_dir = arguments['--out_dir']
    ## gene length file and gene locus file
    gene_length_file = path.join(out_dir, '{}.gene_length.txt'.format(species))
    gene_locus_file = path.join(out_dir, '{}.gene_locus.txt'.format(species))
    get_target_length_and_pos_table(gtf_file, gene_length_file,
                                    gene_locus_file)

    ## gene transcript map file
    tr_dict = {}
    gene_tr_map_file = path.join(out_dir,
                                 '{}.gene_trans_map.txt'.format(species))
    with open(gene_tr_map_file, 'w') as gene_tr_map_file_inf:
        for eachline in GFF_Reader(gtf_file):
            if 'transcript_id' in eachline.attr:
                transcript_id = eachline.attr['transcript_id']
                gene_id = eachline.attr['gene_id']
                if transcript_id not in tr_dict:
                    tr_dict[transcript_id] = gene_id
                    gene_tr_map_file_inf.write('{0}\t{1}\n'.format(
                        gene_id, transcript_id))