def get_expressed_transcript(self): self.novel_transcript_gtf = os.path.join(self.out_dir, 'novel_transcript.gtf') sample_to_replicate_dict = sample_to_replicate(args.replicate_info) for each_quant_file in self.quant_file_list: self.store_exp(each_quant_file, sample_to_replicate_dict) self.exp_novel_transcript_gtf = os.path.join( self.out_dir, 'exp_novel_transcript.gtf') output = open(self.exp_novel_transcript_gtf, 'w') exp_flag_dict = {} for eachline in GFF_Reader(self.novel_transcript_gtf): tr_id = eachline.attr['transcript_id'] if tr_id in exp_flag_dict and exp_flag_dict[tr_id]: output.write(eachline.get_gff_line()) continue elif tr_id not in self.transcript_exp_dict: continue for each_rep in self.transcript_exp_dict[tr_id]: exp_flag_dict[tr_id] = False tr_exp = min(self.transcript_exp_dict[tr_id][each_rep]) # tr_exp = python_tools.Median(self.transcript_exp_dict[tr_id][each_rep]) tr_exon_num = self.assemlby_dict[tr_id]['exon_num'] if tr_exon_num == 1 and tr_exp >= 2: output.write(eachline.get_gff_line()) break elif tr_exp >= 0.5: output.write(eachline.get_gff_line()) break else: exp_flag_dict[tr_id] = True output.close()
def get_gene_number(gtf): gene_dict = {} for eachline in GFF_Reader(gtf): gene_id = eachline.attr['gene_id'] if gene_id not in gene_dict: gene_dict[gene_id] = 1 return len(gene_dict.keys())
def convert_blast(gtf, kegg_blast, out_dir, species, diff_type): if diff_type == 'gene': tr_gene_dict = {} for eachline in GFF_Reader(gtf): gene_id = eachline.attr['gene_id'] transcript_id = eachline.attr['transcript_id'] tr_gene_dict[transcript_id] = gene_id diff_blast_dict = {} with open(kegg_blast, 'r') as kegg_blast_info: for eachline in kegg_blast_info: each_tr_blast = eachline.strip().split('\t') query_id = tr_gene_dict[each_tr_blast[0]] if diff_type == 'gene': query_id = tr_gene_dict[each_tr_blast[0]] if query_id not in diff_blast_dict: diff_blast_dict[query_id] = each_tr_blast[1:] else: if diff_blast_dict[query_id][-2] == each_tr_blast[ -2] and diff_blast_dict[query_id][-1] < each_tr_blast[ -1]: diff_blast_dict[query_id] = each_tr_blast[1:] elif diff_blast_dict[query_id][-2] > each_tr_blast[-2]: diff_blast_dict[query_id] = each_tr_blast[1:] else: continue diff_blast = os.path.join(out_dir, '%s.blasttab.json' % species) with open(diff_blast, 'w') as diff_blast_file: json.dump(diff_blast_dict, diff_blast_file) return diff_blast
def get_transcript_dict(self): for eachline in GFF_Reader(self.gtf): genomic_featrue = eachline.type.lower() if genomic_featrue not in self.TYPE: continue gene_id = eachline.attr['gene_id'] transcript_id = eachline.attr['transcript_id'] start = eachline.iv.start end = eachline.iv.end chrom = eachline.iv.chrom strand = eachline.iv.strand self.transcript_dict.setdefault(transcript_id, {})['strand'] = strand self.transcript_dict.setdefault(transcript_id, {})['chrom'] = chrom self.transcript_dict.setdefault(transcript_id, {})['gene_id'] = gene_id if transcript_id in self.transcript_dict and genomic_featrue in self.transcript_dict[ transcript_id]: if intersect_exon_in_same_tr( self.transcript_dict[transcript_id][genomic_featrue], (start, end)): self.transcript_dict[transcript_id][ genomic_featrue].append((start, end)) else: self.transcript_dict.setdefault(transcript_id, {})[genomic_featrue] = [(start, end)] genomic_featrue_test = genomic_featrue.upper() if self.cds_flag == 0 and genomic_featrue == 'cds': self.cds_flag = 1 if self.utr_flag == 0 and 'utr' in genomic_featrue_test: self.utr_flag = 1
def ReadGff(GFF): dict_gene = {} dict_ens = {} for line in GFF_Reader(GFF): dict_ens[re.split('\.', line.attr['gene_id'])[0]] = line dict_gene[line.attr['gene_name']] = line return dict_gene, dict_ens
def read_transcriptdata(gtffn): gid_key = lambda f: (f.iv.chrom, f.attr.get("gene_id")) tid_key = lambda f: f.attr.get("transcript_id") sort_key = lambda f: f.iv.start tidtc = dict() tid2gid = dict() a = GenomicArrayOfSets("auto", stranded=True) gtf = GFF_Reader(gtffn) for (chrom, gid), ggfiter in groupby(gtf, gid_key): for tid, tgfiter in groupby(ggfiter, tid_key): fs = sorted(tgfiter, key=sort_key) exonfs = filter(lambda f: f.type == "exon", fs) cdsfs = filter(lambda f: f.type == "CDS", fs) if not any(exonfs) or not any(cdsfs): continue try: tc, (cdsstart, cdsend) = validate_and_extract_cds(fs) except ValueError: continue tidtc[tid] = tc, (cdsstart, cdsend) for efs in exonfs: a[efs.iv] += tid tid2gid[tid] = gid return a, tidtc, tid2gid
def oms_lncRNA_classify(feelnc_prd, lnc_gtf, method='Luo'): ''' add lncRNA class to FEElnc_classifier.pl output ''' feelnc_df = pd.read_table(feelnc_prd, index_col=2) feelnc_best_df = feelnc_df[feelnc_df.isBest == 1] feelnc_best_df = feelnc_best_df.loc[:, feelnc_best_df.columns[1:]] lnc_class_df = pd.DataFrame([], columns=feelnc_best_df.columns) lnc_class_list = list() for eachline in GFF_Reader(lnc_gtf): if 'transcript_id' not in eachline.attr: continue tr_id = eachline.attr['transcript_id'] gene_id = eachline.attr['gene_id'] if tr_id in lnc_class_df.index: continue if tr_id in feelnc_best_df.index: if method == 'Luo': dirt, ltype, dis, subtype, loc = feelnc_best_df.loc[tr_id][3:] lnc_class = get_luo_code(dirt, subtype, loc) lnc_class_list.append(lnc_class) lnc_class_df.loc[tr_id] = feelnc_best_df.loc[tr_id] else: sys.exit('undefined classification method.') else: class_detail = ['--' for each in lnc_class_df.columns] class_detail[0] = gene_id lnc_class_list.append('lincRNA') lnc_class_df.loc[tr_id] = class_detail lnc_class_df.loc[:, 'classification'] = lnc_class_list lnc_class_df.index.name = feelnc_best_df.index.name return lnc_class_df
def get_novel_transcript(self): ## annotate mRNA related transcript for each_gtf in self.combined_gtf_list: self.add_compare_info(each_gtf) ## filter transcripts self.assembly_dict_json = os.path.join(self.out_dir, 'assembly_tr_info.json') store_into_json(self.assemlby_dict, self.assembly_dict_json) ## output filtered transcript gtf self.novel_transcript_gtf = os.path.join(self.out_dir, 'novel_transcript.gtf') output = open(self.novel_transcript_gtf, 'w') for eachline in GFF_Reader(self.assembly_gtf): tr_id = eachline.attr['transcript_id'] ## filter length if self.assemlby_dict[tr_id]['tr_length'] <= self.length: continue ## annotation filter elif 'status' not in self.assemlby_dict[tr_id]: continue elif self.assemlby_dict[tr_id]['status'] not in self.lncRNA_type: continue else: output.write(eachline.get_gff_line()) output.close()
def ReadGff(GFF): dict_gene = {} dict_ens = {} for line in GFF_Reader(GFF): dict_ens[line.attr['transcript_id']] = line dict_gene[line.attr['transcript_name']] = line return dict_gene, dict_ens
def get_transcript_info(gtf, genename_dict={}): transcript_info_dict = {} for eachline in GFF_Reader(gtf): if eachline.type != 'exon': continue gene_id = eachline.attr['gene_id'] transcript_id = eachline.attr['transcript_id'] ## get gene_type info if 'gene_type' in eachline.attr: gene_type = eachline.attr['gene_type'] elif 'gene_biotype' in eachline.attr: gene_type = eachline.attr['gene_biotype'] else: gene_type = '--' ## get gene name info if 'gene_name' in eachline.attr: genename = eachline.attr['gene_name'] elif gene_id in genename_dict: genename = genename_dict[gene_id][0] else: genename = '--' ## get location info chrom = eachline.iv.chrom start = eachline.iv.start + 1 end = eachline.iv.end strand = eachline.iv.strand length = end - start + 1 if transcript_id in transcript_info_dict: transcript_info_dict[transcript_id]['exon_start'].append(start) transcript_info_dict[transcript_id]['exon_start'].sort() transcript_info_dict[transcript_id]['exon_end'].append(end) transcript_info_dict[transcript_id]['exon_end'].sort() transcript_info_dict[transcript_id]['exon_len'].append(length) else: transcript_info_dict.setdefault(transcript_id, {})["chrom"] = chrom transcript_info_dict.setdefault(transcript_id, {})["exon_start"] = [start] transcript_info_dict.setdefault(transcript_id, {})["exon_end"] = [end] transcript_info_dict.setdefault(transcript_id, {})["strand"] = strand transcript_info_dict.setdefault(transcript_id, {})["gene_id"] = gene_id transcript_info_dict.setdefault(transcript_id, {})["gene_name"] = genename transcript_info_dict.setdefault(transcript_id, {})["gene_description"] = "--" transcript_info_dict.setdefault(transcript_id, {})['gene_type'] = gene_type transcript_info_dict.setdefault(transcript_id, {})['exon_len'] = [length] if gene_id in genename_dict: transcript_info_dict.setdefault( transcript_id, {})["gene_description"] = genename_dict[gene_id][1] gene_description_flag = 1 return transcript_info_dict
def get_transcript_info(gtf, genename_dict={}): transcript_info_dict = {} for eachline in GFF_Reader(gtf): gene_id = eachline.attr['gene_id'] transcript_id = eachline.attr['transcript_id'] # get gene_type info if 'gene_type' in eachline.attr: gene_type = eachline.attr['gene_type'] elif 'gene_biotype' in eachline.attr: gene_type = eachline.attr['gene_biotype'] else: gene_type = '--' # get gene name info if 'gene_name' in eachline.attr: genename = eachline.attr['gene_name'] elif gene_id in genename_dict: genename = genename_dict[gene_id][0] else: genename = '--' # get location info chrom = eachline.iv.chrom start = eachline.iv.start + 1 end = eachline.iv.end strand = eachline.iv.strand length = end - start + 1 if transcript_id in transcript_info_dict: if start < transcript_info_dict[transcript_id]['start']: transcript_info_dict[transcript_id]['start'] = start if end > transcript_info_dict[transcript_id]['end']: transcript_info_dict[transcript_id]['end'] = end transcript_info_dict.setdefault(transcript_id, {})['length'] += length transcript_info_dict.setdefault(transcript_id, {})['exon_num'] += 1 else: transcript_info_dict.setdefault(transcript_id, {})["chrom"] = chrom transcript_info_dict.setdefault(transcript_id, {})["start"] = start transcript_info_dict.setdefault(transcript_id, {})["end"] = end transcript_info_dict.setdefault(transcript_id, {})["strand"] = strand transcript_info_dict.setdefault(transcript_id, {})["gene_id"] = gene_id transcript_info_dict.setdefault(transcript_id, {})["gene_name"] = genename transcript_info_dict.setdefault(transcript_id, {})["gene_description"] = "--" transcript_info_dict.setdefault(transcript_id, {})['gene_type'] = gene_type transcript_info_dict.setdefault(transcript_id, {})['length'] = length transcript_info_dict.setdefault(transcript_id, {})['exon_num'] = 1 if gene_id in genename_dict: transcript_info_dict.setdefault( transcript_id, {})["gene_description"] = genename_dict[gene_id][1] gene_description_flag = 1 return transcript_info_dict
def _get_genomic_reader(filename): """regions from a BED_Reader or GFF_Reader. """ if isinstance(filename, str) and filename.endswith('.bed'): regions_ = BED_Reader(filename) elif isinstance(filename, str) and (filename.endswith('.gff') or filename.endswith('.gtf')): regions_ = GFF_Reader(filename) else: raise Exception('Regions must be a bed, gff or gtf-file.') return regions_
def get_transcript_length(gtf): tr_length_dict = {} for eachline in GFF_Reader(gtf): gene_id = eachline.attr['gene_id'] transcript_id = eachline.attr['transcript_id'] start = eachline.iv.start end = eachline.iv.end length = end - start + 1 if gene_id in tr_length_dict and transcript_id in tr_length_dict[gene_id] : tr_length_dict[gene_id][transcript_id] += length else : tr_length_dict.setdefault(gene_id,{})[transcript_id] = length return tr_length_dict
def add_compare_info(self, combined_gtf): if 'mRNA' in combined_gtf: for eachline in GFF_Reader(combined_gtf): tr_id, class_code, nearest_ref = get_class_code(eachline) self.assemlby_dict[tr_id]['nearest_ref'] = nearest_ref if class_code in ['u', 'p']: self.assemlby_dict[tr_id]['status'] = 'lincRNA' elif class_code == 'x': self.assemlby_dict[tr_id]['status'] = 'antisense' elif class_code == 'i': if self.assemlby_dict[tr_id]['strand'] != self.ref_dict[ nearest_ref]['strand']: self.assemlby_dict[tr_id][ 'status'] = 'antisense_intronic' else: if self.assemlby_dict[tr_id]['exon_num'] > 1: self.assemlby_dict[tr_id][ 'status'] = 'sense_intronic' else: self.assemlby_dict[tr_id]['status'] = 'backgroud' else: self.assemlby_dict[tr_id]['status'] = 'protein_coding' elif 'other_ncRNA' in combined_gtf: for eachline in GFF_Reader(combined_gtf): tr_id, class_code, nearest_ref = get_class_code(eachline) if class_code in self.overlap_flags and 'status' in self.assemlby_dict[ tr_id]: self.assemlby_dict[tr_id]['status'] = 'ncRNA_host' elif 'lncRNA' in combined_gtf: for eachline in GFF_Reader(combined_gtf): tr_id, class_code, nearest_ref = get_class_code(eachline) if class_code in self.overlap_flags and 'status' in self.assemlby_dict[ tr_id]: self.assemlby_dict[tr_id]['status'] = 'Annotated_lncRNA'
def novel_gtf(compare_gtf, outfile): novel_gtf_dict = dict() outfile_inf = open(outfile, 'w') for record in GFF_Reader(compare_gtf): if 'class_code' in record.attr: if record.attr['class_code'] in NOVEL_TR_CODE: novel_gtf_dict[record.attr['transcript_id']] = 1 record.attr = { key: val for key, val in record.attr.items() if key in OUT_ATTR } else: continue elif record.attr['transcript_id'] in novel_gtf_dict: pass else: continue outline = record.get_gff_line().strip() outfile_inf.write(f'{outline};\n') outfile_inf.close()
def gff2dict(gff, fix_id_flag=False): by_gene_dict = OrderedDict() by_tr_dict = OrderedDict() gene_entry_dict = dict() tr2gene = dict() for eachline in GFF_Reader(gff): if eachline.type == 'gene': gene_id = eachline.attr['ID'] eachline.attr['gene_id'] = gene_id gene_entry_dict[gene_id] = eachline gene_entry_dict[gene_id].attr['fixed'] = True continue if 'geneID' in eachline.attr: parent = eachline.attr['geneID'] eachline.attr['Parent'] = parent else: parent = eachline.attr['Parent'] if eachline.type in ["transcript", "mRNA"]: tr_id = fix_id(eachline.attr['ID'], eachline.type, fix_id_flag) eachline.attr['ID'] = tr_id gene_id = parent tr2gene[tr_id] = parent else: if 'ID' in eachline.attr: eachline.attr['ID'] = fix_id(eachline.attr.get('ID'), eachline.type, fix_id_flag) tr_id = fix_id(parent, 'mRNA', fix_id_flag) eachline.attr['Parent'] = tr_id gene_id = tr2gene[tr_id] eachline.attr['tr_id'] = tr_id eachline.attr['gene_id'] = gene_id by_gene_dict.setdefault(gene_id, []).append(eachline) by_tr_dict.setdefault(tr_id, []).append(eachline) gene_entry_dict[gene_id] = update_gene_inf( gene_entry_dict.get(gene_id), eachline) return by_gene_dict, by_tr_dict, gene_entry_dict, tr2gene
def main(gtf, feelnc_classify, bed_intersect, out_dir): feelnc_df = pd.read_table(feelnc_classify, index_col=2) intersect_df = pd.read_table(bed_intersect, index_col=[3, 15], header=None) lnc_class_list = [] out_header = list(feelnc_df.columns[1:]) out_header.insert(0, 'lncRNA_transcript') out_header.append('lncRNA_class') def get_class(fee_rd, intersect_df): if fee_rd.type == 'intergenic': if fee_rd.subtype == 'divergent': return 'divergent', 0, 0 else: return 'intergenic', 0, 0 else: inter_index = (fee_rd.name, fee_rd.partnerRNA_transcript) inter_rd = intersect_df.loc[inter_index] overlap1, overlap2 = overlap_portion(inter_rd) if fee_rd.direction == 'sense': if fee_rd.subtype == 'containing': return 'other_sense_overlap', overlap1, overlap2 elif fee_rd.subtype == 'nested': return 'sense_intronic', overlap1, overlap2 elif fee_rd.subtype == 'overlapping': if overlap1 >= OVERLAP_CUTOFF: introns = iterintrons(inter_rd[13], inter_rd[22], inter_rd[23]) lnc_can_start = inter_rd[1] if tss_in_interval(lnc_can_start, introns): return 'sense_intronic', overlap1, overlap2 return 'other_sense_overlap', overlap1, overlap2 else: sys.exit('unkown type [{t.subtype}]'.format(t=fee_rd)) else: if fee_rd.subtype == 'nested': return 'antisense', overlap1, overlap2 else: if overlap1 >= OVERLAP_CUTOFF: return 'antisense', overlap1, overlap2 else: return 'intergenic', overlap1, overlap2 def lnc_classify(tr_id, feelnc_df, intersect_df): tr_detail_df = feelnc_df.loc[tr_id] out_inf = [] class_inf = [] if tr_detail_df.index[0] == tr_id: for n in range(len(tr_detail_df)): class_value = list(get_class(tr_detail_df.ix[n], intersect_df)) dis = tr_detail_df.ix[n].distance tmp_class_inf = class_value[:] tmp_class_inf.insert(1, dis) tmp_out_inf = list(tr_detail_df.ix[n][1:]) if not out_inf: out_inf = tmp_out_inf class_inf = tmp_class_inf else: if compare_class(tmp_class_inf, class_inf): out_inf = tmp_out_inf class_inf = tmp_class_inf else: class_value = list(get_class(tr_detail_df, intersect_df)) out_inf = list(tr_detail_df[1:]) class_inf = class_value out_inf.insert(0, tr_id) out_inf.append(class_inf[0]) return out_inf for eachline in GFF_Reader(gtf): if eachline.type == 'transcript': tr_id = eachline.attr['transcript_id'] gene_id = eachline.attr['gene_id'] if tr_id not in feelnc_df.index: out_inf = [tr_id, gene_id] out_inf.extend(INTERGENIC_INF) else: out_inf = lnc_classify(tr_id, feelnc_df, intersect_df) out_inf_series = pd.Series(out_inf, index=out_header) lnc_class_list.append(out_inf_series) out_df = pd.concat(lnc_class_list, axis=1).T out_file = os.path.join(out_dir, 'lncRNA.classify.txt') out_df.to_csv(out_file, sep='\t', index=False)
def main(gtf, output_dir): # read gtf file gtf_dict = dict() for eachline in GFF_Reader(gtf): if eachline.type == 'exon': gene_id = eachline.attr['gene_id'] chrom = eachline.iv.chrom start = eachline.iv.start + 1 end = eachline.iv.end transcript_id = eachline.attr['transcript_id'] exon_len = eachline.iv.end - eachline.iv.start if 'gene_biotype' in eachline.attr: gtf_dict.setdefault('gene_biotype', []).append(eachline.attr['gene_biotype']) gtf_dict.setdefault('gene_id', []).append(gene_id) gtf_dict.setdefault('transcript_id', []).append(transcript_id) gtf_dict.setdefault('exon', []).append(exon_len) gtf_dict.setdefault('chr', []).append(chrom) gtf_dict.setdefault('start', []).append(start) gtf_dict.setdefault('end', []).append(end) gtf_df = pd.DataFrame(gtf_dict) # generate gtf summary gene_stat_df = gtf_df.loc[:, ['gene_id', 'transcript_id']] gene_stat_df = gene_stat_df.drop_duplicates() gene_stat = gene_stat_df.groupby(['gene_id'])['transcript_id'] gene_num = len(gene_stat.count()) tr_per_gene = gene_stat.count().mean() tr_num = len(gene_stat_df) tr_stat = gtf_df.groupby(['transcript_id'])['exon'] single_exon = tr_stat.count().value_counts()[1] tr_length = tr_stat.sum().mean() exon_length = gtf_df.loc[:, 'exon'].mean() exon_num = tr_stat.count().mean() if not os.path.exists(output_dir): os.makedirs(output_dir) gtf_summary_file = os.path.join(output_dir, 'transcriptome.summary.txt') with open(gtf_summary_file, 'w') as gtf_summary_inf: gtf_summary_inf.write('Gene number\t{gn}'.format(gn=gene_num)) gtf_summary_inf.write( 'Transcript per gene\t{tg:.2f}'.format(tg=tr_per_gene)) gtf_summary_inf.write('Transcript number\t{tn}'.format(tn=tr_num)) gtf_summary_inf.write( 'Single exon transcripts\t{st}'.format(st=single_exon)) gtf_summary_inf.write( 'Mean transcript length\t{tl:.2f}'.format(tl=tr_length)) gtf_summary_inf.write( 'Mean exon length\t{el:.2f}'.format(el=exon_length)) gtf_summary_inf.write( 'Exon number per transcript\t{et:.2f}'.format(et=exon_num)) # generate transcript information tr_list = [] tr_list.append(gtf_df.groupby(['transcript_id'])['chr'].unique()) tr_list.append(gtf_df.groupby(['transcript_id'])['start'].min()) tr_list.append(gtf_df.groupby(['transcript_id'])['end'].max()) tr_list.append(gtf_df.groupby(['transcript_id'])['gene_id'].unique()) tr_list.append(gtf_df.groupby(['transcript_id'])['exon'].count()) tr_list.append(gtf_df.groupby(['transcript_id'])['exon'].sum()) if 'gene_biotype' in gtf_df.columns: tr_list.append( gtf_df.groupby(['transcript_id'])['gene_biotype'].unique()) tr_df = pd.concat(tr_list, axis=1) tr_df.loc[:, 'chr'] = map(','.join, tr_df.loc[:, 'chr']) tr_df.loc[:, 'gene_id'] = map(','.join, tr_df.loc[:, 'gene_id']) if 'gene_biotype' in tr_df.columns: tr_df.loc[:, 'gene_biotype'] = map(','.join, tr_df.loc[:, 'gene_biotype']) tr_file = os.path.join(output_dir, 'transcripts.detail.txt') tr_df.to_csv(tr_file, sep='\t')
bed = sys.argv[1] gtf = sys.argv[2] chr_dict = dict() with open(bed) as bed_inf: for eachline in bed_inf: eachline_inf = eachline.strip().split() chrom = eachline_inf[0] start = int(eachline_inf[1]) + 1 end = int(eachline_inf[2]) split_chr = eachline_inf[3] chr_dict.setdefault(chrom, {})[(start, end)] = split_chr for eachline in GFF_Reader(gtf): chrom = eachline.iv.chrom start = eachline.iv.start + 1 end = eachline.iv.end if chrom in chr_dict: for each_inter in chr_dict[chrom]: if start >= each_inter[0] and end <= each_inter[1]: new_chr = chr_dict[chrom][each_inter] new_start = start - each_inter[0] + 1 new_end = end - each_inter[0] + 1 else: new_chr = chrom new_start = start new_end = end output_line = eachline.get_gff_line().strip().split('\t') output_line[0] = new_chr
input_gtf = sys.argv[3] output_gtf = sys.argv[4] miRNA_gene_list = [each.strip() for each in open(miRNA_gene_file)] sense_overlapping_tr_dict = {} with open(lncRNA_classify_file) as lncRNA_classify_file_info: for n, eachline in enumerate(lncRNA_classify_file_info): if n != 0: eachline_info = eachline.strip().split('\t') each_tr_id = eachline_info[2] if eachline_info[5] == 'sense' and eachline_info[6] == 'genic' and eachline_info[9] == 'exonic': sense_overlapping_tr_dict[each_tr_id] = 'so' if eachline_info[3] in miRNA_gene_list: sense_overlapping_tr_dict[each_tr_id] = 'mi' output_gtf_info = open(output_gtf, 'w') for eachline in GFF_Reader(input_gtf): tr_id = eachline.attr['transcript_id'] output_line = '%s;' % eachline.get_gff_line().strip() if tr_id in sense_overlapping_tr_dict: if sense_overlapping_tr_dict[tr_id] == 'mi': output_line = '%s transcript_type "miRNA_host";' % output_line else: continue output_gtf_info.write('%s\n' % output_line) output_gtf_info.close()
endPoint = 2000000 plusStrand = GenomicInterval(chromo, 0, endPoint, '+') minusStrand = GenomicInterval(chromo, 0, endPoint, '-') bothStrands = GenomicInterval(chromo, 0, endPoint, '.') pyplot.plot(list(hitMap[plusStrand])) pyplot.plot(list(hitMap[minusStrand])) pyplot.show() print('\n Using HTSeq to access GFF genome features\n') remoteFileName = '/Bacteria/Escherichia_coli_536_uid58531/NC_008253.gff' gffFile = 'examples/EcoliGenomeFeatures.gff' downloadFile(FTP_ROOT + remoteFileName, gffFile) fileObj = GFF_Reader(gffFile) for genomeFeature in fileObj: genomeRegion = genomeFeature.iv data = (genomeRegion.chrom, genomeRegion.start, genomeRegion.end, genomeRegion.strand) print('%s %s - %s (%s)' % data) data = (genomeFeature.name, genomeFeature.type, genomeFeature.source) print('%s %s (%s)' % data) print(genomeFeature.attr)
''' Usage: extract_ncRNA_from_assemblyline.py <all.gtf> <ncRNA.gtf> Extract ncRNA gtf from assemblyline assembled gtf ''' from docopt import docopt from HTSeq import GFF_Reader if __name__ == '__main__': arguments = docopt(__doc__, version="v1") all_gtf = arguments['<all.gtf>'] nc_gtf = arguments['<ncRNA.gtf>'] nc_gtf_inf = open(nc_gtf, 'w') for eachline in GFF_Reader(all_gtf): if 'transcript_category' in eachline.attr: if eachline.attr['transcript_category'] == 'ncRNA' or eachline.attr[ 'transcript_category'] == 'lncRNA': nc_gtf_inf.write(eachline.get_gff_line())
parser.add_argument('--id_file', help='Output directory.', required=True) parser.add_argument('--flag', help='extract "ex" or delete "de" gtf record in id_file.', choices=['ex', 'de'], default='ex') parser.add_argument('--id_type', help='id type.', choices=['gene', 'transcript'], default='transcript') parser.add_argument('--output', help='output gtf file.', required=True) args = parser.parse_args() id_dict = {} with open(args.id_file, 'r') as id_file_info: for eachline in id_file_info: eachline = eachline.strip() if eachline: id_dict[eachline] = 1 output_info = open(args.output, 'w') attr_flag = '{i}_id'.format(i=args.id_type) for eachline in GFF_Reader(args.gtf): if args.flag == 'ex': if eachline.attr[attr_flag] in id_dict: output_info.write("%s;\n" % eachline.get_gff_line().strip()) else: if eachline.attr[attr_flag] not in id_dict: output_info.write("%s;\n" % eachline.get_gff_line().strip()) output_info.close()
sys.exit(1) class_code_dict = { 'u':'lincRNA', 'p':'lincRNA', 'x':'antisense_lncRNA', 'i':'intronic_lncRNA', } ref_gtf = sys.argv[1] cuffcompare_gtf = sys.argv[2] ref_tr_dict = RNAseq_tools.get_transcript_info(ref_gtf) intronic_tr_dict = {} for eachline in GFF_Reader(cuffcompare_gtf): each_tr_id = eachline.attr['transcript_id'] each_gene_id = eachline.attr['gene_id'] chrom = eachline.iv.chrom start = eachline.iv.start + 1 end = eachline.iv.end strand = eachline.iv.strand source = eachline.source track_type = eachline.type class_code = eachline.attr['class_code'] if class_code not in class_code_dict: continue tr_type = class_code_dict[class_code] each_track_out = '{chrom}\t{source}\t{track_type}\t{start}\t{end}\t.\t{strand}\t.\tgene_id "{each_gene_id}"; transcript_id "{each_tr_id}"; transcript_type "{tr_type}";'.format(**locals()) if 'nearest_ref' in eachline.attr: nearest_ref_tr = eachline.attr['nearest_ref']
import sys import os from HTSeq import GFF_Reader stringtie_gtf = sys.argv[1] output = sys.argv[2] out_dir = os.path.split(output)[0] if not os.path.exists(out_dir): os.system('mkdir -p %s ' % out_dir) output_info = open(output, 'w') tr_fpkm_dict = {} for eachline in GFF_Reader(stringtie_gtf): if eachline.type == 'transcript': tr_id = eachline.attr['transcript_id'] tr_fpkm = eachline.attr['FPKM'] tr_fpkm_dict[tr_id] = tr_fpkm output = "%s;\n" % eachline.get_gff_line().strip() else: tr_id = eachline.attr['transcript_id'] tr_fpkm = tr_fpkm_dict[tr_id] output = '%s; FPKM "%s";\n' % (eachline.get_gff_line().strip(), tr_fpkm) output_info.write(output) output_info.close()
sys.exit(0) lncRNA_feature = sys.argv[1] novel_gtf = sys.argv[2] add_gtf = sys.argv[3] lncRNA_tr_dict = {} lncRNA_gene_dict = {} with open(lncRNA_feature) as lncRNA_feature_inf: for n, eachline in enumerate(lncRNA_feature_inf): if n != 0: eachline_inf = eachline.strip().split('\t') tr_id = eachline_inf[4] gene_id = eachline_inf[5] tr_type = eachline_inf[-1] lncRNA_tr_dict[tr_id] = tr_type lncRNA_gene_dict[gene_id] = tr_type out_list = [] for eachline in GFF_Reader(novel_gtf): gene_id = eachline.attr['gene_id'] transcript_id = eachline.attr['transcript_id'] gene_type = tr_type = 'TUCP' if gene_id in lncRNA_gene_dict: gene_type = lncRNA_gene_dict[gene_id] if transcript_id in lncRNA_tr_dict: tr_type = lncRNA_tr_dict[transcript_id] out_list.append('%s; gene_biotype "%s"; transcript_biotype "%s";' % (eachline.get_gff_line().strip(), gene_type, tr_type)) python_tools.write_obj_to_file(out_list, add_gtf)
from HTSeq import GFF_Reader import argparse parser = argparse.ArgumentParser() parser.add_argument('--gff', help='assembled GTF file or gff', required=True) parser.add_argument('--out_dir', help='Output directory.', required=True) parser.add_argument('--name', help='Output file prefix.', required=True) args = parser.parse_args() gene_trans_map_file = os.path.join( args.out_dir, '{n}.gene_trans_map.txt'.format(n=args.name)) gene_trans_map_file_info = open(gene_trans_map_file, 'w') tr_dict = {} if args.gff.endswith('gff') or args.gff.endswith('gff3'): for eachline in GFF_Reader(args.gff): if eachline.type == "transcript": transcript_id = eachline.attr['ID'] gene_id = eachline.attr['Parent'] gene_trans_map_file_info.write('%s\t%s\n' % (gene_id, transcript_id)) gene_trans_map_file_info.close() elif args.gff.endswith('gtf'): for eachline in GFF_Reader(args.gff): if 'transcript_id' not in eachline.attr: continue transcript_id = eachline.attr['transcript_id'] gene_id = eachline.attr['gene_id'] if 'ref_gene_id' in eachline.attr: gene_id = eachline.attr['ref_gene_id'] if transcript_id not in tr_dict:
def main(): parser = argparse.ArgumentParser( description=dedent(''' Terminitor pipeline extract candidate sequence ----------------------------------------------------------- This script is the last step of Terminitor RNA-seq pipeline. It extracts candidate sequence from alignment bam file for testing by the pre-trained neural network model. '''), formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-v', '--version', action='version', version='Terminitor ' + VERSION) parser.add_argument( '-t', '--annot_trans', help='Transcript annotation file, GTF format. This file contains ' 'features whose type is transcript, can be downloaded from the ' 'auxiliary files or created by users', required=True) parser.add_argument( '-a', '--annot_all', help='Annotation file, GTF format. For example annotations from Ensembl ' 'ftp site', required=True) parser.add_argument( '-m', '--aln', help= 'The alignment file from assembled transcript contigs to reference genome in ' 'bam format.', required=True) parser.add_argument( '-g', '--genome', help= 'Indexed reference genome assembly in Fasta format, which can be downloaded from ' 'Ensembl', required=True) parser.add_argument( '-o', help= 'Output file, fasta format containing candidate sequences to be tested', required=True) parser.add_argument('-u', '--up_len', help='Upstream sequence length', type=int, default=100) parser.add_argument('-d', '--down_len', help='Downstream sequence length', type=int, default=100) args = parser.parse_args() annot_trans = args.annot_trans annot_all = args.annot_all aln = args.aln assem = args.genome out_file = args.o up_len = args.up_len down_len = args.down_len # Read in chromosome lengths from FASTA index chrom_len = {} if os.path.exist(assem + '.fai'): with open(assem + '.fai', 'r') as f: for line in f: cols = line.split() chrom_len[cols[0]] = int(cols[1]) else: print('Cannot find FASTA index for reference genome, i.e. `' + assem + '.fai`') sys.exit(1) # GTF file is 1-based inclusive, bed file is 0-based half open trans_coord = {} gtf_features = GFF_Reader(annot_all) for feature in gtf_features: # Sometimes, 'chr' is not in the chromesome name, add it back chrom = feature.iv.chrom #if 'chr' in feature.iv.chrom else 'chr' + feature.iv.chrom #if chrom not in CHR: # continue if chrom not in trans_coord: trans_coord[chrom] = {} if feature.type == 'transcript': trans = feature.attr['transcript_id'] trans_coord[chrom][trans] = { 'exon': [], 'CDS': [], 'strand': feature.iv.strand } elif feature.type in ['exon', 'CDS']: trans = feature.attr['transcript_id'] coord = [feature.iv.start, feature.iv.end] trans_coord[chrom][trans][feature.type].append(coord) for chrom, transs in trans_coord.items(): for trans, info in transs.items(): info['exon'].sort() info['CDS'].sort() strand = info['strand'] info['utr3'] = [] info['utr5'] = [] info['introns'] = [] # Introns start = info['exon'][0][1] exon_p = 1 while exon_p < len(info['exon']): info['introns'].append([start, info['exon'][exon_p][0]]) start = info['exon'][exon_p][1] exon_p += 1 # UTRs if info['CDS']: exon_p = 0 CDS_p = 0 while exon_p < len(info['exon']) and CDS_p < len(info['CDS']): if info['exon'][exon_p][1] <= info['CDS'][CDS_p][0]: if strand == '+': info['utr5'].append(list(info['exon'][exon_p])) else: info['utr3'].append(list(info['exon'][exon_p])) exon_p += 1 continue if info['exon'][exon_p][0] < info['CDS'][CDS_p][0]: if strand == '+': info['utr5'].append([ info['exon'][exon_p][0], info['CDS'][CDS_p][0] ]) else: info['utr3'].append([ info['exon'][exon_p][0], info['CDS'][CDS_p][0] ]) if info['CDS'][CDS_p][1] < info['exon'][exon_p][1]: if strand == '+': info['utr3'].append([ info['CDS'][CDS_p][1], info['exon'][exon_p][1] ]) else: info['utr5'].append([ info['CDS'][CDS_p][1], info['exon'][exon_p][1] ]) CDS_p += 1 exon_p += 1 while exon_p < len(info['exon']): if strand == '+': info['utr3'].append(list(info['exon'][exon_p])) else: info['utr5'].append(list(info['exon'][exon_p])) exon_p += 1 if info['utr3']: if strand == '+' and info['utr3'][0][1] - info['utr3'][0][ 0] <= 3: del info['utr3'][0] elif strand == '-' and info['utr3'][-1][1] - info['utr3'][ -1][0] <= 3: del info['utr3'][-1] else: if strand == '+': info['utr3'][0][0] += 3 else: info['utr3'][-1][1] -= 3 else: info['CDS'] = deepcopy(info['exon']) # Process alignment ensembl = pybedtools.BedTool(annot_trans) alignment = pybedtools.BedTool(aln) intersect = alignment.intersect(ensembl, bed=True, wo=True, split=True) seq_dict = {} for info in intersect: feature = parse_GFF_attribute_string(info[20][:-1] + '\n') trans = feature['transcript_id'] chrom = info[0] #if 'chr' in info[0] else 'chr' + info[0] #if chrom not in CHR: # continue strand = info[5] if strand == '+': name = info[3] + '_' + info[0] + '_' + str(int(info[2]) - 1) + '_' + 'F' dis2annot = abs( int(info[2]) - trans_coord[chrom][trans]['exon'][-1][1]) if trans_coord[chrom][trans]['utr3']: utr3 = True if int(info[2]) - trans_coord[chrom][trans][ 'utr3'][0][0] > 0 else False else: utr3 = False else: name = info[3] + '_' + info[0] + '_' + info[1] + '_' + 'R' dis2annot = abs( int(info[1]) - trans_coord[chrom][trans]['exon'][0][0]) if trans_coord[chrom][trans]['utr3']: utr3 = True if trans_coord[chrom][trans]['utr3'][-1][1] - int( info[1]) > 0 else False else: utr3 = False if name not in seq_dict: seq_dict[name] = { 'trans': trans, 'dis2annot': dis2annot, 'utr3': utr3 } else: if seq_dict[name]['utr3'] and utr3 and seq_dict[name][ 'dis2annot'] > dis2annot: seq_dict[name] = { 'trans': trans, 'dis2annot': dis2annot, 'utr3': utr3 } elif seq_dict[name]['utr3'] and (not utr3): continue elif (not seq_dict[name]['utr3']) and utr3: seq_dict[name] = { 'trans': trans, 'dis2annot': dis2annot, 'utr3': utr3 } elif (not seq_dict[name]['utr3']) and ( not utr3) and seq_dict[name]['dis2annot'] > dis2annot: seq_dict[name] = { 'trans': trans, 'dis2annot': dis2annot, 'utr3': utr3 } samfile = pysam.AlignmentFile(aln, 'rb') for read in samfile.fetch(until_eof=True): cigar_string = read.cigartuples qseq = read.query_sequence if read.flag != 0 and read.flag != 16: continue chrom = read.reference_name #if 'chr' in read.reference_name else 'chr' + read.reference_name #if chrom not in CHR: # continue if read.flag == 0: strand = '+' cs = read.reference_end - 1 name = read.query_name + '_' + chrom + '_' + str(cs) + '_' + 'F' elif read.flag == 16: strand = '-' cs = read.reference_start name = read.query_name + '_' + chrom + '_' + str(cs) + '_' + 'R' if strand == '+': if cigar_string[-1][0] == 4 or cigar_string[-1][0] == 5: clipped = cigar_string[-1][1] clipped_seq = qseq[-clipped:] if not is_polya(clipped_seq, True): continue u_seq = qseq[-clipped - up_len:-clipped] elif cigar_string[-1][0] == 0 and name in seq_dict and seq_dict[ name]['utr3']: u_seq = qseq[-up_len:] else: continue else: if cigar_string[0][0] == 4 or cigar_string[0][0] == 5: clipped = cigar_string[0][1] clipped_seq = qseq[:clipped] if not is_polya(clipped_seq, False): continue u_seq = qseq[clipped:clipped + up_len] elif cigar_string[0][0] == 0 and name in seq_dict and seq_dict[ name]['utr3']: u_seq = qseq[:up_len] else: continue if name not in seq_dict: seq_dict[name] = {} seq_dict[name]['U'] = u_seq bed_string = [] for k, v in seq_dict.items(): info = k.split('_') if 'U' not in v: continue if info[-1] == 'F': chrom = info[1] start = int(info[2]) + 1 end = int(info[2]) + down_len + 1 if start >= 0 and end < chrom_len[chrom]: # interval must be valid bed_string.append(chrom + '\t' + str(start) + '\t' + str(end) + '\t' + k + '\t-\t+') else: chrom = info[1] start = int(info[2]) - down_len end = int(info[2]) if start >= 0 and end < chrom_len[chrom]: # interval must be valid bed_string.append(chrom + '\t' + str(start) + '\t' + str(end) + '\t' + k + '\t-\t-') down_bed = pybedtools.BedTool('\n'.join(bed_string), from_string=True) down_seqs = down_bed.sequence(fi=assem, name=True, split=True) out = open(out_file, 'w') with open(down_seqs.seqfn) as f: for line in f: name = line[1:-1].split(':')[0] seq = next(f).strip() seq_dict[name]['D'] = seq nonredundant = {} for name, v in seq_dict.items(): direction = name.split('_')[-1] if 'U' not in v or 'D' not in v or len(v['U']) < up_len: continue if direction == 'F': seq = v['U'] + v['D'] elif direction == 'R': seq = v['D'] + v['U'] seq = rev_comp(seq) if 'N' in seq: continue if seq not in nonredundant: out.write('>' + name + '\n' + seq + '\n') nonredundant[seq] = 0 out.close()
def ReadGff(GFF): list_gff = [] for line in GFF_Reader(GFF): list_gff.append(line) return list_gff
target_length_info.close() pos_file_inf.close() return 'produced gene length and position file!' if __name__ == '__main__': ## read arguments arguments = docopt(__doc__, version='1.0') gtf_file = arguments['--gtf'] species = arguments['--species'] out_dir = arguments['--out_dir'] ## gene length file and gene locus file gene_length_file = path.join(out_dir, '{}.gene_length.txt'.format(species)) gene_locus_file = path.join(out_dir, '{}.gene_locus.txt'.format(species)) get_target_length_and_pos_table(gtf_file, gene_length_file, gene_locus_file) ## gene transcript map file tr_dict = {} gene_tr_map_file = path.join(out_dir, '{}.gene_trans_map.txt'.format(species)) with open(gene_tr_map_file, 'w') as gene_tr_map_file_inf: for eachline in GFF_Reader(gtf_file): if 'transcript_id' in eachline.attr: transcript_id = eachline.attr['transcript_id'] gene_id = eachline.attr['gene_id'] if transcript_id not in tr_dict: tr_dict[transcript_id] = gene_id gene_tr_map_file_inf.write('{0}\t{1}\n'.format( gene_id, transcript_id))