def writeGene(self, genomeFileName, geneFileName, pureGeneID=False, onlyChr=True, verbose=True): """从注释文件个基因组文件中解析出基因文件(包含内含子) 测试: # Gencode下载的基因组 writeGene(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/hg38.fa", os.environ.get("HOME")+"/hg38.gene.fa", hg38_gtf_container, pureGeneID=True) writeGene(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/mm10.fa", os.environ.get("HOME")+"/mm10.gene.fa", mm10_gtf_container, pureGeneID=True) # GENCODE 下载的基因组 writeGene(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/GENCODE/GRCh38.p7.fa", os.environ.get("HOME")+"/hg38_GENCODE.gene.fa", hg38_gtf_container, pureGeneID=True) writeGene(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/GENCODE/GRCm38.p4.fa", os.environ.get("HOME")+"/mm10_GENCODE.gene.fa", mm10_gtf_container, pureGeneID=True) 注意:这个文件可能和GENCODE上直接下载的文件不一样,GENCODE上的mRNA喜欢3'UTR加polyA """ import seq as SeqFunc OUT = open(geneFileName, 'w') Seq = SeqFunc.seqClass(genomeFileName) count = 0 for gene_ID in self.gtf_container['gene']: count += 1 if count % 1000 == 0: print '\tProcessed %.2f%% ...' % (100.0*count/len(self.gtf_container['gene'])) "strand, chromosome, " strand = self.gtf_container['gene'][gene_ID]['strand'] Chr = self.gtf_container['gene'][gene_ID]['chr'] if onlyChr and not Chr.startswith('chr'): continue "Transcript ID" GeneID = gene_ID.split('.')[0] if pureGeneID else gene_ID start = int(self.gtf_container['gene'][gene_ID]['start']) end = int(self.gtf_container['gene'][gene_ID]['end']) gene_seq = Seq.fetch(Chr, start-1, end, strand) gene_name = self.gtf_container['gene'][gene_ID]['gene_name'] print >>OUT, '>%s\t%s\t%d\t%s:%d-%d:%s\n%s' % (GeneID, gene_name, len(gene_seq), Chr, start, end, strand, SeqFunc.cutSeq(gene_seq)) OUT.close()
def writeTranscriptome(self, genomeFileName, transcriptomeFileName, genomeChrSym='chr', pureTransID=False, verbose=True): """从注释文件个基因组文件中解析出转录组文件 测试: # Gencode下载的基因组 writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/hg38.fa", os.environ.get("HOME")+"/hg38.fa", hg38_gff3_container, genomeChrSym='chr', pureTransID=True) writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/mm10.fa", os.environ.get("HOME")+"/mm10.fa", mm10_gff3_container, genomeChrSym='chr', pureTransID=True) # NCBI 下载的基因组 writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/ncbi/GRCh38.p7.fa", os.environ.get("HOME")+"/hg38_ncbi.fa", hg38_gff3_container, genomeChrSym='chr', pureTransID=True) writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/ncbi/GRCm38.p4.fa", os.environ.get("HOME")+"/mm10_ncbi.fa", mm10_gff3_container, genomeChrSym='chr', pureTransID=True) 注意:这个文件可能和NCBI上直接下载的文件不一样,NCBI上的mRNA喜欢3'UTR加polyA """ import seq as SeqFunc if genomeChrSym == 'chr': NCToChr = self.build_NC_To_chr_dict(pureNCID=False) OUT = open(transcriptomeFileName, 'w') Seq = SeqFunc.seqClass(genomeFileName) count = 0 for rna_ID in self.gff3_container['RNA']: count += 1 if count % 1000 == 0: print '\tProcessed %.2f%% ...' % ( 100.0 * count / len(self.gff3_container['RNA'])) "strand, chromosome, " strand = self.gff3_container['RNA'][rna_ID]['strand'] Chr = self.gff3_container['RNA'][rna_ID]['chr'] if not Chr.startswith('NC'): continue if genomeChrSym == 'chr': Chr = NCToChr[Chr] "Transcript ID" """ try: TransID = self.gff3_container['RNA'][rna_ID]['transcript_id'] except KeyError: if self.gff3_container['RNA'][rna_ID]['gbkey'] in ('tRNA', 'rRNA'): TransID = self.gff3_container['RNA'][rna_ID]['gene']+'_'+self.gff3_container['RNA'][rna_ID]['gbkey'] else: continue """ TransID = self.rnaid_2_transID[rna_ID] if pureTransID: TransID = TransID.split('.')[0] RNA_seq = '' exons = self.gff3_container['exon'][rna_ID] exonsList = [[int(exon['start']), int(exon['end'])] for exon in exons] for exon in exonsList: try: RNA_seq += Seq.fetch(Chr, exon[0] - 1, exon[1], strand) except KeyError: if verbose: print 'Warning: KeyError -> %s\t%d\t%d\t%s' % ( Chr, exon[0] - 1, exon[1], strand) continue print >> OUT, '>%s\n%s' % (TransID, SeqFunc.cutSeq(RNA_seq)) OUT.close()
def writeTranscriptome(self, genomeFileName, transcriptomeFileName, pureTransID=False, onlyChr=True, verbose=True): """从注释文件个基因组文件中解析出转录组文件 测试: # Gencode下载的基因组 writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/hg38.fa", os.environ.get("HOME")+"/hg38.fa", hg38_gtf_container, pureTransID=True) writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/mm10.fa", os.environ.get("HOME")+"/mm10.fa", mm10_gtf_container, pureTransID=True) # GENCODE 下载的基因组 writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/GENCODE/GRCh38.p7.fa", os.environ.get("HOME")+"/hg38_GENCODE.fa", hg38_gtf_container, pureTransID=True) writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/GENCODE/GRCm38.p4.fa", os.environ.get("HOME")+"/mm10_GENCODE.fa", mm10_gtf_container, pureTransID=True) 注意:这个文件可能和GENCODE上直接下载的文件不一样,GENCODE上的mRNA喜欢3'UTR加polyA """ import seq as SeqFunc OUT = open(transcriptomeFileName, 'w') Seq = SeqFunc.seqClass(genomeFileName) count = 0 for rna_ID in self.gtf_container['RNA']: count += 1 if count % 1000 == 0: print '\tProcessed %.2f%% ...' % ( 100.0 * count / len(self.gtf_container['RNA'])) "trancript information" GeneID = self.gtf_container['RNA'][rna_ID]['gene_id'] GeneName = self.gtf_container['RNA'][rna_ID]['gene_name'] GeneType = self.gtf_container['RNA'][rna_ID]['trans_type'] RNA_info = GeneID + '|' + GeneName + '|' + GeneType "strand, chromosome, " strand = self.gtf_container['RNA'][rna_ID]['strand'] Chr = self.gtf_container['RNA'][rna_ID]['chr'] if onlyChr and not Chr.startswith('chr'): continue "Transcript ID" TransID = rna_ID.split('.')[0] if pureTransID else rna_ID RNA_seq = '' exons = self.gtf_container['exon'][rna_ID] exonsList = [[int(exon['start']), int(exon['end'])] for exon in exons] for exon in exonsList: try: RNA_seq += Seq.fetch(Chr, exon[0] - 1, exon[1], strand) except KeyError: if verbose: print 'Warning: KeyError -> %s\t%d\t%d\t%s' % ( Chr, exon[0] - 1, exon[1], strand) continue print >> OUT, '>%s\t%s\n%s' % (TransID, RNA_info, SeqFunc.cutSeq(RNA_seq)) OUT.close()
def writeTranscriptome(self, genomeFileName, transcriptomeFileName, genomeChrSym='chr', pureTransID=False, verbose=True): """从注释文件个基因组文件中解析出转录组文件 测试: # Gencode下载的基因组 writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/hg38.fa", os.environ.get("HOME")+"/hg38.fa", hg38_gff3_container, genomeChrSym='chr', pureTransID=True) writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/mm10.fa", os.environ.get("HOME")+"/mm10.fa", mm10_gff3_container, genomeChrSym='chr', pureTransID=True) # NCBI 下载的基因组 writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/ncbi/GRCh38.p7.fa", os.environ.get("HOME")+"/hg38_ncbi.fa", hg38_gff3_container, genomeChrSym='chr', pureTransID=True) writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/ncbi/GRCm38.p4.fa", os.environ.get("HOME")+"/mm10_ncbi.fa", mm10_gff3_container, genomeChrSym='chr', pureTransID=True) 注意:这个文件可能和NCBI上直接下载的文件不一样,NCBI上的mRNA喜欢3'UTR加polyA """ import seq as SeqFunc if genomeChrSym == 'chr': NCToChr = self.build_NC_To_chr_dict(pureNCID=False) OUT = open(transcriptomeFileName, 'w') Seq = SeqFunc.seqClass(genomeFileName) count = 0 for rna_ID in self.gff3_container['RNA']: count += 1 if count % 1000 == 0: print '\tProcessed %.2f%% ...' % (100.0*count/len(self.gff3_container['RNA'])) "strand, chromosome, " strand = self.gff3_container['RNA'][rna_ID]['strand'] Chr = self.gff3_container['RNA'][rna_ID]['chr'] if not Chr.startswith('NC'): continue if genomeChrSym == 'chr': Chr = NCToChr[Chr] "Transcript ID" """ try: TransID = self.gff3_container['RNA'][rna_ID]['transcript_id'] except KeyError: if self.gff3_container['RNA'][rna_ID]['gbkey'] in ('tRNA', 'rRNA'): TransID = self.gff3_container['RNA'][rna_ID]['gene']+'_'+self.gff3_container['RNA'][rna_ID]['gbkey'] else: continue """ TransID = self.rnaid_2_transID[rna_ID] if pureTransID: TransID = TransID.split('.')[0] RNA_seq = '' exons = self.gff3_container['exon'][rna_ID] exonsList = [ [int(exon['start']), int(exon['end'])] for exon in exons ] for exon in exonsList: try: RNA_seq += Seq.fetch(Chr, exon[0]-1, exon[1], strand) except KeyError: if verbose: print 'Warning: KeyError -> %s\t%d\t%d\t%s' % (Chr, exon[0]-1, exon[1], strand) continue print >>OUT, '>%s\n%s' % (TransID, SeqFunc.cutSeq(RNA_seq)) OUT.close()
def writeTranscriptome(self, genomeFileName, transcriptomeFileName, pureTransID=False, onlyChr=True, verbose=True): """从注释文件个基因组文件中解析出转录组文件 测试: # Gencode下载的基因组 writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/hg38.fa", os.environ.get("HOME")+"/hg38.fa", hg38_gtf_container, pureTransID=True) writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/mm10.fa", os.environ.get("HOME")+"/mm10.fa", mm10_gtf_container, pureTransID=True) # GENCODE 下载的基因组 writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/GENCODE/GRCh38.p7.fa", os.environ.get("HOME")+"/hg38_GENCODE.fa", hg38_gtf_container, pureTransID=True) writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/GENCODE/GRCm38.p4.fa", os.environ.get("HOME")+"/mm10_GENCODE.fa", mm10_gtf_container, pureTransID=True) 注意:这个文件可能和GENCODE上直接下载的文件不一样,GENCODE上的mRNA喜欢3'UTR加polyA """ import seq as SeqFunc OUT = open(transcriptomeFileName, 'w') Seq = SeqFunc.seqClass(genomeFileName) count = 0 for rna_ID in self.gtf_container['RNA']: count += 1 if count % 1000 == 0: print '\tProcessed %.2f%% ...' % (100.0*count/len(self.gtf_container['RNA'])) "trancript information" GeneID = self.gtf_container['RNA'][rna_ID]['gene_id'] GeneName = self.gtf_container['RNA'][rna_ID]['gene_name'] GeneType = self.gtf_container['RNA'][rna_ID]['trans_type'] RNA_info = GeneID+'|'+GeneName+'|'+GeneType "strand, chromosome, " strand = self.gtf_container['RNA'][rna_ID]['strand'] Chr = self.gtf_container['RNA'][rna_ID]['chr'] if onlyChr and not Chr.startswith('chr'): continue "Transcript ID" TransID = rna_ID.split('.')[0] if pureTransID else rna_ID RNA_seq = '' exons = self.gtf_container['exon'][rna_ID] exonsList = [ [int(exon['start']), int(exon['end'])] for exon in exons ] for exon in exonsList: try: RNA_seq += Seq.fetch(Chr, exon[0]-1, exon[1], strand) except KeyError: if verbose: print 'Warning: KeyError -> %s\t%d\t%d\t%s' % (Chr, exon[0]-1, exon[1], strand) continue print >>OUT, '>%s\t%s\n%s' % (TransID, RNA_info, SeqFunc.cutSeq(RNA_seq)) OUT.close()
def writeGene(self, genomeFileName, geneFileName, pureGeneID=False, onlyChr=True, verbose=True): """从注释文件个基因组文件中解析出基因文件(包含内含子) 测试: # Gencode下载的基因组 writeGene(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/hg38.fa", os.environ.get("HOME")+"/hg38.gene.fa", hg38_gtf_container, pureGeneID=True) writeGene(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/mm10.fa", os.environ.get("HOME")+"/mm10.gene.fa", mm10_gtf_container, pureGeneID=True) # GENCODE 下载的基因组 writeGene(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/GENCODE/GRCh38.p7.fa", os.environ.get("HOME")+"/hg38_GENCODE.gene.fa", hg38_gtf_container, pureGeneID=True) writeGene(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/GENCODE/GRCm38.p4.fa", os.environ.get("HOME")+"/mm10_GENCODE.gene.fa", mm10_gtf_container, pureGeneID=True) 注意:这个文件可能和GENCODE上直接下载的文件不一样,GENCODE上的mRNA喜欢3'UTR加polyA """ import seq as SeqFunc OUT = open(geneFileName, 'w') Seq = SeqFunc.seqClass(genomeFileName) count = 0 for gene_ID in self.gtf_container['gene']: count += 1 if count % 1000 == 0: print '\tProcessed %.2f%% ...' % ( 100.0 * count / len(self.gtf_container['gene'])) "strand, chromosome, " strand = self.gtf_container['gene'][gene_ID]['strand'] Chr = self.gtf_container['gene'][gene_ID]['chr'] if onlyChr and not Chr.startswith('chr'): continue "Transcript ID" GeneID = gene_ID.split('.')[0] if pureGeneID else gene_ID start = int(self.gtf_container['gene'][gene_ID]['start']) end = int(self.gtf_container['gene'][gene_ID]['end']) gene_seq = Seq.fetch(Chr, start - 1, end, strand) gene_name = self.gtf_container['gene'][gene_ID]['gene_name'] print >> OUT, '>%s\t%s\t%d\t%s:%d-%d:%s\n%s' % ( GeneID, gene_name, len(gene_seq), Chr, start, end, strand, SeqFunc.cutSeq(gene_seq)) OUT.close()
def check_transcriptome_MAF(tMaf_FileName, sp1_genome_FileName, sp1_transcriptome_FileName, sp2_genome_FileName, sp2_transcriptome_FileName, verbose=False): sp1_genomeSeq = seq.seqClass(sp1_genome_FileName) sp1_transcriptomeSeq = seq.seqClass(sp1_transcriptome_FileName) sp2_genomeSeq = seq.seqClass(sp2_genome_FileName) sp2_transcriptomeSeq = seq.seqClass(sp2_transcriptome_FileName) IN = open(tMaf_FileName) human = IN.readline() mouse = IN.readline() line = IN.readline() count = 1 last_chr = '' while line: (Chr1, genome_s1, genome_e1, strand1, transID1, trans_s1, trans_e1, seq1) = check_tMaf_Methods.sparse_tMaf_Line(human) (Chr2, genome_s2, genome_e2, strand2, transID2, trans_s2, trans_e2, seq2) = check_tMaf_Methods.sparse_tMaf_Line(mouse) if Chr1 != last_chr: print 'Now Checking ' + Chr1 last_chr = Chr1 try: if len(seq1) != len(seq2): print 'Error 1', count print human print mouse return if check_tMaf_Methods.base_len( seq1) != trans_e1 - trans_s1 + 1: print 'Error 2', count print human print mouse return if check_tMaf_Methods.base_len( seq2) != trans_e2 - trans_s2 + 1: print 'Error 3', count print human print mouse return if check_tMaf_Methods.base_len(seq1) != genome_e1 - genome_s1: print 'Error 4', count print human print mouse return if check_tMaf_Methods.base_len(seq2) != genome_e2 - genome_s2: print 'Error 5', count print human print mouse return if sp1_genomeSeq.fetch( Chr1, genome_s1, genome_e1, strand1).upper() != sp1_transcriptomeSeq.fetch( transID1, trans_s1 - 1, trans_e1).upper() or sp1_transcriptomeSeq.fetch( transID1, trans_s1 - 1, trans_e1).upper( ) != check_tMaf_Methods.pure_seq(seq1).upper(): print 'Error 6', count print human print mouse return if sp2_genomeSeq.fetch( Chr2, genome_s2, genome_e2, strand2).upper() != sp2_transcriptomeSeq.fetch( transID2, trans_s2 - 1, trans_e2).upper() or sp2_transcriptomeSeq.fetch( transID2, trans_s2 - 1, trans_e2).upper( ) != check_tMaf_Methods.pure_seq(seq2).upper(): print 'Error 7', count print human print mouse return except KeyError: if verbose: print 'Check tMaf: Key Error -> ' print '\t\tsp_1: ', (Chr1, genome_s1, genome_e1, strand1, transID1, trans_s1, trans_e1) print '\t\tsp_2: ', (Chr2, genome_s2, genome_e2, strand2, transID2, trans_s2, trans_e2) else: pass human = IN.readline() mouse = IN.readline() line = IN.readline() count += 1 del sp1_genomeSeq del sp2_genomeSeq del sp1_transcriptomeSeq del sp2_transcriptomeSeq print 'All Right'
elif m6A_site < hnrnpc_s < hnrnpc_e: if strand == '+': s = 100 + (hnrnpc_s - m6A_site) e = 100 + (hnrnpc_e - m6A_site) if strand == '-': s = 100 - (hnrnpc_e - m6A_site) e = 100 - (hnrnpc_s - m6A_site) for i in range(s, e + 1): count[i] += 1 return count import tools, seq handle = seq.seqClass( "/150T/zhangqf/GenomeAnnotation/genome/GRCh37.p13.genome.fa") count = statistic_HNRNPC_m6A("/tmp/HNRNPC_m6A.hg19.txt", handle) tools.plt.plot(range(-100, 100), count) tools.plt.savefig("figs/hnrnpc_m6A.pdf") tools.plt.show() #################### # 2. Shuffle m6A or PBR #################### def read_HNRNPC_m6A(inFile): HNRNPC = {} m6A = {}