def writeGene(self, genomeFileName, geneFileName, pureGeneID=False, onlyChr=True, verbose=True):
     """从注释文件个基因组文件中解析出基因文件(包含内含子)
     测试:
         # Gencode下载的基因组
         writeGene(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/hg38.fa", os.environ.get("HOME")+"/hg38.gene.fa", hg38_gtf_container, pureGeneID=True)
         writeGene(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/mm10.fa", os.environ.get("HOME")+"/mm10.gene.fa", mm10_gtf_container, pureGeneID=True)
         # GENCODE 下载的基因组
         writeGene(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/GENCODE/GRCh38.p7.fa", os.environ.get("HOME")+"/hg38_GENCODE.gene.fa", hg38_gtf_container, pureGeneID=True)
         writeGene(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/GENCODE/GRCm38.p4.fa", os.environ.get("HOME")+"/mm10_GENCODE.gene.fa", mm10_gtf_container, pureGeneID=True)
     注意:这个文件可能和GENCODE上直接下载的文件不一样,GENCODE上的mRNA喜欢3'UTR加polyA
     """
     import seq as SeqFunc
     OUT = open(geneFileName, 'w')
     Seq = SeqFunc.seqClass(genomeFileName)
     count = 0
     for gene_ID in self.gtf_container['gene']:
         count += 1
         if count % 1000 == 0: print '\tProcessed %.2f%% ...' % (100.0*count/len(self.gtf_container['gene']))
         "strand, chromosome, "
         strand = self.gtf_container['gene'][gene_ID]['strand']
         Chr = self.gtf_container['gene'][gene_ID]['chr']
         if onlyChr and not Chr.startswith('chr'): continue
         "Transcript ID"
         GeneID = gene_ID.split('.')[0] if pureGeneID else gene_ID
         start = int(self.gtf_container['gene'][gene_ID]['start'])
         end = int(self.gtf_container['gene'][gene_ID]['end'])
         gene_seq = Seq.fetch(Chr, start-1, end, strand)
         gene_name = self.gtf_container['gene'][gene_ID]['gene_name']
         print >>OUT, '>%s\t%s\t%d\t%s:%d-%d:%s\n%s' % (GeneID, gene_name, len(gene_seq), Chr, start, end, strand, SeqFunc.cutSeq(gene_seq))
     OUT.close()
Beispiel #2
0
 def writeTranscriptome(self,
                        genomeFileName,
                        transcriptomeFileName,
                        genomeChrSym='chr',
                        pureTransID=False,
                        verbose=True):
     """从注释文件个基因组文件中解析出转录组文件
     测试:
         # Gencode下载的基因组
         writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/hg38.fa", os.environ.get("HOME")+"/hg38.fa", hg38_gff3_container, genomeChrSym='chr', pureTransID=True)
         writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/mm10.fa", os.environ.get("HOME")+"/mm10.fa", mm10_gff3_container, genomeChrSym='chr', pureTransID=True)
         # NCBI 下载的基因组
         writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/ncbi/GRCh38.p7.fa", os.environ.get("HOME")+"/hg38_ncbi.fa", hg38_gff3_container, genomeChrSym='chr', pureTransID=True)
         writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/ncbi/GRCm38.p4.fa", os.environ.get("HOME")+"/mm10_ncbi.fa", mm10_gff3_container, genomeChrSym='chr', pureTransID=True)
     注意:这个文件可能和NCBI上直接下载的文件不一样,NCBI上的mRNA喜欢3'UTR加polyA
     """
     import seq as SeqFunc
     if genomeChrSym == 'chr':
         NCToChr = self.build_NC_To_chr_dict(pureNCID=False)
     OUT = open(transcriptomeFileName, 'w')
     Seq = SeqFunc.seqClass(genomeFileName)
     count = 0
     for rna_ID in self.gff3_container['RNA']:
         count += 1
         if count % 1000 == 0:
             print '\tProcessed %.2f%% ...' % (
                 100.0 * count / len(self.gff3_container['RNA']))
         "strand, chromosome, "
         strand = self.gff3_container['RNA'][rna_ID]['strand']
         Chr = self.gff3_container['RNA'][rna_ID]['chr']
         if not Chr.startswith('NC'): continue
         if genomeChrSym == 'chr': Chr = NCToChr[Chr]
         "Transcript ID"
         """
         try:
             TransID = self.gff3_container['RNA'][rna_ID]['transcript_id']
         except KeyError:
             if self.gff3_container['RNA'][rna_ID]['gbkey'] in ('tRNA', 'rRNA'):
                 TransID = self.gff3_container['RNA'][rna_ID]['gene']+'_'+self.gff3_container['RNA'][rna_ID]['gbkey']
             else:
                 continue
         """
         TransID = self.rnaid_2_transID[rna_ID]
         if pureTransID: TransID = TransID.split('.')[0]
         RNA_seq = ''
         exons = self.gff3_container['exon'][rna_ID]
         exonsList = [[int(exon['start']),
                       int(exon['end'])] for exon in exons]
         for exon in exonsList:
             try:
                 RNA_seq += Seq.fetch(Chr, exon[0] - 1, exon[1], strand)
             except KeyError:
                 if verbose:
                     print 'Warning: KeyError -> %s\t%d\t%d\t%s' % (
                         Chr, exon[0] - 1, exon[1], strand)
                 continue
         print >> OUT, '>%s\n%s' % (TransID, SeqFunc.cutSeq(RNA_seq))
     OUT.close()
Beispiel #3
0
 def writeTranscriptome(self,
                        genomeFileName,
                        transcriptomeFileName,
                        pureTransID=False,
                        onlyChr=True,
                        verbose=True):
     """从注释文件个基因组文件中解析出转录组文件
     测试:
         # Gencode下载的基因组
         writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/hg38.fa", os.environ.get("HOME")+"/hg38.fa", hg38_gtf_container, pureTransID=True)
         writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/mm10.fa", os.environ.get("HOME")+"/mm10.fa", mm10_gtf_container, pureTransID=True)
         # GENCODE 下载的基因组
         writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/GENCODE/GRCh38.p7.fa", os.environ.get("HOME")+"/hg38_GENCODE.fa", hg38_gtf_container, pureTransID=True)
         writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/GENCODE/GRCm38.p4.fa", os.environ.get("HOME")+"/mm10_GENCODE.fa", mm10_gtf_container, pureTransID=True)
     注意:这个文件可能和GENCODE上直接下载的文件不一样,GENCODE上的mRNA喜欢3'UTR加polyA
     """
     import seq as SeqFunc
     OUT = open(transcriptomeFileName, 'w')
     Seq = SeqFunc.seqClass(genomeFileName)
     count = 0
     for rna_ID in self.gtf_container['RNA']:
         count += 1
         if count % 1000 == 0:
             print '\tProcessed %.2f%% ...' % (
                 100.0 * count / len(self.gtf_container['RNA']))
         "trancript information"
         GeneID = self.gtf_container['RNA'][rna_ID]['gene_id']
         GeneName = self.gtf_container['RNA'][rna_ID]['gene_name']
         GeneType = self.gtf_container['RNA'][rna_ID]['trans_type']
         RNA_info = GeneID + '|' + GeneName + '|' + GeneType
         "strand, chromosome, "
         strand = self.gtf_container['RNA'][rna_ID]['strand']
         Chr = self.gtf_container['RNA'][rna_ID]['chr']
         if onlyChr and not Chr.startswith('chr'): continue
         "Transcript ID"
         TransID = rna_ID.split('.')[0] if pureTransID else rna_ID
         RNA_seq = ''
         exons = self.gtf_container['exon'][rna_ID]
         exonsList = [[int(exon['start']),
                       int(exon['end'])] for exon in exons]
         for exon in exonsList:
             try:
                 RNA_seq += Seq.fetch(Chr, exon[0] - 1, exon[1], strand)
             except KeyError:
                 if verbose:
                     print 'Warning: KeyError -> %s\t%d\t%d\t%s' % (
                         Chr, exon[0] - 1, exon[1], strand)
                 continue
         print >> OUT, '>%s\t%s\n%s' % (TransID, RNA_info,
                                        SeqFunc.cutSeq(RNA_seq))
     OUT.close()
Beispiel #4
0
 def writeTranscriptome(self, genomeFileName, transcriptomeFileName, genomeChrSym='chr', pureTransID=False, verbose=True):
     """从注释文件个基因组文件中解析出转录组文件
     测试:
         # Gencode下载的基因组
         writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/hg38.fa", os.environ.get("HOME")+"/hg38.fa", hg38_gff3_container, genomeChrSym='chr', pureTransID=True)
         writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/mm10.fa", os.environ.get("HOME")+"/mm10.fa", mm10_gff3_container, genomeChrSym='chr', pureTransID=True)
         # NCBI 下载的基因组
         writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/ncbi/GRCh38.p7.fa", os.environ.get("HOME")+"/hg38_ncbi.fa", hg38_gff3_container, genomeChrSym='chr', pureTransID=True)
         writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/ncbi/GRCm38.p4.fa", os.environ.get("HOME")+"/mm10_ncbi.fa", mm10_gff3_container, genomeChrSym='chr', pureTransID=True)
     注意:这个文件可能和NCBI上直接下载的文件不一样,NCBI上的mRNA喜欢3'UTR加polyA
     """
     import seq as SeqFunc
     if genomeChrSym == 'chr':
         NCToChr = self.build_NC_To_chr_dict(pureNCID=False)
     OUT = open(transcriptomeFileName, 'w')
     Seq = SeqFunc.seqClass(genomeFileName)
     count = 0
     for rna_ID in self.gff3_container['RNA']:
         count += 1
         if count % 1000 == 0: print '\tProcessed %.2f%% ...' % (100.0*count/len(self.gff3_container['RNA']))
         "strand, chromosome, "
         strand = self.gff3_container['RNA'][rna_ID]['strand']
         Chr = self.gff3_container['RNA'][rna_ID]['chr']
         if not Chr.startswith('NC'): continue
         if genomeChrSym == 'chr': Chr = NCToChr[Chr]
         "Transcript ID"
         """
         try:
             TransID = self.gff3_container['RNA'][rna_ID]['transcript_id']
         except KeyError:
             if self.gff3_container['RNA'][rna_ID]['gbkey'] in ('tRNA', 'rRNA'):
                 TransID = self.gff3_container['RNA'][rna_ID]['gene']+'_'+self.gff3_container['RNA'][rna_ID]['gbkey']
             else:
                 continue
         """
         TransID = self.rnaid_2_transID[rna_ID]
         if pureTransID: TransID = TransID.split('.')[0]
         RNA_seq = ''
         exons = self.gff3_container['exon'][rna_ID]
         exonsList = [ [int(exon['start']), int(exon['end'])] for exon in exons ]
         for exon in exonsList:
             try:
                 RNA_seq += Seq.fetch(Chr, exon[0]-1, exon[1], strand)
             except KeyError:
                 if verbose: 
                     print 'Warning: KeyError -> %s\t%d\t%d\t%s' % (Chr, exon[0]-1, exon[1], strand)
                 continue
         print >>OUT, '>%s\n%s' % (TransID, SeqFunc.cutSeq(RNA_seq))
     OUT.close()
 def writeTranscriptome(self, genomeFileName, transcriptomeFileName, pureTransID=False, onlyChr=True, verbose=True):
     """从注释文件个基因组文件中解析出转录组文件
     测试:
         # Gencode下载的基因组
         writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/hg38.fa", os.environ.get("HOME")+"/hg38.fa", hg38_gtf_container, pureTransID=True)
         writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/mm10.fa", os.environ.get("HOME")+"/mm10.fa", mm10_gtf_container, pureTransID=True)
         # GENCODE 下载的基因组
         writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/GENCODE/GRCh38.p7.fa", os.environ.get("HOME")+"/hg38_GENCODE.fa", hg38_gtf_container, pureTransID=True)
         writeTranscriptome(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/GENCODE/GRCm38.p4.fa", os.environ.get("HOME")+"/mm10_GENCODE.fa", mm10_gtf_container, pureTransID=True)
     注意:这个文件可能和GENCODE上直接下载的文件不一样,GENCODE上的mRNA喜欢3'UTR加polyA
     """
     import seq as SeqFunc
     OUT = open(transcriptomeFileName, 'w')
     Seq = SeqFunc.seqClass(genomeFileName)
     count = 0
     for rna_ID in self.gtf_container['RNA']:
         count += 1
         if count % 1000 == 0: print '\tProcessed %.2f%% ...' % (100.0*count/len(self.gtf_container['RNA']))
         "trancript information"
         GeneID = self.gtf_container['RNA'][rna_ID]['gene_id']
         GeneName = self.gtf_container['RNA'][rna_ID]['gene_name']
         GeneType = self.gtf_container['RNA'][rna_ID]['trans_type']
         RNA_info = GeneID+'|'+GeneName+'|'+GeneType
         "strand, chromosome, "
         strand = self.gtf_container['RNA'][rna_ID]['strand']
         Chr = self.gtf_container['RNA'][rna_ID]['chr']
         if onlyChr and not Chr.startswith('chr'): continue
         "Transcript ID"
         TransID = rna_ID.split('.')[0] if pureTransID else rna_ID
         RNA_seq = ''
         exons = self.gtf_container['exon'][rna_ID]
         exonsList = [ [int(exon['start']), int(exon['end'])] for exon in exons ]
         for exon in exonsList:
             try:
                 RNA_seq += Seq.fetch(Chr, exon[0]-1, exon[1], strand)
             except KeyError:
                 if verbose:
                     print 'Warning: KeyError -> %s\t%d\t%d\t%s' % (Chr, exon[0]-1, exon[1], strand)
                 continue
         print >>OUT, '>%s\t%s\n%s' % (TransID, RNA_info, SeqFunc.cutSeq(RNA_seq))
     OUT.close()
Beispiel #6
0
 def writeGene(self,
               genomeFileName,
               geneFileName,
               pureGeneID=False,
               onlyChr=True,
               verbose=True):
     """从注释文件个基因组文件中解析出基因文件(包含内含子)
     测试:
         # Gencode下载的基因组
         writeGene(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/hg38.fa", os.environ.get("HOME")+"/hg38.gene.fa", hg38_gtf_container, pureGeneID=True)
         writeGene(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/mm10.fa", os.environ.get("HOME")+"/mm10.gene.fa", mm10_gtf_container, pureGeneID=True)
         # GENCODE 下载的基因组
         writeGene(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/GENCODE/GRCh38.p7.fa", os.environ.get("HOME")+"/hg38_GENCODE.gene.fa", hg38_gtf_container, pureGeneID=True)
         writeGene(os.environ.get("HOME")+"/lipan/DYNAMIC/GTF/GENCODE/GRCm38.p4.fa", os.environ.get("HOME")+"/mm10_GENCODE.gene.fa", mm10_gtf_container, pureGeneID=True)
     注意:这个文件可能和GENCODE上直接下载的文件不一样,GENCODE上的mRNA喜欢3'UTR加polyA
     """
     import seq as SeqFunc
     OUT = open(geneFileName, 'w')
     Seq = SeqFunc.seqClass(genomeFileName)
     count = 0
     for gene_ID in self.gtf_container['gene']:
         count += 1
         if count % 1000 == 0:
             print '\tProcessed %.2f%% ...' % (
                 100.0 * count / len(self.gtf_container['gene']))
         "strand, chromosome, "
         strand = self.gtf_container['gene'][gene_ID]['strand']
         Chr = self.gtf_container['gene'][gene_ID]['chr']
         if onlyChr and not Chr.startswith('chr'): continue
         "Transcript ID"
         GeneID = gene_ID.split('.')[0] if pureGeneID else gene_ID
         start = int(self.gtf_container['gene'][gene_ID]['start'])
         end = int(self.gtf_container['gene'][gene_ID]['end'])
         gene_seq = Seq.fetch(Chr, start - 1, end, strand)
         gene_name = self.gtf_container['gene'][gene_ID]['gene_name']
         print >> OUT, '>%s\t%s\t%d\t%s:%d-%d:%s\n%s' % (
             GeneID, gene_name, len(gene_seq), Chr, start, end, strand,
             SeqFunc.cutSeq(gene_seq))
     OUT.close()
Beispiel #7
0
 def check_transcriptome_MAF(tMaf_FileName,
                             sp1_genome_FileName,
                             sp1_transcriptome_FileName,
                             sp2_genome_FileName,
                             sp2_transcriptome_FileName,
                             verbose=False):
     sp1_genomeSeq = seq.seqClass(sp1_genome_FileName)
     sp1_transcriptomeSeq = seq.seqClass(sp1_transcriptome_FileName)
     sp2_genomeSeq = seq.seqClass(sp2_genome_FileName)
     sp2_transcriptomeSeq = seq.seqClass(sp2_transcriptome_FileName)
     IN = open(tMaf_FileName)
     human = IN.readline()
     mouse = IN.readline()
     line = IN.readline()
     count = 1
     last_chr = ''
     while line:
         (Chr1, genome_s1, genome_e1, strand1, transID1, trans_s1, trans_e1,
          seq1) = check_tMaf_Methods.sparse_tMaf_Line(human)
         (Chr2, genome_s2, genome_e2, strand2, transID2, trans_s2, trans_e2,
          seq2) = check_tMaf_Methods.sparse_tMaf_Line(mouse)
         if Chr1 != last_chr:
             print 'Now Checking ' + Chr1
             last_chr = Chr1
         try:
             if len(seq1) != len(seq2):
                 print 'Error 1', count
                 print human
                 print mouse
                 return
             if check_tMaf_Methods.base_len(
                     seq1) != trans_e1 - trans_s1 + 1:
                 print 'Error 2', count
                 print human
                 print mouse
                 return
             if check_tMaf_Methods.base_len(
                     seq2) != trans_e2 - trans_s2 + 1:
                 print 'Error 3', count
                 print human
                 print mouse
                 return
             if check_tMaf_Methods.base_len(seq1) != genome_e1 - genome_s1:
                 print 'Error 4', count
                 print human
                 print mouse
                 return
             if check_tMaf_Methods.base_len(seq2) != genome_e2 - genome_s2:
                 print 'Error 5', count
                 print human
                 print mouse
                 return
             if sp1_genomeSeq.fetch(
                     Chr1, genome_s1, genome_e1,
                     strand1).upper() != sp1_transcriptomeSeq.fetch(
                         transID1, trans_s1 - 1,
                         trans_e1).upper() or sp1_transcriptomeSeq.fetch(
                             transID1, trans_s1 - 1, trans_e1).upper(
                             ) != check_tMaf_Methods.pure_seq(seq1).upper():
                 print 'Error 6', count
                 print human
                 print mouse
                 return
             if sp2_genomeSeq.fetch(
                     Chr2, genome_s2, genome_e2,
                     strand2).upper() != sp2_transcriptomeSeq.fetch(
                         transID2, trans_s2 - 1,
                         trans_e2).upper() or sp2_transcriptomeSeq.fetch(
                             transID2, trans_s2 - 1, trans_e2).upper(
                             ) != check_tMaf_Methods.pure_seq(seq2).upper():
                 print 'Error 7', count
                 print human
                 print mouse
                 return
         except KeyError:
             if verbose:
                 print 'Check tMaf: Key Error -> '
                 print '\t\tsp_1: ', (Chr1, genome_s1, genome_e1, strand1,
                                      transID1, trans_s1, trans_e1)
                 print '\t\tsp_2: ', (Chr2, genome_s2, genome_e2, strand2,
                                      transID2, trans_s2, trans_e2)
             else:
                 pass
         human = IN.readline()
         mouse = IN.readline()
         line = IN.readline()
         count += 1
     del sp1_genomeSeq
     del sp2_genomeSeq
     del sp1_transcriptomeSeq
     del sp2_transcriptomeSeq
     print 'All Right'
        elif m6A_site < hnrnpc_s < hnrnpc_e:
            if strand == '+':
                s = 100 + (hnrnpc_s - m6A_site)
                e = 100 + (hnrnpc_e - m6A_site)
            if strand == '-':
                s = 100 - (hnrnpc_e - m6A_site)
                e = 100 - (hnrnpc_s - m6A_site)
        for i in range(s, e + 1):
            count[i] += 1

    return count


import tools, seq

handle = seq.seqClass(
    "/150T/zhangqf/GenomeAnnotation/genome/GRCh37.p13.genome.fa")
count = statistic_HNRNPC_m6A("/tmp/HNRNPC_m6A.hg19.txt", handle)

tools.plt.plot(range(-100, 100), count)
tools.plt.savefig("figs/hnrnpc_m6A.pdf")
tools.plt.show()

####################
# 2. Shuffle m6A or PBR
####################


def read_HNRNPC_m6A(inFile):
    HNRNPC = {}
    m6A = {}