Exemple #1
0
def geneCoordinates(species,symbols):
    genes=[]
    import EnsemblImport
    ensembl_annotation_db = EnsemblImport.reimportEnsemblAnnotations(species,symbolKey=True)
    for symbol in symbols:
        ens_geneid = ensembl_annotation_db[symbol]
        genes.append((ens_geneid,symbol))
    
    ### Get gene genomic locations
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array')
    search_locations=[]
    for (gene,symbol) in genes:
        chr,strand,start,end = gene_location_db[gene]
        if symbol == 'SRSF10': chr = 'chr1'; strand = '-'; start = '24295573'; end = '24306953'
        if len(chr)>6: print symbol, 'bad chromosomal reference:',chr
        else:
            search_locations.append([chr,strand,start,end,symbol])
Exemple #2
0
def exportChromosomeStrandCoordinates(species):
    import EnsemblImport
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(
        species, 'RNASeq', 'key_by_array')

    import ExpressionBuilder
    gene_biotype_db = ExpressionBuilder.importTranscriptBiotypeAnnotations(
        species)
    export_path = 'GenMAPPDBs/' + species + '/chr_gene_locations.txt'
    export_data = export.ExportFile(export_path)

    import ExonAnalyze_module
    gene_annotation_file = "AltDatabase/ensembl/" + species + "/" + species + "_Ensembl-annotations.txt"
    annotate_db = ExonAnalyze_module.import_annotations(
        gene_annotation_file, 'RNASeq')

    print 'Annotations for', len(gene_location_db), 'genes imported'

    sorted_list = []
    protein_coding = 0
    for gene in gene_location_db:
        chr, strand, start, end = gene_location_db[gene]
        if gene in gene_biotype_db:
            biotype = gene_biotype_db[gene][-1]
            if biotype == 'protein_coding': protein_coding += 1

        else: biotype = 'NA'
        if len(chr) < 7:
            sorted_list.append(
                [chr, strand, int(start),
                 int(end), gene, biotype])
        #else: print chr;sys.exit()
    print len(sorted_list), 'genes for typical chromosomes present'
    print protein_coding, 'protein coding genes present'
    sorted_list.sort()
    for values in sorted_list:
        chr, strand, start, end, gene, biotype = values
        try:
            symbol = annotate_db[gene].Symbol()
        except Exception:
            symbol = ''
        values = [gene, symbol, chr, strand, str(start), str(end), biotype]
        export_data.write(string.join(values, '\t') + '\n')
    export_data.close()
    print species, 'chromosome locations exported to:\n', export_path
def geneCoordinates(species,symbols):
    genes=[]
    import EnsemblImport
    ensembl_annotation_db = EnsemblImport.reimportEnsemblAnnotations(species,symbolKey=True)
    for symbol in symbols:
        if symbol in ensembl_annotation_db:
            ens_geneid = ensembl_annotation_db[symbol]
            genes.append((ens_geneid,symbol))
        else:
            print symbol, 'not found'
    
    ### Get gene genomic locations
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array')
    search_locations=[]
    for (gene,symbol) in genes:
        chr,strand,start,end = gene_location_db[gene]
        #if symbol == 'SRSF10': chr = 'chr1'; strand = '-'; start = '24295573'; end = '24306953'
        if len(chr)>6: print symbol, 'bad chromosomal reference:',chr
        else:
            search_locations.append([chr,strand,start,end,symbol])
Exemple #4
0
def exportChromosomeStrandCoordinates(species):
    import EnsemblImport
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array')

    import ExpressionBuilder
    gene_biotype_db = ExpressionBuilder.importTranscriptBiotypeAnnotations(species)
    export_path = 'GenMAPPDBs/'+species+'/chr_gene_locations.txt'
    export_data = export.ExportFile(export_path)

    import ExonAnalyze_module
    gene_annotation_file = "AltDatabase/ensembl/"+species+"/"+species+"_Ensembl-annotations.txt"
    annotate_db = ExonAnalyze_module.import_annotations(gene_annotation_file,'RNASeq')
      
    print 'Annotations for',len(gene_location_db),'genes imported'
    
    sorted_list=[]; protein_coding=0 
    for gene in gene_location_db:
        chr,strand,start,end = gene_location_db[gene]
        if gene in gene_biotype_db:
            biotype = gene_biotype_db[gene][-1]
            if biotype == 'protein_coding': protein_coding+=1
                
        else: biotype = 'NA'
        if len(chr)<7:
            sorted_list.append([chr,strand,int(start),int(end),gene,biotype])
        #else: print chr;sys.exit()
    print len(sorted_list),'genes for typical chromosomes present'
    print protein_coding, 'protein coding genes present'
    sorted_list.sort()        
    for values in sorted_list:
        chr,strand,start,end,gene,biotype=values
        try: symbol = annotate_db[gene].Symbol()
        except Exception: symbol = ''
        values = [gene,symbol,chr,strand,str(start),str(end),biotype]
        export_data.write(string.join(values,'\t')+'\n')
    export_data.close()
    print species, 'chromosome locations exported to:\n',export_path
Exemple #5
0
def reformatPolyAdenylationCoordinates(species,force):
    """ PolyA annotations are currently only available from UCSC for human, but flat file
    annotations from 2003-2006 are available for multiple species. Convert these to BED format"""
    version={}
    version['Rn'] = '2003(rn3)'
    version['Dr'] = '2003(zv4)'
    version['Gg'] = '2004(galGal2)'
    version['Hs'] = '2006(hg8)'
    version['Mm'] = '2004(mm5)'

    print 'Exporting polyADB_2 coordinates as BED for',species
    ### Obtain the necessary database files
    url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt'
    output_dir = 'AltDatabase/ucsc/'+species + '/'
    if force == 'yes':
        filename, status = update.download(url,output_dir,'')
    else: filename = output_dir+'polyAsite.txt'

    ### Import the refseq to Ensembl information
    import gene_associations; import OBO_import; import EnsemblImport; import export
    try:
        ens_unigene = gene_associations.getGeneToUid(species,'Ensembl-UniGene')
        print len(ens_unigene),'Ensembl-UniGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_unigene); use_entrez='no'
    except Exception:
        ens_entrez = gene_associations.getGeneToUid(species,'Ensembl-EntrezGene')
        print len(ens_entrez),'Ensembl-EntrezGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_entrez); use_entrez='yes'
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array')
    
    export_bedfile = output_dir+species+'_polyADB_2_predictions.bed'
    print 'exporting',export_bedfile
    export_data = export.ExportFile(export_bedfile)
    header = '#'+species+'\t'+'polyADB_2'+'\t'+version[species]+'\n'
    export_data.write(header)
    
    fn=filepath(filename); x=0; not_found={}
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        if x==0: x=1
        else:
            siteid,llid,chr,sitenum,position,supporting_EST,cleavage = string.split(data,'\t')
            if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if species in siteid:
                if 'NA' not in chr: chr = 'chr'+chr
                strand = '+'; geneid = siteid
                pos_start = str(int(position)-1); pos_end = position
                if use_entrez=='no':
                    external_geneid = string.join(string.split(siteid,'.')[:2],'.')
                else: external_geneid=llid
                if external_geneid in external_ensembl:
                    ens_geneid = external_ensembl[external_geneid][0]
                    geneid += '-'+ens_geneid
                    chr,strand,start,end = gene_location_db[ens_geneid]
                else:
                    not_found[external_geneid]=[]
                    bed_format = string.join([chr,pos_start,pos_end,geneid,'0','-'],'\t')+'\n' ### We don't know the strand, so write out both strands
                    export_data.write(bed_format)
                bed_format = string.join([chr,pos_start,pos_end,geneid,'0',strand],'\t')+'\n'
                export_data.write(bed_format)
    export_data.close()   
def reformatPolyAdenylationCoordinates(species, force):
    """ PolyA annotations are currently only available from UCSC for human, but flat file
    annotations from 2003-2006 are available for multiple species. Convert these to BED format"""
    version = {}
    version['Rn'] = '2003(rn3)'
    version['Dr'] = '2003(zv4)'
    version['Gg'] = '2004(galGal2)'
    version['Hs'] = '2006(hg8)'
    version['Mm'] = '2004(mm5)'

    print 'Exporting polyADB_2 coordinates as BED for', species
    ### Obtain the necessary database files
    url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt'
    output_dir = 'AltDatabase/ucsc/' + species + '/'
    if force == 'yes':
        filename, status = update.download(url, output_dir, '')
    else:
        filename = output_dir + 'polyAsite.txt'

    ### Import the refseq to Ensembl information
    import gene_associations
    import OBO_import
    import EnsemblImport
    import export
    try:
        ens_unigene = gene_associations.getGeneToUid(species,
                                                     'Ensembl-UniGene')
        print len(ens_unigene), 'Ensembl-UniGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_unigene)
        use_entrez = 'no'
    except Exception:
        ens_entrez = gene_associations.getGeneToUid(species,
                                                    'Ensembl-EntrezGene')
        print len(ens_entrez), 'Ensembl-EntrezGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_entrez)
        use_entrez = 'yes'
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(
        species, 'RNASeq', 'key_by_array')

    export_bedfile = output_dir + species + '_polyADB_2_predictions.bed'
    print 'exporting', export_bedfile
    export_data = export.ExportFile(export_bedfile)
    header = '#' + species + '\t' + 'polyADB_2' + '\t' + version[species] + '\n'
    export_data.write(header)

    fn = filepath(filename)
    x = 0
    not_found = {}
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        if x == 0: x = 1
        else:
            siteid, llid, chr, sitenum, position, supporting_EST, cleavage = string.split(
                data, '\t')
            if chr == 'chrM':
                chr = 'chrMT'  ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if chr == 'M':
                chr = 'MT'  ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if species in siteid:
                if 'NA' not in chr: chr = 'chr' + chr
                strand = '+'
                geneid = siteid
                pos_start = str(int(position) - 1)
                pos_end = position
                if use_entrez == 'no':
                    external_geneid = string.join(
                        string.split(siteid, '.')[:2], '.')
                else:
                    external_geneid = llid
                if external_geneid in external_ensembl:
                    ens_geneid = external_ensembl[external_geneid][0]
                    geneid += '-' + ens_geneid
                    chr, strand, start, end = gene_location_db[ens_geneid]
                else:
                    not_found[external_geneid] = []
                    bed_format = string.join(
                        [chr, pos_start, pos_end, geneid, '0', '-'], '\t'
                    ) + '\n'  ### We don't know the strand, so write out both strands
                    export_data.write(bed_format)
                bed_format = string.join(
                    [chr, pos_start, pos_end, geneid, '0', strand],
                    '\t') + '\n'
                export_data.write(bed_format)
    export_data.close()