Example #1
0
def geneCoordinates(species,symbols):
    genes=[]
    from build_scripts import EnsemblImport
    ensembl_annotation_db = EnsemblImport.reimportEnsemblAnnotations(species,symbolKey=True)
    for symbol in symbols:
        if symbol in ensembl_annotation_db:
            ens_geneid = ensembl_annotation_db[symbol]
            genes.append((ens_geneid,symbol))
        else:
            print symbol, 'not found'
    
    ### Get gene genomic locations
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array')
    search_locations=[]
    for (gene,symbol) in genes:
        chr,strand,start,end = gene_location_db[gene]
        #if symbol == 'SRSF10': chr = 'chr1'; strand = '-'; start = '24295573'; end = '24306953'
        if len(chr)>6: print symbol, 'bad chromosomal reference:',chr
        else:
            search_locations.append([chr,strand,start,end,symbol])
Example #2
0
def alignProbesetsToTranscripts(species,
                                array_type,
                                Analysis_type,
                                Force,
                                CoordinateBasedMatching=False):
    global force
    force = Force
    global analysis_type
    analysis_type = Analysis_type
    global coordinateBasedMatching
    coordinateBasedMatching = CoordinateBasedMatching
    """Match exon or junction probeset sequences to Ensembl and USCS mRNA transcripts"""

    if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq':
        data_type = 'junctions'
        probeset_seq_file = ''
        biotype = 'gene'
        if data_type == 'junctions' and analysis_type == 'reciprocal':
            start_time = time.time(
            )  ### Indicates whether to store information at the level of genes or probesets
            probeset_seq_db, pairwise_probeset_combinations = importJunctionAnnotationDatabaseAndSequence(
                species, array_type, biotype)
            end_time = time.time()
            time_diff = int(end_time - start_time)
        elif analysis_type == 'single':
            start_time = time.time()
            probeset_seq_db, pairwise_probeset_combinations = importAllJunctionSequences(
                species, array_type)
            end_time = time.time()
            time_diff = int(end_time - start_time)
        print "Analyses finished in %d seconds" % time_diff
    elif array_type == 'exon':
        data_type = 'exon'
        probeset_annotations_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_probesets.txt'
        ###Import probe-level associations
        exon_db = importSplicingAnnotationDatabase(probeset_annotations_file)
        start_time = time.time()
        probeset_seq_db = importProbesetSequences(exon_db, species)
        end_time = time.time()
        time_diff = int(end_time - start_time)
        print "Analyses finished in %d seconds" % time_diff

    ### Match probesets to mRNAs\=
    from build_scripts import EnsemblImport
    if coordinateBasedMatching == True and array_type == 'RNASeq':
        EnsemblImport.exportTranscriptExonIDAssociations(species)
        matchTranscriptExonIDsToJunctionIDs(
            species, array_type, probeset_seq_db
        )  ### no sequences in probeset_seq_db, just junctionIDs
    else:
        #matchTranscriptExonIDsToJunctionIDs(species,array_type,probeset_seq_db) ### no sequences in probeset_seq_db, just junctionIDs
        importEnsemblTranscriptSequence(species, array_type, probeset_seq_db)
        try:
            mRNASeqAlign.importUCSCTranscriptSequences(species, array_type,
                                                       probeset_seq_db)
        except Exception:
            pass  ### If the species not supported by UCSC - the UCSC file is not written, but the other mRNA_alignments files should be available

    probeset_seq_db = {}  ### Re-set db

    ### Import results if junction array to make comparisons valid for junction-pairs rather than a single probeset
    if data_type == 'junctions':
        ### Re-import matches from above and export matching and non-matching transcripts for each probeset to a new file
        import_dir = '/AltDatabase/' + species + '/SequenceData/output'
        g = GrabFiles()
        g.setdirectory(import_dir)
        align_files = g.searchdirectory('mRNA_alignments')
        reAnalyzeRNAProbesetMatches(align_files, species, array_type,
                                    pairwise_probeset_combinations)
def getAnnotations(Species, array_type, reannotate_exon_seq, force):
    """Annotate Affymetrix exon array data using files Ensembl data (sync'ed to genome release)."""
    global species
    species = Species
    global test
    global test_cluster
    test = 'no'
    test_cluster = ['TC0701360']
    data_type = 'mRNA'

    global ensembl_exon_db
    global ensembl_exon_db
    global exon_clusters
    global exon_region_db
    ensembl_exon_db, ensembl_annot_db, exon_clusters, intron_clusters, exon_region_db, intron_retention_db, ucsc_splicing_annot_db, ens_transcript_db = EnsemblImport.getEnsemblAssociations(
        species, data_type, test)
    ensembl_probeset_db = importCriticalExonLocations(
        species, array_type, ensembl_exon_db,
        force)  ###Get Pre-computed genomic locations for critical exons
    ensembl_probeset_db = ExonArrayEnsemblRules.annotateExons(
        ensembl_probeset_db, exon_clusters, ensembl_exon_db, exon_region_db,
        intron_retention_db, intron_clusters, ucsc_splicing_annot_db)
    constitutive_gene_db = {}
    ExonArrayEnsemblRules.exportEnsemblLinkedProbesets(array_type,
                                                       ensembl_probeset_db,
                                                       species)
    print "\nCritical exon data exported coordinates, exon associations and splicing annotations exported..."

    ### Change filenames to reflect junction array type
    export_filename = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_probesets.txt'
    ef = filepath(export_filename)
    export_replacement = string.replace(export_filename, '_probe',
                                        '_' + array_type + '_probe')
    er = filepath(export_replacement)
    shutil.copyfile(ef, er)
    os.remove(ef)  ### Copy file to a new name

    ### Export full exon seqeunce for probesets/critical exons to replace the original incomplete sequence (used for miRNA analyses)
    if reannotate_exon_seq == 'yes':
        JunctionArray.reAnnotateCriticalExonSequences(species, array_type)
def importAndReformatEnsemblJunctionAnnotations(species, array_type,
                                                nonconstitutive_junctions):
    filename = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_' + array_type + '_probesets.txt'
    export_filepath = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_probesets.txt'
    efn = filepath(export_filepath)
    export_data = open(efn, 'w')

    fn = filepath(filename)
    x = 0
    ensembl_exon_db = {}
    left = {}
    right = {}
    exon_gene_db = {}
    nonjunction_aligning = {}
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        if x == 0:
            x = 1
            export_data.write(data + '\n')
        else:
            t = string.split(data, '\t')
            probeset, exon_id, ensembl_gene_id, transcript_cluster_id, chr, strand, probeset_start, probeset_stop, affy_class, constitutitive_probeset, ens_exon_ids, exon_annotations, regionid, r_start, r_stop, splice_event, splice_junctions = t
            if len(regionid) < 1:
                regionid = exon_id
                t[12] = exon_id
            if chr == 'chrM':
                chr = 'chrMT'  ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if chr == 'M':
                chr = 'MT'  ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            tc, probeset = string.split(probeset, ':')
            regionid = string.replace(regionid, '-', '.')
            original_region_id = regionid
            r_starts = string.split(r_start, '|')
            r_stops = string.split(r_stop, '|')
            ed = EnsemblImport.ExonStructureData(ensembl_gene_id, chr, strand,
                                                 probeset_start, probeset_stop,
                                                 constitutitive_probeset,
                                                 ens_exon_ids, [])
            ed.reSetExonID(regionid)
            if '|5' in probeset:
                left[probeset[:-2]] = ed, t
                if strand == '+':  ### If the junction probesets DO NOT align to the region coordinates, then the probeset maps to a junction outside the database
                    if probeset_stop not in r_stops:
                        nonjunction_aligning[
                            probeset[:
                                     -2]] = original_region_id + '_' + probeset_stop, 'left'
                elif probeset_start not in r_starts:
                    nonjunction_aligning[
                        probeset[:
                                 -2]] = original_region_id + '_' + probeset_start, 'left'
            elif '|3' in probeset:
                right[probeset[:-2]] = ed, t
                if strand == '+':
                    if probeset_start not in r_starts:
                        nonjunction_aligning[
                            probeset[:
                                     -2]] = original_region_id + '_' + probeset_start, 'right'
                elif probeset_stop not in r_stops:
                    nonjunction_aligning[
                        probeset[:
                                 -2]] = original_region_id + '_' + probeset_stop, 'right'
            else:
                t[0] = probeset
                ensembl_exon_db[probeset] = ed
                export_data.write(string.join(t, '\t') + '\n')
                regionids = string.split(regionid, '|')
                for regionid in regionids:
                    exon_gene_db[ensembl_gene_id, regionid] = probeset

    for probeset in left:
        if probeset in right:
            l, pl = left[probeset]
            r, pr = right[probeset]
            if l.Constitutive() != r.Constitutive():
                l.setConstitutive(
                    'no'
                )  ### used to determine if a junciton is alternative or constitutive
            if probeset in nonconstitutive_junctions: l.setConstitutive('no')
            l.setJunctionCoordinates(l.ExonStart(), l.ExonStop(),
                                     r.ExonStart(), r.ExonStop())
            ens_exon_idsl = pl[10]
            ens_exon_idsr = pr[10]
            exon_idl = pl[1]
            exon_idr = pr[1]
            regionidl = pl[12]
            regionidr = pr[12]
            splice_junctionsl = pl[-1]
            splice_junctionsr = pr[-1]
            exon_idl = string.replace(exon_idl, '-', '.')
            exon_idr = string.replace(exon_idr, '-', '.')
            regionidl_block = string.split(regionidl, '-')[0]
            regionidr_block = string.split(regionidr, '-')[0]

            if regionidl_block != regionidr_block:  ### Otherwise, the junction is probing a single exon block and thus is not informative
                regionidl = string.replace(regionidl, '-', '.')
                regionidr = string.replace(regionidr, '-', '.')
                exon_id = exon_idl + '-' + exon_idr
                regionid = regionidl + '-' + regionidr

                if probeset in nonjunction_aligning:
                    new_region_id, side = nonjunction_aligning[probeset]
                    regionid = renameJunction(regionid, side, new_region_id)

                l.reSetExonID(regionid)
                ensembl_exon_db[probeset] = l

                splice_junctionsl += splice_junctionsr
                ens_exon_idsl = string.split(ens_exon_idsl, '|')
                ens_exon_idsr = string.split(ens_exon_idsr, '|')
                ens_exon_ids = string.join(
                    unique.unique(ens_exon_idsl + ens_exon_idsr), '|')
                pl[10] = ens_exon_ids
                pl[12] = regionid
                pl[1] = exon_id
                pl[-1] = splice_junctionsl
                pl[13] = l.ExonStart() + '|' + l.ExonStop()
                pl[14] = r.ExonStart() + '|' + r.ExonStop()
                strand = pl[5]
                if strand == '+':
                    pl[6] = l.ExonStop()
                    pl[7] = r.ExonStart()  ### juncstion splice-sites
                else:
                    pl[6] = l.ExonStart()
                    pl[7] = r.ExonStop()  ### juncstion splice-sites

                pl[0] = probeset
                pl[9] = l.Constitutive()

                pl = string.join(pl, '\t') + '\n'
                export_data.write(pl)

    export_data.close()
    return ensembl_exon_db, exon_gene_db
def findSpeciesInUniProtFiles(force):
    ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs

    import AltAnalyze
    ###Get species annotations from the GO-Elite config
    species_annot_db = AltAnalyze.importGOEliteSpeciesInfo()
    tax_db = {}
    for species_full in species_annot_db:
        taxid = species_annot_db[species_full].TaxID()
        tax_db[taxid] = species_full

    if force == 'yes':
        ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl
        import export
        import update
        filesearch = '_sprot_'
        all_swissprot = update.getFTPData(
            'ftp.expasy.org',
            '/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',
            filesearch)
        for file in all_swissprot:
            gz_filepath, status = update.download(file, 'uniprot_temp/', '')
            if status == 'not-removed':
                try:
                    os.remove(gz_filepath
                              )  ### Not sure why this works now and not before
                except OSError:
                    status = status

    species_uniprot_db = {}
    altanalyze_species_uniprot_db = {}
    dir = read_directory('/uniprot_temp')
    for filename in dir:
        fn = filepath('uniprot_temp/' + filename)
        for line in open(fn, 'r').xreadlines():
            data = cleanUpLine(line)
            if data[0:2] == 'OX':
                taxid = string.split(data, '=')[1][:-1]
                if taxid in tax_db:
                    species_full = tax_db[taxid]
            elif data[0:2] == 'OS':
                species = data[5:]
                species = string.split(species, ' ')[:2]
                species_full = string.join(species, ' ')
            elif data[0] == '/':
                url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/' + filename
                ss = string.split(species_full, ' ')
                if len(
                        ss
                ) == 2:  ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full
                    try:
                        species_uniprot_db[species_full].append(
                            (taxid, 'ftp://' + url + '.gz'))
                    except KeyError:
                        species_uniprot_db[species_full] = [
                            (taxid, 'ftp://' + url + '.gz')
                        ]
                taxid = ''
                species_full = ''
    from build_scripts import EnsemblImport
    species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(
        species_uniprot_db)
    ### Export all species to UniProt file relationships so this function needs to only be run once
    import export
    up = export.ExportFile('Config/uniprot-species-file.txt')
    for species_full in species_uniprot_db:
        values = species_uniprot_db[species_full]
        if len(values) > 1:
            found = 'no'
            for (taxid, url) in values:
                if taxid in tax_db:
                    if species_full == tax_db[taxid]:
                        found = 'yes'
                        print 'ambiguity resolved:', species_full
                        break
                if found == 'yes': break
        else: (taxid, url) = values[0]
        up.write(string.join([species_full, taxid, url], '\t') + '\n')
    up.close()
Example #6
0
def reformatPolyAdenylationCoordinates(species, force):
    """ PolyA annotations are currently only available from UCSC for human, but flat file
    annotations from 2003-2006 are available for multiple species. Convert these to BED format"""
    version = {}
    version['Rn'] = '2003(rn3)'
    version['Dr'] = '2003(zv4)'
    version['Gg'] = '2004(galGal2)'
    version['Hs'] = '2006(hg8)'
    version['Mm'] = '2004(mm5)'

    print 'Exporting polyADB_2 coordinates as BED for', species
    ### Obtain the necessary database files
    url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt'
    output_dir = 'AltDatabase/ucsc/' + species + '/'
    if force == 'yes':
        filename, status = update.download(url, output_dir, '')
    else:
        filename = output_dir + 'polyAsite.txt'

    ### Import the refseq to Ensembl information
    import gene_associations
    from import_scripts import OBO_import
    from build_scripts import EnsemblImport
    import export
    try:
        ens_unigene = gene_associations.getGeneToUid(species,
                                                     'Ensembl-UniGene')
        print len(ens_unigene), 'Ensembl-UniGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_unigene)
        use_entrez = 'no'
    except Exception:
        ens_entrez = gene_associations.getGeneToUid(species,
                                                    'Ensembl-EntrezGene')
        print len(ens_entrez), 'Ensembl-EntrezGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_entrez)
        use_entrez = 'yes'
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(
        species, 'RNASeq', 'key_by_array')

    export_bedfile = output_dir + species + '_polyADB_2_predictions.bed'
    print 'exporting', export_bedfile
    export_data = export.ExportFile(export_bedfile)
    header = '#' + species + '\t' + 'polyADB_2' + '\t' + version[species] + '\n'
    export_data.write(header)

    fn = filepath(filename)
    x = 0
    not_found = {}
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        if x == 0: x = 1
        else:
            siteid, llid, chr, sitenum, position, supporting_EST, cleavage = string.split(
                data, '\t')
            if chr == 'chrM':
                chr = 'chrMT'  ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if chr == 'M':
                chr = 'MT'  ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if species in siteid:
                if 'NA' not in chr: chr = 'chr' + chr
                strand = '+'
                geneid = siteid
                pos_start = str(int(position) - 1)
                pos_end = position
                if use_entrez == 'no':
                    external_geneid = string.join(
                        string.split(siteid, '.')[:2], '.')
                else:
                    external_geneid = llid
                if external_geneid in external_ensembl:
                    ens_geneid = external_ensembl[external_geneid][0]
                    geneid += '-' + ens_geneid
                    chr, strand, start, end = gene_location_db[ens_geneid]
                else:
                    not_found[external_geneid] = []
                    bed_format = string.join(
                        [chr, pos_start, pos_end, geneid, '0', '-'], '\t'
                    ) + '\n'  ### We don't know the strand, so write out both strands
                    export_data.write(bed_format)
                bed_format = string.join(
                    [chr, pos_start, pos_end, geneid, '0', strand],
                    '\t') + '\n'
                export_data.write(bed_format)
    export_data.close()