Esempio n. 1
0
def runProgram(Species,Array_type,mir_source,stringency,Force):
    global species; global array_type; global force
    process_microRNA_predictions = 'yes'

    species = Species; array_type = Array_type; force = Force
    
    import_dir = '/AltDatabase/'+species+'/'+array_type
    filedir = import_dir[1:]+'/'
    dir_list = read_directory(import_dir)  #send a sub_directory to a function to identify all files in a directory
    probeset_seq_file=''
    for input_file in dir_list:    #loop through each file in the directory to  results
        if 'critical-exon-seq_updated' in input_file: probeset_seq_file = filedir+input_file
        elif 'critical-exon-seq' in input_file: probeset_seq_file2 = filedir+input_file
    if len(probeset_seq_file)==0: probeset_seq_file=probeset_seq_file2
        
    data_type = 'critical-exons'
    try: splice_event_db = getParametersAndExecute(probeset_seq_file,array_type,species,data_type)
    except UnboundLocalError:
        probeset_seq_file = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'_critical-exon-seq_updated.txt'
        update.downloadCurrentVersion(probeset_seq_file,array_type,'txt')
        splice_event_db = getParametersAndExecute(probeset_seq_file,array_type,species,data_type)
        
    if process_microRNA_predictions == 'yes':
        print 'stringency:',stringency
        try:
            ensembl_mirna_db = ExonSeqModule.importmiRNATargetPredictionsAdvanced(species)
            ExonSeqModule.alignmiRNAData(array_type,mir_source,species,stringency,ensembl_mirna_db,splice_event_db)
        except Exception: pass
Esempio n. 2
0
def importSplicingAnnotationDatabaseAndSequence(species,array_type,biotype):
    array_ens_db={}
    if array_type == 'AltMouse':
        filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'-Ensembl_relationships.txt'
        update.verifyFile(filename,array_type) ### Will force download if missing
        fn=filepath(filename); x = 0
        for line in open(fn,'r').xreadlines():
            data, newline = string.split(line,'\n'); t = string.split(data,'\t')
            if x==0: x=1
            else: 
                array_gene,ens_gene = t
                try: array_ens_db[array_gene].append(ens_gene)
                except KeyError: array_ens_db[array_gene]=[ens_gene]

    filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'_critical-junction-seq.txt'         
    fn=filepath(filename); probeset_seq_db={}; x = 0
    for line in open(fn,'r').xreadlines():
        data, newline = string.split(line,'\n'); t = string.split(data,'\t')
        if x==0: x=1
        else: 
            probeset,probeset_seq,junction_seq = t; junction_seq=string.replace(junction_seq,'|','')
            probeset_seq_db[probeset] = probeset_seq,junction_seq
            
    ###Import reciprocol junctions, so we can compare these directly instead of hits to nulls and combine with sequence data
    ###This short-cuts what we did in two function in ExonSeqModule with exon level data
    filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'_junction-comparisons.txt'
    fn=filepath(filename); probeset_gene_seq_db={}; x = 0
    for line in open(fn,'r').xreadlines():
        data, newline = string.split(line,'\n'); t = string.split(data,'\t')
        if x==0: x=1
        else: 
            array_gene,probeset1,probeset2,critical_exons = t #; critical_exons = string.split(critical_exons,'|')
            probesets = [probeset1,probeset2]
            if array_type == 'junction' or array_type == 'RNASeq': array_ens_db[array_gene]=[array_gene]
            if array_gene in array_ens_db:
                ensembl_gene_ids = array_ens_db[array_gene]
                for probeset_id in probesets:
                    if probeset_id in probeset_seq_db:
                        probeset_seq,junction_seq = probeset_seq_db[probeset_id]
                        if biotype == 'gene':
                            for ensembl_gene_id in ensembl_gene_ids:
                                probe_data = ExonSeqModule.JunctionDataSimple(probeset_id,ensembl_gene_id,array_gene,probesets,critical_exons)
                                probe_data.SetExonSeq(probeset_seq)
                                probe_data.SetJunctionSeq(junction_seq)
                                try: probeset_gene_seq_db[ensembl_gene_id].append(probe_data)
                                except KeyError: probeset_gene_seq_db[ensembl_gene_id] = [probe_data]
                        else: ### Used for probeset annotations downstream of sequence alignment in LinkEST, analagous to exon_db for exon analyses
                            probe_data = ExonSeqModule.JunctionDataSimple(probeset_id,ensembl_gene_ids,array_gene,probesets,critical_exons)
                            probe_data.SetExonSeq(probeset_seq)
                            probe_data.SetJunctionSeq(junction_seq)                            
                            probeset_gene_seq_db[probeset_id] = probe_data                
    print len(probeset_gene_seq_db),"genes with probeset sequence associated"
    return probeset_gene_seq_db
Esempio n. 3
0
def getParametersAndExecute(probeset_seq_file,array_type,species,data_type):
    if data_type == 'critical-exons':
        if array_type == 'RNASeq': probeset_annotations_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_exons.txt'
        else: probeset_annotations_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_'+array_type+'_probesets.txt'
        ###Import probe-level associations
        exon_db = ExonSeqModule.importSplicingAnnotationDatabase(probeset_annotations_file,array_type)
        start_time = time.time()
        probeset_seq_db = importProbesetSeqeunces(probeset_seq_file,exon_db,species)  ###Do this locally with a function that works on tab-delimited as opposed to fasta sequences (exon array)
        end_time = time.time(); time_diff = int(end_time-start_time)
    elif data_type == 'junctions':
        start_time = time.time(); biotype = 'gene' ### Indicates whether to store information at the level of genes or probesets
        probeset_seq_db = importSplicingAnnotationDatabaseAndSequence(species,array_type,biotype)
        end_time = time.time(); time_diff = int(end_time-start_time)
    print "Analyses finished in %d seconds" % time_diff
    return probeset_seq_db
Esempio n. 4
0
def executeParameters(species,array_type,force,genomic_build,update_uniprot,update_ensembl,update_probeset_to_ensembl,update_domain,update_miRs,update_all,update_miR_seq,ensembl_version):    
    if '|' in array_type: array_type, specific_array_type = string.split(array_type,'|') ### To destinguish between array sub-types, like the HJAY and hGlue
    else: specific_array_type = array_type
    
    if update_all == 'yes':
        update_uniprot='yes'; update_ensembl='yes'; update_probeset_to_ensembl='yes'; update_domain='yes'; update_miRs = 'yes'
        
    if update_ensembl == 'yes':
        from build_scripts import EnsemblSQL; reload(EnsemblSQL)

        """ Used to grab all essential Ensembl annotations previously obtained via BioMart"""        
        configType = 'Advanced'; analysisType = 'AltAnalyzeDBs'; externalDBName = ''
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force)
        
        """ Used to grab Ensembl-to-External gene associations"""
        configType = 'Basic'; analysisType = 'ExternalOnly'; externalDBName = 'Uniprot/SWISSPROT'
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force)
        
        """ Used to grab Ensembl full gene sequence plus promoter and 3'UTRs """
        if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq':
            EnsemblSQL.getFullGeneSequences(ensembl_version,species)
            
    if update_uniprot == 'yes':            
        ###Might need to delete the existing versions of downloaded databases or force download
        buildUniProtFunctAnnotations(species,force)
                
    if update_probeset_to_ensembl == 'yes':
        if species == 'Mm' and array_type == 'AltMouse':
            buildAltMouseExonAnnotations(species,array_type,force,genomic_build)
        elif array_type == 'junction':
            buildJunctionExonAnnotations(species,array_type,specific_array_type,force,genomic_build)
        elif array_type == 'RNASeq':
            import RNASeq; test_status = 'no'; data_type = 'mRNA'
            RNASeq.getEnsemblAssociations(species,data_type,test_status,force)
        else: buildExonArrayExonAnnotations(species,array_type,force)

    if update_domain == 'yes':
        if array_type == 'RNASeq':
            only_rely_on_coordinate_mapping = True ### This will provide more accurate results as many junctions have missing sequences
        else:
            only_rely_on_coordinate_mapping = False

        from build_scripts import FeatureAlignment
        from build_scripts import JunctionArray
        from build_scripts import mRNASeqAlign
        from build_scripts import IdentifyAltIsoforms
        
        ### Get UCSC associations for all Ensembl linked genes (download databases if necessary)        if species == 'Mm' and array_type == 'AltMouse':
        mRNA_Type = 'mrna'; run_from_scratch = 'yes'
        export_all_associations = 'yes' ### YES only for protein prediction analysis
        buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force)

        if (species == 'Mm' and array_type == 'AltMouse'):
            """Imports and re-exports array-Ensembl annotations"""
            null = JunctionArray.importArrayAnnotations(species,array_type); null={}
        if (species == 'Mm' and array_type == 'AltMouse') or array_type == 'junction' or array_type == 'RNASeq':
            if only_rely_on_coordinate_mapping == False:
                """Performs probeset sequence aligment to Ensembl and UCSC transcripts. To do: Need to setup download if files missing"""
                analysis_type = 'reciprocal'
                mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force)
    
        run_seqcomp = 'no'
        if only_rely_on_coordinate_mapping == False:
            IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null')
        
        if array_type == 'junction' or array_type == 'RNASeq':
            if only_rely_on_coordinate_mapping == False:
                ### For junction probeset sequences from mRNASeqAlign(), find and assess alternative proteins - export to the folder 'junction'
                mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force)
                IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp)
                FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction')
                ### For exon probesets (and junction exons) align and assess alternative proteins - export to the folder 'exon'
                IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp)
                FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'exon') # not needed
            
            """ Repeat above with CoordinateBasedMatching = True """ 
            ### Peform coordinate based junction mapping to transcripts (requires certain sequence files built in IdentifyAltIosofmrs)
            analysis_type = 'reciprocal'
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force,CoordinateBasedMatching = True)
            IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null')
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force,CoordinateBasedMatching = True)
            IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction')
            IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp)
            if array_type == 'RNASeq':
                JunctionArray.combineExonJunctionAnnotations(species,array_type)
                
    if update_miRs == 'yes':
        if update_miR_seq == 'yes':
            from build_scripts import MatchMiRTargetPredictions; only_add_sequence_to_previous_results = 'no'
            MatchMiRTargetPredictions.runProgram(species,force,only_add_sequence_to_previous_results)

        if array_type == 'exon' or array_type == 'gene':        
            from build_scripts import ExonSeqModule
            stringency = 'strict'; process_microRNA_predictions = 'yes'; mir_source = 'multiple'
            ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency)
            stringency = 'lax'
            ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency)
            ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build
        else:
            from build_scripts  import JunctionSeqModule
            stringency = 'strict'; mir_source = 'multiple'
            JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force)
            stringency = 'lax'
            JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force)

    if array_type == 'junction':
        try:
            from build_scripts import JunctionArray; from build_scripts import JunctionArrayEnsemblRules
            JunctionArray.filterForCriticalExons(species,array_type)
            JunctionArray.overRideExonEntriesWithJunctions(species,array_type)
            JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type)
            ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build
        except IOError: print 'No built junction files to analyze';sys.exit()
    if array_type == 'RNASeq' and (species == 'Hs' or species == 'Mm' or species == 'Rn'):
        from build_scripts import JunctionArray; from build_scripts import JunctionArrayEnsemblRules
        try: JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type)
        except IOError: print 'No Ensembl_exons.txt file to analyze';sys.exit()
    
    try:
        filename = 'AltDatabase/'+species+'/SequenceData/miRBS-combined_gene-targets.txt'; ef=filepath(filename)
        er = string.replace(ef,species+'/SequenceData/miRBS-combined_gene-targets.txt','ensembl/'+species+'/'+species+'_microRNA-Ensembl.txt')
        import shutil; shutil.copyfile(ef,er)
    except Exception: null=[]
    if array_type != 'RNASeq':
        ### Get the probeset-probe relationships from online - needed for FIRMA analysis
        filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probeset-probes.txt'
        if array_type == 'junction' and 'lue' in specific_array_type:
            server_folder = 'junction/hGlue'
            verifyFile(filename,server_folder) ### Will force download if missing
            verifyFile('AltDatabase/'+species+'/'+array_type+'/platform.txt',server_folder) ### Will force download if missing
        elif array_type != 'AltMouse': verifyFile(filename,array_type) ### Will force download if missing
        if (array_type == 'exon' or array_type == 'AltMouse') and species != 'Rn':
            try:
                ### Available for select exon-arrays and AltMouse
                probeset_to_remove_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probes_to_remove.txt'
                verifyFile(probeset_to_remove_file,array_type)
            except Exception: null=[]
def annotateJunctionIDsAsExon(species, array_type):
    from build_scripts import ExonSeqModule
    probeset_annotations_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_junction_probesets-filtered.txt'
    if array_type == 'RNASeq':
        probeset_annotations_file = string.replace(
            probeset_annotations_file, 'junction_probesets-filtered', 'exons')
    junction_exon_db = ExonSeqModule.importSplicingAnnotationDatabase(
        probeset_annotations_file, array_type)
    probeset_annotations_file = 'AltDatabase/' + species + '/exon/' + species + '_Ensembl_probesets.txt'
    exon_db = ExonSeqModule.importSplicingAnnotationDatabase(
        probeset_annotations_file, array_type)

    ### Extract unique exon regions from Exon Array annotations
    multiple_exon_regions = {}
    unique_exon_regions = {}
    for probeset in exon_db:
        y = exon_db[probeset]
        geneid = y.GeneID()
        if '|' in y.ExonRegionID():
            exonids = string.split(y.ExonRegionID(), '|')
            for exonid in exonids:
                multiple_exon_regions[geneid, exonid] = y
        else:
            unique_exon_regions[geneid, y.ExonRegionID()] = y
    ### Add missing exons to unique
    for uid in multiple_exon_regions:
        if uid not in unique_exon_regions:
            unique_exon_regions[uid] = multiple_exon_regions[uid]
    """
        for i in unique_exon_regions:
            if 'ENSMUSG00000066842' in i:
                print i
    stop
    """

    ### Extract unique exon regions from Junction Array annotation
    junction_to_exonids = {}
    for probeset in junction_exon_db:
        if 'ENSMUSG00000066842' in probeset: print probeset
        y = junction_exon_db[probeset]
        geneid = y.GeneID()
        if '|' in y.ExonRegionID():
            exonids = string.split(y.ExonRegionID(), '|')
            if probeset == 'ENSMUSG00000066842|E60.1': print[[exonids]]
            for exonid in exonids:
                if (geneid, exonid) in unique_exon_regions:
                    y = unique_exon_regions[geneid, exonid]
                    if probeset == 'ENSMUSG00000066842:E60.1':
                        print[y.Probeset()]
                    junction_to_exonids[probeset] = y.Probeset()
        else:
            if (geneid, string.replace(y.ExonRegionID(), '.',
                                       '-')) in unique_exon_regions:
                #if ':' in probeset: print [probeset,y.ExonRegionID()];kill
                y = unique_exon_regions[
                    geneid, string.replace(y.ExonRegionID(), '.', '-')]
                junction_to_exonids[probeset] = y.Probeset()

    output_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_' + array_type + '-exon_probesets.txt'
    fn = filepath(output_file)
    data = open(fn, 'w')
    data.write(array_type + '_probeset\texon_probeset\n')

    for probeset in junction_to_exonids:
        exon_probeset = junction_to_exonids[probeset]
        data.write(probeset + '\t' + exon_probeset + '\n')
    data.close()