def agilentSummarize(exp_file_location_db):
    print 'Agilent array import started'
    
    global red_channel_db
    global green_channel_db
    red_channel_db={}
    green_channel_db={}
    
    for dataset in exp_file_location_db: ### Instance of the Class ExpressionFileLocationData
        fl = exp_file_location_db[dataset]
        output_dir = fl.OutputDir()
        array_dir=fl.CELFileDir()
        group_dir = fl.GroupsFile() ### provides the list of array_files
        channel_to_extract = fl.ChannelToExtract()
        expression_file = fl.ExpFile()
        array_group_list = UI.importArrayGroupsSimple(group_dir,[])[0]
        normalization_method = fl.NormMatrix()

    arrays = map(lambda agd: agd.Array(), array_group_list) ### Pull the array names out of this list of objects

    dir_list = unique.read_directory(array_dir)
    count=0
    for array in dir_list:
        if array in arrays: ### Important since other text files may exist in that directory
            count+=1
            filename = array_dir+'/'+array
            importAgilentExpressionValues(filename,array,channel_to_extract)
            if count == 50:
                print '' ### For progress printing
                count = 0
            
    if len(green_channel_db)>0:
        filename = output_dir+ '/'+ 'gProcessed/gProcessed-'+dataset+'-raw.txt'
        exportExpressionData(filename,green_channel_db)
        if 'quantile' in normalization_method:
            print '\nPerforming quantile normalization on the green channel...'
            green_channel_db = RNASeq.quantileNormalizationSimple(green_channel_db)
            filename = output_dir+ '/'+ 'gProcessed/gProcessed-'+dataset+'-quantile.txt'
            exportExpressionData(filename,green_channel_db)
        final_exp_db = green_channel_db
        
    if len(red_channel_db)>0:
        filename = output_dir+ '/'+ 'rProcessed/rProcessed-'+dataset+'-raw.txt'
        exportExpressionData(filename,red_channel_db)
        if 'quantile' in normalization_method:
            print '\nPerforming quantile normalization on the red channel...'
            red_channel_db = RNASeq.quantileNormalizationSimple(red_channel_db)
            filename = output_dir+ '/'+ 'rProcessed/rProcessed-'+dataset+'-quantile.txt'
            exportExpressionData(filename,red_channel_db)
        final_exp_db = red_channel_db

    if len(red_channel_db)>0 and len(green_channel_db)>0:
        if channel_to_extract == 'green/red ratio':
            final_exp_db = calculateRatios(green_channel_db,red_channel_db)
        elif channel_to_extract == 'red/green ratio':
            final_exp_db = calculateRatios(red_channel_db,green_channel_db)
            
    exportExpressionData(expression_file,final_exp_db)
    print 'Exported expression input file to:',expression_file
Example #2
0
def summarizeExpressionData(filename, qc_type):
    start_time = time.time()
    fn = filepath(filename)
    matrix = []
    row_header = []
    import RNASeq
    platform = RNASeq.checkExpressionFileFormat(fn, "3'array")
    x = 0
    if '/' in filename:
        dataset_name = string.split(filename, '/')[-1][:-4]
    else:
        dataset_name = string.split(filename, '\\')[-1][:-4]
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data, '\t')
        if data[0] == '#': x = 0
        elif x == 0:
            group_db, column_header, qc_db = assignGroupColors(t[1:], qc_type)
            x = 1
        else:
            if ' ' not in t and '' not in t:  ### Occurs for rows with missing data
                if qc_type == 'distribution':
                    #values = map(lambda x: round(float(x), 1), t[1:]) ### report value to one decimal place
                    values = map(lambda x: float(x), t[1:])
                    i = 0
                    for r in values:
                        if r != 0:
                            if 'counts' in dataset_name or platform == 'RNASeq':
                                r = round(math.log(r, 2), 1)
                            else:
                                r = round(r, 1)
                            try:
                                qc_db[column_header[i]][
                                    r] += 1  ### count this rounded expression value once for this filename
                            except Exception:
                                qc_db[column_header[i]][r] = 1
                        i += 1
                if qc_type == 'feature' or qc_type == 'totals':
                    if 'counts' in dataset_name:
                        feature_id = string.split(t[0], '=')[0]
                        if '-' in feature_id: feature = 'junction'
                        elif ':I' in feature_id: feature = 'intron'
                        elif ':E' in feature_id: feature = 'exon'
                        values = map(lambda x: float(x), t[1:])
                        i = 0
                        for r in values:
                            if r != 0:
                                if qc_type == 'feature':
                                    r = round(math.log(r, 2), 1)
                                try:
                                    qc_db[column_header[i]][feature].append(
                                        r)  ### add all expression values
                                except Exception:
                                    qc_db[column_header[i]][feature] = [r]
                            i += 1
            x += 1

    time_diff = str(round(time.time() - start_time, 1))
    print 'Dataset import in %s seconds' % time_diff
    return qc_db, dataset_name
Example #3
0
def normalizeDataset(filename,
                     output=None,
                     normalization='quantile',
                     platform="3'array"):
    """ Perform Quantile Normalization on an input expression dataset """

    if output == None:
        output = filename
        moved_exp_dir = export.findParentDir(
            filename) + 'Non-Normalized/' + export.findFilename(filename)
        try:
            export.copyFile(filename, moved_exp_dir)
            print 'Moved original expression file to:'
            print '\t' + moved_exp_dir
        except Exception:
            None

    if normalization == 'Quantile' or normalization == 'quantile':
        print "Importing data..."
        sample_expression_db = importExpressionValues(filename)
        print "Performing quantile normalization..."
        sample_expression_db = RNASeq.quantileNormalizationSimple(
            sample_expression_db)
        exportExpressionData(output, sample_expression_db)
    elif normalization == 'group':
        performGroupNormalization(moved_exp_dir, filename, platform)
    print 'Exported expression input file to:', output
Example #4
0
def summarizeExpressionData(filename,qc_type):
    start_time = time.time()
    fn = filepath(filename)
    matrix=[]
    row_header=[]
    import RNASeq
    platform = RNASeq.checkExpressionFileFormat(fn,"3'array")
    x=0
    if '/' in filename:
        dataset_name = string.split(filename,'/')[-1][:-4]
    else:
        dataset_name = string.split(filename,'\\')[-1][:-4]
    for line in open(fn,'rU').xreadlines():         
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if data[0] =='#': x=0
        elif x==0:
            group_db, column_header, qc_db = assignGroupColors(t[1:],qc_type)
            x=1
        else:
            if ' ' not in t and '' not in t: ### Occurs for rows with missing data
                if qc_type == 'distribution':
                    #values = map(lambda x: round(float(x), 1), t[1:]) ### report value to one decimal place
                    values = map(lambda x: float(x), t[1:])
                    i=0
                    for r in values:
                        if r!=0:
                            if 'counts' in dataset_name or platform == 'RNASeq':
                                r = round(math.log(r,2),1)
                            else:
                                r = round(r,1)
                            try:
                                qc_db[column_header[i]][r]+=1 ### count this rounded expression value once for this filename
                            except Exception:
                                qc_db[column_header[i]][r]=1
                        i+=1
                if qc_type == 'feature' or qc_type == 'totals':
                    if 'counts' in dataset_name:
                        feature_id = string.split(t[0],'=')[0]
                        if '-' in feature_id: feature = 'junction'
                        elif ':I' in feature_id: feature = 'intron'
                        elif ':E' in feature_id: feature = 'exon'
                        values = map(lambda x: float(x), t[1:])
                        i=0
                        for r in values:
                            if r!=0:
                                if qc_type == 'feature':
                                    r = round(math.log(r,2),1)
                                try:
                                    qc_db[column_header[i]][feature].append(r) ### add all expression values
                                except Exception:
                                    qc_db[column_header[i]][feature] = [r]
                            i+=1
            x+=1
            
    time_diff = str(round(time.time()-start_time,1))
    print 'Dataset import in %s seconds' % time_diff
    return qc_db,dataset_name
Example #5
0
def normalizeDataset(filename, output=None):
    """ Perform Quantile Normalization on an input expression dataset """

    print "Importing data..."
    sample_expression_db = importExpressionValues(filename)
    print "Performing quantile normalization..."
    sample_expression_db = RNASeq.quantileNormalizationSimple(sample_expression_db)

    if output == None:
        output = filename
        moved_exp_dir = export.findParentDir(filename) + "Non-Quantile/" + export.findFilename(filename)
        try:
            export.copyFile(filename, moved_exp_dir)
            print "Moved original expression file to:"
            print "\t" + moved_exp_dir
        except Exception:
            None

    exportExpressionData(output, sample_expression_db)
    print "Exported expression input file to:", output
Example #6
0
def normalizeDataset(filename,output = None, normalization='quantile',platform="3'array"):
    """ Perform Quantile Normalization on an input expression dataset """
    
    if output==None:
        output = filename
        moved_exp_dir = export.findParentDir(filename)+'Non-Normalized/'+export.findFilename(filename)
        try:
            export.copyFile(filename, moved_exp_dir)
            print 'Moved original expression file to:'
            print '\t'+moved_exp_dir
        except Exception: None
        
    if normalization == 'Quantile' or normalization == 'quantile':
        print "Importing data..."
        sample_expression_db = importExpressionValues(filename)
        print "Performing quantile normalization..."    
        sample_expression_db = RNASeq.quantileNormalizationSimple(sample_expression_db)
        exportExpressionData(output,sample_expression_db)
    elif normalization == 'group':
        performGroupNormalization(moved_exp_dir,filename,platform)    
    print 'Exported expression input file to:',output
def normalizeDataset(filename, output=None):
    """ Perform Quantile Normalization on an input expression dataset """

    print "Importing data..."
    sample_expression_db = importExpressionValues(filename)
    print "Performing quantile normalization..."
    sample_expression_db = RNASeq.quantileNormalizationSimple(
        sample_expression_db)

    if output == None:
        output = filename
        moved_exp_dir = export.findParentDir(
            filename) + 'Non-Quantile/' + export.findFilename(filename)
        try:
            export.copyFile(filename, moved_exp_dir)
            print 'Moved original expression file to:'
            print '\t' + moved_exp_dir
        except Exception:
            None

    exportExpressionData(output, sample_expression_db)
    print 'Exported expression input file to:', output
Example #8
0
def CompleteWorkflow(full_PSI_InputFile,EventAnnot,rho_cutoff,strategy,seq,gsp,forceBroadClusters,AnalysisRound):
    """ This function is used perform a single-iteration of the OncoSplice workflow (called from main),
    including the unsupervised splicing analysis (splice-ICGS) and signature depletion """
    
    ### Filter the EventAnnotation PSI file with non-depleted events from the prior round
    filtered_EventAnnot_dir=filterEventAnnotation.FilterFile(full_PSI_InputFile,EventAnnot,AnalysisRound)
    
    try:
        print "Running splice-ICGS for feature selection - Round"+str(AnalysisRound)

        ### Reset the below variables which can be altered in prior rounds
        gsp.setGeneSelection('')
        gsp.setGeneSet('None Selected')
        gsp.setPathwaySelect([])
        species = gsp.Species()
        if forceBroadClusters == True:
            ### Find Broad clusters with at least 25% of all samples
            originalSamplesDiffering = gsp.SamplesDiffering()
            gsp.setSamplesDiffering(int(SampleNumber*0.25))
            
        print 'Number varying samples to identify:',gsp.SamplesDiffering()
        
        graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species, 'exons', full_PSI_InputFile,mlp,exp_threshold=0, rpkm_threshold=0, parameters=gsp)
        if forceBroadClusters == True:
            gsp.setSamplesDiffering(originalSamplesDiffering)

        dPSI_results_fn=graphic_links3[-1][-1]
        dPSI_results_fn=dPSI_results_fn[:-4]+'.txt'
       
        print "Running block identification for k analyses - Round"+str(AnalysisRound)
        ### Parameters are fixed as they are distinct 
        RNASeq_blockIdentification.correlateClusteredGenesParameters(dPSI_results_fn,rho_cutoff=0.4,hits_cutoff=4,hits_to_report=50,ReDefinedClusterBlocks=True,filter=True) 
        dPSI_results_fn_block=dPSI_results_fn[:-4]+'-BlockIDs.txt'
        NMFinput, k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn_block,full_PSI_InputFile,AnalysisRound)

    except Exception:
        print 'UNKNOWN ERROR!!!!! Setting k=0' 
        print traceback.format_exc()
        k=0
    
    print "Round =", AnalysisRound,'and k =', k
    if AnalysisRound == 1:
        if force_broad_round1:
            k = 2
        else:
            NMFinput,k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn,full_PSI_InputFile,AnalysisRound) ### Just use the Guide 3 file alone
    if k < 2:
        NMFinput,k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn,full_PSI_InputFile,AnalysisRound) ### Just use the Guide 3 file alone
        #k = 2
        
    print "Round =", AnalysisRound,'and k =', k
    if k>1:
        ### ADJUST THE k - MUST UPDATE!!!!
        if AnalysisRound == 1:
            if k < 2:
                k = 30
        else:
            if k > 2:
                k = 30
        print "Round =", AnalysisRound,'and k =', k
        
        try:
            flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k,AnalysisRound, strategy)
        except:
            print traceback.format_exc()
            k+=1
            print 'Adjusted k =',k
            try:
                flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k, AnalysisRound, strategy)
                print traceback.format_exc()
            except:
                k = 30
                print 'Adjusted k = 30'
                try:
                    flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k,AnalysisRound, strategy)
                    print traceback.format_exc()
                except:
                    flag = True
                    pass ### will force k-means below
    
    if k<2:
        if k==1:
            try:
                print "Running K-means analyses instead of NMF - Round"+str(AnalysisRound)
                header=[]
                header=Kmeans.header_file(dPSI_results_fn_block)
                Kmeans.KmeansAnalysis(dPSI_results_fn_block,header,full_PSI_InputFile,AnalysisRound)
                if AnalysisRound == 1:
                    flag=True
                else:
                    flag=False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                AnalysisRound = True
        else:
            flag=False
     
    return flag,full_PSI_InputFile,filtered_EventAnnot_dir
Example #9
0
def executeParameters(species,array_type,force,genomic_build,update_uniprot,update_ensembl,update_probeset_to_ensembl,update_domain,update_miRs,update_all,update_miR_seq,ensembl_version):    
    if '|' in array_type: array_type, specific_array_type = string.split(array_type,'|') ### To destinguish between array sub-types, like the HJAY and hGlue
    else: specific_array_type = array_type
    
    if update_all == 'yes':
        update_uniprot='yes'; update_ensembl='yes'; update_probeset_to_ensembl='yes'; update_domain='yes'; update_miRs = 'yes'
        
    if update_ensembl == 'yes':
        from build_scripts import EnsemblSQL; reload(EnsemblSQL)

        """ Used to grab all essential Ensembl annotations previously obtained via BioMart"""        
        configType = 'Advanced'; analysisType = 'AltAnalyzeDBs'; externalDBName = ''
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force)
        
        """ Used to grab Ensembl-to-External gene associations"""
        configType = 'Basic'; analysisType = 'ExternalOnly'; externalDBName = 'Uniprot/SWISSPROT'
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force)
        
        """ Used to grab Ensembl full gene sequence plus promoter and 3'UTRs """
        if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq':
            EnsemblSQL.getFullGeneSequences(ensembl_version,species)
            
    if update_uniprot == 'yes':            
        ###Might need to delete the existing versions of downloaded databases or force download
        buildUniProtFunctAnnotations(species,force)
                
    if update_probeset_to_ensembl == 'yes':
        if species == 'Mm' and array_type == 'AltMouse':
            buildAltMouseExonAnnotations(species,array_type,force,genomic_build)
        elif array_type == 'junction':
            buildJunctionExonAnnotations(species,array_type,specific_array_type,force,genomic_build)
        elif array_type == 'RNASeq':
            import RNASeq; test_status = 'no'; data_type = 'mRNA'
            RNASeq.getEnsemblAssociations(species,data_type,test_status,force)
        else: buildExonArrayExonAnnotations(species,array_type,force)

    if update_domain == 'yes':
        if array_type == 'RNASeq':
            only_rely_on_coordinate_mapping = True ### This will provide more accurate results as many junctions have missing sequences
        else:
            only_rely_on_coordinate_mapping = False

        from build_scripts import FeatureAlignment
        from build_scripts import JunctionArray
        from build_scripts import mRNASeqAlign
        from build_scripts import IdentifyAltIsoforms
        
        ### Get UCSC associations for all Ensembl linked genes (download databases if necessary)        if species == 'Mm' and array_type == 'AltMouse':
        mRNA_Type = 'mrna'; run_from_scratch = 'yes'
        export_all_associations = 'yes' ### YES only for protein prediction analysis
        buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force)

        if (species == 'Mm' and array_type == 'AltMouse'):
            """Imports and re-exports array-Ensembl annotations"""
            null = JunctionArray.importArrayAnnotations(species,array_type); null={}
        if (species == 'Mm' and array_type == 'AltMouse') or array_type == 'junction' or array_type == 'RNASeq':
            if only_rely_on_coordinate_mapping == False:
                """Performs probeset sequence aligment to Ensembl and UCSC transcripts. To do: Need to setup download if files missing"""
                analysis_type = 'reciprocal'
                mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force)
    
        run_seqcomp = 'no'
        if only_rely_on_coordinate_mapping == False:
            IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null')
        
        if array_type == 'junction' or array_type == 'RNASeq':
            if only_rely_on_coordinate_mapping == False:
                ### For junction probeset sequences from mRNASeqAlign(), find and assess alternative proteins - export to the folder 'junction'
                mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force)
                IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp)
                FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction')
                ### For exon probesets (and junction exons) align and assess alternative proteins - export to the folder 'exon'
                IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp)
                FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'exon') # not needed
            
            """ Repeat above with CoordinateBasedMatching = True """ 
            ### Peform coordinate based junction mapping to transcripts (requires certain sequence files built in IdentifyAltIosofmrs)
            analysis_type = 'reciprocal'
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force,CoordinateBasedMatching = True)
            IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null')
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force,CoordinateBasedMatching = True)
            IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction')
            IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp)
            if array_type == 'RNASeq':
                JunctionArray.combineExonJunctionAnnotations(species,array_type)
                
    if update_miRs == 'yes':
        if update_miR_seq == 'yes':
            from build_scripts import MatchMiRTargetPredictions; only_add_sequence_to_previous_results = 'no'
            MatchMiRTargetPredictions.runProgram(species,force,only_add_sequence_to_previous_results)

        if array_type == 'exon' or array_type == 'gene':        
            from build_scripts import ExonSeqModule
            stringency = 'strict'; process_microRNA_predictions = 'yes'; mir_source = 'multiple'
            ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency)
            stringency = 'lax'
            ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency)
            ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build
        else:
            from build_scripts  import JunctionSeqModule
            stringency = 'strict'; mir_source = 'multiple'
            JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force)
            stringency = 'lax'
            JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force)

    if array_type == 'junction':
        try:
            from build_scripts import JunctionArray; from build_scripts import JunctionArrayEnsemblRules
            JunctionArray.filterForCriticalExons(species,array_type)
            JunctionArray.overRideExonEntriesWithJunctions(species,array_type)
            JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type)
            ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build
        except IOError: print 'No built junction files to analyze';sys.exit()
    if array_type == 'RNASeq' and (species == 'Hs' or species == 'Mm' or species == 'Rn'):
        from build_scripts import JunctionArray; from build_scripts import JunctionArrayEnsemblRules
        try: JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type)
        except IOError: print 'No Ensembl_exons.txt file to analyze';sys.exit()
    
    try:
        filename = 'AltDatabase/'+species+'/SequenceData/miRBS-combined_gene-targets.txt'; ef=filepath(filename)
        er = string.replace(ef,species+'/SequenceData/miRBS-combined_gene-targets.txt','ensembl/'+species+'/'+species+'_microRNA-Ensembl.txt')
        import shutil; shutil.copyfile(ef,er)
    except Exception: null=[]
    if array_type != 'RNASeq':
        ### Get the probeset-probe relationships from online - needed for FIRMA analysis
        filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probeset-probes.txt'
        if array_type == 'junction' and 'lue' in specific_array_type:
            server_folder = 'junction/hGlue'
            verifyFile(filename,server_folder) ### Will force download if missing
            verifyFile('AltDatabase/'+species+'/'+array_type+'/platform.txt',server_folder) ### Will force download if missing
        elif array_type != 'AltMouse': verifyFile(filename,array_type) ### Will force download if missing
        if (array_type == 'exon' or array_type == 'AltMouse') and species != 'Rn':
            try:
                ### Available for select exon-arrays and AltMouse
                probeset_to_remove_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probes_to_remove.txt'
                verifyFile(probeset_to_remove_file,array_type)
            except Exception: null=[]
Example #10
0
def CompleteWorkflow(InputFile, EventAnnot, rho_cutoff, strategy, seq, gsp,
                     forceBroadClusters, turn):
    """ This function is used perform a single-iteration of the OncoSplice workflow (called from main),
    including the unsupervised splicing analysis (splice-ICGS) and signature depletion """

    ### Filter the EventAnnotation PSI file with non-depleted events from the prior round
    FilteredEventAnnot = filterEventAnnotation.FilterFile(
        InputFile, EventAnnot, turn)

    try:
        print "Running splice-ICGS for feature selection - Round" + str(turn)

        ### Reset the below variables which can be altered in prior rounds
        gsp.setGeneSelection('')
        gsp.setGeneSet('None Selected')
        gsp.setPathwaySelect([])
        if forceBroadClusters == True:
            ### Find Broad clusters with at least 25% of all samples
            originalSamplesDiffering = gsp.SamplesDiffering()
            gsp.setSamplesDiffering(int(SampleNumber * 0.25))

        print 'Number varying samples to identify:', gsp.SamplesDiffering()

        graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species,
                                                         'exons',
                                                         InputFile,
                                                         mlp,
                                                         exp_threshold=0,
                                                         rpkm_threshold=0,
                                                         parameters=gsp)
        if forceBroadClusters == True:
            gsp.setSamplesDiffering(originalSamplesDiffering)

        Guidefile = graphic_links3[-1][-1]
        Guidefile = Guidefile[:-4] + '.txt'

        print "Running block identification for rank analyses - Round" + str(
            turn)
        ### Parameters are fixed as they are distinct
        RNASeq_blockIdentification.correlateClusteredGenesParameters(
            Guidefile,
            rho_cutoff=0.4,
            hits_cutoff=4,
            hits_to_report=50,
            ReDefinedClusterBlocks=True,
            filter=True)
        Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt'
        NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block,
                                                 InputFile, turn)

    except Exception:
        print 'UNKNOWN ERROR!!!!! Setting Rank=0'
        #print traceback.format_exc()
        Rank = 0

    if Rank > 1:
        ### ADJUST THE RANKS - MUST UPDATE!!!!
        if turn == 1:
            if force_broad_round1:
                #Rank=2
                Rank = Rank
            else:
                if Rank > 2:
                    Rank = 30
        else:
            if Rank > 2:
                Rank = 30
        if seq == "bulk":
            use_adjusted_p = True
        else:
            use_adjusted_p = False

        print "Running NMF analyses for dimension reduction using " + str(
            Rank) + " ranks - Round" + str(turn)
        NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis(
            NMFinput, Rank, turn, strategy)
        print "Running Metadata Analyses for finding differential splicing events"
        rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis(
            'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p,
            0.05, Annotation)
        counter = 1
        Guidedir = rootdir + CovariateQuery
        PSIdir = rootdir + 'ExpressionProfiles'
        global upd_guides
        upd_guides = []
        name = []
        group = []
        grplst = []
        for filename in os.listdir(Guidedir):
            if filename.startswith("PSI."):
                Guidefile = os.path.join(Guidedir, filename)
                psi = string.replace(filename, "PSI.", "")
                PSIfile = os.path.join(PSIdir, psi)
                omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir)

                if omitcluster == 0:
                    group.append(counter)
                    name.append(psi)
                    counter += 1
        if counter > 2:
            dire = export.findParentDir(InputFile)
            output_dir = dire + 'OncoInputs'
            if os.path.exists(output_dir) == False:
                export.createExportFolder(output_dir)

            output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt'
            ExpandSampleClusters.filterRows(InputFile,
                                            output_file,
                                            filterDB=upd_guides,
                                            logData=False)
            header = ExpandSampleClusters.header_file(output_file)
            print "Running SVM prediction for improved subtypes - Round" + str(
                turn)
            train = ExpandSampleClusters.TrainDataGeneration(
                output_file, BinarizedOutput, name)
            grplst.append(group)
            ExpandSampleClusters.Classify(header, train, output_file, grplst,
                                          name, turn)
            header = Correlationdepletion.header_file(NMFResult)

            output_file = output_dir + '/DepletionInput-Round' + str(
                turn) + ".txt"
            sampleIndexSelection.filterFile(InputFile, output_file, header)
            print "Running Correlation Depletion - Round" + str(turn)
            commonkeys, count = Correlationdepletion.FindCorrelations(
                NMFResult, output_file, name)
            Depleted = Correlationdepletion.DepleteSplicingevents(
                commonkeys, output_file, count, InputFile)
            InputFile = Depleted

            flag = True
        else:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)
                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False
    else:
        if Rank == 1:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)
                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False
        else:
            flag = False

    return flag, InputFile, FilteredEventAnnot
def agilentSummarize(exp_file_location_db):
    print 'Agilent array import started'

    global red_channel_db
    global green_channel_db
    red_channel_db = {}
    green_channel_db = {}

    for dataset in exp_file_location_db:  ### Instance of the Class ExpressionFileLocationData
        fl = exp_file_location_db[dataset]
        output_dir = fl.OutputDir()
        array_dir = fl.CELFileDir()
        group_dir = fl.GroupsFile()  ### provides the list of array_files
        channel_to_extract = fl.ChannelToExtract()
        expression_file = fl.ExpFile()
        array_group_list = UI.importArrayGroupsSimple(group_dir, [])[0]
        normalization_method = fl.NormMatrix()

    arrays = map(
        lambda agd: agd.Array(),
        array_group_list)  ### Pull the array names out of this list of objects

    dir_list = unique.read_directory(array_dir)
    count = 0
    for array in dir_list:
        if array in arrays:  ### Important since other text files may exist in that directory
            count += 1
            filename = array_dir + '/' + array
            importAgilentExpressionValues(filename, array, channel_to_extract)
            if count == 50:
                print ''  ### For progress printing
                count = 0

    if len(green_channel_db) > 0:
        filename = output_dir + '/' + 'gProcessed/gProcessed-' + dataset + '-raw.txt'
        exportExpressionData(filename, green_channel_db)
        if 'quantile' in normalization_method:
            print '\nPerforming quantile normalization on the green channel...'
            green_channel_db = RNASeq.quantileNormalizationSimple(
                green_channel_db)
            filename = output_dir + '/' + 'gProcessed/gProcessed-' + dataset + '-quantile.txt'
            exportExpressionData(filename, green_channel_db)
        final_exp_db = green_channel_db

    if len(red_channel_db) > 0:
        filename = output_dir + '/' + 'rProcessed/rProcessed-' + dataset + '-raw.txt'
        exportExpressionData(filename, red_channel_db)
        if 'quantile' in normalization_method:
            print '\nPerforming quantile normalization on the red channel...'
            red_channel_db = RNASeq.quantileNormalizationSimple(red_channel_db)
            filename = output_dir + '/' + 'rProcessed/rProcessed-' + dataset + '-quantile.txt'
            exportExpressionData(filename, red_channel_db)
        final_exp_db = red_channel_db

    if len(red_channel_db) > 0 and len(green_channel_db) > 0:
        if channel_to_extract == 'green/red ratio':
            final_exp_db = calculateRatios(green_channel_db, red_channel_db)
        elif channel_to_extract == 'red/green ratio':
            final_exp_db = calculateRatios(red_channel_db, green_channel_db)

    exportExpressionData(expression_file, final_exp_db)
    print 'Exported expression input file to:', expression_file
Example #12
0
def summarizeExpressionData(filename, qc_type):
    start_time = time.time()
    fn = filepath(filename)
    matrix = []
    row_header = []
    import RNASeq

    platform = RNASeq.checkExpressionFileFormat(fn, "3'array")
    x = 0
    if "/" in filename:
        dataset_name = string.split(filename, "/")[-1][:-4]
    else:
        dataset_name = string.split(filename, "\\")[-1][:-4]
    for line in open(fn, "rU").xreadlines():
        data = cleanUpLine(line)
        t = string.split(data, "\t")
        if data[0] == "#":
            x = 0
        elif x == 0:
            group_db, column_header, qc_db = assignGroupColors(t[1:], qc_type)
            x = 1
        else:
            if " " not in t and "" not in t:  ### Occurs for rows with missing data
                if qc_type == "distribution":
                    # values = map(lambda x: round(float(x), 1), t[1:]) ### report value to one decimal place
                    values = map(lambda x: float(x), t[1:])
                    i = 0
                    for r in values:
                        if r != 0:
                            if "counts" in dataset_name or platform == "RNASeq":
                                r = round(math.log(r, 2), 1)
                            else:
                                r = round(r, 1)
                            try:
                                qc_db[column_header[i]][
                                    r
                                ] += 1  ### count this rounded expression value once for this filename
                            except Exception:
                                qc_db[column_header[i]][r] = 1
                        i += 1
                if qc_type == "feature" or qc_type == "totals":
                    if "counts" in dataset_name:
                        feature_id = string.split(t[0], "=")[0]
                        if "-" in feature_id:
                            feature = "junction"
                        elif ":I" in feature_id:
                            feature = "intron"
                        elif ":E" in feature_id:
                            feature = "exon"
                        values = map(lambda x: float(x), t[1:])
                        i = 0
                        for r in values:
                            if r != 0:
                                if qc_type == "feature":
                                    r = round(math.log(r, 2), 1)
                                try:
                                    qc_db[column_header[i]][feature].append(r)  ### add all expression values
                                except Exception:
                                    qc_db[column_header[i]][feature] = [r]
                            i += 1
            x += 1

    time_diff = str(round(time.time() - start_time, 1))
    print "Dataset import in %s seconds" % time_diff
    return qc_db, dataset_name
Example #13
0
def executeParameters(species,array_type,force,genomic_build,update_uniprot,update_ensembl,update_probeset_to_ensembl,update_domain,update_miRs,update_all,update_miR_seq,ensembl_version):    
    if '|' in array_type: array_type, specific_array_type = string.split(array_type,'|') ### To destinguish between array sub-types, like the HJAY and hGlue
    else: specific_array_type = array_type
    
    if update_all == 'yes':
        update_uniprot='yes'; update_ensembl='yes'; update_probeset_to_ensembl='yes'; update_domain='yes'; update_miRs = 'yes'
        
    if update_ensembl == 'yes':
        import EnsemblSQL; reload(EnsemblSQL)

        """ Used to grab all essential Ensembl annotations previously obtained via BioMart"""        
        configType = 'Advanced'; analysisType = 'AltAnalyzeDBs'; externalDBName = ''
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force)
        
        """ Used to grab Ensembl-to-External gene associations"""
        configType = 'Basic'; analysisType = 'ExternalOnly'; externalDBName = 'Uniprot/SWISSPROT'
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force)
        
        """ Used to grab Ensembl full gene sequence plus promoter and 3'UTRs """
        if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq':
            EnsemblSQL.getFullGeneSequences(ensembl_version,species)
            
    if update_uniprot == 'yes':            
        ###Might need to delete the existing versions of downloaded databases or force download
        buildUniProtFunctAnnotations(species,force)
                
    if update_probeset_to_ensembl == 'yes':
        if species == 'Mm' and array_type == 'AltMouse':
            buildAltMouseExonAnnotations(species,array_type,force,genomic_build)
        elif array_type == 'junction':
            buildJunctionExonAnnotations(species,array_type,specific_array_type,force,genomic_build)
        elif array_type == 'RNASeq':
            import RNASeq; test_status = 'no'; data_type = 'mRNA'
            RNASeq.getEnsemblAssociations(species,data_type,test_status,force)
        else: buildExonArrayExonAnnotations(species,array_type,force)

    if update_domain == 'yes':

        ### Get UCSC associations for all Ensembl linked genes (download databases if necessary)        if species == 'Mm' and array_type == 'AltMouse':
        mRNA_Type = 'mrna'; run_from_scratch = 'yes'
        export_all_associations = 'yes' ### YES only for protein prediction analysis
        buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force)

        if (species == 'Mm' and array_type == 'AltMouse'):
            """Imports and re-exports array-Ensembl annotations"""
            import JunctionArray
            null = JunctionArray.importArrayAnnotations(species,array_type); null={}
        if (species == 'Mm' and array_type == 'AltMouse') or array_type == 'junction' or array_type == 'RNASeq':
            """Performs probeset sequence aligment to Ensembl and UCSC transcripts. To do: Need to setup download if files missing"""
            import mRNASeqAlign; analysis_type = 'reciprocal'
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force)
       
        import IdentifyAltIsoforms; run_seqcomp = 'no'
        IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp)
        import FeatureAlignment; import JunctionArray
        FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null')
        
        if array_type == 'junction' or array_type == 'RNASeq':
            ### For junction probeset sequences from mRNASeqAlign(), find and assess alternative proteins - export to the folder 'junction'
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force)
            IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction')
            ### For exon probesets (and junction exons) align and assess alternative proteins - export to the folder 'exon'
            IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp)
            # FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'exon') # not needed
            
            """ Repeat above with CoordinateBasedMatching = True """ 
            ### Peform coordinate based junction mapping to transcripts (requires certain sequence files built in IdentifyAltIosofmrs)
            analysis_type = 'reciprocal'
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force,CoordinateBasedMatching = True)
            IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null')
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force,CoordinateBasedMatching = True)
            IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction')
            IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp)
            if array_type == 'RNASeq':
                JunctionArray.combineExonJunctionAnnotations(species,array_type)
                
    if update_miRs == 'yes':
        if update_miR_seq == 'yes':
            import MatchMiRTargetPredictions; only_add_sequence_to_previous_results = 'no'
            MatchMiRTargetPredictions.runProgram(species,force,only_add_sequence_to_previous_results)

        if array_type == 'exon' or array_type == 'gene':        
            import ExonSeqModule
            stringency = 'strict'; process_microRNA_predictions = 'yes'; mir_source = 'multiple'
            ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency)
            stringency = 'lax'
            ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency)
            ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build
        else:
            import JunctionSeqModule
            stringency = 'strict'; mir_source = 'multiple'
            JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force)
            stringency = 'lax'
            JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force)

    if array_type == 'junction':
        try:
            import JunctionArray; import JunctionArrayEnsemblRules
            JunctionArray.filterForCriticalExons(species,array_type)
            JunctionArray.overRideExonEntriesWithJunctions(species,array_type)
            JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type)
            ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build
        except IOError: print 'No built junction files to analyze';sys.exit()
    if array_type == 'RNASeq' and (species == 'Hs' or species == 'Mm' or species == 'Rn'):
        import JunctionArray; import JunctionArrayEnsemblRules
        try: JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type)
        except IOError: print 'No Ensembl_exons.txt file to analyze';sys.exit()
    
    try:
        filename = 'AltDatabase/'+species+'/SequenceData/miRBS-combined_gene-targets.txt'; ef=filepath(filename)
        er = string.replace(ef,species+'/SequenceData/miRBS-combined_gene-targets.txt','ensembl/'+species+'/'+species+'_microRNA-Ensembl.txt')
        import shutil; shutil.copyfile(ef,er)
    except Exception: null=[]
    if array_type != 'RNASeq':
        ### Get the probeset-probe relationships from online - needed for FIRMA analysis
        filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probeset-probes.txt'
        if array_type == 'junction' and 'lue' in specific_array_type:
            server_folder = 'junction/hGlue'
            verifyFile(filename,server_folder) ### Will force download if missing
            verifyFile('AltDatabase/'+species+'/'+array_type+'/platform.txt',server_folder) ### Will force download if missing
        elif array_type != 'AltMouse': verifyFile(filename,array_type) ### Will force download if missing
        if (array_type == 'exon' or array_type == 'AltMouse') and species != 'Rn':
            try:
                ### Available for select exon-arrays and AltMouse
                probeset_to_remove_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probes_to_remove.txt'
                verifyFile(probeset_to_remove_file,array_type)
            except Exception: null=[]
Example #14
0
def CompleteWorkflow(InputFile, EventAnnot, turn, rho_cutoff, strategy, seq):

    species = "Hs"
    row_method = 'hopach'
    column_method = 'hopach'
    row_metric = 'correlation'
    column_metric = 'euclidean'
    color_gradient = 'yellow_black_blue'
    contrast = 3
    vendor = "RNASeq"
    GeneSelection = ''
    PathwaySelection = ''
    GeneSetSelection = 'None Selected'
    excludeCellCycle = False
    #rho_cutoff = 0.4
    restrictBy = 'protein_coding'
    featurestoEvaluate = 'Genes'
    ExpressionCutoff = 0
    CountsCutoff = 0
    FoldDiff = 1.2
    SamplesDiffering = 4
    JustShowTheseIDs = ''
    removeOutliers = False
    PathwaySelection = []
    array_type = "RNASeq"
    #rho_cutoff=0.4
    gsp = UI.GeneSelectionParameters(species, array_type, vendor)
    gsp.setGeneSet(GeneSetSelection)
    gsp.setPathwaySelect(PathwaySelection)
    gsp.setGeneSelection(GeneSelection)
    gsp.setJustShowTheseIDs(JustShowTheseIDs)
    gsp.setNormalize('median')
    gsp.setSampleDiscoveryParameters(ExpressionCutoff, CountsCutoff, FoldDiff,
                                     SamplesDiffering, removeOutliers,
                                     featurestoEvaluate, restrictBy,
                                     excludeCellCycle, column_metric,
                                     column_method, rho_cutoff)
    #Run splice ICGS
    """import UI
        species='Mm'; platform = "3'array"; vendor = 'Ensembl'
        gsp = UI.GeneSelectionParameters(species,platform,vendor)
        gsp.setGeneSet('None Selected')
        gsp.setPathwaySelect('')
        gsp.setGeneSelection('')
        gsp.setJustShowTheseIDs('')
        gsp.setNormalize('median')
        gsp.setSampleDiscoveryParameters(0,0,1.5,3,
        False,'PSI','protein_coding',False,'cosine','hopach',0.35)"""

    FilteredEventAnnot = filterEventAnnotation.FilterFile(
        InputFile, EventAnnot, turn)

    try:
        print "Running splice-ICGS for feature selection - Round" + str(turn)
        #except Exception:Rank=0
        graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species,
                                                         'exons',
                                                         InputFile,
                                                         mlp,
                                                         exp_threshold=0,
                                                         rpkm_threshold=0,
                                                         parameters=gsp)

        Guidefile = graphic_links3[-1][-1]
        Guidefile = Guidefile[:-4] + '.txt'

        print "Running block identification for rank analyses - Round" + str(
            turn)
        RNASeq_blockIdentification.correlateClusteredGenesParameters(
            Guidefile,
            rho_cutoff=0.4,
            hits_cutoff=4,
            hits_to_report=50,
            ReDefinedClusterBlocks=True,
            filter=True)
        Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt'
        NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block,
                                                 InputFile, turn)
    except Exception:
        print 'UNKNOWN ERROR!!!!!'
        print traceback.format_exc()
        Rank = 0

    if Rank > 1:
        print 'Current turn:', turn, 'k =',
        if turn == 1:
            Rank = 2
        elif Rank > 2:
            Rank = 30
        else:
            Rank = 2
        if seq == "bulk":
            use_adjusted_p = True
        else:
            use_adjusted_p = False
        print Rank
        print "Running NMF analyses for dimension reduction using " + str(
            Rank) + " ranks - Round" + str(turn)
        NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis(
            NMFinput, Rank, turn, strategy)
        print "Running Metadata Analyses for finding differential splicing events"
        rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis(
            'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p,
            0.05, Annotation)
        counter = 1
        Guidedir = rootdir + CovariateQuery
        PSIdir = rootdir + 'ExpressionProfiles'
        global upd_guides
        upd_guides = []
        name = []
        group = []
        grplst = []
        for filename in os.listdir(Guidedir):
            if filename.startswith("PSI."):
                Guidefile = os.path.join(Guidedir, filename)
                psi = string.replace(filename, "PSI.", "")
                PSIfile = os.path.join(PSIdir, psi)
                omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir)

                if omitcluster == 0:
                    group.append(counter)
                    name.append(psi)
                    counter += 1
        if counter > 2:
            dire = export.findParentDir(InputFile)
            output_dir = dire + 'OncoInputs'
            if os.path.exists(output_dir) == False:
                export.createExportFolder(output_dir)

            output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt'
            ExpandSampleClusters.filterRows(InputFile,
                                            output_file,
                                            filterDB=upd_guides,
                                            logData=False)
            header = ExpandSampleClusters.header_file(output_file)
            print "Running SVM prediction for improved subtypes - Round" + str(
                turn)
            train = ExpandSampleClusters.TrainDataGeneration(
                output_file, BinarizedOutput, name)
            grplst.append(group)
            ExpandSampleClusters.Classify(header, train, output_file, grplst,
                                          name, turn)
            header = Correlationdepletion.header_file(NMFResult)

            output_file = output_dir + '/DepletionInput-Round' + str(
                turn) + ".txt"
            sampleIndexSelection.filterFile(InputFile, output_file, header)
            print "Running Correlation Depletion - Round" + str(turn)
            commonkeys, count = Correlationdepletion.FindCorrelations(
                NMFResult, output_file, name)
            Depleted = Correlationdepletion.DepleteSplicingevents(
                commonkeys, output_file, count, InputFile)
            InputFile = Depleted

            flag = True
        else:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)
                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False

    else:
        if Rank == 1:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)

                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False
        else:
            flag = False

    return flag, InputFile, FilteredEventAnnot
Example #15
0
def generateConstitutiveExpression(exp_dbase,constitutive_gene_db,probeset_gene_db,pre_filtered_db,array_names,filename):
    """Generate Steady-State expression values for each gene for analysis in the main module of this package"""
    steady_state_db={}; k=0; l=0
    remove_nonexpressed_genes = 'no' ### By default set to 'no'

    ###1st Pass: Identify probesets for steady-state calculation
    for gene in probeset_gene_db:
        if avg_all_probes_for_steady_state == 'yes': average_all_probesets[gene] = probeset_gene_db[gene] ### These are all exon aligning (not intron) probesets
        else:
            if gene not in constitutive_gene_db: average_all_probesets[gene] = probeset_gene_db[gene]
            else:
                constitutive_probeset_list = constitutive_gene_db[gene]
                constitutive_filtered=[] ###Added this extra code to eliminate constitutive probesets not in exp_dbase (gene level filters are more efficient when dealing with this many probesets)
                for probeset in constitutive_probeset_list:
                    if probeset in probeset_gene_db[gene]: constitutive_filtered.append(probeset)
                if len(constitutive_filtered)>0: average_all_probesets[gene] = constitutive_filtered
                else: average_all_probesets[gene] = probeset_gene_db[gene]

    ###2nd Pass: Remove probesets that have no detected expression (keep all if none are expressed)
    if excludeLowExpressionExons:
        non_expressed_genes={} ### keep track of these for internal QC
        for gene in average_all_probesets:
            gene_probe_list=[]; x = 0
            for probeset in average_all_probesets[gene]:
                if probeset in pre_filtered_db: gene_probe_list.append(probeset); x += 1
            ###If no constitutive and there are probes with detected expression: replace entry
            if x >0: average_all_probesets[gene] = gene_probe_list
            elif remove_nonexpressed_genes == 'yes': non_expressed_genes[gene]=[]   

    if remove_nonexpressed_genes == 'yes':
        for gene in non_expressed_genes: del average_all_probesets[gene]
    ###3rd Pass: Make sure the probesets are present in the input set (this is not typical unless a user is loading a pre-filtered probeset expression dataset)
    for gene in average_all_probesets:
        v=0
        for probeset in average_all_probesets[gene]:
            try: null = exp_dbase[probeset]; v+=1
            except KeyError: null =[] ###occurs if the expression probeset list is missing some of these probesets
            if v==0: ###Therefore, no probesets were found that were previously predicted to be best constitutive
                try: average_all_probesets[gene] = probeset_gene_db[gene] ###expand the average_all_probesets to include any exon linked to the gene
                except KeyError: print gene, probeset, len(probeset_gene_db), len(average_all_probesets);kill
    
    for probeset in exp_dbase:
        array_count = len(exp_dbase[probeset]); break

    try: null = array_count
    except Exception:
        print 'WARNING...CRITICAL ERROR. Make sure the correct array type is selected and that all input expression files are indeed present (array_count ERROR).'; forceError
        
    ###Calculate avg expression for each array for each probeset (using constitutive values)
    gene_count_db={}
    for gene in average_all_probesets:
        x = 0 ###For each array, average all probeset expression values
        gene_sum=0
        probeset_list = average_all_probesets[gene]#; k+= len(average_all_probesets[gene])
        if array_type != 'RNASeq': ### Just retain the list of probesets for RNA-seq
            while x < array_count:
                exp_list=[] ### average all exp values for constituitive probesets for each array
                for probeset in probeset_list:
                    try:
                        exp_val = exp_dbase[probeset][x]
                        exp_list.append(exp_val)
                    except KeyError: null =[] ###occurs if the expression probeset list is missing some of these probesets
                try:
                    if len(exp_list)==0:                
                        for probeset in probeset_list:
                            try:
                                exp_val = exp_dbase[probeset][x]
                                exp_list.append(exp_val)
                            except KeyError: null =[] ###occurs if the expression probeset list is missing some of these probesets
                    avg_const_exp=statistics.avg(exp_list)
                    ### Add only one avg-expression value for each array, this loop
                    try: steady_state_db[gene].append(avg_const_exp)
                    except KeyError: steady_state_db[gene] = [avg_const_exp]
                except ZeroDivisionError: null=[] ### Occurs when processing a truncated dataset (for testing usually) - no values for the gene should be included
                x += 1

    l = len(probeset_gene_db) - len(steady_state_db)
    steady_state_export = filename[0:-4]+'-steady-state.txt'
    steady_state_export = string.replace(steady_state_export,'counts.','exp.')
    fn=filepath(steady_state_export); data = open(fn,'w'); title = 'Gene_ID'
    
    if array_type == 'RNASeq':
        import RNASeq
        steady_state_db, pre_filtered_db = RNASeq.calculateGeneLevelStatistics(steady_state_export,species,average_all_probesets,normalize_feature_exp,array_names,UserOptions,excludeLowExp=excludeLowExpressionExons)
        ### This "pre_filtered_db" replaces the above since the RNASeq module performs the exon and junction-level filtering, not ExonArray (RPKM and count based)
        ### Use pre_filtered_db to exclude non-expressed features for multi-group alternative exon analysis
        removeNonExpressedProbesets(pre_filtered_db,full_dataset_export_dir)
        reload(RNASeq)
    
    for array in array_names: title = title +'\t'+ array
    data.write(title+'\n')
    for gene in steady_state_db:
        ss_vals = gene
        for exp_val in steady_state_db[gene]:
            ss_vals = ss_vals +'\t'+ str(exp_val)
        data.write(ss_vals+'\n')
    data.close()
    exp_dbase={}; steady_state_db={}; pre_filtered_db ={}
    #print k, "probesets were not found in the expression file, that could be used for the constitutive expression calculation"
    #print l, "genes were also not included that did not have such expression data"
    print "Steady-state data exported to",steady_state_export