Python filterFile Examples

Programming Language: Python

Namespace/Package Name: sampleIndexSelection

Method/Function: filterFile

Examples at hotexamples.com: 5

Python filterFile - 5 examples found. These are the top rated real world Python examples of sampleIndexSelection.filterFile extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k, AnalysisRound, strategy):
    """ Run NMF and determine the number of valid clusters based on the magnitude of detected differential splicing """
    
    use_adjusted_p=True
           
    print "Running NMF analyses for dimension reduction using "+str(k)+" k - Round"+str(AnalysisRound)
    NMFResult,BinarizedOutput,metaData,Annotation=NMF_Analysis.NMFAnalysis(NMFinput,k,AnalysisRound,strategy) ### This is where we get the correct version
    print "Running metaData Analyses for finding differential splicing events"
    rootdir,CovariateQuery=metaDataAnalysis.remoteAnalysis(species,filtered_EventAnnot_dir,metaData,'PSI',0.1,use_adjusted_p,0.05,Annotation)
    counter=1
    dPSI_results_dir=rootdir+CovariateQuery
    global upd_guides
    upd_guides=[]
    name=[]
    group=[]
    grplst=[]
    for filename in os.listdir(dPSI_results_dir):
        if filename.startswith("PSI."):
            dPSI_results_fn=os.path.join(dPSI_results_dir, filename)
            dPSI_comparison_alt_name=string.replace(filename,"PSI.","")
            omitcluster=FindTopUniqueEvents(dPSI_results_fn,dPSI_comparison_alt_name,dPSI_results_dir)
            if omitcluster==0: ### Hence, clustering succeeded and did not fail in this dPSI comparison
                group.append(counter)
                name.append(string.replace(filename,"PSI.",""))
                counter+=1
                
    print counter, 'robust splicing subtypes identified in round',AnalysisRound
    if counter>0: #counter>2 --- changed to 0 to force NMF
        dire = export.findParentDir(full_PSI_InputFile)
        output_dir = dire+'OncoInputs'
        if os.path.exists(output_dir)==False:
            export.createExportFolder(output_dir)

        output_file = output_dir+'/SVMInput-Round'+str(AnalysisRound)+'.txt'
        ExpandSampleClusters.filterRows(full_PSI_InputFile,output_file,filterDB=upd_guides,logData=False)
        header=ExpandSampleClusters.header_file(output_file)
        print "Running SVM prediction for improved subtypes - Round"+str(AnalysisRound)
        #print 'AAAAAAAAAAAAAAAAAAAAAAAA',output_file
        #print 'BBBBBBBBBBBBBBBBBBBBBBBB',BinarizedOutput
        train=ExpandSampleClusters.TrainDataGeneration(output_file,BinarizedOutput,name)
        grplst.append(group)
        ExpandSampleClusters.Classify(header,train,output_file,grplst,name,AnalysisRound) ### This is where we write the worng version
        header=Correlationdepletion.header_file(NMFResult)
        
        output_file=output_dir+'/DepletionInput-Round'+str(AnalysisRound)+".txt"
        sampleIndexSelection.filterFile(full_PSI_InputFile,output_file,header)
        print "Running Correlation Depletion - Round"+str(AnalysisRound)
        commonkeys,count=Correlationdepletion.FindCorrelations(NMFResult,output_file,name)
        Depleted=Correlationdepletion.DepleteSplicingevents(commonkeys,output_file,count,full_PSI_InputFile)
        full_PSI_InputFile=Depleted
    
        flag=True ### Indicates that K-means was not run - hence, another round of splice-ICGS should be performed
    """"
    else:
        try:
            print "Running K-means analyses instead of NMF - Round"+str(AnalysisRound)
            header=[]
            header=Kmeans.header_file(dPSI_results_fn_block)
            Kmeans.KmeansAnalysis(dPSI_results_fn_block,header,full_PSI_InputFile,AnalysisRound)
            flag=True
        except Exception:
            print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
            print traceback.format_exc()
            AnalysisRound = True
    """
    return flag,full_PSI_InputFile

Example #2

Show file

    if EnrichmentOnly==False:
        
        print 'PSI input files:',EventAnnot
        print 'Using a rho-cutoff of:',rho_cutoff
    
        if filters==True: ### Filter based on a default percentage of samples with detected PSI values
            EventAnnot,SampleNumber=filterPSIValues(EventAnnot,percentCutoff=percentCutoff,filterStatus=True)
        else:
            SampleNumber=filterPSIValues(EventAnnot,percentCutoff=percentCutoff,filterStatus=False)
        output_dir = dire+'ExpressionInput'
    
        export.createExportFolder(output_dir)
        full_PSI_InputFile=output_dir+"/exp.input.txt"
        header=header_list(EventAnnot)
        sampleIndexSelection.filterFile(EventAnnot,full_PSI_InputFile,header,FirstCol=False)
        
        ### Set Splice-ICGS defaults
        gsp = UI.GeneSelectionParameters(species,platform,platform)
        gsp.setNormalize('median')
        gsp.setGeneSelection('')
        gsp.setGeneSet('None Selected')
        gsp.setPathwaySelect([])
        gsp.setJustShowTheseIDs('')
        gsp.setSampleDiscoveryParameters(ExpressionCutoff,CountsCutoff,FoldDiff,SamplesDiffering,removeOutliers,
                        featurestoEvaluate,restrictBy,excludeCellCycle,column_metric,column_method,rho_cutoff)
        
        AnalysisRound=1
        if mode == "single":
            """ Perform a single round of Splice-ICGS (RNASeq.py module) """
            flag,full_PSI_InputFile,EventAnnot=CompleteWorkflow(full_PSI_InputFile,EventAnnot,rho_cutoff,strategy,seq,gsp,forceBroadClusters,AnalysisRound)

Example #3

Show file

File: Correlationdepletion.py Project: venkatmi/oncosplice

if __name__ == '__main__':

    import getopt

    ################  Comand-line arguments ################
    if len(
            sys.argv[1:]
    ) <= 1:  ### Indicates that there are insufficient number of command-line arguments
        print "Warning! Insufficient command line flags supplied."
        sys.exit()
    else:
        analysisType = []

        options, remainder = getopt.getopt(sys.argv[1:], '',
                                           ['Guidefile=', 'PSIfile='])
        for opt, arg in options:
            if opt == '--Guidefile': Guidefile = arg
            elif opt == '--PSIfile': PSIfile = arg

            else:
                print "Warning! Command-line argument: %s not recognized. Exiting..." % opt
                sys.exit()

#filename="/Users/meenakshi/Documents/leucegene/ICGS/Clustering-exp.Hs_RNASeq_top_alt_junctions367-Leucegene-75p_no149-Guide1 TRAK1&ENSG00000182606&I1.1_42075542-E2.1__E-hierarchical_cosine_correlation.txt"
#PSIfile="/Users/meenakshi/Documents/leucegene/ExpressionInput/exp.Hs_RNASeq_top_alt_junctions-PSI_EventAnnotation-367-Leucegene-75p-unique-filtered-filtered.txt"
#keylabel="/Users/meenakshi/Documents/leucegene/ExpressionInput/exp.round2_glmfilteredKmeans_label.txt"
    header = header_file(Guidefile)
    output_file = PSIfile[:-4] + "-filtered.txt"
    sampleIndexSelection.filterFile(PSIfile, output_file, header)
    commonkeys, count = FindCorrelations(Guidefile, output_file)
    DepleteSplicingevents(commonkeys, output_file, count)

Example #4

Show file

File: testing.py Project: venkatmi/oncosplice

def CompleteWorkflow(InputFile, EventAnnot, rho_cutoff, strategy, seq, gsp,
                     forceBroadClusters, turn):
    """ This function is used perform a single-iteration of the OncoSplice workflow (called from main),
    including the unsupervised splicing analysis (splice-ICGS) and signature depletion """

    ### Filter the EventAnnotation PSI file with non-depleted events from the prior round
    FilteredEventAnnot = filterEventAnnotation.FilterFile(
        InputFile, EventAnnot, turn)

    try:
        print "Running splice-ICGS for feature selection - Round" + str(turn)

        ### Reset the below variables which can be altered in prior rounds
        gsp.setGeneSelection('')
        gsp.setGeneSet('None Selected')
        gsp.setPathwaySelect([])
        if forceBroadClusters == True:
            ### Find Broad clusters with at least 25% of all samples
            originalSamplesDiffering = gsp.SamplesDiffering()
            gsp.setSamplesDiffering(int(SampleNumber * 0.25))

        print 'Number varying samples to identify:', gsp.SamplesDiffering()

        graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species,
                                                         'exons',
                                                         InputFile,
                                                         mlp,
                                                         exp_threshold=0,
                                                         rpkm_threshold=0,
                                                         parameters=gsp)
        if forceBroadClusters == True:
            gsp.setSamplesDiffering(originalSamplesDiffering)

        Guidefile = graphic_links3[-1][-1]
        Guidefile = Guidefile[:-4] + '.txt'

        print "Running block identification for rank analyses - Round" + str(
            turn)
        ### Parameters are fixed as they are distinct
        RNASeq_blockIdentification.correlateClusteredGenesParameters(
            Guidefile,
            rho_cutoff=0.4,
            hits_cutoff=4,
            hits_to_report=50,
            ReDefinedClusterBlocks=True,
            filter=True)
        Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt'
        NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block,
                                                 InputFile, turn)

    except Exception:
        print 'UNKNOWN ERROR!!!!! Setting Rank=0'
        #print traceback.format_exc()
        Rank = 0

    if Rank > 1:
        ### ADJUST THE RANKS - MUST UPDATE!!!!
        if turn == 1:
            if force_broad_round1:
                #Rank=2
                Rank = Rank
            else:
                if Rank > 2:
                    Rank = 30
        else:
            if Rank > 2:
                Rank = 30
        if seq == "bulk":
            use_adjusted_p = True
        else:
            use_adjusted_p = False

        print "Running NMF analyses for dimension reduction using " + str(
            Rank) + " ranks - Round" + str(turn)
        NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis(
            NMFinput, Rank, turn, strategy)
        print "Running Metadata Analyses for finding differential splicing events"
        rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis(
            'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p,
            0.05, Annotation)
        counter = 1
        Guidedir = rootdir + CovariateQuery
        PSIdir = rootdir + 'ExpressionProfiles'
        global upd_guides
        upd_guides = []
        name = []
        group = []
        grplst = []
        for filename in os.listdir(Guidedir):
            if filename.startswith("PSI."):
                Guidefile = os.path.join(Guidedir, filename)
                psi = string.replace(filename, "PSI.", "")
                PSIfile = os.path.join(PSIdir, psi)
                omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir)

                if omitcluster == 0:
                    group.append(counter)
                    name.append(psi)
                    counter += 1
        if counter > 2:
            dire = export.findParentDir(InputFile)
            output_dir = dire + 'OncoInputs'
            if os.path.exists(output_dir) == False:
                export.createExportFolder(output_dir)

            output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt'
            ExpandSampleClusters.filterRows(InputFile,
                                            output_file,
                                            filterDB=upd_guides,
                                            logData=False)
            header = ExpandSampleClusters.header_file(output_file)
            print "Running SVM prediction for improved subtypes - Round" + str(
                turn)
            train = ExpandSampleClusters.TrainDataGeneration(
                output_file, BinarizedOutput, name)
            grplst.append(group)
            ExpandSampleClusters.Classify(header, train, output_file, grplst,
                                          name, turn)
            header = Correlationdepletion.header_file(NMFResult)

            output_file = output_dir + '/DepletionInput-Round' + str(
                turn) + ".txt"
            sampleIndexSelection.filterFile(InputFile, output_file, header)
            print "Running Correlation Depletion - Round" + str(turn)
            commonkeys, count = Correlationdepletion.FindCorrelations(
                NMFResult, output_file, name)
            Depleted = Correlationdepletion.DepleteSplicingevents(
                commonkeys, output_file, count, InputFile)
            InputFile = Depleted

            flag = True
        else:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)
                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False
    else:
        if Rank == 1:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)
                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False
        else:
            flag = False

    return flag, InputFile, FilteredEventAnnot

Example #5

Show file

def CompleteWorkflow(InputFile, EventAnnot, turn, rho_cutoff, strategy, seq):

    species = "Hs"
    row_method = 'hopach'
    column_method = 'hopach'
    row_metric = 'correlation'
    column_metric = 'euclidean'
    color_gradient = 'yellow_black_blue'
    contrast = 3
    vendor = "RNASeq"
    GeneSelection = ''
    PathwaySelection = ''
    GeneSetSelection = 'None Selected'
    excludeCellCycle = False
    #rho_cutoff = 0.4
    restrictBy = 'protein_coding'
    featurestoEvaluate = 'Genes'
    ExpressionCutoff = 0
    CountsCutoff = 0
    FoldDiff = 1.2
    SamplesDiffering = 4
    JustShowTheseIDs = ''
    removeOutliers = False
    PathwaySelection = []
    array_type = "RNASeq"
    #rho_cutoff=0.4
    gsp = UI.GeneSelectionParameters(species, array_type, vendor)
    gsp.setGeneSet(GeneSetSelection)
    gsp.setPathwaySelect(PathwaySelection)
    gsp.setGeneSelection(GeneSelection)
    gsp.setJustShowTheseIDs(JustShowTheseIDs)
    gsp.setNormalize('median')
    gsp.setSampleDiscoveryParameters(ExpressionCutoff, CountsCutoff, FoldDiff,
                                     SamplesDiffering, removeOutliers,
                                     featurestoEvaluate, restrictBy,
                                     excludeCellCycle, column_metric,
                                     column_method, rho_cutoff)
    #Run splice ICGS
    """import UI
        species='Mm'; platform = "3'array"; vendor = 'Ensembl'
        gsp = UI.GeneSelectionParameters(species,platform,vendor)
        gsp.setGeneSet('None Selected')
        gsp.setPathwaySelect('')
        gsp.setGeneSelection('')
        gsp.setJustShowTheseIDs('')
        gsp.setNormalize('median')
        gsp.setSampleDiscoveryParameters(0,0,1.5,3,
        False,'PSI','protein_coding',False,'cosine','hopach',0.35)"""

    FilteredEventAnnot = filterEventAnnotation.FilterFile(
        InputFile, EventAnnot, turn)

    try:
        print "Running splice-ICGS for feature selection - Round" + str(turn)
        #except Exception:Rank=0
        graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species,
                                                         'exons',
                                                         InputFile,
                                                         mlp,
                                                         exp_threshold=0,
                                                         rpkm_threshold=0,
                                                         parameters=gsp)

        Guidefile = graphic_links3[-1][-1]
        Guidefile = Guidefile[:-4] + '.txt'

        print "Running block identification for rank analyses - Round" + str(
            turn)
        RNASeq_blockIdentification.correlateClusteredGenesParameters(
            Guidefile,
            rho_cutoff=0.4,
            hits_cutoff=4,
            hits_to_report=50,
            ReDefinedClusterBlocks=True,
            filter=True)
        Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt'
        NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block,
                                                 InputFile, turn)
    except Exception:
        print 'UNKNOWN ERROR!!!!!'
        print traceback.format_exc()
        Rank = 0

    if Rank > 1:
        print 'Current turn:', turn, 'k =',
        if turn == 1:
            Rank = 2
        elif Rank > 2:
            Rank = 30
        else:
            Rank = 2
        if seq == "bulk":
            use_adjusted_p = True
        else:
            use_adjusted_p = False
        print Rank
        print "Running NMF analyses for dimension reduction using " + str(
            Rank) + " ranks - Round" + str(turn)
        NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis(
            NMFinput, Rank, turn, strategy)
        print "Running Metadata Analyses for finding differential splicing events"
        rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis(
            'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p,
            0.05, Annotation)
        counter = 1
        Guidedir = rootdir + CovariateQuery
        PSIdir = rootdir + 'ExpressionProfiles'
        global upd_guides
        upd_guides = []
        name = []
        group = []
        grplst = []
        for filename in os.listdir(Guidedir):
            if filename.startswith("PSI."):
                Guidefile = os.path.join(Guidedir, filename)
                psi = string.replace(filename, "PSI.", "")
                PSIfile = os.path.join(PSIdir, psi)
                omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir)

                if omitcluster == 0:
                    group.append(counter)
                    name.append(psi)
                    counter += 1
        if counter > 2:
            dire = export.findParentDir(InputFile)
            output_dir = dire + 'OncoInputs'
            if os.path.exists(output_dir) == False:
                export.createExportFolder(output_dir)

            output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt'
            ExpandSampleClusters.filterRows(InputFile,
                                            output_file,
                                            filterDB=upd_guides,
                                            logData=False)
            header = ExpandSampleClusters.header_file(output_file)
            print "Running SVM prediction for improved subtypes - Round" + str(
                turn)
            train = ExpandSampleClusters.TrainDataGeneration(
                output_file, BinarizedOutput, name)
            grplst.append(group)
            ExpandSampleClusters.Classify(header, train, output_file, grplst,
                                          name, turn)
            header = Correlationdepletion.header_file(NMFResult)

            output_file = output_dir + '/DepletionInput-Round' + str(
                turn) + ".txt"
            sampleIndexSelection.filterFile(InputFile, output_file, header)
            print "Running Correlation Depletion - Round" + str(turn)
            commonkeys, count = Correlationdepletion.FindCorrelations(
                NMFResult, output_file, name)
            Depleted = Correlationdepletion.DepleteSplicingevents(
                commonkeys, output_file, count, InputFile)
            InputFile = Depleted

            flag = True
        else:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)
                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False

    else:
        if Rank == 1:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)

                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False
        else:
            flag = False

    return flag, InputFile, FilteredEventAnnot