Example #1
0
def pipeline(trainingDir,testingDir,GetTrainingFeatures,GetTestFeatures, classType):
    # change here to the training data folder
    # trainingDir = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\CODE\Feature_Extract\test_seq\Chap'
    # trainingDir = str(trainingDir)

    # change here to the testing data folder
    # testingDir = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\FASTA_Sets\HSP33_Chap\Unknown_Tests'
    # testingDir = str(testingDir)

    if GetTrainingFeatures==True:
        print('Starting to extract features from training set')
        'Temporary measure: If features extracted and saved, disable following line to avoid re-extracting trainign features'
        t0 = time.clock()
        featExt(directory=trainingDir, trainingSetFlag=True,
                classType=classType, normFlag=True, normParams='.')
        t1 = time.clock()
        time1 = t1-t0
        print('Extracted training data features')
	    #

    print('Starting to train model')
    t1 = time.clock()
    model, lb_encoder = trainClassifier(trainingDir+'\\trainingSetFeatures.csv', False, 'forest', 0, False, False)
    t1 = time.clock()
    time2 = t1-t0
    print('Created and trained model')


    if GetTestFeatures==True:
        print('Starting to extract features from testing set')
        t0 = time.clock()
        featExt(directory=testingDir, trainingSetFlag=False, classType='dir', normFlag=True,normParams= trainingDir+'\\trainingSetNormParams.csv')
        t1 = time.clock()
        time3 = t1-t0
        print('Extracted testing data features')
        df = pd.DataFrame.from_csv(testingDir+'\\testingSetFeatures.csv')
        features = df.values
        print('Predicting labels')
        t0 = time.clock()
        results = model.predict(features)
        t1 = time.clock()
        time4 = t1-t0
        ind = 0

        df['classname'] = 0
        for index, row in df.iterrows():
            df['classname'][index] = lb_encoder.inverse_transform(results[ind])
            ind+=1
        df = df['classname']
        df.to_csv(testingDir+'\\testingSetResults.csv')
        print('saved results to ' + testingDir+'\\testingSetResults.csv')
Example #2
0
def train_model(dataset_files, classformat, outputmodel, classifiertype):
    """Given set of files with fasta sequences, class format (e.g., file), 
    filename to save model and required model type (e.g., forest)
    Train the model and save it to the given file
    """
    output_features_file = "ProFET_features.csv"
    features_dict = extract_datasets_features(dataset_files, classformat, output_features_file)
    # features_df = load_data(output_features_file)
    print("Learning %s model" % classifiertype)
    model, label_encoder, scaler, feature_names = trainClassifier(
        output_features_file, classifiertype, kbest=0, alpha=False, optimalFlag=False, normFlag=True
    )
    # Save model and additional data to file
    pickle.dump(
        (model, label_encoder, scaler, feature_names), open(outputmodel, "wb"), protocol=pickle.HIGHEST_PROTOCOL
    )
    print("Done")
Example #3
0
def train_model(dataset_files, classformat, outputmodel, classifiertype):
    '''Given set of files with fasta sequences, class format (e.g., file), 
    filename to save model and required model type (e.g., forest)
    Train the model and save it to the given file
    '''
    output_features_file = 'ProFET_features.csv'
    features_dict = extract_datasets_features(dataset_files, classformat,
                                              output_features_file)
    #features_df = load_data(output_features_file)
    print('Learning %s model' % classifiertype)
    model, label_encoder, scaler, feature_names = trainClassifier(
        output_features_file,
        classifiertype,
        kbest=0,
        alpha=False,
        optimalFlag=False,
        normFlag=True)
    #Save model and additional data to file
    pickle.dump((model, label_encoder, scaler, feature_names),
                open(outputmodel, 'wb'),
                protocol=pickle.HIGHEST_PROTOCOL)
    print('Done')
Example #4
0
def pipeline(trainingDir,testingDir,resultsDir, GetTrainingFeatures,GetTestFeatures, classType):
    print(profiler)
    # change here to the training data folder
    # trainingDir = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\CODE\Feature_Extract\test_seq\Chap'
    # change here to the testing data folder
    # testingDir = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\FASTA_Sets\HSP33_Chap\Unknown_Tests'

    if GetTrainingFeatures==True:
        print('Starting to extract features from training set')
        'Temporary measure: If features extracted and saved, disable following line to avoid re-extracting trainign features'
        featExt(directory=trainingDir, trainingSetFlag=True,
                classType=classType, normParams='.')
        print('Extracted training data features')


    'TODO: Seperate model training/prediction from feat.extraction!'
    if GetTestFeatures==True:
        print('Training predictive model')
        #ORIG \\
##        model, lb_encoder = trainClassifier(trainingDir+'\\trainingSetFeatures.csv', False, 'forest', 0, False, False)
        model, lb_encoder = trainClassifier(filename=trainingDir+'/trainingSetFeatures.csv',normFlag= False,classifierType= 'forest',kbest= 0,alpha= False,optimalFlag= False) #Win
        # model, lb_encoder = trainClassifier(filename=trainingDir+'\\trainingSetFeatures.csv',normFlag= False,classifierType= 'forest',kbest= 0,alpha= False,optimalFlag= False)
        print('Model trained')

    'Change to "If GetPredictions==True" , after adding such a param'
    if GetTestFeatures==True:
        ## TODO: If more than 4k seqs, predict in chunks - DANs
        print()
        print('Extracting features from test set')
        print("trainingDir: ",trainingDir)
        featExt(directory=testingDir, trainingSetFlag=False, classType='dir', normParams=(trainingDir+'/trainingSetNormParams.csv'))
        # featExt(testingDir, False, 'dir', trainingDir+'\\trainingSetNormParams.csv') #ORIG
        print('Extracted test data features')
        # dfTesting = pd.DataFrame.from_csv(testingDir+'\\testingSetFeatures.csv') #ORIG
        dfTesting = pd.DataFrame.from_csv(testingDir+'/testingSetFeatures.csv')

        dfTraining = pd.io.parsers.read_csv(trainingDir+'/trainingSetFeatures.csv',nrows=2) #Orig.
        # dfTraining =  pd.DataFrame.from_csv(trainingDir+'/trainingSetFeatures.csv') #New

        '''
        # FeatureFilt
        Filter Extracted Features, keeping only feats that are in the training set.
        This is crucial! (Remember  be reapplied if used elsewhere, if feature filtering/selection used)
        '''
        # remove feature in dfTesting when not in dfTraining:

        #Not working? #dan
        " Bug here - fix by padding non existant features with zeroes."
        feature_cols = [col for col in dfTraining.columns if col not in ['classname','Id','proteinname']]
        # feature_cols = [col for col in feature_cols if col in dfTraining.columns]
        # https://github.com/zygmuntz/kaggle-happiness/blob/master/vectorize_validation.py
        ### train.YOB[ train.YOB.isnull() ] = 0
        #new - fill missing features..
        # dfTesting = dfTesting[feature_cols]
        common_cols = [col for col in feature_cols if col in dfTesting.columns]
        missing_cols = [col for col in feature_cols if col not in dfTesting.columns]

        dfTesting = dfTesting[common_cols]
        #dfTesting.fillna(0)
        "ToDO: Do this in one command as a map or pandas command. Faster"

        print("Orig dfTesting.shape:", dfTesting.shape)
        print("Missing_cols (in dfTesting: \n", missing_cols)

        print("len(dfTesting)",len(dfTesting),"len(dfTesting).columns",len(dfTesting.columns))
        # import numpy.zeroes
        for col in missing_cols:
            dfTesting[col] = pd.Series([0] * len(dfTesting))
            # dfTesting[col] = np.zeroes(len(dfTesting))

        print("dfTraining (shape) was:", dfTraining.shape)
        print("dfTesting shape (after padding features):", dfTesting.shape)
        print("Features matched")

        #May be unnecessary?
        # dfTesting.replace([np.inf, -np.inf], 0)
        dfTesting.fillna(0, inplace=True)

        # features = dfTesting[feature_cols].values #ORIG
        features = dfTesting.values


        print('Predicting labels')
        results = model.predict(features)
        labels = lb_encoder.inverse_transform(results)
        # dfTesting['classname'].append(list(labels))
        dfTesting['classname'] = labels
        #df to df2 :
        df2 = dfTesting['classname']
        df2.to_csv(testingDir+'\\PredictedTestSetResults.csv')
        print('Saved results to ' + testingDir+'\\PredictedTestSetResults.csv') #ORIG
        # print('Saved results to ' + testingDir+'/PredictedTestSetResults.csv')
        if os.access(resultsDir, os.F_OK) and os.access(resultsDir, os.W_OK):
            writeClassifiedFastas(classType, testingDir, resultsDir, df2)
        else:
            print("Classified fastas were not written - no access to %s" % resultsDir)

        profiler.dump_stats('profile.txt')
Example #5
0
def pipeline():
    results = parser.parse_args()
    trainingDir=results.trainingDir
    testingDir=results.testingDir
    resultsDir=results.resultsDir
    GetTrainingFeatures=results.GetTrainingFeatures
    GetTestFeatures=results.GetTestFeatures
    classType=results.classType
    classifierType=results.classifierType
    outputTrainedModel=results.outputTrainedModel
    
    if trainingDir:
        if (not os.path.exists(trainingDir)):
            print('training dir doesn\'t exist')
            exit()
        if not (os.access(trainingDir, os.R_OK) and os.access(trainingDir, os.X_OK) and os.access(trainingDir, os.W_OK)):
            print('don\' have permission to access training dir')
            exit()
    if testingDir:    
        if (not os.path.exists(testingDir)):
            print('testing dir doesn\'t exist')
            exit()
        if not (os.access(testingDir, os.R_OK) and os.access(testingDir, os.X_OK) and os.access(testingDir, os.W_OK)):
            print('don\' have permission to access testing dir')
            exit()
    if resultsDir:   
        if (not os.path.exists(resultsDir)):
            print('results dir doesn\'t exist')
            exit()
        if not (os.access(resultsDir, os.R_OK) and os.access(resultsDir, os.X_OK) and os.access(resultsDir, os.W_OK)):
            print('don\' have permission to access results dir')
            exit()

        
    print(profiler)
    # change here to the training data folder
    # trainingDir = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\CODE\Feature_Extract\test_seq\Chap'
    # change here to the testing data folder
    # testingDir = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\FASTA_Sets\HSP33_Chap\Unknown_Tests'

    if GetTrainingFeatures==True:
        print('Starting to extract features from training set')
        'Temporary measure: If features extracted and saved, disable following line to avoid re-extracting training features'
        featExt(directory=trainingDir, trainingSetFlag=True,
                classType=classType, normParams='.')
        print('Extracted training data features')


    # 'TODO: Seperate model training/prediction from feat.extraction!'
    if GetTestFeatures or outputTrainedModel:
        print('Training predictive model')
        model, lb_encoder = trainClassifier(filename=trainingDir+'/trainingSetFeatures.csv',normFlag= False,classifierType= classifierType,kbest= 0,alpha= False,optimalFlag= False) #Win
        print('Model trained')

    'Change to "If GetPredictions==True" , after adding such a param'
    if GetTestFeatures==True:
        ## TODO: If more than 4k seqs, predict in chunks - DANs
        print()
        print('Extracting features from test set')
        print("trainingDir: ",trainingDir)
        featExt(directory=testingDir, trainingSetFlag=False, classType='dir', normParams=(trainingDir+'/trainingSetNormParams.csv'))
        # featExt(testingDir, False, 'dir', trainingDir+'\\trainingSetNormParams.csv') #ORIG
        print('Extracted test data features')
        # dfTesting = pd.DataFrame.from_csv(testingDir+'\\testingSetFeatures.csv') #ORIG
        dfTesting = pd.DataFrame.from_csv(testingDir+'/testingSetFeatures.csv')
        # We use DF training to ensure consistency with features - we just need the feature names.
        dfTraining = pd.io.parsers.read_csv(trainingDir+'/trainingSetFeatures.csv',nrows=2) #Orig.
        # dfTraining =  pd.DataFrame.from_csv(trainingDir+'/trainingSetFeatures.csv') #New

        '''
        # FeatureFilt
        Filter Extracted Features, keeping only feats that are in the training set.
        This is crucial! (Remember  be reapplied if used elsewhere, if feature filtering/selection used)
        '''
        # remove feature in dfTesting when not in dfTraining:

        #Not working? #dan
        " Bug here - fix by padding non existant features with zeroes."
        feature_cols = [col for col in dfTraining.columns if col not in ['classname','Id','proteinname']]
        # feature_cols = [col for col in feature_cols if col in dfTraining.columns]
        # https://github.com/zygmuntz/kaggle-happiness/blob/master/vectorize_validation.py
        ### train.YOB[ train.YOB.isnull() ] = 0
        #new - fill missing features..
        # dfTesting = dfTesting[feature_cols]
        common_cols = [col for col in feature_cols if col in dfTesting.columns]
        missing_cols = [col for col in feature_cols if col not in dfTesting.columns]

        dfTesting = dfTesting[common_cols]
        #dfTesting.fillna(0)
        "ToDO: Do this in one command as a map or pandas command. Faster"

        print("Orig dfTesting.shape:", dfTesting.shape)
        print("Missing_cols (in dfTesting: \n", missing_cols)

        print("len(dfTesting)",len(dfTesting),"len(dfTesting).columns",len(dfTesting.columns))
        # import numpy.zeroes
        for col in missing_cols:
            dfTesting[col] = pd.Series([0] * len(dfTesting))
            # dfTesting[col] = np.zeroes(len(dfTesting))

        print("dfTraining (shape) was:", dfTraining.shape)
        print("dfTesting shape (after padding features):", dfTesting.shape)
        print("Features matched")

        #May be unnecessary?
        # dfTesting.replace([np.inf, -np.inf], 0)
        dfTesting.fillna(0, inplace=True)

        # features = dfTesting[feature_cols].values #ORIG
        features = dfTesting.values


        print('Predicting labels')
        results = model.predict(features)
        labels = lb_encoder.inverse_transform(results)
        # dfTesting['classname'].append(list(labels))
        dfTesting['classname'] = labels
        #df to df2 :
        df2 = dfTesting['classname']
        df2.to_csv(testingDir+'\\PredictedTestSetResults.csv')
        print('Saved results to ' + testingDir+'\\PredictedTestSetResults.csv') #ORIG
        # print('Saved results to ' + testingDir+'/PredictedTestSetResults.csv')
        if os.access(resultsDir, os.F_OK) and os.access(resultsDir, os.W_OK):
            writeClassifiedFastas(classType, testingDir, resultsDir, df2)
        else:
            print("Classified fastas were not written - no access to %s" % resultsDir)

        profiler.dump_stats('profile.txt')