def fit(parameters): # Load the training data D_train = data.loadData(parameters["training_fasta"]) # Iterate through the fasta file for fasta in (os.listdir(parameters["k_mers_path"])): # Get the k-mers of the actual file K = kmers.loadKmers(parameters["k_mers_path"] + "/" + fasta) # Generate the samples matrix (X_train) and the target values (y_train) X_train, y_train = matrix.generateSamplesTargets( D_train, K, parameters["k"]) # Instantiate a linear svm classifier clf = SVC(kernel='linear', C=1, probability=True, random_state=0, cache_size=1000) # Fit the classifier clf.fit(X_train, y_train) # Get index of the separator index = fasta.index(".") # Get he filename file_name = fasta[0:index] # Save the model joblib.dump(clf, parameters["model_path"] + "/" + file_name + ".pkl") # Information message print("Model: " + file_name + ".pkl saved at: " + parameters["model_path"])
def identifyPerfectMatch(parameters): # Display information print("\nIndentify perfect matches...") # Initialize the results list Results = {} # Get the discriminative motifs Kmers = kmers.loadKmers(str(parameters["k_mers_path"])) # Get the sequence dataset Data = data.loadData(str(parameters["training_fasta"])) # Add the reference sequence Data = data.loadReferenceSequence( Data, str(parameters["refence_sequence_genbank"])) # Iterate through the k-mers for kmer in Kmers: # Display the current motif print("Signature: " + kmer) # Get the current k-mer query = kmer # Check if there is perfect pairwise alignment of the current kmer with each sequence using parallelization informations = Parallel(n_jobs=-1)( delayed(perfectLocalPairwiseSequenceAlignment)(data, query) for data in Data) # Save the informations of each sequence according to the current kmer Results[kmer] = informations # Return the list of dictionary return Results
def predict(parameters): # Get the path of the model file model_path = str(parameters["model_path"]) # Get the path of the k-mers file k_mers_path = str(parameters["k_mers_path"]) # Get the testing fasta file file_path = str(parameters["testing_fasta"]) # Get the prediction file path prediction_path = str(parameters["prediction_path"]) # Get the evaluation mode evaluation_mode = str(parameters["evaluation_mode"]) # Load the training data D = data.loadData(file_path) # Get the set of k-mers K = kmers.loadKmers(k_mers_path) # Get the k-mers length k = len(list(K.keys())[0]) # Generate the samples matrix (X) and the target values (y) X, y = matrix.generateSamplesTargets(D, K , k) # Load the classifier clf = joblib.load(model_path) # Predict the sequences y_pred = clf.predict(X) # If evaluation mode is egal to True if evaluation_mode == "True": # If the target values list is empty if len(y) == 0: print("Evaluation cannot be performed because target values are not given") # Else display the classification report else: print("Classification report \n", classification_report(y, y_pred)) # Save the predictions f = open(prediction_path, "w") # Write the header f.write("id,y_pred\n") # Iterate through the predictions for i, y in enumerate(y_pred): # Save the current prediction f.write(D[i][0] + "," + y + "\n") # Close the file f.close() # Displays a confirmation message print("Predictions saved at the path:", prediction_path)
def fit(parameters): # Get the parameters model_path = str(parameters["model_path"]) # Get the path of the k-mers file k_mers_path = str(parameters["k_mers_path"]) # Get the path of the training fasta file file_path = str(parameters["training_fasta"]) # Load the training data D = data.loadData(file_path) # Get the set of k-mers K = kmers.loadKmers(k_mers_path) # Get the k-mers length k = len(list(K.keys())[0]) # Generate the samples matrix (X) and the target values (y) X, y = matrix.generateSamplesTargets(D, K , k) # Instantiate a linear svm classifier clf = svm() # Fit the classifier clf.fit(X, y) # Save the model joblib.dump(clf, model_path) # Displays a confirmation message print("Model saved at the path:", model_path)
def predict(parameters): # Table of predictions y_pred = [] # Table of classes classes = [] # Table of belonging probabilities probabilities = numpy.empty(0, float) # Load the testing data D_test = data.loadData(parameters["testing_fasta"]) # Compute the belonging probability for each model for fasta, model in zip(os.listdir(parameters["k_mers_path"]), os.listdir(parameters["model_path"])): # Get the current model clf = joblib.load(parameters["model_path"] + "/" + model) if len(classes) == 0: classes = clf.classes_ # Get the current k-mers K = kmers.loadKmers(parameters["k_mers_path"] + "/" + fasta) # Generate the samples matrix (X_test) and the target values (y_test) X_test, y_test = matrix.generateSamplesTargets(D_test, K, parameters["k"]) # Load the current model clf = joblib.load(parameters["model_path"] + "/" + model) # Compute the membership probabilities for the initial sub-model if probabilities.shape[0] == 0: probabilities = clf.predict_proba(X_test) # Sum the membership probabilities of the additional sub-models else: probabilities += clf.predict_proba(X_test) # Iterate membership probabilities for p in probabilities: # Get the maximum score of the array max_score = numpy.max(p) # Get the index asocciated to the high score of the array index = numpy.where(p == max_score) # Save the prediction y_pred.append(classes[index][0]) # If evaluation mode is egal to True if parameters["evaluation_mode"] == "True": # If the target values list is empty if len(y_test) == 0: print( "Evaluation cannot be performed because target values are not given" ) # Else display the classification report else: print("Classification report \n", classification_report(y_test, y_pred)) # Save the predictions f = open(parameters["prediction_path"] + "/prediction.csv", "w") # Write the header f.write("id,y_pred\n") # Iterate through the predictions for i, y in enumerate(y_pred): # Save the current prediction f.write(D_test[i][0] + "," + y + "\n") # Close the file f.close() # Displays a confirmation message print("Predictions saved at the path:", parameters["prediction_path"])