Example #1
0
def fit(parameters):
    # Load the training data
    D_train = data.loadData(parameters["training_fasta"])
    # Iterate through the fasta file
    for fasta in (os.listdir(parameters["k_mers_path"])):
        # Get the k-mers of the actual file
        K = kmers.loadKmers(parameters["k_mers_path"] + "/" + fasta)
        # Generate the samples matrix (X_train) and the target values (y_train)
        X_train, y_train = matrix.generateSamplesTargets(
            D_train, K, parameters["k"])
        # Instantiate a linear svm classifier
        clf = SVC(kernel='linear',
                  C=1,
                  probability=True,
                  random_state=0,
                  cache_size=1000)
        # Fit the classifier
        clf.fit(X_train, y_train)
        # Get index of the separator
        index = fasta.index(".")
        # Get he filename
        file_name = fasta[0:index]
        # Save the model
        joblib.dump(clf, parameters["model_path"] + "/" + file_name + ".pkl")
        # Information message
        print("Model: " + file_name + ".pkl saved at: " +
              parameters["model_path"])
Example #2
0
def identifyPerfectMatch(parameters):
    # Display information
    print("\nIndentify perfect matches...")
    # Initialize the results list
    Results = {}
    # Get the discriminative motifs
    Kmers = kmers.loadKmers(str(parameters["k_mers_path"]))
    # Get the sequence dataset
    Data = data.loadData(str(parameters["training_fasta"]))
    # Add the reference sequence
    Data = data.loadReferenceSequence(
        Data, str(parameters["refence_sequence_genbank"]))
    # Iterate through the k-mers
    for kmer in Kmers:
        # Display the current motif
        print("Signature: " + kmer)
        # Get the current k-mer
        query = kmer
        # Check if there is perfect pairwise alignment of the current kmer with each sequence using parallelization
        informations = Parallel(n_jobs=-1)(
            delayed(perfectLocalPairwiseSequenceAlignment)(data, query)
            for data in Data)
        # Save the informations of each sequence according to the current kmer
        Results[kmer] = informations
    # Return the list of dictionary
    return Results
Example #3
0
def predict(parameters):
	# Get the path of the model file
	model_path = str(parameters["model_path"])
	# Get the  path of the k-mers file
	k_mers_path = str(parameters["k_mers_path"])
	# Get the testing fasta file
	file_path = str(parameters["testing_fasta"])
	# Get the prediction file path
	prediction_path = str(parameters["prediction_path"])
	# Get the evaluation mode
	evaluation_mode = str(parameters["evaluation_mode"])
	# Load the training data
	D = data.loadData(file_path)
	# Get the set of k-mers
	K = kmers.loadKmers(k_mers_path)
	# Get the k-mers length
	k = len(list(K.keys())[0])
	# Generate the samples matrix (X) and the target values (y)
	X, y = matrix.generateSamplesTargets(D, K , k)
	# Load the classifier
	clf = joblib.load(model_path)
	# Predict the sequences
	y_pred = clf.predict(X)
	# If evaluation mode is egal to True
	if evaluation_mode == "True":
		# If the target values list is empty
		if len(y) == 0: print("Evaluation cannot be performed because target values are not given")
		# Else display the classification report
		else: print("Classification report \n", classification_report(y, y_pred))
	# Save the predictions
	f = open(prediction_path, "w")
	# Write the header
	f.write("id,y_pred\n")
	# Iterate through the predictions
	for i, y in enumerate(y_pred): 
		# Save the current prediction
		f.write(D[i][0] + "," + y + "\n")
	# Close the file
	f.close()
	# Displays a confirmation message
	print("Predictions saved at the path:", prediction_path)
Example #4
0
def fit(parameters):
	# Get the parameters
	model_path = str(parameters["model_path"])
	# Get the path of the k-mers file
	k_mers_path = str(parameters["k_mers_path"])
	# Get the path of the training fasta file
	file_path = str(parameters["training_fasta"])
	# Load the training data
	D = data.loadData(file_path)
	# Get the set of k-mers
	K = kmers.loadKmers(k_mers_path)
	# Get the k-mers length
	k = len(list(K.keys())[0])
	# Generate the samples matrix (X) and the target values (y)
	X, y = matrix.generateSamplesTargets(D, K , k)
	#  Instantiate a linear svm classifier
	clf = svm()
	# Fit the classifier
	clf.fit(X, y)
	# Save the model
	joblib.dump(clf,  model_path)
	# Displays a confirmation message
	print("Model saved at the path:", model_path)
Example #5
0
def predict(parameters):
    # Table of predictions
    y_pred = []
    # Table of classes
    classes = []
    # Table of belonging probabilities
    probabilities = numpy.empty(0, float)
    # Load the testing data
    D_test = data.loadData(parameters["testing_fasta"])

    # Compute the belonging probability for each model
    for fasta, model in zip(os.listdir(parameters["k_mers_path"]),
                            os.listdir(parameters["model_path"])):
        # Get the current model
        clf = joblib.load(parameters["model_path"] + "/" + model)
        if len(classes) == 0: classes = clf.classes_
        # Get the current k-mers
        K = kmers.loadKmers(parameters["k_mers_path"] + "/" + fasta)
        # Generate the samples matrix (X_test) and the target values (y_test)
        X_test, y_test = matrix.generateSamplesTargets(D_test, K,
                                                       parameters["k"])
        # Load the current model
        clf = joblib.load(parameters["model_path"] + "/" + model)
        # Compute the membership probabilities for the initial sub-model
        if probabilities.shape[0] == 0:
            probabilities = clf.predict_proba(X_test)
            # Sum the membership probabilities of the additional sub-models
        else:
            probabilities += clf.predict_proba(X_test)

    # Iterate membership probabilities
    for p in probabilities:
        # Get the maximum score of the array
        max_score = numpy.max(p)
        # Get the index asocciated to the high score of the array
        index = numpy.where(p == max_score)
        # Save the prediction
        y_pred.append(classes[index][0])

    # If evaluation mode is egal to True
    if parameters["evaluation_mode"] == "True":
        # If the target values list is empty
        if len(y_test) == 0:
            print(
                "Evaluation cannot be performed because target values are not given"
            )
            # Else display the classification report
        else:
            print("Classification report \n",
                  classification_report(y_test, y_pred))
    # Save the predictions
    f = open(parameters["prediction_path"] + "/prediction.csv", "w")
    # Write the header
    f.write("id,y_pred\n")
    # Iterate through the predictions
    for i, y in enumerate(y_pred):
        # Save the current prediction
        f.write(D_test[i][0] + "," + y + "\n")
    # Close the file
    f.close()
    # Displays a confirmation message
    print("Predictions saved at the path:", parameters["prediction_path"])