Beispiel #1
0
def quantify_data():
    sets = all_set_filepath()
    classification_to_number = {"H":0, "E": 1, "C":2}
    pc = get_given_principal_components()
    for dataset in sets:
        train_classification_file = []
        train_sequence_file = os.path.join(dataset,"SVM_Training_Categorical_Feature.txt")
        train_sequence = get_string(train_sequence_file)
        for i in range(5):
            filename = "SVM_Training_Classification_Transformed_Rule_" + str(i+1) + ".txt"
            train_classification_file.append(os.path.join(dataset,filename))
        for file in train_classification_file:
            print file
            number = file[-5]
            tf = get_tendency_factor(train_sequence_file, file)
            classification = get_string(file)
            filename = os.path.join(dataset,"SVM_Training_Feature_" + number + ".txt")
            create_file(filename)
            filename = os.path.join(dataset,"SVM_Training_Label_" + number + ".txt")
            create_file(filename)
            list = []
            for i in range(len(classification)):
                code = tf[train_sequence[i]] + pc[train_sequence[i]]
                a,b,c,d,e,f = code
                code = (a,b,c,d,e,f,classification_to_number[classification[i]])
                list.append(code)
            for a,b,c,d,e,f,g in list:
                file = open(os.path.join(dataset,"SVM_Training_Feature_" + number + ".txt"), "a+")
                file.write(str(round(a,2))+","+str(round(b,2))+","+str(round(c,2))+","+str(round(d,2))+","+str(round(e,2))+","+str(round(f,2))+"\n")
                file.close()
                file = open(os.path.join(dataset,"SVM_Training_Label_" + number + ".txt"), "a+")
                file.write(str(g) + "\n")
                file.close()
def predict_and_test():  #better named results
    create_file("Overall_Results.txt")
    sets = all_set_filepath()
    for i in range(len(sets)):
        similarity_prediction(sets[i], 15, 0.9)
    for dataset in sets:
        svm_prediction(dataset)
        get_accuracy_state(dataset)
def svm_prediction(setpath):
    number = setpath[-1]
    pc = get_given_principal_components()
    test_sequence_file = os.path.join(setpath,
                                      "SVM_Testing_Categorical_Feature.txt")

    with open(test_sequence_file) as f:
        test_proteins = {
            line.split("|")[0]: line.split("|")[1][:-1]
            for line in f
        }

    for m in range(5):
        filename = "SVM_Set_" + number + "_" + str(m + 1) + ".pkl"
        print filename
        classifier = joblib.load(filename)
        print classifier
        test_classification_file = os.path.join(
            setpath, "SVM_Testing_Classification_Transformed_Rule_" +
            str(m + 1) + ".txt")
        tf = get_tendency_factor(test_sequence_file, test_classification_file)

        filename = "Prediction_Based_On_Similarity_" + str(m + 1) + ".txt"
        print filename
        print "started", strftime("%Y-%m-%d %H:%M:%S", gmtime())
        with open(os.path.join(setpath, filename)) as f:
            predictions = {
                line.split("|")[0]: line.split("|")[1][:-1]
                for line in f
            }
        print len(predictions)
        create_file(os.path.join(setpath, "temp.txt"))
        for name, item in predictions.iteritems():
            for i in range(len(item)):
                pass
                if item[i] == " ":
                    code = tf[test_proteins[name][i]] + pc[test_proteins[name]
                                                           [i]]
                    a, b, c, d, e, f = code
                    code = (round(a, 2), round(b, 2), round(c, 2), round(d, 2),
                            round(e, 2), round(f, 2))
                    nparray = np.array(list(code))
                    reshaped_array = nparray.reshape(1, -1)
                    svm_output = int(
                        classifier.predict(reshaped_array).tolist()[0])
                    svm_output_to_categorical = {0: "H", 1: "E", 2: "C"}
                    item = item[:i] + svm_output_to_categorical[
                        svm_output] + item[i + 1:]
                    predictions[name] = item
        file = open(os.path.join(setpath, "temp.txt"), "a+")
        for name, item in predictions.iteritems():
            file.write(name + "|" + item + "\n")
        file.close()
        os.remove(os.path.join(setpath, filename))
        os.rename(os.path.join(setpath, "temp.txt"),
                  os.path.join(setpath, filename))
        print "completed", strftime("%Y-%m-%d %H:%M:%S", gmtime())
Beispiel #4
0
def purify_data():
    create_file("new_sequence.txt")
    create_file("new_secondary_structure.txt")
    sequence = get_all_sequence()
    new_seq = remove_unavailable(sequence)
    write_file(new_seq)
    update_secondary_structure(new_seq.keys())
    os.remove("Training_Sequence.txt")
    os.remove("Training_Classification.txt")
    os.rename("new_sequence.txt", "Training_Sequence.txt")
    os.rename("new_secondary_structure.txt", "Training_Classification.txt")
Beispiel #5
0
def split_categorical_data(proportion):
    path = os.path.realpath("Training_Set_1")
    while os.path.exists(path):
        path = path[:-1] + str(int(path[-1])+1)
    os.mkdir(path)
    
    sequence = get_all_sequence()
    classification = get_all_classification()
    
    if len(classification.keys()) != len(sequence.keys()):
        print "classification and sequence are of different length"
        return
    if proportion*len(classification) < 1 and proportion*len(classification) > 0:
        print "proportion too small"
        return
    elif proportion >= 1 or proportion <= 0:
        print "proportion must be within (0,1)"
        return
    
    create_file(os.path.join(path,"SVM_Training_Categorical_Feature.txt"))
    create_file(os.path.join(path,"SVM_Training_Categorical_Label.txt"))
    create_file(os.path.join(path,"SVM_Testing_Categorical_Feature.txt"))
    create_file(os.path.join(path,"SVM_Testing_Categorical_Label.txt"))
    create_file(os.path.join(path,"SVM_Reference_Categorical_Feature.txt"))
    create_file(os.path.join(path,"SVM_Reference_Categorical_Label.txt"))
    
    keylist = classification.keys()
    shuffle(keylist)
    training_length = proportion*len(classification)
    for i in range(len(keylist)):
        if i < training_length:
            file = open(os.path.join(path,"SVM_Training_Categorical_Feature.txt"),"a+") 
            file.write(keylist[i]+"|"+sequence[keylist[i]]+"\n")
            file.close()
            file = open(os.path.join(path,"SVM_Training_Categorical_Label.txt"),"a+")
            file.write(keylist[i]+"|"+classification[keylist[i]] + "\n")
            file.close()
        elif i < 2*training_length:
            file = open(os.path.join(path,"SVM_Reference_Categorical_Feature.txt"),"a+")
            file.write(keylist[i]+"|"+sequence[keylist[i]]+"\n")
            file.close()
            file = open(os.path.join(path,"SVM_Reference_Categorical_Label.txt"),"a+")
            file.write(keylist[i]+"|"+classification[keylist[i]] + "\n")
            file.close()
        else:
            file = open(os.path.join(path,"SVM_Testing_Categorical_Feature.txt"),"a+") 
            file.write(keylist[i]+"|"+sequence[keylist[i]]+"\n")
            file.close()
            file = open(os.path.join(path,"SVM_Testing_Categorical_Label.txt"),"a+")
            file.write(keylist[i]+"|"+classification[keylist[i]] + "\n")
            file.close()
    if length_of(os.path.join(path,"SVM_Training_Categorical_Feature.txt")) + length_of(os.path.join(path,"SVM_Testing_Categorical_Feature.txt")) + length_of(os.path.join(path,"SVM_Reference_Categorical_Feature.txt"))== len(keylist):
        print "successful separation"
def similarity_prediction(setpath, window_size, threshold):

    file = open("Overall_Results.txt", "a+")
    file.write("Trial " + setpath[-1] + ":\n")
    file.close()
    if window_size % 2 == 0:
        sidelength = window_size / 2
    else:
        sidelength = (window_size - 1) / 2
    for m in range(5):
        with open(
                os.path.join(setpath,
                             "SVM_Reference_Categorical_Feature.txt")) as f:
            list = [line.split("|")[0] for line in f]
        filename = "SVM_Reference_Classification_Transformed_Rule_" + str(
            m + 1) + ".txt"
        with open(os.path.join(setpath, filename)) as f:
            list2 = [line.split("|")[0] for line in f]
        if list != list2:
            return

        with open(
                os.path.join(setpath,
                             "SVM_Reference_Categorical_Feature.txt")) as f:
            refs = {line.split("|")[0]: line.split("|")[1][:-1] for line in f}
        with open(os.path.join(setpath,
                               "SVM_Testing_Categorical_Feature.txt")) as f:
            test = {line.split("|")[0]: line.split("|")[1][:-1] for line in f}
        filename = "SVM_Reference_Classification_Transformed_Rule_" + str(
            m + 1) + ".txt"
        with open(os.path.join(setpath, filename)) as f:
            ref_structure = {
                line.split("|")[0]: line.split("|")[1][:-1]
                for line in f
            }

        unpredictable_amino_acids = 0
        total_amino_acids = 0
        bad_proteins = {}
        predicted_proteins = {}
        test_length = 0
        for protein in test.itervalues():
            test_length += len(protein)
        print test_length
        for name, protein in test.iteritems():
            prediction = ""
            for i in range(len(protein)):
                total_amino_acids += 1
                if i - sidelength < 0 or i + sidelength >= len(protein):
                    prediction += " "
                    unpredictable_amino_acids += 1
                    continue
                protein_sequence = protein[i - sidelength:i + sidelength + 1]
                protein_sequence_3d = Complicated_Math.get_coordinate(
                    (1, 2, 7), protein_sequence)
                similar = similar_amino_acid(protein_sequence_3d, refs,
                                             window_size, threshold)
                if similar == None:
                    prediction += " "
                    unpredictable_amino_acids += 1
                    continue
                else:
                    sim_name, index = similar
                    prediction += ref_structure[sim_name][index]
            if len(prediction) != len(protein):
                print "Protein", name, "predicted secondary structure length is problematic."
                bad_proteins[name] = prediction
                continue
            predicted_proteins[name] = prediction
            print str(total_amino_acids), "/", str(test_length)

        file = open("Overall_Results.txt", "a+")
        file.write(
            "When using translation method " + str(m + 1) + ", " +
            str(round(unpredictable_amino_acids /
                      float(total_amino_acids), 1)) +
            "% of Proteins cannot be predicted via similarity analysis.\n")
        file.close()

        filename = "Prediction_Based_On_Similarity_" + str(m + 1) + ".txt"
        create_file(os.path.join(setpath, filename))
        file = open(os.path.join(setpath, filename), "a+")
        for name, item in predicted_proteins.iteritems():
            file.write(name + "|" + item + "\n")
        file.close()

        filename = "Prediction_Pure_SVM_" + str(m + 1) + ".txt"
        create_file(os.path.join(setpath, filename))
        file = open(os.path.join(setpath, filename), "a+")
        for name, item in predicted_proteins.iteritems():
            file.write(name + "|")
            for i in item:
                file.write(" ")
            file.write("\n")
        file.close()