def knn(train, test, k=1, randFlag=False): """ Return a list of comparison scores computed between each loop in test and its 'nearest neighbor' (highest scoring model) from the models in train. @author: Travis Peters """ # Given a test instance, find its k-nearest neighbors in the training set knn_list = [] for test_instance in test: if not randFlag: knn_list.append( (test_instance, neighbors(test_instance, train, k)) ) else: knn_list.append( (test_instance, rand_neighbors(test_instance, train, k)) ) # Compare each test instance and its best matching model... test_results = [] for t in knn_list: # Un-pack test/best matching loop testLoop = t[0] bestMatchLoop = t[1][0] # Compute comparison testLoopModel = Model.fromLoop(testLoop) cmp_score = testLoopModel.compare(bestMatchLoop, max_rmsd=-1) # Record (1) best matching model, and (2) comparison score test_results.append( (t[1], cmp_score) ) # Tuple of ( best_model, compare_score) return test_results
def compute_score_naive(bin_clusters, first_only=True): ## Naive testing - check if a loop gets placed into it's model total_model_score = 0 total_structure_score = 0 total_partial_structure_score = [[0,0],[0,0]] total_clusters = len(bin_clusters) for bc in bin_clusters: bin_data, models = bc #Find loops loop_set = [] for model in models: temp_loops = [] model.get_loops(temp_loops) if(len(temp_loops) > 2): #reject exact and close to exact matches, why test what we know is going to hit 100%? loop_set += temp_loops #Quit if we didn't find any suitable loops if(len(loop_set) == 0): total_clusters -= 1 continue cluster_model_score = 0 cluster_structure_score = 0 cluster_partial_structure_score = [[0,0],[0,0]] for loop in loop_set: scores = classify_loop_seq(loop.seq, models, blosum62) scores = sorted(scores, key=lambda x:-x[1]) model = Model.fromLoop(loop) #compute model score if not first_only: structure_score = 0.0 tries = 0 #start at 1 so no div by zero stuff structure_score = scores[0][0].compare(model, max_rmsd=-1, verbose=False) #Iterate until we find the match for score in scores: temp_loop_set = [] score[0].get_loops(temp_loop_set) if loop in temp_loop_set: #is match break tries+=1 #Higher score is better! model_score = (len(scores) - tries) / (len(scores) + 0.0) else: #Only search first result temp_loop_set = [] scores[0][0].get_loops(temp_loop_set) model_score = 0.0 structure_score = scores[0][0].compare(model, max_rmsd=-1, verbose=False) if loop in temp_loop_set: model_score = 1.0 cluster_partial_structure_score[0][0] += structure_score cluster_partial_structure_score[0][1] += 1 else: cluster_partial_structure_score[1][0] += structure_score cluster_partial_structure_score[1][1] += 1 cluster_model_score += model_score cluster_structure_score += structure_score cluster_model_score /= len(loop_set) cluster_structure_score /= len(loop_set) try: cluster_partial_structure_score[0] = cluster_partial_structure_score[0][0] / cluster_partial_structure_score[0][1] total_partial_structure_score[0][0] += cluster_partial_structure_score[0] total_partial_structure_score[0][1] += 1 except: pass try: cluster_partial_structure_score[1] = cluster_partial_structure_score[1][0] / cluster_partial_structure_score[1][1] total_partial_structure_score[1][0] += cluster_partial_structure_score[1] total_partial_structure_score[1][1] += 1 except: pass total_model_score += cluster_model_score total_structure_score += cluster_structure_score if(total_clusters != 0): total_model_score /= total_clusters total_structure_score /= total_clusters try: ps1 = total_partial_structure_score[0][0] / total_partial_structure_score[0][1] except: ps1 = float("nan") try: ps2 = total_partial_structure_score[1][0] / total_partial_structure_score[1][1] except: ps2 = float("nan") print("Total score: (%f, %f (%f, %f))" % (total_model_score, total_structure_score, ps1, ps2)) else: print("Insufficient data to compute score")