def eps_neighbourhood(X, index, eps, metric): """ Query for neighbors within a given radius. :param X: data :param index: index position of point in data :param eps: looking for points inside radius eps :param metric: distance metric :return: vector of indices """ tree = KDTree(X, leaf_size=2, metric=metric) indices = tree.query_radius([X[index]], r=eps) return indices[0]
def test_kd_tree_query_radius(n_samples=100, n_features=10): rng = check_random_state(0) X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail kdt = KDTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt) ** 2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind = kdt.query_radius([query_pt], r + eps)[0] i = np.where(rad <= r + eps)[0] ind.sort() i.sort() assert_array_almost_equal(i, ind)
def test_kd_tree_query_radius(n_samples=100, n_features=10): np.random.seed(0) X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail kdt = KDTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt) ** 2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind = kdt.query_radius(query_pt, r + eps)[0] i = np.where(rad <= r + eps)[0] ind.sort() i.sort() assert_allclose(i, ind)
def test_kd_tree_query_radius(n_samples=100, n_features=10): rng = check_random_state(0) X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail kdt = KDTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt)**2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind = kdt.query_radius([query_pt], r + eps)[0] i = np.where(rad <= r + eps)[0] ind.sort() i.sort() assert_array_almost_equal(i, ind)
def test_kd_tree_query_radius_distance(n_samples=100, n_features=10): rng = check_random_state(0) X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail kdt = KDTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt) ** 2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind, dist = kdt.query_radius([query_pt], r + eps, return_distance=True) ind = ind[0] dist = dist[0] d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1)) assert_array_almost_equal(d, dist)
def test_kd_tree_query_radius_distance(n_samples=100, n_features=10): np.random.seed(0) X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail kdt = KDTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt) ** 2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind, dist = kdt.query_radius(query_pt, r + eps, return_distance=True) ind = ind[0] dist = dist[0] d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1)) assert_allclose(d, dist)
def test_kd_tree_query_radius_distance(n_samples=100, n_features=10): np.random.seed(0) X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1e-15 # roundoff error can cause test to fail kdt = KDTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt) ** 2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind, dist = kdt.query_radius(query_pt, r + eps, return_distance=True) ind = ind[0] dist = dist[0] d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1)) assert_allclose(d, dist)
def fit_predict(self, xs: np.ndarray, ys: np.ndarray = None): kd_tree = KDTree(xs, metric=self.metric, leaf_size=self.leaf_size) n_points = xs.shape[0] neighbours = kd_tree.query_radius(X=xs, r=self.eps) dsu = DisjointSetUnion(n_points) for i, neighs in enumerate(neighbours): if neighs.shape[0] < self.min_samples: continue for j in neighs: dsu.merge(i, j) if ys is None: ys = [0] * n_points current_cluster_id = 0 for i in range(n_points): if i == dsu.find(i): ys[i] = current_cluster_id current_cluster_id += 1 return [ys[dsu.find(i)] for i in range(n_points)]
def compute_average_scores(pdb_path, cat, it, bu): files = glob("%s*_%s_%s.pdb" % (pdb_path, it, bu)) for pdb_filename in sorted(files) : pdb_id = basename(pdb_filename)[:-4] pdb_patch_coord = ("%s%s_patch_coord.txt" % (pdb_path, pdb_id)) pdb_patch_score = ("%s%s_patch_score.txt" % (pdb_path, pdb_id)) with open(pdb_patch_coord) as coord, open(pdb_patch_score) as score: patch_coord = [[float(x) for x in a.split()] for a in coord.readlines()] patch_score = [float(x) - threshold[(cat, it, bu)] for x in score.readlines()] min_v = min(patch_score) max_v = max(patch_score) patch_score_scaled = [(lambda x: -(x / min_v) if x < 0 else (x / max_v))(x) for x in patch_score] X = np.array([a[0] for a in zip(patch_coord, patch_score_scaled) if a[1] >= 0]) X_weights = np.array([x for x in patch_score_scaled if x >= 0]) pdb_structure = p.get_structure(pdb_id, pdb_filename) atoms = np.array([atm.get_coord() for atm in pdb_structure.get_atoms() if not isHydrogen(atm) and not isHETATM(atm)]) atoms_tree = KDTree(atoms) residues_coord = {} for residue in pdb_structure.get_residues() : for atm in residue : residues_coord[tuple(atm.get_coord())] = residue average_residues_scores = {residue : 0 for residue in pdb_structure.get_residues()} # since the isollation forest algorithm is random, we run it several times to assess the average performance of the method if outlier_fraction[(cat, it, bu)] : reps = n_iterations else : reps = 1 for iteration in xrange(reps) : print "Running iteration %d of %d" % (iteration + 1, reps) if outlier_fraction[(cat, it, bu)] : forest = IsolationForest(contamination=outlier_fraction[(cat, it, bu)], n_jobs=-1) forest.fit(X, sample_weight=X_weights) prediction_isolation_forest = forest.predict(patch_coord) patch_pred_no_outliers = [copysign(1, x) for x in prediction_isolation_forest] else : patch_pred_no_outliers = [copysign(1, x) for x in patch_score] # here we map the patch predictions on the underlying residues for i in xrange(len(patch_coord)) : # for each patch # if it was predicted as non-interface continue to the next if patch_pred_no_outliers[i] < 0 : continue # multiple residues can be underneath a given patch, we do not want to consider the same residue more than once marked_residues = set() # get all atoms within mapping_distance from the given patch center indexes = atoms_tree.query_radius([patch_coord[i]], r=mapping_distance, count_only = False, return_distance=True, sort_results = True) for ind in zip(indexes[0][0], indexes[1][0]) : # which residue does the current atom belong to? current_res = residues_coord[tuple(atoms[ind[0]])] # if already considered continue to the next if current_res in marked_residues : continue # increase the score of the current residue average_residues_scores[current_res] += 1 / (1.0 + ind[1]) # patch_pred_no_outliers[i] / (1.0 + ind[1]) # mark as seen for the current patch marked_residues.add(current_res) average_residues_scores.update((x, y / reps) for x, y in average_residues_scores.items()) residues_with_scores = [(lambda x, y : (x[2], str(x[3][1]) + x[3][2], y))(residue.get_full_id(), score) for residue, score in average_residues_scores.items()] residues_with_scores.sort(key=lambda x : x[1]) residues_with_scores.sort(key=lambda x : x[0]) prediction_path = pdb_path + "our_prediction/" if not path.exists(prediction_path) : makedirs(prediction_path) print pdb_id with open("%s%s_residue_scores.txt" % (prediction_path, pdb_id), "wb") as output_residue_scores : for r in residues_with_scores : output_residue_scores.write("%s;%s;%f\n" %(r[0], r[1], r[2]))