Beispiel #1
0
    def eps_neighbourhood(X, index, eps, metric):
        """
        Query for neighbors within a given radius.

        :param X: data
        :param index: index position of point in data
        :param eps: looking for points inside radius eps
        :param metric: distance metric
        :return: vector of indices
        """
        tree = KDTree(X, leaf_size=2, metric=metric)
        indices = tree.query_radius([X[index]], r=eps)
        return indices[0]
def test_kd_tree_query_radius(n_samples=100, n_features=10):
    rng = check_random_state(0)
    X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1E-15  # roundoff error can cause test to fail
    kdt = KDTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt) ** 2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind = kdt.query_radius([query_pt], r + eps)[0]
        i = np.where(rad <= r + eps)[0]

        ind.sort()
        i.sort()

        assert_array_almost_equal(i, ind)
Beispiel #3
0
def test_kd_tree_query_radius(n_samples=100, n_features=10):
    np.random.seed(0)
    X = 2 * np.random.random(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1E-15  # roundoff error can cause test to fail
    kdt = KDTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt) ** 2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind = kdt.query_radius(query_pt, r + eps)[0]
        i = np.where(rad <= r + eps)[0]

        ind.sort()
        i.sort()

        assert_allclose(i, ind)
Beispiel #4
0
def test_kd_tree_query_radius(n_samples=100, n_features=10):
    rng = check_random_state(0)
    X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1E-15  # roundoff error can cause test to fail
    kdt = KDTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt)**2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind = kdt.query_radius([query_pt], r + eps)[0]
        i = np.where(rad <= r + eps)[0]

        ind.sort()
        i.sort()

        assert_array_almost_equal(i, ind)
def test_kd_tree_query_radius_distance(n_samples=100, n_features=10):
    rng = check_random_state(0)
    X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1E-15  # roundoff error can cause test to fail
    kdt = KDTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt) ** 2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind, dist = kdt.query_radius([query_pt], r + eps, return_distance=True)

        ind = ind[0]
        dist = dist[0]

        d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1))

        assert_array_almost_equal(d, dist)
Beispiel #6
0
def test_kd_tree_query_radius_distance(n_samples=100, n_features=10):
    np.random.seed(0)
    X = 2 * np.random.random(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1E-15  # roundoff error can cause test to fail
    kdt = KDTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt) ** 2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind, dist = kdt.query_radius(query_pt, r + eps, return_distance=True)

        ind = ind[0]
        dist = dist[0]

        d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1))

        assert_allclose(d, dist)
Beispiel #7
0
def test_kd_tree_query_radius_distance(n_samples=100, n_features=10):
    np.random.seed(0)
    X = 2 * np.random.random(size=(n_samples, n_features)) - 1
    query_pt = np.zeros(n_features, dtype=float)

    eps = 1e-15  # roundoff error can cause test to fail
    kdt = KDTree(X, leaf_size=5)
    rad = np.sqrt(((X - query_pt) ** 2).sum(1))

    for r in np.linspace(rad[0], rad[-1], 100):
        ind, dist = kdt.query_radius(query_pt, r + eps, return_distance=True)

        ind = ind[0]
        dist = dist[0]

        d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1))

        assert_allclose(d, dist)
Beispiel #8
0
    def fit_predict(self, xs: np.ndarray, ys: np.ndarray = None):
        kd_tree = KDTree(xs, metric=self.metric, leaf_size=self.leaf_size)
        n_points = xs.shape[0]
        neighbours = kd_tree.query_radius(X=xs, r=self.eps)
        dsu = DisjointSetUnion(n_points)
        for i, neighs in enumerate(neighbours):
            if neighs.shape[0] < self.min_samples:
                continue
            for j in neighs:
                dsu.merge(i, j)

        if ys is None:
            ys = [0] * n_points
            current_cluster_id = 0
            for i in range(n_points):
                if i == dsu.find(i):
                    ys[i] = current_cluster_id
                    current_cluster_id += 1

        return [ys[dsu.find(i)] for i in range(n_points)]
Beispiel #9
0
def compute_average_scores(pdb_path, cat, it, bu):
    files = glob("%s*_%s_%s.pdb" % (pdb_path, it, bu))
    
    for pdb_filename in sorted(files) :
        pdb_id = basename(pdb_filename)[:-4]
        
        pdb_patch_coord = ("%s%s_patch_coord.txt" % (pdb_path, pdb_id))
        pdb_patch_score = ("%s%s_patch_score.txt" % (pdb_path, pdb_id))
      
        with open(pdb_patch_coord) as coord, open(pdb_patch_score) as score:
            patch_coord = [[float(x) for x in a.split()] for a in coord.readlines()]
            patch_score = [float(x) - threshold[(cat, it, bu)] for x in score.readlines()]
 
        min_v = min(patch_score)
        max_v = max(patch_score)
         
        patch_score_scaled = [(lambda x: -(x / min_v) if x < 0 else (x / max_v))(x) for x in patch_score]
     
        X = np.array([a[0] for a in zip(patch_coord, patch_score_scaled) if a[1] >= 0])
        X_weights = np.array([x for x in patch_score_scaled if x >= 0])
        
        pdb_structure = p.get_structure(pdb_id, pdb_filename)
        atoms = np.array([atm.get_coord() for atm in pdb_structure.get_atoms() if not isHydrogen(atm) and not isHETATM(atm)])
        atoms_tree = KDTree(atoms)     
        
        residues_coord = {}
        for residue in pdb_structure.get_residues() :
            for atm in residue :
                residues_coord[tuple(atm.get_coord())] = residue
    
        average_residues_scores = {residue : 0 for residue in pdb_structure.get_residues()}

        # since the isollation forest algorithm is random, we run it several times to assess the average performance of the method
        
        if outlier_fraction[(cat, it, bu)] : reps = n_iterations
        else : reps = 1
        
        for iteration in xrange(reps) :
            print "Running iteration %d of %d" % (iteration + 1, reps)
            if outlier_fraction[(cat, it, bu)] : 
                forest = IsolationForest(contamination=outlier_fraction[(cat, it, bu)], n_jobs=-1)
                forest.fit(X, sample_weight=X_weights)
         
                prediction_isolation_forest = forest.predict(patch_coord)
                patch_pred_no_outliers = [copysign(1, x) for x in prediction_isolation_forest]
            else : 
                patch_pred_no_outliers = [copysign(1, x) for x in patch_score]
            # here we map the patch predictions on the underlying residues
            for i in xrange(len(patch_coord)) : # for each patch
                # if it was predicted as non-interface continue to the next
                if patch_pred_no_outliers[i] < 0 : continue 
                # multiple residues can be underneath a given patch, we do not want to consider the same residue more than once
                marked_residues = set() 
                # get all atoms within mapping_distance from the given patch center
                indexes = atoms_tree.query_radius([patch_coord[i]], r=mapping_distance, count_only = False, return_distance=True, sort_results = True)
                for ind in zip(indexes[0][0], indexes[1][0]) :
                    # which residue does the current atom belong to?
                    current_res = residues_coord[tuple(atoms[ind[0]])] 
                    # if already considered continue to the next
                    if current_res in marked_residues : continue 
                    # increase the score of the current residue
                    average_residues_scores[current_res] += 1 / (1.0 + ind[1]) # patch_pred_no_outliers[i] / (1.0 + ind[1])
                    # mark as seen for the current patch
                    marked_residues.add(current_res)
             
        average_residues_scores.update((x, y / reps) for x, y in average_residues_scores.items())
        
        residues_with_scores = [(lambda x, y : (x[2], str(x[3][1]) + x[3][2], y))(residue.get_full_id(), score) for residue, score in average_residues_scores.items()]
        residues_with_scores.sort(key=lambda x : x[1])
        residues_with_scores.sort(key=lambda x : x[0])

        prediction_path = pdb_path + "our_prediction/"
        if not path.exists(prediction_path) : makedirs(prediction_path)
        print pdb_id
        with open("%s%s_residue_scores.txt" % (prediction_path, pdb_id), "wb") as output_residue_scores :
            for r in residues_with_scores :
                output_residue_scores.write("%s;%s;%f\n" %(r[0], r[1], r[2]))