Beispiel #1
0
def distance(vector1, vector2, alpha=2, metric='euclidean'):
    '''
    Helper function that calculates the alpha

    :param vector1:a vector
    :type vector1:list of doubles
    :param vector2:a vector
    :type vector2:list of doubles
    :param metric: euclidean, mahalanobis, seuclidean, cityblock
    :type metric:string
    :rtype: norm between vectors A and B
    '''

    alpha = numpy.float64(1.0 * alpha)
    vector1 = numpy.float64(numpy.array(vector1))
    vector2 = numpy.float64(numpy.array(vector2))

    if metric == 'euclidean':
        vector_norm = distances.euclidean(vector1, vector2)
    elif metric == 'mahalanobis':
        vi = numpy.linalg.inv(
            numpy.cov(numpy.concatenate((vector1, vector2)).T))
        vector_norm = distances.mahalanobis(vector1, vector2, vi)
    elif metric == 'seuclidean':
        vector_norm = distances.seuclidean(vector1, vector2)
    elif metric == 'cityblock':
        vector_norm = distances.cityblock(vector1, vector2)
    elif metric == 'hamming':
        vector_norm = distances.hamming(vector1, vector2)
    else:
        print "Unknown metric"
        return None

    return vector_norm
Beispiel #2
0
def match_state_seq(sts_true, sts_pred, K):
    """ Matchs the set of states in sts_pred such that it minimizes the hamming
        distance between sts_pred and sts_true. We assume here that the states
        are labeled 0, ..., K - 1.

        sts_true : A numpy array of integers.

        sts_pred : A numpy array of integers.

        K : Number of states in case sts_true doesn't cover all states.
    """

    sts = np.arange(K, dtype='int')
    sts_true = sts_true.astype('int')
    sts_pred = sts_pred.astype('int')
    min_perm = None
    min_hd = np.inf
    for p in itertools.permutations(sts):
        cur_sts = np.array(p)[sts_pred]
        hd = distance.hamming(sts_true, cur_sts)
        if hd < min_hd:
            min_hd = hd
            min_perm = p

    return np.array(min_perm)
Beispiel #3
0
def eval_ind(individual, initial_pop, model, base_biomass, exp_ess, distance):
    # Set this as warning
    model.solver = 'gurobi'
    old_biomass = list(linear_reaction_coefficients(model).keys())[0]  # index removed
    old_biomass.remove_from_model()
    # Make a biomass reaction and optimize for it
    biomass = Reaction('BIOMASS')
    model.add_reaction(biomass)
    index = initial_pop.index
    for i in range(len(index)):
        if individual[i] == 1:
            biomass.add_metabolites({initial_pop.index[i]: -0.1})
    biomass.add_metabolites(base_biomass)
    biomass.objective_coefficient = 1.
    # Generate deletion results --> BOTTLENECK FOR SURE
    deletion_results = single_gene_deletion(model, model.genes, processes=1)

    # Filter the results to get a boolean result
    a = [(str(next(iter(i))), 1) for i in deletion_results[deletion_results['growth'] > 1e-3].index]
    b = [(str(next(iter(i))), 0) for i in deletion_results[deletion_results['growth'] <= 1e-3].index]
    c = a + b
    pred_ess = pd.DataFrame(c, columns=['Genes', 'Predicted_growth'])
    compare_df = pd.merge(right=exp_ess, left=pred_ess, on='Genes', how='inner')

    # Apply hamming distance
    u = np.array([f for f in compare_df.Measured_growth])
    v = np.array([x for x in compare_df.Predicted_growth])
    if distance == 'hd':
        dist = hamming(u, v)
    elif distance == 'mcc':
        dist = matthews_corrcoef(u, v)
    else:
        print('Error: Invalid distance metric')

    return dist, sum(individual)
Beispiel #4
0
def k_means(data, k=2, distance='e'):
    centers = np.array(random.sample(list(data), k))
    
    centers_steps = [centers.tolist()]
    
    changed = True
    while changed:
        prev_centers = np.copy(centers)
        data_nr = data.shape[0]
        clusters = np.empty((data_nr, k))
        for i in range(data_nr):
            if distance == 'e':
                clusters[i] = np.array([euclidean(data[i] - centers[j]) for j in range(k)])
            elif distance == 'm':
                clusters[i] = np.array([cityblock(data[i], centers[j]) for j in range(k)])
            elif distance == 'h':
                clusters[i] = np.array([hamming(data[i], centers[j]) for j in range(k)])
            else:
                raise ValueError('Unrecognized distance')
        clusters = np.argmin(clusters, axis=1)
        
        for i in range(k):
            centers[i] = np.mean(data[np.where(clusters == i)], axis=0)
        
        changed = not np.intersect1d(prev_centers, centers).size == centers.size
        centers_steps.append(centers.tolist())
        
    return centers, centers_steps
Beispiel #5
0
def hamming_z_score(seq1, seq2, p1=None, p2=None):
    """
    Return the z score of the hamming distance under the specified 3 assumptions:
    1. P(X_i = Y_i) = P(X_j = Y_j). In words, this means (X_i, Y_i) and
    (X_j, Y_j) are identically jointly distributed. In other words, all data
    points are equally easy (or hard) to learn (this is an empirically false
    assumption).
    2. X_i and Y_i are conditionally independent (conditioned on i). In other
    words, the predictions between any two learned models on the same test
    example are independent (obviously false assumption).
    3. (X_i = Y_i) and (X_j = Y_j) are independent. Given assumptions 1 and 2,
    this amounts to adding the assumption that the ith and jth predictions from
    the same classifier are the independent (obviously false assumption).
    """
    # Use given p1 and p2
    if p1 is not None and p2 is not None:
        expected = expected_hamming(p1, p2)
    # Use p1 as single p
    elif p1 is not None:
        expected = expected_hamming(p1)
    # Calculate and use p1 and p2
    elif p1 is None and p2 is None:
        p1 = torch.mean(seq1)
        p2 = torch.mean(seq2)
        expected = expected_hamming(p1, p2)
    else:
        raise ValueError('Invalid arguments: p1 is None and p2 is not None')
    std = hamming_std(len(seq1), p1, p2)
    return (hamming(seq1, seq2) - expected) / std
Beispiel #6
0
def similarity_function(x, y):
    """ Similarity function for comparing user features.

    This actually really should be implemented in taar.similarity_recommender
    and then imported here for consistency.
    """

    def safe_get(field, row, default_value):
        # Safely get a value from the Row. If the value is None, get the
        # default value.
        return row[field] if row[field] is not None else default_value

    # Extract the values for the categorical and continuous features for both
    # the x and y samples. Use an empty string as the default value for missing
    # categorical fields and 0 for the continuous ones.
    x_categorical_features = [safe_get(k, x, "") for k in CATEGORICAL_FEATURES]
    y_categorical_features = [safe_get(k, y, "") for k in CATEGORICAL_FEATURES]
    x_continuous_features = [
        float(safe_get(k, x, 0)) for k in CONTINUOUS_FEATURES
    ]
    y_continuous_features = [
        float(safe_get(k, y, 0)) for k in CONTINUOUS_FEATURES
    ]

    # Here a larger distance indicates a poorer match between categorical variables.
    j_d = distance.hamming(x_categorical_features, y_categorical_features)
    j_c = distance.canberra(x_continuous_features, y_continuous_features)

    # Take the product of similarities to attain a univariate similarity score.
    # Add a minimal constant to prevent zero values from categorical features.
    # Note: since both the distance function return a Numpy type, we need to
    # call the |item| function to get the underlying Python type. If we don't
    # do that this job will fail when performing KDE due to SPARK-20803 on
    # Spark 2.2.0.
    return abs((j_c + 0.001) * j_d).item()
def one_simulation(number, opt_k):
    print("\nThe result for %dth experiment with optimal k=%d:" %
          (number, opt_k))
    kMeans_model = KMeans(n_clusters=opt_k).fit(data_X)
    kMeans_labels = kMeans_model.labels_
    df['cluster'] = kMeans_labels

    labels = ["Family", "Genus", "Species"]
    df['Family_label'] = df['cluster'].apply(family_label)
    df['Genus_label'] = df['cluster'].apply(genus_label)
    df['Species_label'] = df['cluster'].apply(species_label)

    ham_dist_once = []
    for cluster_id in range(0, opt_k):
        cluster = df[df['cluster'] == cluster_id]
        for label in labels:
            cluster_label = label + "_label"
            dist = distance.hamming(cluster[label], cluster[cluster_label])
            ham_dist_once.append(dist)
            print("Hamming score of cluster %d for label %s is %f" %
                  (cluster_id, label, dist))

    avg_dist = np.mean(ham_dist_once)
    ham_dist_all.append(avg_dist)
    print("The average hamming score for %dth experiment is %f\n\n" %
          (number, avg_dist))
Beispiel #8
0
def hamming_diff(seq1, seq2, p1=None, p2=None, ps=None):
    """
    Return the difference between the hamming distance of the two sequences and
    the expected hamming distance between two random vectors.
    If p is specified, use it as the common p for both vector. Otherwise,
    calculate each vector's respective p and use those.
    """
    # Use the list of ps (not identically distributed)
    if ps is not None:
        if isinstance(ps, torch.FloatTensor):
            assert (len(ps) == len(seq1))
            expected = expected_hamming_nonid(ps)
        else:
            raise ValueError('ps is not FloatTensor')
    # Use given p1 and p2
    elif p1 is not None and p2 is not None:
        expected = expected_hamming(p1, p2)
    # Use p1 as single p
    elif p1 is not None:
        expected = expected_hamming(p1)
    # Calculate and use p1 and p2
    elif p1 is None and p2 is None:
        p1 = torch.mean(seq1)
        p2 = torch.mean(seq2)
        expected = expected_hamming(p1, p2)
    else:
        raise ValueError('Invalid arguments: p1 is None and p2 is not None')
    return hamming(seq1, seq2) - expected
Beispiel #9
0
def find_matches(
    pred,  #features from user selected image
    collection_features,  #list of features in the collection
    images,  #list of filenames associated with the features
    dist='cosine'  #distance metric - only cosine is good
):
    '''
    Finds matches for the features of the selected image, 
    according to the distance metric specified.
    Distance metrics use the scipy package
    '''
    pred = pred.flatten()

    nimages = len(collection_features)
    #vectorize cosine similarity
    #    sims= inner(pred,collection_features)/norm(pred)/norm(collection_features,axis=1)
    sims = []
    for i in range(0, nimages):
        if dist == 'euclidean':
            sims.append(
                distance.euclidean(pred.flatten(),
                                   collection_features[i].flatten()))
        elif dist == 'hamming':
            pred[pred > 0] = 1
            sims.append(
                distance.hamming(pred.flatten(),
                                 collection_features[i].flatten()))
        else:  #default to cosine
            sims.append(
                distance.cosine(pred.flatten(),
                                collection_features[i].flatten()))
    print('max sim = ' + str(max(sims)))
    similar_images = pd.DataFrame({'imgfile': images, 'simscore': sims})
    return (similar_images)
def example_of_cross_validation_with_detailed_info(raw_data, labels, num_subjects, num_epochs_per_subj):
    # no shrinking, set C=1
    svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1)
    #logit_clf = LogisticRegression()
    clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj)
    # doing leave-one-subject-out cross validation
    for i in range(num_subjects):
        leave_start = i * num_epochs_per_subj
        leave_end = (i+1) * num_epochs_per_subj
        training_data = raw_data[0:leave_start] + raw_data[leave_end:]
        test_data = raw_data[leave_start:leave_end]
        training_labels = labels[0:leave_start] + labels[leave_end:]
        test_labels = labels[leave_start:leave_end]
        clf.fit(list(zip(training_data, training_data)), training_labels)
        # joblib can be used for saving and loading models
        #joblib.dump(clf, 'model/logistic.pkl')
        #clf = joblib.load('model/svm.pkl')
        predict = clf.predict(list(zip(test_data, test_data)))
        print(predict)
        print(clf.decision_function(list(zip(test_data, test_data))))
        incorrect_predict = hamming(predict, np.asanyarray(test_labels)) * num_epochs_per_subj
        logger.info(
            'when leaving subject %d out for testing, the accuracy is %d / %d = %.2f' %
            (i, num_epochs_per_subj-incorrect_predict, num_epochs_per_subj,
             (num_epochs_per_subj-incorrect_predict) * 1.0 / num_epochs_per_subj)
        )
        print(clf.score(list(zip(test_data, test_data)), test_labels))
Beispiel #11
0
    def compute_clients_dist(self, client_data, cache):
        client_categorical_feats = [
            client_data.get(specified_key)
            for specified_key in CATEGORICAL_FEATURES
        ]
        client_continuous_feats = [
            client_data.get(specified_key)
            for specified_key in CONTINUOUS_FEATURES
        ]

        # Compute the distances between the user and the cached continuous features.
        cont_features = distance.cdist(
            cache["continuous_features"],
            np.array([client_continuous_feats]),
            "canberra",
        )

        # Compute the distances between the user and the cached categorical features.
        cat_features = np.array(
            [[distance.hamming(x, client_categorical_feats)]
             for x in cache["categorical_features"]])

        # See the "Note about cdist optimization" in README.md for why we only use cdist once.

        # Take the product of similarities to attain a univariate similarity score.
        # Note that the addition of 0.001 to the continuous features
        # sets a floor value to the distance in continuous similarity
        # scores.  There is no such floor value set for categorical
        # features so this adjustment prioritizes categorical
        # similarity over continous similarity
        return (cont_features + FLOOR_DISTANCE_ADJUSTMENT) * cat_features
Beispiel #12
0
def get_error_hamming_distributions_from_results(results: Sequence[Sequence[Sequence[int]]]) \
        -> Sequence[Sequence[float]]:
    """
    Get the distribution of the hamming weight of the error vector (number of bits flipped
    between output and expected answer) for each possible pair of two n_bit summands using
    results output by get_n_bit_adder_results

    :param results: a list of results output from a call to get_n_bit_adder_results
    :return: the relative frequency of observing each hamming weight, 0 to n_bits+1, for the error
        that occurred when adding each pair of two n_bit summands
    """
    num_shots = len(results[0])
    n_bits = len(results[0][0]) - 1

    hamming_wt_distrs = []
    # loop over all binary strings of length n_bits
    for result, bits in zip(results, all_bitstrings(2 * n_bits)):
        # Input nums are written from (MSB .... LSB) = (a_n, ..., a_1, a_0)
        num_a = bit_array_to_int(bits[:n_bits])
        num_b = bit_array_to_int(bits[n_bits:])

        # add the numbers
        ans = num_a + num_b
        ans_bits = int_to_bit_array(ans, n_bits + 1)

        # record the fraction of shots that resulted in an error of the given weight
        hamming_wt_distr = [0. for _ in range(len(ans_bits) + 1)]
        for shot in result:
            # multiply relative hamming distance by the length of the output for the weight
            wt = len(ans_bits) * hamming(ans_bits, shot)
            hamming_wt_distr[int(wt)] += 1. / num_shots

        hamming_wt_distrs.append(hamming_wt_distr)

    return hamming_wt_distrs
Beispiel #13
0
def hmmg(X):
    hashcode=np.sign(X)
    hammin = np.zeros((X.shape[0], X.shape[0]))  # penalized hamming(Ci,Cj)
    for i in range(X.shape[0] - 1):
        for j in range(i + 1, X.shape[0]):
            hammin[i,j]=hamming(hashcode[i],hashcode[j])
            hammin[j,i]=hammin[i,j]
    return hammin
def get_class_from_ECOC(testing_predictions, class_codes):
    label = []
    for i in range(testing_predictions.shape[0]):
        hamming_distances = []
        for j in range(class_codes.shape[0]):
            hamming_distances.append(distance.hamming(testing_predictions[i], class_codes[j]))
        label.append(np.array(hamming_distances).argmin())
    return np.array(label)
Beispiel #15
0
def distance(user1, user2):
    try:
        user1Ratings = userItemRatingMatrix.transpose()[str(user1)]
        user2Ratings = userItemRatingMatrix.transpose()[str(user2)]
        distance = hamming(user1Ratings, user2Ratings)
    except:
        distance = np.NaN
    return distance
def computeDistance(user1, user2):
    try:
        user1Ratings = userItemRatingMatrix.transpose()[user1]
        user2Ratings = userItemRatingMatrix.transpose()[user2]
        distance = hamming(user1Ratings, user2Ratings)
    except:
        distance = np.NAN
    return distance
Beispiel #17
0
def get_score(motifs):
    motif_mx = np.vstack(motifs)
    profile = get_profile(motif_mx)
    consensus = get_consensus(profile)
    return np.sum(
        distance.hamming(motif, consensus)
        for motif in motif_mx
    )
def distance(x, y):
    d = 0.
    for i in xrange(len(x)):
        a = np.int32(list(binary(x[i])))
        b = np.int32(list(binary(y[i])))
        d = d + hamming(a,b)
    d = d / len(x)
    return d
def hamming_distance(training_set_vectors, query_vector, top_n=50):

    distances = []
    # comparing each image to all training set
    for i in range(len(training_set_vectors)):
        distances.append(hamming(training_set_vectors[i], query_vector[0]))
    # return sorted indices of 30 most similar images   
    return np.argsort(distances)[:top_n]
Beispiel #20
0
def mapfn(k, v):
    import kdt_config # import has to be under function. cf. mincemeat README.
    from dkdt_chunk_ends import chunk_ends
    from scipy.spatial.distance import hamming
    import numpy as np
    sims = [( hamming(np.asarray(chunk),np.asarray(v)) , i ) for i,chunk in enumerate(chunk_ends)]
    sims = sorted(sims)
    yield sims[-1][1],v # yield one pair so each record is indexed at only one leaf tree.
Beispiel #21
0
    def _init_similar_matrix(self):
        for a1 in self.agents:
            for a2 in self.agents:

                similarity = 1 - hamming(a1.traits, a2.traits)
                similar = similarity >= self.similarity_threshold

                self.similar_matrix[a1.index, a2.index] = similar
Beispiel #22
0
def hamming_distance(v1, v2):
    print('v1: ', v1.shape)
    print('v2: ', v2.shape)
    if (len(v2) < len(v1)):
        v2 = np.concatenate((v2, np.zeros(len(v1) - len(v2))), axis=0)
    if (len(v2) > len(v1)):
        v1 = np.concatenate((v1, np.zeros(len(v2) - len(v1))), axis=0)
    return distance.hamming(v1, v2)
Beispiel #23
0
 def get_ratio(self,line_list, CLASS_list):
     # ersetzt die matches mit leer und subtrahiert das gesamt davon --> Anteil der Matches
     #SIZE = float(len(line_list))
     #replace = line_list.replace(str(CLASS_list), "")
     #print(str(CLASS_list).replace("]","").replace("[",""))
     ratio = distance.hamming(line_list,CLASS_list)
     #float(((SIZE - float(len(replace)) )) / SIZE)
     return (1 - ratio)
Beispiel #24
0
def _instability_score(predicted_label, test_label, k):
    """Computes the stability score (see Lange) for `predicted_label` and
    `test_label` assuming `k` possible labels.
    """
    # find optimal permutation of labels between predicted and test
    test_label_ = _permute(
        test_label, _get_optimal_permutation(predicted_label, test_label, k))
    # return hamming distance
    return hamming(predicted_label, test_label_)
Beispiel #25
0
 def jaccard(self, id1, id2):
     """
     Approximate Jaccard coefficient using minhash
     :param id1: Doc ID (key)
     :param id2: Doc ID (key)
     :return j: Approximate Jaccard coefficient
     """
     j = 1 - hamming(self.signatures[id1], self.signatures[id2])
     return j
def get_displacement(image0, image1):
    """
    Gets displacement (in pixels I think) difference between 2 images using scikit-image
    not as accurate as the opencv version i think.

    :param image0: reference image
    :param image1: target image
    :return:
    """
    from skimage.feature import (match_descriptors, ORB, plot_matches)
    from skimage.color import rgb2gray
    from scipy.spatial.distance import hamming
    from scipy import misc
    image0_gray = rgb2gray(image0)
    image1_gray = rgb2gray(image1)
    descriptor_extractor = ORB(n_keypoints=200)

    descriptor_extractor.detect_and_extract(image0_gray)
    keypoints1 = descriptor_extractor.keypoints
    descriptors1 = descriptor_extractor.descriptors

    descriptor_extractor.detect_and_extract(image1_gray)
    keypoints2 = descriptor_extractor.keypoints
    descriptors2 = descriptor_extractor.descriptors

    matches12 = match_descriptors(descriptors1, descriptors2, cross_check=True)

    # Sort the matches based on distance.  Least distance
    # is better
    distances12 = []
    for match in matches12:
        distance = hamming(descriptors1[match[0]], descriptors2[match[1]])
        distances12.append(distance)

    indices = np.arange(len(matches12))
    indices = [index for (_, index) in sorted(zip(distances12, indices))]
    matches12 = matches12[indices]

    # collect displacement from the first 10 matches
    dx_list = []
    dy_list = []
    for mat in matches12[:10]:
        # Get the matching key points for each of the images
        img1_idx = mat[0]
        img2_idx = mat[1]

        # x - columns
        # y - rows
        (x1, y1) = keypoints1[img1_idx]
        (x2, y2) = keypoints2[img2_idx]
        dx_list.append(abs(x1 - x2))
        dy_list.append(abs(y1 - y2))

    dx_median = np.median(np.asarray(dx_list, dtype=np.double))
    dy_median = np.median(np.asarray(dy_list, dtype=np.double))
    # plot_matches(image0, image1, descriptors1, descriptors2, matches12[:10])
    return dx_median, dy_median
Beispiel #27
0
def get_hamming_distance_matrix(spike_nums_dur):
    n_cells = spike_nums_dur.shape[0]
    hamm_dist_matrix = np.zeros((n_cells, n_cells))
    for i in np.arange(n_cells):
        for j in np.arange(n_cells):
            hamm_dist_matrix[i,
                             j] = sci_sp_dist.hamming(spike_nums_dur[i, :],
                                                      spike_nums_dur[j, :])
    return hamm_dist_matrix
Beispiel #28
0
 def jaccard(self, id1, id2):
     """
     Approximate Jaccard coefficient using minhash
     :param id1: Doc ID (key)
     :param id2: Doc ID (key)
     :return j: Approximate Jaccard coefficient
     """
     j = 1 - hamming(self.signatures[id1], self.signatures[id2])
     return j
Beispiel #29
0
    def calculateHEM(self):
        from scipy.spatial.distance import hamming

        N = self.Errors.shape[0]
        self.HEM = np.zeros((N, N))

        for i, ei in enumerate(self.Errors):
            for j, ej in enumerate(self.Errors):
                self.HEM[i, j] = hamming(ei, ej)
Beispiel #30
0
def vote_hamming_distance(votes1, votes2):
    ids = np.array(range(len(votes1)))
    both_voted = ids[(votes1 != 0) & (votes2 != 0)]
    if len(both_voted) == 0:
        return 0
    v1 = votes1[both_voted]
    v2 = votes2[both_voted]
    distance = hamming(v1, v2)
    return distance
    def compute_accuracy(self, X, y):
        """
        Computes accuracy in percent. Uses scipy lib.
        :param X: Data batch. d x n
        :param y: Labels batch. n vector
        :return: Prediction accuracy in percent
        """

        return 1 - hamming(y, self.predict(X))
 def match_iris(self,iris,irisList):
     check = False
     img = ''
     for i in irisList:
         if distance.hamming(iris.ravel(),i.ravel()) == 0:
             check = True
             img = i.ravel()
             break
     return (check, img)
Beispiel #33
0
def ComputeFeatureDistance(F1, F2, dis='L2'):
    res = np.zeros([F1.shape[0], F2.shape[0]])
    for i in range(F1.shape[0]):
        for j in range(F2.shape[0]):
            if (dis == 'L2'):
                res[i][j] = (np.linalg.norm(F1[i] - F2[j]))
            elif (dis == 'hamming'):
                res[i][j] = hamming(F1[i], F2[j])
    return res
Beispiel #34
0
 def distance(self, otherPoint):
     max_len = max(len(otherPoint.data), len(self.data))
     dist = 0.
     for i in range(max_len):
         l1, l2 = pad_to_match(self.get_data(i), otherPoint.get_data(i))
         l1.sort()
         l2.sort()
         dist += hamming(l1, l2)/max_len
     return dist
    def dist_between_matrices(A, B):

        A_unweighted = np.zeros(A.shape)
        B_unweighted = np.zeros(B.shape)

        A_unweighted[A != 0] = 1
        B_unweighted[B != 0] = 1

        return hamming(A_unweighted.flatten(), B_unweighted.flatten())
Beispiel #36
0
def hamming_distance(a, b):
    '''
    compares distance for binary arrays
    returns number of features that are not the same
    '''
    if max(a) > 1:
        a[a > 0] = 1
        b[b > 0] = 1
    return (distance.hamming(a, b))
Beispiel #37
0
def stability_score(predicted_label, test_label, k):
    """Computes the stability score (see Lange) for `predicted_label` and
    `test_label` assuming `k` possible labels.
    """
    # find optimal permutation of labels between predicted and test
    test_label_ = permute(test_label,
                          get_optimal_permutation(predicted_label,
                                                  test_label, k))
    # return hamming distance
    return hamming(predicted_label, test_label_)
Beispiel #38
0
def cam_corr(M, cell, test):
    cor = []
    for i in test:
        #r = pearsonr(M[cell,:],M[i,:])[0]
        r = hamming(M[cell, :], M[i, :])
        if np.isnan(r):
            continue
        else:
            cor.append(r)
    return cor
Beispiel #39
0
 def compute_similarity(self, arr1, arr2):
     if self.simfcn == "cosine":
         return self.d_to_sim(cosine(arr1, arr2))
     elif self.simfcn == "pearson":
         return self.d_to_sim(correlation(arr1, arr2))
     elif self.simfcn == "hamming":
         return 1 - hamming(arr1, arr2)
     elif self.simfcn == "jaccard":
         return 1 - jaccard(arr1, arr2)
     else:
         print "Similiarity Function Not Yet Supported"
         exit()
Beispiel #40
0
 def dist(self, other):
     """Return the hamming distance between self and other
     
     Attempts to have other provide its own numpy array, but able
     to produce one if necessary.
     
     """
     try:
         other_seq_array = other.get_seq_array()
     except AttributeError:
         other_seq_array = sp.array(list(other))
     return hamming(self.get_seq_array(), other_seq_array)
Beispiel #41
0
 def __find__(self,videorequest):
     """
     :param videorequest: videorequest object
     :type: object
     :rtype: tuple of 2 numpy.array (frames number, hamming distances)   
     """        
     reqsig = videorequest.get_feature('bindct')
     lreq = len(reqsig)
     dists = []
     for i in range(len(self.sigs)-lreq):
         print "frame", i 
         hdist = ssd.hamming(self.sigs[i:i+lreq],reqsig)
         dists.append(hdist)
     return self.__local_minima_fancy__(dists,window=lreq),lreq
Beispiel #42
0
def pair_distance(genome_sig_list, file_list):
    H_dist = [[0 for x in range(genome_sig_list.shape[0])] for x in range(genome_sig_list.shape[0])]
    E_dist = [[0 for x in range(genome_sig_list.shape[0])] for x in range(genome_sig_list.shape[0])]
    C_dist = [[0 for x in range(genome_sig_list.shape[0])] for x in range(genome_sig_list.shape[0])]

    for i in range(0, genome_sig_list.shape[0]):
        for j in range(0, genome_sig_list.shape[0]):
            H_dist[i][j] = distance.hamming(genome_sig_list[i],genome_sig_list[j])
            E_dist[i][j] = distance.euclidean(genome_sig_list[i],genome_sig_list[j])
            C_dist[i][j] = distance.cosine(genome_sig_list[i],genome_sig_list[j])

    output_distance(H_dist, file_list, "hamming_distance.csv")
    output_distance(E_dist, file_list, "euclidean_distance.csv")
    output_distance(C_dist, file_list, "cosine_distance.csv")
Beispiel #43
0
def rd_dist(new_perm, dist_metric):
    #some metric where
    # [0, 9, 8, 1, 2, 3, 4, 7, 5, 6, 10, 11]
    # is worse than
    # [0, 1, 2, 3, 8, 9, 4, 7, 5, 6, 10, 11] or [0, 3, 2, 1, 4, 9, 8, 7, 5, 6, 10, 11]
    # beta = 0.5
    goal = range(len(new_perm))
    # metric = ((beta**2 + 1) * ssd.hamming(goal, new_perm) * ssd.euclidean(goal, new_perm))/((beta**2) * ssd.hamming(goal, new_perm) + ssd.euclidean(goal,new_perm))
    # metric = ssd.hamming(goal, new_perm) + ssd.euclidean(goal, new_perm)
    if dist_metric == "hamming":
        metric = ssd.hamming(goal, new_perm)
    elif dist_metric == "euclidean":
        metric = ssd.euclidean(goal, new_perm)
        
    return metric
def test_hammming_loss():
    true = np.random.binomial(n=1, p=.5, size=10).astype('float32')
    predicted = np.round(np.random.random(10))
    refscore = hamming(true, predicted)
    yt = T.fvector('yt')
    yp = T.fvector('yp')
    f = theano.function([yt, yp], tmetrics.classification.hamming_loss(yt, yp), allow_input_downcast=True)
    score = f(true, predicted)
    print 'true'
    print true
    print 'predicted'
    print predicted
    print 'refscore {}'.format(refscore)
    print 'score {}'.format(score)
    assert np.allclose(refscore, score)
Beispiel #45
0
def analyze_performance(predicted_labels, actual_labels):
    """ Returns the proportion of total labels that are acurately matched 
    
    Parameters
    ----------
    predicted_labels : 1d array
        Consists of numeric labels   
    actual_labels: 1d array 
        Consists of numeric labels     
    Returns
    -------
    distance : float
        'Distance' here equals #matching entries / length(predicted_labels)
    """
    normed_distance = hamming(predicted_labels, actual_labels) #between 0 - 1
    return 1 - normed_distance
def test_classification():
    fake_raw_data = [create_epoch(i) for i in range(20)]
    labels = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
    # 4 subjects, 4 epochs per subject
    epochs_per_subj = 4
    # svm
    svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1)
    training_data = fake_raw_data[0: 12]
    clf = Classifier(svm_clf, epochs_per_subj=epochs_per_subj)
    clf.fit(training_data, labels)
    expected_confidence = np.array([-1.18234421, 0.97403604, -1.04005679, 
                                    0.92403019, -0.95567738, 1.11746593,
                                    -0.83275891, 0.9486868])
    recomputed_confidence = clf.decision_function(fake_raw_data[12:])
    hamming_distance = hamming(np.sign(expected_confidence), 
			       np.sign(recomputed_confidence))
    assert hamming_distance <= 1, \
        'decision function of SVM with recomputation ' \
        'does not provide correct results'
    y_pred = clf.predict(fake_raw_data[12:])
    expected_output = [0, 0, 0, 1, 0, 1, 0, 1]
    hamming_distance = hamming(y_pred, expected_output) * len(y_pred)
    assert hamming_distance <= 1, \
        'classification via SVM does not provide correct results'
    confidence = clf.decision_function(fake_raw_data[12:])
    hamming_distance = hamming(np.sign(expected_confidence),
			       np.sign(confidence))
    assert hamming_distance <= 1, \
        'decision function of SVM without recomputation ' \
        'does not provide correct results'
    # logistic regression
    lr_clf = LogisticRegression()
    clf = Classifier(lr_clf, epochs_per_subj=epochs_per_subj)
    clf.fit(training_data, labels[0:12])
    expected_confidence = np.array([-4.49666484, 3.73025553, -4.04181695, 
                                    3.73027436, -3.77043872, 4.42613412,
                                    -3.35616616, 3.77716609])
    recomputed_confidence = clf.decision_function(fake_raw_data[12:])
    hamming_distance = hamming(np.sign(expected_confidence), 
			       np.sign(recomputed_confidence))
    assert hamming_distance <= 1, \
        'decision function of logistic regression with recomputation ' \
        'does not provide correct results'
    y_pred = clf.predict(fake_raw_data[12:])
    expected_output = [0, 0, 0, 1, 0, 1, 0, 1]
    hamming_distance = hamming(y_pred, expected_output) * len(y_pred)
    assert hamming_distance <= 1, \
        'classification via logistic regression ' \
        'does not provide correct results'
    confidence = clf.decision_function(fake_raw_data[12:])
    hamming_distance = hamming(np.sign(expected_confidence), 
			       np.sign(confidence))
    assert hamming_distance <= 1, \
        'decision function of logistic regression without precomputation ' \
        'does not provide correct results'
Beispiel #47
0
def generate_one_hit_codon_table():
    """Creates a lookup table for one-hit codon changes using a numbered index

    Returns
    -------
    OrderedDict

    """
    _one_hit_codon = OrderedDict()
    for i in range(1, 65):
        for j in range(1, 65):
            anc = GENETIC_CODE.by_index(i)[0]
            der = GENETIC_CODE.by_index(j)[0]
            dist = hamming(list(anc), list(der))
            if dist <= 1/float(3):
                _one_hit_codon[(i,j)] = Codon_change(GENETIC_CODE.by_index(i), GENETIC_CODE.by_index(j))
    return _one_hit_codon
Beispiel #48
0
    def hamming_dist(self, full_var_x, true_sts):
        """ This function returns the hamming distance between the full
            variational distribution on the states, and the true state
            sequence, after matching via the munkres algorithm
            
            full_var_x: variational distribution of state sequence.  Generate
                        it with self.full_local_update().

            true_sts: true state sequence

            Returns float with hamming distance and best permutation to match
            the states.
        """

        state_sq = np.argmax(full_var_x, axis=1).astype(int) #these are learned states
        best_match = util.munkres_match(true_sts, state_sq, self.K)
        return dist.hamming(true_sts, best_match[state_sq]), best_match
Beispiel #49
0
    def eval(self, heldout):
        """ evaluate under two metrics: 1-best error rate and hamming """

        mistakes = dd(int)
        err, ham  = 0, 0
        N = float(len(heldout))

        for x, y, e, c in heldout:
            y_pred = self.predict(x, e, c)
            err += 0.0 if (y == y_pred).all() else 1
            ham += hamming(y, y_pred) * len(y)

            for i, (y1, y2) in enumerate(zip(y, y_pred)):
                if y1 != y2:
                    mistakes[i] += 1
                    
        return err / N, ham / N, dict(mistakes)
def getDisplacement(Image0, Image1):
    Image0Gray = rgb2gray(Image0)
    Image1Gray = rgb2gray(Image1)
    descriptor_extractor = ORB(n_keypoints=200)

    descriptor_extractor.detect_and_extract(Image0Gray)
    keypoints1 = descriptor_extractor.keypoints
    descriptors1 = descriptor_extractor.descriptors

    descriptor_extractor.detect_and_extract(Image1Gray)
    keypoints2 = descriptor_extractor.keypoints
    descriptors2 = descriptor_extractor.descriptors

    matches12 = match_descriptors(descriptors1, descriptors2, cross_check=True)

    # Sort the matches based on distance.  Least distance
    # is better
    distances12 = []
    for match in matches12:
        distance = hamming(descriptors1[match[0]], descriptors2[match[1]])
        distances12.append(distance)

    indices = np.range(len(matches12))
    indices = [index for (_, index) in sorted(zip(distances12, indices))]
    matches12 = matches12[indices]

    # collect displacement from the first 10 matches
    dxList = []
    dyList = []
    for mat in matches12[:10]:
        # Get the matching keypoints for each of the images
        img1_idx = mat[0]
        img2_idx = mat[1]

        # x - columns
        # y - rows
        (x1, y1) = keypoints1[img1_idx]
        (x2, y2) = keypoints2[img2_idx]
        dxList.append(abs(x1 - x2))
        dyList.append(abs(y1 - y2))

    dxMedian = np.median(np.asarray(dxList, dtype=np.double))
    dyMedian = np.median(np.asarray(dyList, dtype=np.double))
    plot_matches(Image0, Image1, descriptors1, descriptors2, matches12[:10])
    return dxMedian, dyMedian
 def predict(self, x_test):
     prediction = np.zeros( x_test.shape[0] )
     for i in range( x_test.shape[0] ):
         x = x_test[i]
         new_code = np.zeros( self.P )
         for p in range(self.P):
             new_code[p] = self.forests[p].predict(x)
         # predict the class whose codeword has the smallest hamming distance to new_code
         from scipy.spatial.distance import hamming
         min_dist = float(np.inf)
         c_predicted = -1
         for c in range(self.C):
             dist = hamming(new_code, self.code_matrix[c,:])
             if dist < min_dist:
                 min_dist = dist
                 c_predicted = c
         prediction[i] = c_predicted
     return prediction
Beispiel #52
0
 def detect(self,video,hamming_threshold=21.0/64):
     """
     :param video: video object
     :type: video
     """         
     dctcuts = []
     dctcuts.append(0)
     sigs = video.get_feature('bindct')  
     s = sigs[0]
     #TODO use GPGPU module if possible or numpy.ediff1d
     for i,s_ in enumerate (sigs[1:]):
         if ssd.hamming(s,s_) > hamming_threshold :
             dctcuts.append(i+1)
         s= s_
     dctcuts.append(len(sigs))
     self.cuts = numpy.array(dctcuts)
     self.video = video 
     return self.cuts
Beispiel #53
0
def ecoc_test():
    svms = loader.load_pickle_file(model_path)
    te_data= loader.load_pickle_file(te_data_path)
    pred = []

    for f in te_data[0]:
        min_hamming_dist = 1.
        match_label = 0
        code = []
        for s in svms:
            c_pred = s.predict([f])[0]
            code.append(1 if c_pred == 1 else 0)  # replace -1 with 0
        for ind, c in enumerate(ecoc):
            cur_hd = hamming(c, code)
            if cur_hd < min_hamming_dist:
                min_hamming_dist = cur_hd
                match_label = ind
        pred.append(match_label)

    return (pred == te_data[1]).sum() / len(te_data[1])
def example_of_correlating_two_components(raw_data, raw_data2, labels, num_subjects, num_epochs_per_subj):
    # aggregate the kernel matrix to save memory
    svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1)
    clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj)
    num_training_samples=num_epochs_per_subj*(num_subjects-1)
    clf.fit(list(zip(raw_data[0:num_training_samples], raw_data2[0:num_training_samples])),
            labels[0:num_training_samples])
    X = list(zip(raw_data[num_training_samples:], raw_data2[num_training_samples:]))
    predict = clf.predict(X)
    print(predict)
    print(clf.decision_function(X))
    test_labels = labels[num_training_samples:]
    incorrect_predict = hamming(predict, np.asanyarray(test_labels)) * num_epochs_per_subj
    logger.info(
        'when aggregating the similarity matrix to save memory, '
        'the accuracy is %d / %d = %.2f' %
        (num_epochs_per_subj-incorrect_predict, num_epochs_per_subj,
         (num_epochs_per_subj-incorrect_predict) * 1.0 / num_epochs_per_subj)
    )
    # when the kernel matrix is computed in portion, the test data is already in
    print(clf.score(X, test_labels))
Beispiel #55
0
def ecoc_prediction_single(feature, boosts, ecoc):
    '''

    :param feature:
    :param boosts:
    :param ecoc: ecoc for predicting
    :return:
    '''
    min_hamming_dist = 1.
    match_label = 0
    code = []
    for b in boosts:
        c_pred = b.predict_single(feature)

        code.append(1 if c_pred == 1 else 0)  # replace -1 with 0
    for ind, c in enumerate(ecoc):
        cur_hd = hamming(c, code)
        if cur_hd < min_hamming_dist:
            min_hamming_dist = cur_hd
            match_label = ind
    return match_label
def example_of_aggregating_sim_matrix(raw_data, labels, num_subjects, num_epochs_per_subj):
    # aggregate the kernel matrix to save memory
    svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1)
    clf = Classifier(svm_clf, num_processed_voxels=1000, epochs_per_subj=num_epochs_per_subj)
    rearranged_data = raw_data[num_epochs_per_subj:] + raw_data[0:num_epochs_per_subj]
    rearranged_labels = labels[num_epochs_per_subj:] + labels[0:num_epochs_per_subj]
    clf.fit(list(zip(rearranged_data, rearranged_data)), rearranged_labels,
            num_training_samples=num_epochs_per_subj*(num_subjects-1))
    predict = clf.predict()
    print(predict)
    print(clf.decision_function())
    test_labels = labels[0:num_epochs_per_subj]
    incorrect_predict = hamming(predict, np.asanyarray(test_labels)) * num_epochs_per_subj
    logger.info(
        'when aggregating the similarity matrix to save memory, '
        'the accuracy is %d / %d = %.2f' %
        (num_epochs_per_subj-incorrect_predict, num_epochs_per_subj,
         (num_epochs_per_subj-incorrect_predict) * 1.0 / num_epochs_per_subj)
    )
    # when the kernel matrix is computed in portion, the test data is already in
    print(clf.score(None, test_labels))
Beispiel #57
0
def load_data(data_name, dir_to_data):
    """
    Reading data (also transforming string and binary data to appropriate shape)
    """
    if data_name in ["actg1", "actg2", "actg3"]:
        data = np.genfromtxt(op.join(dir_to_data, data_name + ".data.gz"), dtype = 'str')
        N = len(data)
        data_input = np.zeros((N, N))
        for i in range(N):
            for j in range(N):
                data_input[i][j] = Levenshtein.distance(data[i], data[j])
    elif data_name in ["binstr1", "binstr2", "binstr3"]:
        data = np.genfromtxt(op.join(dir_to_data, data_name + ".data.gz"), dtype = 'str')
        N = len(data)
        data_input = np.zeros((N, N))
        for i in range(N):
            for j in range(N):
                data_input[i][j] = hamming(list(data[i]), list(data[j]))
    else:
        data_input = np.loadtxt(op.join(dir_to_data, data_name + ".data.gz"), ndmin = 2)
    labels = np.loadtxt(op.join(dir_to_data, data_name + ".labels.gz"), dtype = 'int')
    return (data_input, labels)
Beispiel #58
0
def mapfn(k, v):
    from dkdt_chunk_ends import chunk_ends
    from scipy.spatial import kdtree
    kdtree.node      = kdtree.KDTree.node
    kdtree.leafnode  = kdtree.KDTree.leafnode
    kdtree.innernode = kdtree.KDTree.innernode
    #k is 0, ..., M
    #v is serialzied KDtree
    import kdt_config # import has to be under function. cf. mincemeat README.
    import cPickle
    dkdt = cPickle.loads(v)
    from scipy.spatial.distance import hamming
    import numpy as np
    for i , q in enumerate(kdt_config.queries):
        sims = [( hamming(np.asarray(chunk),np.asarray(q)) , i ) for i,chunk in enumerate(chunk_ends)]
        sims = sorted(sims)
        sims = sims[-kdt_config.dkdt_S:]
        sims = [str(s[1]) for s in sims]
        if (str(k) in sims): # search if current tree is attached to a similar top leaf
            # check is ith kdtree is close to query q.
            nearestNeighbors = dkdt.query(q)
            yield i , nearestNeighbors
Beispiel #59
0
def get_ecoc(ecoc_path, num_ecoc, class_num):
    if path.isfile(ecoc_path):
        print('Loading the ecoc...')
        best_ecoc = loader.load_pickle_file(ecoc_path)
    else:
        print('Creating the ecoc...')
        best_ecoc = [0, [], []]     # distance, ecoc for training, ecoc for predicting
        for i in range(100):
            n = int(math.pow(2, num_ecoc))
            codes = choice(n, class_num)
            ecoc_func_codes = []
            for i in range(num_ecoc):
                ecoc_func_codes.append([])
            c_ecoc = []
            for c in codes:
                bin_s = '{0:0' + str(num_ecoc) + '10b}'.format(c)
                bin_s = [int(ss) for ss in bin_s]
                c_ecoc.append(bin_s)
                for i in range(num_ecoc):
                    ecoc_func_codes[i].append(bin_s[i])
            c_hamming_dist = 0
            has_same_code = False
            for j in range(len(c_ecoc)):
                for k in range(len(c_ecoc)):
                    if j != k:
                        c_hd = hamming(c_ecoc[j], c_ecoc[k])
                        if c_hd == 0:
                            has_same_code = True
                        c_hamming_dist += c_hd
            if has_same_code:
                continue
            if c_hamming_dist > best_ecoc[0]:
                best_ecoc[0] = c_hamming_dist
                best_ecoc[1] = ecoc_func_codes
                best_ecoc[2] = c_ecoc

        # serialize the best ecoc
        loader.save(ecoc_path, best_ecoc)
    return best_ecoc
    ids=0
    for i in range(1,niter):
        data = pickle.load(open(model %(eta,i),'rb'))        
        Amedi=data['Ubs'][1][:,:rk]; 
        Acodei=data['Ubs'][2][:,:rk]; 
        for j in range(i+1,niter):            
            data = pickle.load(open(model %(eta,j),'rb'))
            Amedj=data['Ubs'][1][:,:rk]; 
            Acodej=data['Ubs'][2][:,:rk]; 
            
            # Med stats
            cc=[1-cosine(Amedi[:,r],Amedj[:,r]) for r in range(rk)]
            c['med_cosine']=c['med_cosine']+cc
            cmean['med_cosine']=cmean['med_cosine']+[np.mean(cc)]

            cc=[1-hamming(Amedi[:,r]>=1e-15,Amedj[:,r]>=1e-15) for r in range(rk)]
            c['med_hamming']=c['med_hamming']+cc
            cmean['med_hamming']=cmean['med_hamming']+[np.mean(cc)]
            
            t1=[np.argsort(Amedi[:,r])[::-1][:K] for r in range(rk)]
            t2=[np.argsort(Amedj[:,r])[::-1][:K] for r in range(rk)]
            
            cc=[len(np.intersect1d(t1[r],t2[r]))/float(len(np.union1d(t1[r],t2[r]))) for r in range(rk)]
            c['med_jaccard_topK']=c['med_jaccard_topK']+cc
            cmean['med_jaccard_topK']=cmean['med_jaccard_topK']+[np.mean(cc)]
            
            cc=[len(np.intersect1d(t1[r],t2[r]))/float(K) for r in range(rk)]
            c['med_hamming_topK']=c['med_hamming_topK']+cc
            cmean['med_hamming_topK']=cmean['med_hamming_topK']+[np.mean(cc)]
            
            # code stats