Esempio n. 1
0
 def fun(grid, alt):
   n = len(grid)
   tl, tr, bl, br = grid[0][n-2], grid[0][n-1], grid[n-1][n-2], alt
   s_top = similarity(modifier(tl), tr)
   s_bot = similarity(modifier(bl), br)
   # return min(s_top, s_bot)
   return s_top * s_bot
def remove_noise(data, cutoff, radius, colors=None):
    if colors == []: 
        colors = None
    sim = similarity.similarity(data, cutoff, radius, colors=colors)
    sim.bin_data()
    sim.delete_bins()

    newdata = []
    newcolors = []
    for bin in sim.bins:
        for b in bin:
            d = sim.data[b]
            newdata.append(sim.data[b])
            if colors != None:
                newcolors.append(colors[b])

    if colors != None:
        indices = [] * len(set(newcolors))
        for n in newcolors:
            if n not in indices:
                indices.append(n)

        for i in range(len(newcolors)):
            newcolors[i] = indices.index(newcolors[i])

    return newdata, newcolors
Esempio n. 3
0
    def filter(self, s, id_set):
        """
        返回需要预警的id_set
        """
        word_list = self.tf_idf_hd.get_top_n_tf_idf(s)
        word_list_len = len(word_list)
        
        print "/".join(word_list)
        
        repeat_tid_set = set()
        ret_set = id_set
        for i in range(word_list_len):
            key_word_list = [self.word_key_pre + word for word in word_list]
            if word_list_len > 1:
                del key_word_list[i]
            tid_set_s = self.r_hd.sinter(key_word_list)
            tid_set = set([int(i) for i in tid_set_s])
            #fid_set为重复的id集合, 加到总重复id集合里
            repeat_tid_set |= tid_set

        key_word_list = [self.word_key_pre + word for word in word_list]
        if repeat_tid_set:
            repeat_tid_list = list(repeat_tid_set)
            if word_list_len < self.sim_judge_limit:
                title_key_list = [self.title_id_pre + str(i) for i in repeat_tid_list]
                #取出所有title判断相似度
                title_list = self.r_hd.mget(title_key_list)
                idx = -1
                for title in title_list:
                    idx += 1
                    if similarity(s, title) > 0.5:
                        break
                if idx >= 0:
                    l_id_set_s = self.r_hd.smembers(self.uid_pre + str(repeat_tid_list[idx]))
                    l_id_set = set([int(i) for i in l_id_set_s])
                    overtime_uid_set = self.check_for_uid_overtime(repeat_tid_set, id_set)
                    ret_set = (id_set - l_id_set) | overtime_uid_set
                    self.update_uid_to_redis(repeat_tid_list, key_word_list, ret_set)
                else:
                    #如果没有相似的title, 则insert
                    self.insert_s_to_redis(s, key_word_list, ret_set)
            else:
                tid_uid_key_list = [self.uid_pre + str(tid) for tid in repeat_tid_set]
                l_id_set_s = self.r_hd.sunion(tid_uid_key_list)
                l_id_set = set([int(i) for i in l_id_set_s])
                overtime_uid_set = self.check_for_uid_overtime(repeat_tid_set, id_set)
                print "overtime uid set:", overtime_uid_set
                print "repeat tid set:", repeat_tid_set
                ret_set = (id_set - l_id_set) | overtime_uid_set
                self.update_uid_to_redis(repeat_tid_list, key_word_list, ret_set)
        else:
            #如果一个都没有对上 则直接新增
            self.insert_s_to_redis(s, key_word_list, id_set)
        return ret_set
Esempio n. 4
0
def xor_rows_then_compare(grid, alt):
  if(len(grid) == 2):
      return 0
  grid = copy.deepcopy(grid)
  grid[2].append(alt)
  top = reduce(lambda a, b: a^b, grid[0])
  bot = reduce(lambda a, b: a^b, grid[2])
  a_top = reduce(lambda a, b: a&b, grid[0], top)
  a_bot = reduce(lambda a, b: a&b, grid[2], bot)
  top_xor = pymorph.open(top^a_top)
  bot_xor = pymorph.open(bot^a_bot)
  return similarity(top_xor, bot_xor)
Esempio n. 5
0
def compare_vectors(word_vector1, word_vector2):
    
    all_words = list(set(word_vector1).union(set(word_vector2)))
    #print all_words
    frequency_dict1 = word_frequencies(word_vector1)
    #print frequency_dict1
    frequency_dict2 = word_frequencies(word_vector2)
    #print frequency_dict2

    frequency_vector1 = [frequency_dict1.get(word, 0) for word in all_words]
    frequency_vector2 = [frequency_dict2.get(word, 0) for word in all_words]
    #print frequency_vector1,frequency_vector2

    return similarity(frequency_vector1, frequency_vector2)
Esempio n. 6
0
def compare_vectors(word_vector1, word_vector2):
    """Numerical similarity between lists of words. Higher is better.

    Uses cosine similarity.
    Result range: 0 (bad) - 1 (uses all the same words in the same proportions)

    """
    all_words = list(set(word_vector1).union(set(word_vector2)))
    frequency_dict1 = word_frequencies(word_vector1)
    frequency_dict2 = word_frequencies(word_vector2)

    frequency_vector1 = [frequency_dict1.get(word, 0) for word in all_words]
    frequency_vector2 = [frequency_dict2.get(word, 0) for word in all_words]

    return similarity(frequency_vector1, frequency_vector2)
Esempio n. 7
0
def match(
            E1, E2, kpts1, kpts2, desc1, desc2,
            cang, crat, cdesc, th_e, th_p, verb):
    '''
    E1, E2: hyperedges lists of img1 and img2, respectly
    '''
    # indices_taken = []
    hyperedge_matches = []
    point_matches = []
    sel_point_matches = set()

    if verb:
        count = 0
        size = len(E1) * len(E2)

    for i, e_i in enumerate(E1):
        max_similarity = -float('inf')
        for j, e_j in enumerate(E2):
            p = [np.array(kpts1[e_i[k]].pt) for k in xrange(3)]
            q = [np.array(kpts2[e_j[k]].pt) for k in xrange(3)]
            dp = [np.array(desc1[e_i[k]]) for k in xrange(3)]
            dq = [np.array(desc2[e_j[k]]) for k in xrange(3)]
            _point_idx, _sim, sim_a, sim_r, sim_d = similarity(
                p, q,
                dp, dq,
                cang, crat, cdesc
            )
            if verb:
                count += 1
                print '{}/{} = {:.2}%'.format(count, size, count / size * 100)
            if _sim > max_similarity:
                best_index = j
                max_similarity = _sim
                s_ang = sim_a
                s_ratios = sim_r
                s_desc = sim_d
                e_idx = [(e_i[l], e_j[m]) for l, m in _point_idx]
        if max_similarity >= th_e:
            hyperedge_matches.append(
                (i, best_index, max_similarity, s_ang, s_ratios, s_desc)
            )
            for l, m in e_idx:
                dist = LA.norm(np.subtract(desc1[l], desc2[m]))
                sim = exp(-dist / SIGMA)
                if not (l, m) in sel_point_matches and sim >= th_p:
                    point_matches.append(cv2.DMatch(l, m, dist))
                    sel_point_matches.add((l, m))
    return hyperedge_matches, point_matches
Esempio n. 8
0
def isomorphic(G1, G2, name=None):
    if G1.number_of_nodes() != G2.number_of_nodes():
        print "Non-isomorphic: different number of nodes."
        return False
    [B1, B2] = stable_colouring([G1, G2])
    print "%d colours found." % len(B1)
    L1 = [np.sum(M) for M in B1]
    L2 = [np.sum(M) for M in B2]
    if L1 != L2:
        print "Non-isomorphic: different stable colouring."
        return False
    #field = next_prime_field(len(B1))
    field = GF(2)
    p = field.getCharacteristic()
    print "Testing similarity over GF(%d)" % p
    # Convert to numpy matrices, ignoring colours with zero entries
    C1 = [numpy_to_nzmath(B1[i], field) for i in range(len(B1)) if L1[i] != 0 ]
    C2 = [numpy_to_nzmath(B2[i], field) for i in range(len(B2)) if L2[i] != 0 ]
    assert len(C1) == len(C2)
    if similarity(C1, C2, name) is None:
        print "Non-isomorphic: no simultaneous similarity in GF(%d)." % p
        return False
    print "Isomorphic or too difficult to tell."
    return True
Esempio n. 9
0
def echo_all(updates):
    found = 0
    for update in updates["result"]:
        try:
            text = update["message"]["text"]
            prev_score = 0
            instances = faq.query.all()
            print(instances)
            for instance in instances:
                score, match = similarity(text, instance.question)
                if match and prev_score < score:
                    Atext = instance.answer
                    chat = update["message"]["chat"]["id"]
                    found = 1
                    prev_score = score

            if found != 1:
                Atext = "Sorry I didn't quite get you"
                chat = update["message"]["chat"]["id"]
                send_message(Atext, chat)
            else:
                send_message(Atext, chat)
        except Exception as e:
            print(e)
Esempio n. 10
0

# Generate text matrix with help of simple class TmgSimple
tm = TmgSimple(filename='../Data/textDocs.txt', stopwords_filename='../Data/stopWords.txt', stem=True)

# Extract variables representing data
X = np.mat(tm.get_matrix(sort=True))
attributeNames = tm.get_words(sort=True)

# Query vector
q = np.matrix([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0])


# Method 1 ('for' loop - slow)
N = np.shape(X)[0]; # get the number of data objects
sim = np.zeros((N,1)) # allocate a vector for the similarity
for i in range(N):
    x = X[i,:] # Get the i'th data object (here: document)
    sim[i] = q/linalg.norm(q) * x.T/linalg.norm(x) # Compute cosine similarity

# Method 2 (one line of code with no iterations - faster)
sim = (q*X.T).T / (np.sqrt(np.power(X,2).sum(axis=1)) * np.sqrt(np.power(q,2).sum(axis=1)))

# Method 3 (use the "similarity" function)
sim = similarity(X, q, 'cos');


# Display the result
print('Query vector:\n {0}\n'.format(q))
print('Similarity results:\n {0}'.format(sim))
Esempio n. 11
0

def w(s):
    sys.stdout.write(s)


if __name__ == '__main__':
    conn = sqlite3.connect(DATABASE)
    files = os.listdir(ROOT_DIR)[15:30]
    nb = len(files)
    for x in xrange(nb):
        f = open(os.path.join(ROOT_DIR, files[x]))
        count = len(json.loads(f.read())['tags'])
        print "%-3d - %s (%d tags)" % (x, files[x], count)
    w("\n\n")
    # Print the matrix.
    w("   ")
    for x in xrange(nb):
        w("  %5d" % x)
    w("\n")
    for x in xrange(nb):
        w("%3d" % x)
        fx = open(os.path.join(ROOT_DIR, files[x]))
        tx = json.loads(fx.read())
        for y in xrange(nb):
            fy = open(os.path.join(ROOT_DIR, files[y]))
            ty = json.loads(fy.read())
            sim = similarity.similarity(tx, ty, conn)
            w("  %5.2f" % sim)
        w("\n")
Esempio n. 12
0
 def test_similarity(self):
     self.assertAlmostEqual(similarity([1, 1], [-1, -1]), -1)
     self.assertAlmostEqual(similarity([1, 1], [1, 1]), 1)
     self.assertAlmostEqual(similarity([0, 1], [1, 0]), 0)
Esempio n. 13
0
 def test_exact_similiarity(self):
     s = similarity("layer 4 neuron", "layer 4 neuron", use_inter_similarity=False)
     self.assertEqual(s[0], 1.0)
Esempio n. 14
0
    print filename
    # compute Mel filterank features
    mel.compute_mfb(filename)
    # compute dynamic features
    dynfeat.compute_dynamic_features(filename)
    selecfeat.mutual_information_features(filename)
'''

for filename in filenames:
    # select a small set of features
    trainFilenames = filenames[:]  # copy the filename list
    trainFilenames.remove(filename)
    selectFeatures = selecfeat.select_features(filename, trainFilenames)

    # compute the full similarity matrix and the upper diagonal (frame to frame distance)
    similarityUpperDiag = similarity.similarity(filename)
    similarityMatrix = similarity.full_similarity(selectFeatures)

    # get the segment dates
    similarSegmentsInd = similarity.segments_indices(filename)

    # plot the upper diagonal
    nPoints = len(similarityUpperDiag)
    rate = numpy.load(settings.DIR_SAMPLE_RATE + filename + '.npy')
    x = timeconv.timeconv(numpy.arange(nPoints), rate, 'feat', 'second')
    plt.plot(x, similarityUpperDiag)
    plt.vlines(timeconv.timeconv(similarSegmentsInd, rate, 'feat', 'second'),
               min(similarityUpperDiag), 1)
    gtmap = numpy.loadtxt(settings.DIR_LABELS + filename + '.csv',
                          dtype=int,
                          delimiter=',')
Esempio n. 15
0
doc_theta = [random.dirichlet([alpha[m]] * K) for m in range(M)]

#beta = random.gamma(1, 1)
beta = random.beta(1, 1)
#corpus_phi = [ random.dirichlet([ beta ] * K) for k in range(K) ]
corpus_phi = [random.dirichlet([beta] * V) for k in range(K)]

# Network.
graph = networkx.Graph()

for i in range(M):
    for j in range(M):
        if i == j:
            continue

        s_ij = similarity.similarity(doc_theta[i], doc_theta[j], 'kld')
        s_ji = similarity.similarity(doc_theta[j], doc_theta[i], 'kld')

        if s_ij < T and s_ji < T:
            graph.add_edge(i, j)

E_size_comp = (M * (M - 1)) / 2
E_size_graph = len(graph.edges())

print('%d / %d - %f' %
      (E_size_graph, E_size_comp, float(E_size_graph) / E_size_comp))

with file(sys.argv[1] + '.edges', 'w') as opened:
    opened.write(json.dumps(graph.edges()))

docs = []
Esempio n. 16
0
def distance(q1, q2):
    if q1 == q2:
        return 0
    return similarity(q1, q2)
Esempio n. 17
0
#beta = random.gamma(1, 1)
beta = random.beta(1, 1)
#corpus_phi = [ random.dirichlet([ beta ] * K) for k in range(K) ]
corpus_phi = [ random.dirichlet([ beta ] * V) for k in range(K) ]


# Network.
graph = networkx.Graph()

for i in range(M):
    for j in range(M):
        if i == j:
            continue

        s_ij = similarity.similarity(doc_theta[i], doc_theta[j], 'kld')
        s_ji = similarity.similarity(doc_theta[j], doc_theta[i], 'kld')

        if s_ij < T and s_ji < T:
            graph.add_edge(i, j)

E_size_comp = (M * (M-1)) / 2
E_size_graph = len(graph.edges())

print('%d / %d - %f' % (E_size_graph, E_size_comp, float(E_size_graph) / E_size_comp))

with file(sys.argv[1] + '.edges', 'w') as opened:
    opened.write(json.dumps(graph.edges()))


docs = []
Esempio n. 18
0
 def test_exact_similiarity(self):
     s = similarity('layer 4 neuron', 'layer 4 neuron', use_inter_similarity=False)
     self.assertEqual(s[0], 1.0)
Esempio n. 19
0
    print filename
    # compute Mel filterank features
    mel.compute_mfb(filename)
    # compute dynamic features
    dynfeat.compute_dynamic_features(filename)
    selecfeat.mutual_information_features(filename)
'''

for filename in filenames:
    # select a small set of features
    trainFilenames = filenames[:]   # copy the filename list
    trainFilenames.remove(filename)
    selectFeatures = selecfeat.select_features(filename, trainFilenames)

    # compute the full similarity matrix and the upper diagonal (frame to frame distance)
    similarityUpperDiag = similarity.similarity(filename)
    similarityMatrix = similarity.full_similarity(selectFeatures)

    # get the segment dates
    similarSegmentsInd = similarity.segments_indices(filename)
    
    # plot the upper diagonal
    nPoints = len(similarityUpperDiag)
    rate = numpy.load(settings.DIR_SAMPLE_RATE + filename + '.npy')
    x = timeconv.timeconv(numpy.arange(nPoints), rate, 'feat', 'second')
    plt.plot(x, similarityUpperDiag)
    plt.vlines(timeconv.timeconv(similarSegmentsInd, rate, 'feat', 'second'), min(similarityUpperDiag), 1)
    gtmap = numpy.loadtxt(settings.DIR_LABELS + filename + '.csv', dtype=int, delimiter=',')
    plt.vlines(gtmap[:,0], min(similarityUpperDiag), 1, color='r')
    plt.title('Segmentation based on frame to frame similarity : ' + filename)
    plt.show()
Esempio n. 20
0
plt.ylabel("Density")
plt.subplot(1, 3, 3)
wsm = np.mean(writing_score)
ws_m = writing_score - wsm
sns.distplot(writing_score,
             hist=True,
             kde=True,
             bins=int(100 / 10),
             color='red',
             hist_kws={'edgecolor': 'black'},
             kde_kws={'linewidth': 1})
plt.xlabel("Writing score")
plt.ylabel("Density")
# %%

sim = similarity(writing_score, reading_score, 'cor')
print(sim)

score_attribute = attributeNames[5:8]
print(score_attribute)
i = 0
j = 1
fig7, ax8 = plt.subplots()
for att in range(3):
    ax8.arrow(0, 0, V[att, i], V[att, j])
    ax8.text(V[att, i], V[att, j], score_attribute[att])
ax8.set_xlim([-1, 1])
ax8.set_ylim([-1, 1])
ax8.set_xlabel('PC' + str(i + 1))
ax8.set_ylabel('PC' + str(j + 1))
ax8.grid()
Esempio n. 21
0

def w(s):
    sys.stdout.write(s)


if __name__ == "__main__":
    conn = sqlite3.connect(DATABASE)
    files = os.listdir(ROOT_DIR)[15:30]
    nb = len(files)
    for x in xrange(nb):
        f = open(os.path.join(ROOT_DIR, files[x]))
        count = len(json.loads(f.read())["tags"])
        print "%-3d - %s (%d tags)" % (x, files[x], count)
    w("\n\n")
    # Print the matrix.
    w("   ")
    for x in xrange(nb):
        w("  %5d" % x)
    w("\n")
    for x in xrange(nb):
        w("%3d" % x)
        fx = open(os.path.join(ROOT_DIR, files[x]))
        tx = json.loads(fx.read())
        for y in xrange(nb):
            fy = open(os.path.join(ROOT_DIR, files[y]))
            ty = json.loads(fy.read())
            sim = similarity.similarity(tx, ty, conn)
            w("  %5.2f" % sim)
        w("\n")
# exercise 3.2.2

import numpy as np
from similarity import similarity

# Generate two data objects with M random attributes
M = 5;
x = np.mat(np.random.rand(1,M))
y = np.mat(np.random.rand(1,M))

# Two constants
a = 1.5
b = 1.5

# Check the statements in the exercise
print "Cosine scaling: %.4f " % (similarity(x,y,'cos') - similarity(a*x,y,'cos'))[0,0]
print "ExtendedJaccard scaling: %.4f " % (similarity(x,y,'ext') - similarity(a*x,y,'ext'))[0,0]
print "Correlation scaling: %.4f " % (similarity(x,y,'cor') - similarity(a*x,y,'cor'))[0,0]
print "Cosine translation: %.4f " % (similarity(x,y,'cos') - similarity(b+x,y,'cos'))[0,0]
print "ExtendedJaccard translation: %.4f " % (similarity(x,y,'ext') - similarity(b+x,y,'ext'))[0,0]
print "Correlation translation: %.4f " % (similarity(x,y,'cor') - similarity(b+x,y,'cor'))[0,0]
Esempio n. 23
0
 def closest_center_index(vector):
     """Get the index of the closest cluster center to `self.vector`."""
     similarity_to_vector = lambda center: similarity(center,vector)
     center = max(self.centers, key=similarity_to_vector)
     return self.centers.index(center)
Esempio n. 24
0
import similarity
!curl 'http://www.gutenberg.org/files/11/11-0.txt' -o aliceText.txt
# take all words from alice and store them in memory

aliceFile = open("aliceText.txt")

wordCorpus = []

for line in aliceFile:
    
    # remove newlines
    line = line.strip().lower()
    
    # get words
    words = line.split(" ")
    
    for word in words:
        if word.isalnum():
            if word not in wordCorpus:
                wordCorpus.append(word)
                
print similarity.similarity("rabbi",wordCorpus)
Esempio n. 25
0
# exercise 3.2.2

import numpy as np
from similarity import similarity

# Generate two data objects with M random attributes
M = 5;
x = np.mat(np.random.rand(1,M))
y = np.mat(np.random.rand(1,M))

# Two constants
a = 1.5
b = 1.5

# Check the statements in the exercise
print "Cosine scaling: %.4f " % (similarity(x,y,'cos') - similarity(a*x,y,'cos'))[0,0]
print "ExtendedJaccard scaling: %.4f " % (similarity(x,y,'ext') - similarity(a*x,y,'ext'))[0,0]
print "Correlation scaling: %.4f " % (similarity(x,y,'cor') - similarity(a*x,y,'cor'))[0,0]
print "Cosine translation: %.4f " % (similarity(x,y,'cos') - similarity(b+x,y,'cos'))[0,0]
print "ExtendedJaccard translation: %.4f " % (similarity(x,y,'ext') - similarity(b+x,y,'ext'))[0,0]
print "Correlation translation: %.4f " % (similarity(x,y,'cor') - similarity(b+x,y,'cor'))[0,0]
Esempio n. 26
0
    def filter(self, s, id_set):
        """
        返回需要预警的id_set
        """
        word_list = self.tf_idf_hd.get_top_n_tf_idf(s)
        word_list_len = len(word_list)

        print "/".join(word_list)

        repeat_tid_set = set()
        ret_set = id_set
        for i in range(word_list_len):
            key_word_list = [self.word_key_pre + word for word in word_list]
            if word_list_len > 1:
                del key_word_list[i]
            tid_set_s = self.r_hd.sinter(key_word_list)
            tid_set = set([int(i) for i in tid_set_s])
            #fid_set为重复的id集合, 加到总重复id集合里
            repeat_tid_set |= tid_set

        key_word_list = [self.word_key_pre + word for word in word_list]
        if repeat_tid_set:
            repeat_tid_list = list(repeat_tid_set)
            if word_list_len < self.sim_judge_limit:
                title_key_list = [
                    self.title_id_pre + str(i) for i in repeat_tid_list
                ]
                #取出所有title判断相似度
                title_list = self.r_hd.mget(title_key_list)
                idx = -1
                for title in title_list:
                    idx += 1
                    if similarity(s, title) > 0.5:
                        break
                if idx >= 0:
                    l_id_set_s = self.r_hd.smembers(self.uid_pre +
                                                    str(repeat_tid_list[idx]))
                    l_id_set = set([int(i) for i in l_id_set_s])
                    overtime_uid_set = self.check_for_uid_overtime(
                        repeat_tid_set, id_set)
                    ret_set = (id_set - l_id_set) | overtime_uid_set
                    self.update_uid_to_redis(repeat_tid_list, key_word_list,
                                             ret_set)
                else:
                    #如果没有相似的title, 则insert
                    self.insert_s_to_redis(s, key_word_list, ret_set)
            else:
                tid_uid_key_list = [
                    self.uid_pre + str(tid) for tid in repeat_tid_set
                ]
                l_id_set_s = self.r_hd.sunion(tid_uid_key_list)
                l_id_set = set([int(i) for i in l_id_set_s])
                overtime_uid_set = self.check_for_uid_overtime(
                    repeat_tid_set, id_set)
                print "overtime uid set:", overtime_uid_set
                print "repeat tid set:", repeat_tid_set
                ret_set = (id_set - l_id_set) | overtime_uid_set
                self.update_uid_to_redis(repeat_tid_list, key_word_list,
                                         ret_set)
        else:
            #如果一个都没有对上 则直接新增
            self.insert_s_to_redis(s, key_word_list, id_set)
        return ret_set
Esempio n. 27
0
 def closest_center_index(vector):
     """Get the index of the closest cluster center to `self.vector`."""
     similarity_to_vector = lambda center: similarity(center, vector)
     center = max(self.centers, key=similarity_to_vector)
     return self.centers.index(center)
Esempio n. 28
0
# # Project the centered data onto principal component space
K = centeredMatrix * V
#print np.size(K,0)
#print np.size(K,1)

# # Compute variance explained by principal components
var = (S * S) / (S * S).sum()
print sum(
    var[:2]), "The amount of variation explained as a function of two PCA"

# # This matrix is used to check the correlation
KTranspose = np.mat(K).T

# # Checking for correlation
correlationMat = np.mat(similarity(KTranspose, KTranspose, 'cor'))
#print np.size(correlationMat,0)
#print np.size(correlationMat,1)
for i in range(len(correlationMat)):
    for j in range(len(correlationMat)):
        if i != j:
            if round(correlationMat[i, j]) == -1 or round(
                    correlationMat[i, j]) == 1:
                print 'correlation between', i, 'and', j

# # Plot the first Direction
f = plt.figure(1)
plt.title('Direction of First Component ')
plt.plot([np.arange(13)], V[0, :], 'o', color='black')
plt.xlabel('Attributes')
plt.ylabel('Weights')
Esempio n. 29
0
tm = TmgSimple(filename='../02450Toolbox_Python/Data/textDocs.txt', stopwords_filename='../02450Toolbox_Python/Data/stopWords.txt', stem=True)

# Extract variables representing data
X = tm.get_matrix(sort=True)
attributeNamesWithStop = tm.get_words(sort=True)

# Display the result
print('Now with stopwords !!!')
print attributeNamesWithStop
print X

"""
3.1.5
calculating similarity
Using the similarity lib.
"""
#q is our desired similarity query the words are "solving", "rank" & "matrix"
q = np.matrix([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0])

sim = similarity(X, q, 'cos')

print('Similarity results:\n {0}'.format(sim))

"""
3.2.1
"""




Esempio n. 30
0
 def test_inter_similiarity_PV(self):
     s = similarity("PV neuron", "fast-spiking neuron", use_inter_similarity=True)
     self.assertEqual(s[0], 0.9, "inter similarity works for PV and fast-spiking")
     s_reverse = similarity("fast-spiking neuron", "PV neuron")
     self.assertEqual(s_reverse[0], 0.9, "inter similarity works in both directions")
Esempio n. 31
0
#!/usr/bin/env python

import time
import pandas as pd
from collections import OrderedDict
from similarity import similarity
from helpertools import helperTools
import sys

start_time = time.time()
obj1 = helperTools()
obj = similarity()

df = pd.read_csv("/home/hadoop/50review.csv")
domainThesaurus = OrderedDict({
    "SERVICE": ["SERVICE", "WAITER", "STAFF", "SERVER"],
    "ROOM": ["BED", "ROOM", "BATHROOM"],
    "SHOPPING": ["MALL", "SHOPPING", "STORE", "MARKET"],
    "CLEANLINESS": ["DIRTY", "GRUBBY", "CLEAN", "NEAT"],
    "FOOD": [
        "EAT", "DISHES", "DINNER", "FOOD", "BREAKFAST", "DELICIOUS", "MEAL",
        "RESTAURANT", "LUNCH"
    ],
    "VALUE": ["PRICE", "CHEAP", "WORTH", "MONEY", "EXPENSIVE", "PAY"],
    "TRANSPORTATION": ["RELATEDWAY", "STOP", "TRANSPORTATION", "BUS"],
    "FAMILY/FRIENDS": [
        "MOTHER", "FRIEND", "FATHER", "FAMILY", "DAUGHTER", "HUSBAND", "CHILD",
        "SON", "KID", "WIFE"
    ],
    "LOCATION": ["FAR", "NEAR", "LACATION"],
    "VIEW": ["VIEW"],
# exercise 3.2.2

import numpy as np
from similarity import similarity

# Generate two data objects with M random attributes
M = 5
x = np.mat(np.random.rand(1, M))
y = np.mat(np.random.rand(1, M))

# Two constants
a = 1.5
b = 1.5

# Check the statements in the exercise
print("Cosine scaling: %.4f " %
      (similarity(x, y, 'cos') - similarity(a * x, y, 'cos'))[0, 0])
print("ExtendedJaccard scaling: %.4f " %
      (similarity(x, y, 'ext') - similarity(a * x, y, 'ext'))[0, 0])
print("Correlation scaling: %.4f " %
      (similarity(x, y, 'cor') - similarity(a * x, y, 'cor'))[0, 0])
print("Cosine translation: %.4f " %
      (similarity(x, y, 'cos') - similarity(b + x, y, 'cos'))[0, 0])
print("ExtendedJaccard translation: %.4f " %
      (similarity(x, y, 'ext') - similarity(b + x, y, 'ext'))[0, 0])
print("Correlation translation: %.4f " %
      (similarity(x, y, 'cor') - similarity(b + x, y, 'cor'))[0, 0])
# exercise 3.2.2

import numpy as np
from similarity import similarity

# Generate two data objects with M random attributes
M = 5
x = np.mat(np.random.rand(1, M))
y = np.mat(np.random.rand(1, M))

# Two constants
a = 1.5
b = 1.5

# Check the statements in the exercise
print "Cosine scaling: %.4f " % (similarity(x, y, "cos") - similarity(a * x, y, "cos"))[0, 0]
print "ExtendedJaccard scaling: %.4f " % (similarity(x, y, "ext") - similarity(a * x, y, "ext"))[0, 0]
print "Correlation scaling: %.4f " % (similarity(x, y, "cor") - similarity(a * x, y, "cor"))[0, 0]
print "Cosine translation: %.4f " % (similarity(x, y, "cos") - similarity(b + x, y, "cos"))[0, 0]
print "ExtendedJaccard translation: %.4f " % (similarity(x, y, "ext") - similarity(b + x, y, "ext"))[0, 0]
print "Correlation translation: %.4f " % (similarity(x, y, "cor") - similarity(b + x, y, "cor"))[0, 0]
Esempio n. 34
0
i = 1

# Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation' 
similarity_measure = 'smc'

# Load the CBCL face database
# Load Matlab data file to python dict structure
X = loadmat('../Data/wildfaces_grayscale.mat')['X']
N, M = shape(X)


# Search the face database for similar faces
# Index of all other images than i
noti = range(0,i) + range(i+1,N) 
# Compute similarity between image i and all others
sim = similarity(X[i,:], X[noti,:], similarity_measure)
sim = sim.tolist()[0]
# Tuples of sorted similarities and their indices
sim_to_index = sorted(zip(sim,noti))


# Visualize query image and 5 most/least similar images
figure(figsize=(12,8))
subplot(3,1,1)
imshow(np.reshape(X[i],(40,40)).T, cmap=cm.gray)
xticks([]); yticks([])
title('Query image')
ylabel('image #{0}'.format(i))


for ms in range(5):
if __name__ == '__main__':
    p = [
        np.array([119.91277313232422, 252.8047332763672]),
        np.array([139.75482177734375, 284.2823181152344]),
        np.array([109.2437744140625, 257.5870056152344])
    ]
    dp = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    # q = [trans(rot(pp, 10 * math.pi / 180), 2, 3) for pp in p]
    q = [
        np.array([374.65393066, 144.14517212]),
        np.array([161.24133301, 217.84576416]),
        np.array([99.27768707, 161.40586853])
    ]
    dq = [[1, 2, 3.1], [4, 5.1, 6], [7.1, 8, 9]]

    point_match, max_sim, sim_a, sim_r, sim_d = similarity(
        p, q, dp, dq, 1, 1, 0.4, True)
    print p
    print q
    print 'max sim    ->  {}'.format(max_sim)
    print 'sim angles  :  {}'.format(sim_a)
    print 'sim ratios  :  {}'.format(sim_r)
    print 'sim desc    :  {}'.format(sim_d)

    xp, yp = zip(*p)
    xq, yq = zip(*q)
    plt.plot(xp + (xp[0], ), yp + (yp[0], ), 'r-')
    plt.plot(xq + (xq[0], ), yq + (yq[0], ), 'b-')
    plt.xlim((min(xp + xq), max(xp + xq)))
    plt.ylim((min(yp + yq), max(yp + yq)))
    plt.gca().invert_yaxis()
    plt.show()
Esempio n. 36
0
 def test_inter_similiarity_PV(self):
     s = similarity('PV neuron', 'fast-spiking neuron', use_inter_similarity=True)
     self.assertEqual(s[0], 0.9, 'inter similarity works for PV and fast-spiking')
     s_reverse = similarity('fast-spiking neuron', 'PV neuron')
     self.assertEqual(s_reverse[0], 0.9, 'inter similarity works in both directions')
Esempio n. 37
0
def testing_news_all_values(d, test_news, threshold_prob_fake, min_common_en, component_selector,Dice_intersection__intensity):
    ## Initialising the count for prediction "fake news"
    count_predictions_Fake = 0
    predictions_Fake = []
    predictions_Fake_value = []
    ## There must be at least n_tested_news tested news
    # Initialising the index in the list of test news to read
    len_test_news = len(test_news)
    # Initialising the count of valid_news_NotFake
    valid_tested_news = 0
    for i in range(len_test_news):
        ## Reading test news number i
        test_news_dict = test_news[i]

        ## Filtering the knowledge base graph through the test news
        knowledge_filtered, error_size_KF = knowledge_filtered_fake(d, min_common_en, test_news_dict)

        ## If there is not enough size in the filtered knowledge graph, pass to the next news.
        if error_size_KF == 1:
            predictions_Fake_value.append(float("nan"))
        ## Otherwise continue the process
        else:
            ## Appending the fake news to the filtered knowledge graph. The fake news is the last position
            knowledge_filtered[test_news_dict["SOURCE"]] = test_news_dict

            ## It is necessary to apply a previous filter (regarding Entity Names and Related Words) to the
            # obtained filtered knowledge graph including the fake news
            rwords_news_min = 10
            rwords_en_min = 1
            knowledge_filtered = previous_filter(knowledge_filtered, rwords_news_min, rwords_en_min)

            ## Obtainaning the similarity and the dissimilariry matrixes with the selected componentes
            #  Fixing some basic parameters for the similarity measure
            optionSimbSim = "Ichino_yaguchi"
            gamma = 0.2  # In case of Ichino-Yaguchi similarity
            # Similarity calculations
            dis_matrix = similarity(knowledge_filtered, component_selector, optionSimbSim,
                                    Dice_intersection__intensity, gamma)

            ## Automathic selection of parameters epsilon and min_samples in DBSCAN algorithm
            epsilon, min_samples, error_parameters_DBSCAN = DBSCAN_parameters_epsilon_minsamples(dis_matrix)
            ## If there is not enough size in yhr filtered knowledge graph, pass to the next news
            if error_parameters_DBSCAN == 1:
                predictions_Fake_value.append(float("nan"))
            ## Otherwise continue the process
            else:
                ## DBSAN algorithm results
                dbscan_labels, dbscan_n_clusters, dbscan_n_noise, var_exp, label_fake, \
                prob_fake = dbscan_clustering(dis_matrix, epsilon, min_samples, None, False)

                predictions_Fake_value.append(prob_fake)

                ## Deciding if the test news is a fake new or not and adding it to the count of
                # false predictions of NotFake
                if (prob_fake > threshold_prob_fake):
                    count_predictions_Fake += 1
                    predictions_Fake.append(i)

                valid_tested_news += 1

        print("i = ", i, ";     valid_tested_news = ", valid_tested_news)

    ## Computing the number of predictions "NotFake"
    count_predictions_NotFake = valid_tested_news - count_predictions_Fake

    return count_predictions_Fake, count_predictions_NotFake, predictions_Fake, predictions_Fake_value
Esempio n. 38
0
        arr_ins_textValid.append(arr_ins_text)
        arr_ins_imgValid.append(arr_ins_img)
        arr_ins_timeValid.append(arr_ins_time)
        arr_labelValid.append(arr_label)
    executor.shutdown()
    endtime = datetime.datetime.now()
    print(" Time Cost: ", (endtime - starttime))


config = tf.ConfigProto()
# config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.15
# tf.reset_default_graph()
sess = tf.Session(config=config)
print("Loading Model", end="")
similarityV = similarity(batch_size1)
saver = tf.train.Saver(max_to_keep=3)
sess.run(tf.global_variables_initializer())
if loadModel:
    model_file = tf.train.latest_checkpoint('./model/')
    saver.restore(sess, model_file)

print(" finally", end="")

print(" finished")


def npyCos(vector1, vector2):
    return np.dot(
        vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
Esempio n. 39
0
def calc_price_sim(price1, price2):
    return similarity(price1, price2)
Esempio n. 40
0
def main(args):
    print('Start test')

    creds = ReadDictJson(args.credentails)
    if not creds:
        print('Failed to load credentials file {}. Exiting'.format(args.credentails))
        return False

    s3def = creds['s3'][0]
    s3 = s3store(s3def['address'], 
                 s3def['access key'], 
                 s3def['secret key'], 
                 tls=s3def['tls'], 
                 cert_verify=s3def['cert_verify'], 
                 cert_path=s3def['cert_path']
                 )

    trainingset = '{}/{}/'.format(s3def['sets']['trainingset']['prefix'] , args.trainingset)
    print('Load training set {}/{} to {}'.format(s3def['sets']['trainingset']['bucket'],trainingset,args.trainingset_dir ))
    s3.Mirror(s3def['sets']['trainingset']['bucket'], trainingset, args.trainingset_dir)

    trainingsetDescriptionFile = '{}/description.json'.format(args.trainingset_dir)
    trainingsetDescription = json.load(open(trainingsetDescriptionFile))
    
    config = {
        'batch_size': args.batch_size,
        'trainingset': trainingsetDescription,
        'input_shape': [args.training_crop[0], args.training_crop[1], args.train_depth],
        'classScale': 0.001, # scale value for each product class
        'augment_rotation' : 5., # Rotation in degrees
        'augment_flip_x': False,
        'augment_flip_y': True,
        'augment_brightness':0.,
        'augment_contrast': 0.,
        'augment_shift_x': 0.0, # in fraction of image
        'augment_shift_y': 0.0, # in fraction of image
        'scale_min': 0.75, # in fraction of image
        'scale_max': 1.25, # in fraction of image
        'ignore_label': trainingsetDescription['classes']['ignore'],
        'classes': trainingsetDescription['classes']['classes'],
        'epochs': 1,
        'area_filter_min': 25,
        'weights': None,
        'channel_order': args.channel_order,
        's3_address':s3def['address'],
        's3_sets':s3def['sets'],
        'initialmodel':args.initialmodel,
        'training_dir': None, # used by LoadModel
        'learning_rate': 1e-3, # used by LoadModel
        'clean' : True,
        'test_archive': trainingset,
        'run_archive': '{}{}/'.format(trainingset, args.initialmodel),
        'min':args.min,
    }

    trainingsetDescriptionFile = '{}/description.json'.format(args.trainingset_dir)
    trainingsetDescription = json.load(open(trainingsetDescriptionFile))

    strategy = None
    if(args.strategy == 'mirrored'):
        strategy = tf.distribute.MirroredStrategy(devices=args.devices)

    else:
        device = "/gpu:0"
        if args.devices is not None and len(args.devices) > 0:
            device = args.devices[0]

        strategy = tf.distribute.OneDeviceStrategy(device=device)

    # Prepare datasets for similarity computation
    objTypes = {}
    for objType in trainingsetDescription['classes']['objects']:
        if objType['trainId'] not in objTypes:
            objTypes[objType['trainId']] = copy.deepcopy(objType)
            # set name to category for objTypes and id to trainId
            objTypes[objType['trainId']]['name'] = objType['category']
            objTypes[objType['trainId']]['id'] = objType['trainId']

    results = {'class similarity':{}, 'config':config, 'image':[]}

    for objType in objTypes:
        results['class similarity'][objType] = {'union':0, 'intersection':0} 

    with strategy.scope(): # Apply training strategy 
        model =  LoadModel(config, s3)
        accuracy = tf.keras.metrics.Accuracy()

        # Display model
        model.summary()

        #train_dataset = input_fn('train', args.trainingset_dir, config)
        val_dataset = input_fn('val', args.trainingset_dir, config)

        trainingsetdesc = {}
        validationsetdec = {}
        for dataset in config['trainingset']['sets']:
            if dataset['name'] == 'val':
                validationsetdec = dataset
            if dataset['name'] == 'train':
                trainingsetdesc = dataset

        print("Begin inferences")
        dtSum = 0.0
        accuracySum = 0.0
        total_confusion = None
        iterator = iter(val_dataset)
        numsteps = int(validationsetdec['length']/config['batch_size'])

        if(config['min']):
            numsteps=min(args.min_steps, numsteps)

        try:
            for i in tqdm(range(numsteps)):
                image, annotation  = iterator.get_next()
                initial = datetime.now()
                logits = model.predict(image, batch_size=config['batch_size'], steps=1)
                segmentation = tf.argmax(logits, axis=-1)
                dt = (datetime.now()-initial).total_seconds()
                dtSum += dt
                imageTime = dt/config['batch_size']
                for j in range(config['batch_size']):
                    img = tf.squeeze(image[j]).numpy().astype(np.uint8)
                    ann = tf.squeeze(annotation[j]).numpy().astype(np.uint8)
                    seg = tf.squeeze(segmentation[j]).numpy().astype(np.uint8)

                    accuracy.update_state(ann,seg)
                    seg_accuracy = accuracy.result().numpy()

                    accuracySum += seg_accuracy
                    imagesimilarity, results['class similarity'], unique = jaccard(ann, seg, objTypes, results['class similarity'])

                    confusion = tf.math.confusion_matrix(ann.flatten(),seg.flatten(), config['classes']).numpy().astype(np.int64)
                    if total_confusion is None:
                        total_confusion = confusion
                    else:
                        total_confusion += confusion
                        

                    results['image'].append({'dt':imageTime,'similarity':imagesimilarity, 'accuracy':seg_accuracy.astype(float), 'confusion':confusion.tolist()})
        except Exception as e:
            print("Error: test exception {} step {}".format(e, i))
            numsteps = i
        except:
            print("Error: test exception step {}".format(i))
            numsteps = i

    num_images = numsteps*config['batch_size']
    average_time = dtSum/num_images
    average_accuracy = accuracySum/num_images
    sumIntersection = 0
    sumUnion = 0
    sumAccuracy = 0.0
    dataset_similarity = {}
    for key in results['class similarity']:
        intersection = results['class similarity'][key]['intersection']
        sumIntersection += intersection
        union = results['class similarity'][key]['union']
        sumUnion += union
        class_similarity = similarity(intersection, union)

        # convert to int from int64 for json.dumps
        dataset_similarity[key] = {'intersection':int(intersection) ,'union':int(union) , 'similarity':class_similarity}

    results['class similarity'] = dataset_similarity
    total_similarity = similarity(sumIntersection, sumUnion)

    now = datetime.now()
    date_time = now.strftime("%m/%d/%Y, %H:%M:%S")
    test_summary = {'date':date_time, 'model':config['initialmodel']}
    test_summary['model']=config['initialmodel']}
    test_summary['accuracy']=average_accuracy
    test_summary['class_similarity']=dataset_similarity
    test_summary['similarity']=total_similarity
    test_summary['confusion']=total_confusion.tolist()
    test_summary['images']=num_images
    test_summary['image time']=average_time
    test_summary['batch size']=config['batch_size']
    test_summary['test store'] =s3def['address']
    test_summary['test bucket'] = s3def['sets']['trainingset']['bucket']
    test_summary['results'] = results
    
    print ("Average time {}".format(average_time))
    print ('Similarity: {}'.format(dataset_similarity))

    # If there is a way to lock this object between read and write, it would prevent the possability of loosing data
    training_data = s3.GetDict(s3def['sets']['trainingset']['bucket'], config['test_archive']+args.tests_json)
    if training_data is None:
        training_data = []
    training_data.append(test_summary)
    s3.PutDict(s3def['sets']['trainingset']['bucket'], config['test_archive']+args.tests_json, training_data)

    test_url = s3.GetUrl(s3def['sets']['trainingset']['bucket'], config['test_archive']+args.tests_json)

    print("Test results {}".format(test_url))
Esempio n. 41
0
def calc_ordering_sim(ordering1, ordering2, matrix):
    return similarity(ordering1, ordering2, matrix)
Esempio n. 42
0
# Image to use as query
i = 1

# Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation'
similarity_measure = 'smc'

# Load the CBCL face database
# Load Matlab data file to python dict structure
X = loadmat('../Data/wildfaces_grayscale.mat')['X']
N, M = shape(X)

# Search the face database for similar faces
# Index of all other images than i
noti = range(0, i) + range(i + 1, N)
# Compute similarity between image i and all others
sim = similarity(X[i, :], X[noti, :], similarity_measure)
sim = sim.tolist()[0]
# Tuples of sorted similarities and their indices
sim_to_index = sorted(zip(sim, noti))

# Visualize query image and 5 most/least similar images
figure(figsize=(12, 8))
subplot(3, 1, 1)

img_hw = int(sqrt(len(X[0])))
imshow(np.reshape(X[i], (img_hw, img_hw)).T, cmap=cm.gray)
xticks([])
yticks([])
title('Query image')
ylabel('image #{0}'.format(i))
Esempio n. 43
0
def calc_cuisine_sim(cuisine1, cuisine2, matrix):
    return similarity(cuisine1, cuisine2, matrix)
for i in range(0, len(file_name)):
    for j in range(0, len(file_name)):
        if i != j:

            sentList1 = file_text[i]
            sentList2 = file_text[j]

            print(file_name[i], "&", file_name[j])

            for sentList1Text in sentList1:
                max = 0
                sim = 0
                comparedStatement = None
                for sentList2Text in sentList2:
                    sim = similarity(sentList1Text, sentList2Text)
                    if sim > 0.5:
                        if max < sim:
                            max = sim
                            comparedStatement = sentList2Text
                print(sentList1Text, "&", comparedStatement)
                print(max)
                simList.append(max)

            simListArr = np.array(simList)
            print("Similarity:", np.sqrt(np.mean(simListArr**2)))
            print("")

# Calculate execution time
end = time.time()
dur = end - start
        np.array([119.91277313232422, 252.8047332763672]),
        np.array([139.75482177734375, 284.2823181152344]),
        np.array([109.2437744140625, 257.5870056152344])
    ]
    dp = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    # q = [trans(rot(pp, 10 * math.pi / 180), 2, 3) for pp in p]
    q = [
        np.array([374.65393066, 144.14517212]),
        np.array([161.24133301, 217.84576416]),
        np.array([99.27768707, 161.40586853])
    ]
    dq = [[1, 2, 3.1], [4, 5.1, 6], [7.1, 8, 9]]

    point_match, max_sim, sim_a, sim_r, sim_d = similarity(
        p, q,
        dp, dq,
        1, 1, 0.4, True
    )
    print p
    print q
    print 'max sim    ->  {}'.format(max_sim)
    print 'sim angles  :  {}'.format(sim_a)
    print 'sim ratios  :  {}'.format(sim_r)
    print 'sim desc    :  {}'.format(sim_d)

    xp, yp = zip(*p)
    xq, yq = zip(*q)
    plt.plot(xp + (xp[0],), yp + (yp[0],), 'r-')
    plt.plot(xq + (xq[0],), yq + (yq[0],), 'b-')
    plt.xlim((min(xp + xq), max(xp + xq)))
    plt.ylim((min(yp + yq), max(yp + yq)))