Ejemplo n.º 1
0
    def summarize(self,
                  content: str,
                  title: str = None,
                  splitChar='(。|!|\!|?|\?|\n|\t)',
                  proportion=0.3):
        contents = self._splitText(content, splitChar=splitChar)

        # 获取标题向量
        if title != None:
            title = title.strip()
            if splitChar.find(title[len(title) - 1]) == -1:
                title += '。'
            contents.insert(0, title)
        # 获取文章向量
        contents.append(content)

        # print(' len(contents)',  len(contents))
        if len(contents) <= 4:
            return contents

        sentencesVec = self.sif.getSentencesEmbedding(contents)

        sentencesVec = list(sentencesVec)
        contentVec = sentencesVec.pop()

        similarities = [(similarity.cosine_similarity(senVec,
                                                      contentVec), index)
                        for index, senVec in enumerate(sentencesVec)]
        similarities2 = [(similarity.cosine_similarity(senVec,
                                                       sentencesVec[0]), index)
                         for index, senVec in enumerate(sentencesVec)]
        similarities = [((sim1[0] * 0.382 + sim2[0] * 0.618), sim1[1])
                        for sim1, sim2 in zip(similarities, similarities2)]
        # 相似度平滑 KNN
        similarities = self._knnSmooth(similarities)

        # 排序
        similarities.sort(reverse=True)

        summarySentenceIndexes = similarities[
            0:int(len(similarities) * proportion)]
        # print("summarySentenceIndexes:")
        # for i, sim in enumerate(summarySentenceIndexes):
        #     print(i, "index:", sim[1], sim, contents[sim[1]])
        summarySentences = [(index, contents[index])
                            for (cos, index) in summarySentenceIndexes]

        summarySentences.sort()

        return [sentence for (index, sentence) in summarySentences]
Ejemplo n.º 2
0
def knn_classify(test_tf, train_tf, train_class, k):
    tf_distance = {}
    # 計算每個訓練集合特徵關鍵字字詞頻率向量和輸入向量的距離
    for place in train_tf.keys():
        tf_distance[place] = cosine_similarity(train_tf.get(place), test_tf)

    # 把距離排序,取出k個最近距離的分類

    class_count = {}
    # print('(2) 取K個最近鄰居的分類, k = %d' % k)
    for i, place in enumerate(
            sorted(tf_distance, key=tf_distance.get, reverse=True)):
        current_class = train_class.get(place)
        # print('\tTF(%s) = %f, class = %s' % (place, tf_distance.get(place), current_class))
        class_count[current_class] = class_count.get(current_class, 0) + 1
        if (i + 1) >= k:
            break

    print('(3) K個最近鄰居分類出現頻率最高的分類當作最後分類')
    input_class = ''
    for i, c in enumerate(
            sorted(class_count, key=class_count.get, reverse=True)):
        if i == 0:
            input_class = c
        print('\t%s, %d' % (c, class_count.get(c)))

    print('(4) 分類結果 = %s' % input_class)

    return str(input_class)
Ejemplo n.º 3
0
def score_batch_cosine(users, user, user_id, movie_ids):
    weights = [cosine_similarity(user, u) for
               u in users]

    ratings = []
    for movie_id in movie_ids:
        sum_w = 0
        rating = 0

        for w, u_other in zip(weights, users):
            u_rating = u_other[movie_id]
            if u_rating == 0:
                continue

            sum_w += w
            rating += (w * u_rating)

        if sum_w != 0:
            rating /= sum_w
        else:
            # If no relevant info was found, guess a score of 3.
            rating = 3

        rating = int(np.rint(rating))
        ratings.append(rating)

    return clean_ratings(ratings)
Ejemplo n.º 4
0
def quadratic_entropy(example, train_term_dist, word2id, word2vec):
    """Calculates Quadratic Entropy."""
    assert word2vec is not None, ('Error: Word vector representations have to '
                                  'be available for quadratic entropy.')
    summed = 0
    for word_1 in set(example):
        if word_1 not in word2id or word_1 not in word2vec:
            continue  # continue as the product will be 0
        for word_2 in set(example):
            if word_2 not in word2id or word_2 not in word2vec:
                continue  # continue as the product will be 0
            p_1 = train_term_dist[word2id[word_1]]
            p_2 = train_term_dist[word2id[word_2]]
            vec_1 = word2vec[word_1]
            vec_2 = word2vec[word_2]
            sim = similarity.cosine_similarity(vec_1, vec_2)
            summed += sim * p_1 * p_2
    return summed
Ejemplo n.º 5
0
def main_algo(features, tweetid, lastclusterid):

    fvecs, freqdict = tfidf_all.get_tfidf_freqdict(features)

    # Creating random vectors
    num_randvecs = 13
    random_vectors = randomvecs.getVecs(len(freqdict), num_randvecs)

    # Initialising prefix trees
    a = []
    b = []
    prime = 13
    P = []
    # modP = int(input("Enter number of permutations to be used : "))
    modP = 20
    for i in range(modP):
        atemp = random.uniform(1, prime)
        btemp = random.uniform(0, prime)
        a.append(atemp)
        b.append(btemp)
        P.append(pygtrie.Trie())

    index = 0
    wordindexmap = {}
    for key in freqdict.keys():
        wordindexmap[key] = index
        index = index + 1

    # MAIN TWEET LOOP

    tweetclustermap = {}
    clusterdict = {}
    for fvec in fvecs:
        tweetsign = signature.getSign(fvec, random_vectors, wordindexmap)

        # Insert tweet signature in prefix tree and find its nearest neighbor in that tree
        nearestNeighbours = []
        for i in range(modP):
            signPerm = [None] * len(tweetsign)
            for x in range(len(tweetsign)):
                ind = int(a[i] * x + b[i]) % prime
                signPerm[x] = tweetsign[ind]

            if P[i].has_key(signPerm):
                P[i][signPerm].append(tweetid)
            else:
                P[i][signPerm] = [tweetid]

            neighbor, hdist = nearest_neighbor.getNN(signPerm, P[i])

            if (neighbor == None): None
            elif hdist == 0:
                neighbor.remove(tweetid)
                nearestNeighbours.append((neighbor, hdist))
            elif hdist == 1:
                nearestNeighbours.append((neighbor, hdist))
            elif (hdist > 1):
                templist = []
                for item in neighbor:
                    templist += item[1]
                nearestNeighbours.append((templist, hdist))

        mindist = len(signPerm) + 10
        closestNeighbors = []
        for pair in nearestNeighbours:
            if pair[1] <= mindist:
                mindist = pair[1]

        for pair in nearestNeighbours:
            if pair[1] == mindist:
                for i in range(len(pair[0])):
                    if not pair[0][i] in closestNeighbors:
                        closestNeighbors.append(pair[0][i])

        # T = float(input("Enter the similarity threshold : "))
        T = 0.05
        tweetclustermap[0] = 0
        clusterdict[0] = [0]

        for cneighbor in closestNeighbors:
            if (similarity.cosine_similarity(fvec, fvecs[cneighbor]) >= T):
                if (tweetid in tweetclustermap.keys()):
                    if (not (tweetclustermap[tweetid]
                             == tweetclustermap[cneighbor])):
                        tweetclustermap[tweetid] = tweetclustermap[cneighbor]
                        clusterdict[tweetclustermap[cneighbor]].append(tweetid)
                else:
                    tweetclustermap[tweetid] = tweetclustermap[cneighbor]
                    clusterdict[tweetclustermap[cneighbor]].append(tweetid)
            else:
                if (not (tweetid in tweetclustermap.keys())):
                    tweetclustermap[tweetid] = lastclusterid + 1
                    clusterdict[lastclusterid + 1] = [tweetid]
                    lastclusterid += 1

        tweetid = tweetid + 1
    return clusterdict, fvecs, freqdict
            #average centre
            #classe_i = np.argwhere(labels==i)
            #cur_centre_item = np.mean(feature[classe_i], axis=0)

            #min-distance centre
            classe_i = np.argwhere(labels == i)
            mean_centre = np.mean(feature[classe_i], axis=0)
            eudistance = np.array([eucliDist(mean_centre[0], r[0]) for r in feature[classe_i]])
            min_centre = classe_i[np.argmin(eudistance)]
            min_centre = min_centre[0]

            is_old = 0
            cur_centre = np.concatenate((cur_centre, [feature[min_centre]]), axis=0)
            if pre_centre.any():
                for item in pre_centre:
                    cos_value = cosine_similarity(item, feature[min_centre])
                    if cos_value > 0.8:
                        old.append(train_data[min_centre]+[str(labels[min_centre])])
                        is_old = 1
                        break
            if not is_old: new.append(train_data[min_centre]+[str(labels[min_centre])])
        pre_centre = cur_centre

        # write cluster result
        write_data = []
        print(train_data[0])
        for i, item in enumerate(train_data):
            item.append(labels[i])
            write_data.append(item)
        write_data.sort(key= lambda a:a[-1])
        write_data = ['\t'.join([l[0], l[4], l[1], str(l[5])]) for l in write_data]
Ejemplo n.º 7
0
 def test_similar_cosine_similarity(self):
     with self.assertRaises(NotImplementedError):
         VideoHistory.similar('http://www.inf.puc-rio.br', 5,
                              sim.cosine_similarity())
Ejemplo n.º 8
0
library_path=os.listdir(copyrighted_works)

# list of text files
files = [file for file in library_path if file.endswith('.txt')]

for file in files:
    # read text data for each copyrighted content
    # extract TF_IDF terms and values
    # calculate the similarity between this vector and uploaded text file vector
    
    full_path_file = os.path.join(copyrighted_works, file)
    # read raw texts one by one from copyright library path
    try:
        with open(full_path_file, "r", encoding='iso-8859-1') as ifile:
            raw_text = ifile.read() # raw text of one of copyrighted works
    except:
        with open(full_path_file, "r", encoding='utf-8') as ifile:
            raw_text = ifile.read() # raw text of one of copyrighted works

    tfs_dict2 = tf_idf(raw_text) # dictionary of tfidf terms and values {'tfs_values':tfs_values, 'tfs_term':tfs_term}

    # calculate cosine similarity between this copyrighted work and new uploaded text
    similarity = cosine_similarity(tfs_dict1["tfs_term"], tfs_dict1["tfs_values"], tfs_dict2["tfs_term"], tfs_dict2["tfs_values"])
    print(file, ":   ", similarity)
    ifile.close()


    
    
    
    
Ejemplo n.º 9
0
 def find_users_similarity(self, user1, user2):
     x = self.user_interface.get_user_vector(user1)
     y = self.user_interface.get_user_vector(user2)
     return similarity.cosine_similarity(x, y)
Ejemplo n.º 10
0
clusterdict = {}
for i in range(len(rlist)):
    pair = json.loads(rlist[i][0])
    clusterdict[pair[0]] = pair[1]

None        # for breakpoint


# for clusterdata2.csv
r = csv.reader(open('clusterdata2.csv', encoding='utf8'))
clusterlist = list(r)
for i in range(len(clusterlist)):
    clusterlist[i] = json.loads(clusterlist[i][0])

# Testing similarity of tweets in a cluster
r = csv.reader(open('tfidfdata.csv', encoding='utf8'))
fvecs = list(r)
for i in range(len(fvecs)):
    fvecs[i] = json.loads(fvecs[i][0])

#clusterid = int(input("Enter id of the cluster to be tested : ") )
clusterid = 23
simlist = [similarity.cosine_similarity(fvecs[tweetid1],fvecs[tweetid2]) for tweetid1 in clusterdict[clusterid] for tweetid2 in clusterdict[clusterid] if tweetid1 != tweetid2]

false_positives = 0
for sim in simlist:
    if sim<0.2:
        false_positives += 1

print(false_positives, false_positives/len(simlist))
        else:
            test20_rated_indexes[user_id - 401].append(movie_id)

file.close()

#########################################################################################
test_user_averages = []
threshold = 0.84
k_users = 160
#########################################################################################
cosine_similarities = []

for i in range(100):
    test_user_sims = []
    for j in range(200):
        test_user_sims.append(cosine_similarity(users[j], test5_users[i]))

    cosine_similarities.append(test_user_sims)


for i, row in enumerate(cosine_similarities):
    for j, cell in enumerate(row):      # TODO: don't use threshold if only few have non zero weights
        if cell < threshold:
            cosine_similarities[i][j] = 0

all_ratings = []

for k, each_target_user in enumerate(test5_target_indexes):

    final_ratings = []
    for blank_rating in each_target_user:
import similarity
import csv
import json
import random

r = csv.reader(open('tfidfdata.csv', encoding='utf8'))
fvecs = list(r)
for i in range(len(fvecs)):
    fvecs[i] = json.loads(fvecs[i][0])

count = 1
topsims = []
while (count > 0):
    tweetid = int(random.uniform(0, len(fvecs)))
    simlist = []
    for i in range(len(fvecs)):
        simlist.append((similarity.cosine_similarity(fvecs[tweetid],
                                                     fvecs[i]), (tweetid, i)))
    simlist.sort(reverse=True)
    topsims.append(simlist[:50])
    count -= 1

print(
    topsims
)  # Stores top 5 similarities and the pairs for which this similarity is obtained, for 10 tweets
 def test_cosine_similarity(self):
     for vecs, expected in self.cosine_similarity_tests:
         result = cosine_similarity(vecs[0], vecs[1])
         self.assertAlmostEqual(result, expected, places=3)
Ejemplo n.º 14
0
        mu, sigma, inf_mu, inf_sigma, z = model.get_distribution(
            center_vec, context_vec)

        #print(mu, sigma, inf_mu, inf_sigma, z)
        for gold_candidate in gold_dict[lst_item.target_word]:
            if gold_candidate not in vocab.index:
                # TODO print something out and track
                continue
            vec = torch.LongTensor(np.array([vocab[gold_candidate]]))

            if model.use_cuda:
                vec = center_vec.cuda()

            mu_s, sigma_s, inf_mu_s, inf_sigma_s, z_s = model.get_distribution(
                vec, context_vec)
            score_mu = cosine_similarity(mu.squeeze().detach().cpu().numpy(),
                                         mu_s.squeeze().detach().cpu().numpy())
            scores_mu[lst_item.complete_word, lst_item.sentence_id].append(
                (gold_candidate, score_mu))

            # TODO not sure if minus is reqd
            # posterior (word) (inf) || prior (candidate)
            kl_prior = -1 * kl_div(inf_mu, inf_sigma, mu_s, sigma_s)
            scores_kl_prior[lst_item.complete_word,
                            lst_item.sentence_id].append(
                                (gold_candidate, kl_prior.item()))

            kl_post = -1 * kl_div(inf_mu, inf_sigma, inf_mu_s, inf_sigma_s)
            scores_kl_post[lst_item.complete_word,
                           lst_item.sentence_id].append(
                               (gold_candidate, kl_post.item()))
Ejemplo n.º 15
0
preprocessed_git_issues = preprocessing.preprocessing(git_issues)

#Preprocess Stack Overflow posts
print ">>> PREPROCESSING SO POSTS <<<"
preprocessed_so_posts = preprocessing.preprocessing(so_posts)

#Similarity
print ">>> TFIDF POSTS <<<"
tfidf_posts = similarity.tfidf(preprocessed_so_posts)
print ">>> TFIDF ISSUES <<<"
tfidf_issues = similarity.tfidf(preprocessed_git_issues)

for post in tfidf_posts:
    for issue in tfidf_issues:
        print ">>> COSINE SIMILARITY <<<"
        cosine_result = similarity.cosine_similarity(post, issue)
        print ">>> Cosine: ", cosine_result


# TO TEST
# teste1 = [["Buying! fdsfsfsd. of", "Buyed? the bug2", "I bug3 bBg3"], ["BUG bug4", "abs bug5", "bug6 bug6"]]
# teste2 = [["Buying! fdsfsfsd. of", "Buyed? the bug2", "I bug3 bBg3"], ["BUG bug4", "abs bug5", "bug6 bug6"]]
# text1 = preprocessing.preprocessing(teste1)
# text2 = preprocessing.preprocessing(teste2)
# print "preproceesded 1: ", text1
# print "preproceesded 2: ", text2

# # Similarity
# result1 = similarity.tfidf(text1)
# result2 = similarity.tfidf(text2)