Esempio n. 1
0
def pilot_test():
	"""
	"""
	users_vectors = []
	vectorsums = []
	for i, user in enumerate(sample_users):
		df = pd.read_pickle('./fc8_100imgs_{}.pkl'.format(user))
		users_vectors.append(df)
		vectorsums.append(df.fc8.values.sum())

	corpus = []
	for vector in vectorsums:
		corpus.append(vector_to_document(vector))

	tfidf = TfidfVectorizer()
	tfidf_vectorized = tfidf.fit_transform(corpus)

	cosine_similarities = linear_kernel(tfidf_vectorized, tfidf_vectorized)

	new_docs = []
	for i, user in enumerate(sample_users):
		for j, img_vec in enumerate(users_vectors[i].fc8):
			doc = vector_to_document(img_vec)
			new_docs.append(doc)
			# vectorized = tfidf.transform([doc])
			# sims = linear_kernel(vectorized, tfidf_vectorized)[0]
			# most_sims = np.argsort(sims)[::-1]
			#
			# print '{} img {} most similar to \n{}'.format(user, j, [(sample_users[i], sims[i]) for i in most_sims] )

	new_docs_vectorized = tfidf.transform(new_docs)
	cosine_similarities = linear_kernel(new_docs_vectorized, tfidf_vectorized)

	for sim in cosine_similarities:
		print 'top score: {}     top user: {}'.format(sim.max(), sample_users[np.argmax(sim)])
    def plot_hist_d_to_centroid(self, min_w=0):
        '''
        histograms of distance to centroids: overall vs. each cluster
        '''
        self.assign_cluster(min_w)
        self.cal_centroid()
        n_clusters = np.max(self.clusters)
        #fig = plt.figure(figsize=(20,8))
        X2_dense = self.X2.todense()
        centroid_overall = np.mean(X2_dense, axis=0)
        sim = linear_kernel(centroid_overall, X2_dense)
        max_sim = np.max(sim)
        min_sim = np.min(sim)

        # multiple plot, subplots
        ncols = 3
        nrows = (n_clusters + 1) // ncols + (((n_clusters + 1) % ncols) > 0)
        # subplot preferred way
        fig, ax = plt.subplots(nrows, ncols, figsize=(30, 10))
        axs = ax.flatten()
        i_plot = 0
        axs[i_plot].hist(sim.flatten(), alpha=0.2)  # , ax=axs[i_plot])
        axs[i_plot].set_xlim(min_sim, max_sim)
        i_plot = i_plot + 1

        for i in xrange(n_clusters):
            cond = self.clusters == i
            arr = X2_dense[cond]
            sim = linear_kernel(self.centroids[i], arr)
            axs[i_plot].hist(sim.flatten(), alpha=0.2)  # , ax=axs[i_plot])
            axs[i_plot].set_xlim(min_sim, max_sim)
            i_plot = i_plot + 1
        fig.savefig(self.model_name + '_hist_dis_to_centroid.png')
        plt.close(fig)
Esempio n. 3
0
 def _build_similarity_matrix(self):
     """
     partitioned similarity matrix ('s' for source nodes and 't' for target nodes)
     S = [[S_ss, S_st],
          [S_ts, S_tt]]
     """
     normalize(self.source_features, norm='l2', copy=False)
     normalize(self.target_features, norm='l2', copy=False)
     self.ss = linear_kernel(self.source_features)
     self.st = linear_kernel(self.source_features, self.target_features)
     self.ts = self.st.T
     self.tt = linear_kernel(self.target_features)
    def plot_hist_d_to_centroid(self, min_w=0):
        '''
        plot histogram of distance to centroid, overall vs. per cluster
                - INPUT: self.X2
        '''
        self.assign_cluster(min_w)
        self.cal_centroid()
        n_clusters = np.max(self.clusters)
        #fig = plt.figure(figsize=(20,8))

        # multiple plot, subplots
        ncols = 3
        nrows = (n_clusters + 1) // ncols + (((n_clusters + 1) % ncols) > 0)
        # subplot preferred way
        fig, ax = plt.subplots(nrows, ncols, figsize=(30, 10))
        axs = ax.flatten()

        centroid_overall = np.mean(self.X2, axis=0)
        sim = linear_kernel(centroid_overall, self.X2)
        max_sim = np.max(sim)
        min_sim = np.min(sim)
        print 'sim shape: %s  X shape: %s centroid_overall shape: %s' % (sim.shape, self.X2.shape, centroid_overall.shape)
        print 'min %.2f max %.2f ' % (min_sim, max_sim)
        print sorted(sim.flatten(), reverse=True)[:5]
        print sorted(centroid_overall.getA().flatten(), reverse=True)[:5]

        max_sim = 1
        min_sim = 0

        i_plot = 0
        axs[i_plot].hist(sim.flatten(), alpha=0.2)  # , ax=axs[i_plot])
        axs[i_plot].set_xlim(min_sim, max_sim)
        i_plot = i_plot + 1

        for i in xrange(n_clusters + 1):
            cond = self.clusters == i
            arr = self.X2[cond]
            sim = linear_kernel(self.centroids[i], arr)
            print 'sim shape: %s  arr shape: %s  centroid shape: %s' % (sim.shape, arr.shape, self.centroids[i].shape)
            print sorted(sim.flatten(), reverse=True)[:5]
            print sorted(self.centroids[i].flatten(), reverse=True)[:5]
            axs[i_plot].hist(sim.flatten(), alpha=0.2)  # , ax=axs[i_plot])
            axs[i_plot].set_xlim(min_sim, max_sim)
            i_plot = i_plot + 1

        plt.show()
        fig.savefig(self.model_name + '_hist_dis_to_centroid.png')

        plt.close(fig)
Esempio n. 5
0
def main():
    twenty = fetch_20newsgroups()
    tfidf = TfidfVectorizer().fit_transform(twenty.data)
    cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-5:-1]
    print related_docs_indices
    print cosine_similarities[related_docs_indices]
    # vectorizer = CountVectorizer(min_df=1)
    # corpus = [
    # 'This is the first document.',
    # 'This is the second second document.',
    # 'And the third one.',
    # 'Is this the first document?',
    # ]

    # tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
    # tfs = tfidf.fit_transform(token_dict.values())

    train_set = ("The sky is blue.", "The sun is bright.")
    test_set = ("The sun in the sky is bright.",
                "We can see the shining sun, the bright sun.")
    count_vectorizer = CountVectorizer()
    count_vectorizer.fit_transform(train_set)
    print "Vocabulary:", count_vectorizer.vocabulary
    # Vocabulary: {'blue': 0, 'sun': 1, 'bright': 2, 'sky': 3}
    freq_term_matrix = count_vectorizer.transform(test_set)
    print freq_term_matrix.todense()
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(freq_term_matrix)
    print "IDF:", tfidf.idf_
    tf_idf_matrix = tfidf.transform(freq_term_matrix)
    print tf_idf_matrix.todense()
def __asyncable_similarity(tup):

    #  bs, beer_id_ref, ref_vect, s_ids, b_ids, X_t, top = tup

    # bs: beer similarity object for db commit
    # ref_vects from one style
    # ref_b_ids: beer ids for ref vecs
    # s_ids, b_ids: style and beer indices of X_t
    # X_t for beers in other styles to be compared to
    # keep top similarities by style
    bs, b_refs, X_t_ref, b_comps, X_t_comp, top = tup

    start = dt.now()
    print "Beer ct %s vs ct %s: Compute Similarity" % (len(b_refs), len(b_comps))
    try:
        for i in xrange(len(b_refs)):

            # compute similarity between beer_ref[i] and all b_comps
            lk = linear_kernel(X_t_ref.getrow(i), X_t_comp).flatten()

            # take #top of largest similarities
            n = len(lk)
            kp = min(top, n)
            m_ixs = lk.argsort()[-kp:]

            sims = [(b_refs[i], b_comps[j], lk[j]) for j in m_ixs if b_refs[i] != b_comps[j]]

            # bs.smooth_similarity(sims)
            bs.add_many(sims)

        print "Comparison Complete: %s" % (dt.now() - start)
        return (b_refs, None)
    except Exception as e:
        return (b_refs, e)
Esempio n. 7
0
 def __kernel_definition__(self):
     if self.Kf == 'rbf':
         return lambda X,Y : rbf_kernel(X,Y,self.rbf_gamma)
     if self.Kf == 'poly':
         return lambda X,Y : polynomial_kernel(X, Y, degree=self.poly_deg, gamma=None, coef0=self.poly_coeff)
     if self.Kf == None or self.Kf == 'linear':
         return lambda X,Y : linear_kernel(X,Y)
Esempio n. 8
0
def thread_diag_block(top_nbrs,dataM,job_ranges,r_offset, c_offset,
                    n_nbr=100,verbose=False):
    
    ''' (cos,idx) 
        Note in the min-heap, the first one is the smallest.
    '''

    for job_bd in job_ranges:
        crossV = linear_kernel(dataM[job_bd[0]:job_bd[1],:],dataM)
        n_doc1, n_doc2 = crossV.shape
        
        for i_doc in range(n_doc1):
            i_offset = i_doc + job_bd[0] + r_offset
            L = top_nbrs[i_offset]
            for j in range(n_doc2):            
                if i_offset == j+c_offset:
                    continue

                if len(L)<n_nbr:
                    heapq.heappush(L, (crossV[i_doc,j],j+c_offset))
                elif crossV[i_doc,j] > L[0][0]:
                    heapq.heapreplace(L, (crossV[i_doc,j],j+c_offset))
        
            top_nbrs[i_offset] = L

        if verbose:
            print('process range (%d,%d)'%(job_bd[0],job_bd[1]))
Esempio n. 9
0
def get_related_news(articles ,base_art_index):
    
    if related_dict.get(base_art_index) is not None :
        return related_dict.get(base_art_index)
    
    corpus = []
    for art in articles :
        corpus.append( ' '.join( jieba.cut(art.context) ) )
    ls = [w for w in  WordCutLibs.stopwords.split('\n')]
    vectorizer = CountVectorizer(stop_words=ls)
    X = vectorizer.fit_transform(corpus)
    #word = vectorizer.get_feature_names()
    #stopword = vectorizer.get_stop_words()
    
    
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(X)
    #weight = tfidf.toarray()
    
    target = base_art_index #設定目標標題     #index順序同SQL
    
    
    cosine_similarities = linear_kernel(tfidf[target], tfidf).flatten().argsort()
    
    max_len = len(cosine_similarities)
    bnd = -11 if max_len >= 10 else -(max_len)
    related_docs_indices = cosine_similarities[: bnd:-1]
    
    res = [ articles[idx] for idx in related_docs_indices ]
    related_dict[base_art_index] = res
    return res
    
Esempio n. 10
0
    def print_most_cos_sim(self, thresh=0.675):
        '''
        Prints the two posts that have the highest cosine similarity
        '''
        cos_sims = linear_kernel(self.word_vecs, self.word_vecs)

        # Initialize max_sim = 0, only consider cos sims under threshold
        # so we know we're not recording a post compared with itself (1.0)
        max_cos_sim = 0.0
        thr = thresh

        # Find max_cos_sim
        for i, j in enumerate(cos_sims):
            for k, l in enumerate(j):
                if (float(l) >= max_cos_sim) and (float(l) < thr):
                    max_cos_sim = float(l)

        # Find indices of max_cos_sim
        double_break = False
        for i, j in enumerate(cos_sims):
            for k, l in enumerate(j):
                if float(l) == max_cos_sim:
                    ind1, ind2 = i, k
                    double_break = True
                    break
            if double_break:
                break

        print 'Posts with highest cosine similarity ({:.3f}):\n\nPost {}:\n{}\
            \n\nPost {}:\n{}'.format(max_cos_sim, ind1, self.posts[ind1],
                                    ind2, self.posts[ind2])
Esempio n. 11
0
    def __init__(self, *args, **kwargs):
        super(QUIRE, self).__init__(*args, **kwargs)
        self.Uindex = [idx for idx, _ in self.dataset.get_unlabeled_entries()]
        self.Lindex = [idx for idx in range(len(self.dataset)) if idx not in self.Uindex]
        self.lmbda = kwargs.pop("lambda", 1.0)
        X, self.y = zip(*self.dataset.get_entries())
        self.y = list(self.y)
        self.kernel = kwargs.pop("kernel", "rbf")
        if self.kernel == "rbf":
            self.K = rbf_kernel(X=X, Y=X, gamma=kwargs.pop("gamma", 1.0))
        elif self.kernel == "poly":
            self.K = polynomial_kernel(
                X=X, Y=X, coef0=kwargs.pop("coef0", 1), degree=kwargs.pop("degree", 3), gamma=kwargs.pop("gamma", 1.0)
            )
        elif self.kernel == "linear":
            self.K = linear_kernel(X=X, Y=X)
        elif hasattr(self.kernel, "__call__"):
            self.K = self.kernel(X=np.array(X), Y=np.array(X))
        else:
            raise NotImplementedError

        if not isinstance(self.K, np.ndarray):
            raise TypeError("K should be an ndarray")
        if self.K.shape != (len(X), len(X)):
            raise ValueError("kernel should have size (%d, %d)" % (len(X), len(X)))
        self.L = np.linalg.inv(self.K + self.lmbda * np.eye(len(X)))
Esempio n. 12
0
	def get(self):		
		query = self.get_argument('q', None)
		if query is None:
			return
		queryTerms = query.split()
		queryVector = np.array([self._logIDF[term] for term in queryTerms])
		docVectorDict = defaultdict(lambda: np.array([0] * len(queryTerms)))
		for i in range(len(queryTerms)):
			term = queryTerms[i].lower()
			newList = self._postingsLists[term]
			for item in newList:
				docVectorDict[item[0]][i] = item[1] * self._logIDF[term]
		docMatrix = np.zeros((len(docVectorDict), len(queryTerms)))
		docIx = 0
		docIxToDocID = {}
		for docID in docVectorDict.keys():
			docMatrix[docIx][:] = docVectorDict[docID][:]
			docIxToDocID[docIx] = docID
			docIx += 1
		sims = linear_kernel(queryVector, docMatrix).flatten()
		bestDocIxes = sims.argsort()[::-1]
		bestDocSims = sims[bestDocIxes]
		bestDocIDs = [docIxToDocID[docIx] for docIx in bestDocIxes]
		postings = zip(bestDocIDs, bestDocSims)
		self.write(json.dumps({"postings": postings}))
Esempio n. 13
0
def main(lists):
    #    titles = ["overexpression expression overexpression inhibition overexpression association association interaction binds binds interaction affinity affinity", "expression inducing expression detected lacking expression inducing expression detected lacking expression"]
    stopwords = ["and", "edition", "for", "in", "little", "of", "the", "to"]
    ignorechars = """,:'!"""

    mylsa = LSA(stopwords, ignorechars)
    for x in lists:
        mylsa.parse(x)
    mylsa.build()
    mylsa.printA()
    mylsa.calc()
    mylsa.printSVD()

    tfidf = mylsa.Vt

    #    print cosine_similarity(mylsa.Vt[0:1], mylsa.Vt)
    print tfidf, tfidf[0:1]
    cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
    print "cosine similarities", cosine_similarities
    angle_list = []
    for a in cosine_similarities:
        try:
            angle_in_radians = math.acos(a)
            angle_in_degrees = math.degrees(angle_in_radians)
            angle_list.append(angle_in_degrees)
        except ValueError:
            angle_list.append(0)
    return_list = []
    return_list.append(angle_list[1])
    print return_list
    return return_list
def predict(data, vect, user_list, tweet_list, word_counts): 
	vector = vect.transform(data)
	result_matrix = linear_kernel(vector, word_counts)
	
	indices_of_tweets = []

	# For each tweet by the client, find the 30 most similar tweets
	# This list may include tweets by the client
	for row in result_matrix: 
		indices = row.argsort()[:][::-1]
		indices_of_tweets.append(indices[2:51])


	# Return the person that tweeted each of the 50 most similar tweets
	user_array = np.array(user_list)
	persons_per_tweet = []

	for row in indices_of_tweets: 
		persons_per_tweet.append(user_array[row])

	# Count up how many times each person shows up. 
	# Same weighting is given to people who have many tweets similar to one client tweet
	# and a tweet that matches a high number of client tweets.
	persons_counter = Counter()

	for row in persons_per_tweet: 
		persons_counter.update(row)

	# return the top 25 people in this list
	top_people_and_count = persons_counter.most_common(25)

	top_people = [tup[0] for tup in top_people_and_count]

	return top_people
Esempio n. 15
0
    def _apply_kernel(self, x, y):
        """Apply the selected kernel function to the data."""
        if self.kernel == 'linear':
            phi = linear_kernel(x, y)
        elif self.kernel == 'rbf':
            phi = rbf_kernel(x, y, self.coef1)
        elif self.kernel == 'poly':
            phi = polynomial_kernel(x, y, self.degree, self.coef1, self.coef0)
        elif callable(self.kernel):
            phi = self.kernel(x, y)
            if len(phi.shape) != 2:
                raise ValueError(
                    "Custom kernel function did not return 2D matrix"
                )
            if phi.shape[0] != x.shape[0]:
                raise ValueError(
                    "Custom kernel function did not return matrix with rows"
                    " equal to number of data points."""
                )
        else:
            raise ValueError("Kernel selection is invalid.")

        if self.bias_used:
            phi = np.append(phi, np.ones((phi.shape[0], 1)), axis=1)

        return phi
Esempio n. 16
0
def main(protein_dict,q1,q2):
    if not protein_dict:
        angle_list = ['9999']
    elif len(protein_dict) < 2:
        angle_list = ['9999']
    else:
        train_set = [' '.join(protein_dict[x]) for x in protein_dict]
        proteins = [x for x in protein_dict]        
        tfidf_vectorizer = TfidfVectorizer()
        tfidf = tfidf_vectorizer.fit_transform(train_set)  #finds the tfidf score with normalization
#        print 'tfidf[0:1]', tfidf[0:1]
#        print 'tfidf[0:2]', tfidf[0:2]
#        print 'tfidf', tfidf
        cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
#        print 'cosine_similarities', cosine_similarities
        related_docs_indices = cosine_similarities.argsort()[:-5:-1]

        degrees_list = []
        for a in (cosine_similarities[related_docs_indices].tolist()):

            try:
                angle_list = []
                angle_in_radians = math.acos(a)
                angle_in_degrees = math.degrees(angle_in_radians)
                degrees_list.append(angle_in_degrees) 
                angle_list.append(angle_in_degrees)   
            except ValueError:
                angle_list = ['9999']
                
        if len(degrees_list) > 1:
            return_list = [degrees_list[1]]
        else:
            return_list = degrees_list

        return return_list   
Esempio n. 17
0
def sim_char10(text1, text2):
    vect = HashingVectorizer(analyzer='char_wb', tokenizer=normalize, stop_words='english', ngram_range=(10, 10))
    texts = [text1, text2]
    matrix = vect.fit_transform(texts)
    cosine_similarities = linear_kernel(matrix[0:1], matrix).flatten()
    simmax = max(cosine_similarities[1:])
    return simmax
Esempio n. 18
0
def sim_char5(text1, text2):
    vect = HashingVectorizer(analyzer='word', tokenizer=normalize, stop_words='english')
    texts = [text1, text2]
    matrix = vect.transform(texts)
    cosine_similarities = linear_kernel(matrix[0:1], matrix).flatten()
    simmax = max(cosine_similarities[1:])
    return simmax
def predict(data, vect, user_list, word_counts, sn): 
	vector = vect.transform(data)
	result_matrix = linear_kernel(vector, word_counts)
	
	tweet_list = []

	# For each tweet by the client, find the 30 most similar tweets
	# This list may include tweets by the client
	for row in result_matrix: 
		indices = row.argsort()[-51:-1][::-1]
		tweet_list.append(indices)

	# Find the 50 tweets that showed up the most number of times in the tweet list
	# Now you have a list of the tweets that showed up the most number of times as 
	# being similar to your other tweets
	tweet_indexes = Counter([idx for sublist in tweet_list for idx in sublist])

	print tweet_indexes.most_common(50)

	top_indexes = [tup[0] for tup in tweet_indexes.most_common(50)]

	#find the users that wrote the tweets 
	user_array = np.array(user_list)

	people = user_array[top_indexes]

	print people

	# remove for duplicate people
	unique_people = set(people)

	print unique_people

	return [x for x in unique_people if x != sn]
Esempio n. 20
0
    def _train(self, ds):
        """
        Train the engine.

        Create a TF-IDF matrix of unigrams, bigrams, and trigrams for each product. The 'stop_words' param
        tells the TF-IDF module to ignore common english words like 'the', etc.

        Then we compute similarity between all products using SciKit Leanr's linear_kernel (which in this case is
        equivalent to cosine similarity).

        Iterate through each item's similar items and store the 100 most-similar. Stops at 100 because well...
        how many similar products do you really need to show?

        Similarities and their scores are stored in redis as a Sorted Set, with one set for each item.

        :param ds: A pandas dataset containing two fields: description & id
        :return: Nothin!
        """
        tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
        tfidf_matrix = tf.fit_transform(ds['content'])

        cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

        for idx, row in ds.iterrows():
            similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
            similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices]

            # First item is the item itself, so remove it.
            # This 'sum' is turns a list of tuples into a single tuple: [(1,2), (3,4)] -> (1,2,3,4)
            flattened = sum(similar_items[1:], ())
            self._r.zadd(self.SIMKEY % row['id'], *flattened)
Esempio n. 21
0
def getRelevantPassages(query, k):
    queryVector = allTextVectorizer.transform([query])
    queryIndices = numpy.array([allTextVectorizer.vocabulary_.get(word) for word in allTextAnalyzer(query)])
    queryIndices = [i for i in queryIndices if i is not None]
    querySimilarityScores = linear_kernel(queryVector[:,queryIndices], allTextIndex[:,queryIndices]).flatten()
    relatedDocIndices = querySimilarityScores.argsort()[:-k:-1]
    return [allTextLines[i] for i in relatedDocIndices]
def get_results(query):

    test = query
    response = tfidf.transform([test])

    print 'response: ', response

    RESULTS_ARRAY = []

    cosine_similarities = linear_kernel(response, tfs).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-10:-1]
    for i in related_docs_indices:
        if cosine_similarities[i] > 0:
            file_name = token_dict.keys()[i].split('.')[0] + '.pdf.html.json'
            data = {}
            data = summary_dict[file_name]
            data.update({"candidate": token_dict.keys()[i].split('.')[0],
                            "cosine": cosine_similarities[i]})
            # data = {"candidate": token_dict.keys()[i].split('.')[0],
            #                 "cosine": cosine_similarities[i]}

            RESULTS_ARRAY.append(data)
            # print "%-50s %.4f" % (token_dict.keys()[i].split('.')[0],cosine_similarities[i])

    # print RESULTS_ARRAY
    return RESULTS_ARRAY
Esempio n. 23
0
    def get(self):
        query = self.get_argument('q', None)
        if query is None:
            return
        queryTerms = query.split()
        # let's say we have N documents and M terms in query
        # Apparently we assume unique term in query
        # queryVector is a 1 * M dimension array
        queryVector = np.array([self._logIDF[term] for term in queryTerms])
        # docVectoDict is a N * M vector, with default value np.array([0] * M)
        docVectorDict = defaultdict(lambda: np.array([0]*len(queryTerms)))

        for i in range(len(queryTerms)):
            term = queryTerms[i].lower()
            newList = self._postingsList[term]
            for item in newList:  # newList is [(docID,tf)]
                docVectorDict[item[0]][i] = item[1] * self._logIDF[term]
        docMatrix = np.zeros((len(docVectoDict)), len(queryTerms)))
        docIx = 0
        docIxToDocID = {}
        for docID in docVectorDict.keys():
            docMatrix[docIx][:] = docVectorDict[docID][:]
            docIxToDocID[docIx] = docID
            docIx += 1
        # linear_kernel is used to compute the similarity
        sims = linear_kernel(queryVector,docMatrix).flatten()
        # argsort return the index 
        bestDocIxes = sims.argsort()[::-1]
        bestDocSims = sims[bestDocIxes]
        bestDocIDs = [docIxToDocID[docIx] for docIx in bestDocIxes]
        postings = zip(bestDocIDs, bestDocSims)
        self.write(json.dumps({"postings":postings}))
Esempio n. 24
0
    def _apply_kernel(self, X, y=None):
        """Apply the selected kernel function to the data."""
        if self.kernel == 'linear':
            phi = linear_kernel(X, y)
        elif self.kernel == 'rbf':
            phi = rbf_kernel(X, y, gamma=self.gamma)
        elif self.kernel == 'poly':
            phi = polynomial_kernel(X, y, degree=self.degree)
        elif callable(self.kernel):
            phi = self.kernel(X, y)
            if len(phi.shape) != 2:
                raise ValueError(
                    "Custom kernel function did not return 2D matrix"
                )
            if phi.shape[0] != X.shape[0]:
                raise ValueError(
                    "Custom kernel function did not return matrix with rows"
                    " equal to number of data points."""
                )
        else:
            raise ValueError("Kernel selection is invalid.")
        phi = phi.T
        if self.bias_used:
            phi = np.hstack((np.ones((phi.shape[0], 1)), phi))

        return phi
Esempio n. 25
0
def pairwise_similarity():
    import pickle
    import numpy as np
    import math
    import heapq
    from sklearn.metrics.pairwise import linear_kernel
    singular = 311363
    tfidf = pickle.load(open("D:\\Users\\yutao\\eclipse1\\two_tfidfs_dump_"))
    x = pickle.load(open("D:\\Users\\yutao\\eclipse1\\profiles"))
    ids = x['ids']
    sim_l = {}
    sim_l_index = {}
    flag = 0
    for i in range(1):
        print i
        v1 = tfidf[i]
        t_v1 = np.transpose(v1)
        sim_a = []
        for j in range(singular,len(ids)):
            if j%1000==0:
                print j
            if flag == 0:
                sim_l[j] = []
                sim_l_index[j] = []
            v2 = tfidf[j]
            t_v2 = np.transpose(v2)            
            sim = np.dot(v1, t_v2)[0,0] / (math.sqrt(np.dot(v1,t_v1)[0,0]) * math.sqrt(np.dot(v2,t_v2)[0,0]))
            if sim > 0:
                if len(sim_a)<=100:
                    heapq.heappush(sim_a,sim)
                else:
                    heapq.heappushpop(sim_a,sim)
                if len(sim_l[j])<=100:
                    heapq.heappush(sim_l[j],sim)
                else:
                    heapq.heappushpop(sim_l[j],sim)
    
    
    lil_tfidf = tfidf.tolil()
    flag = 0
    import pymongo
    col = pymongo.Connection("10.1.1.110",12345)['scrapy']['similarity']
    for i in range(singular+((len(ids)-singular)/6)*4,singular+((len(ids)-singular)/6)*5):
        try:
            print i
            print ids[i]
        except Exception, e:
            print e
        sim = {}
        sim['_id'] = ids[i]
        sim['sim'] = []
        lk = linear_kernel(tfidf[i], lil_tfidf[:singular]).flatten()
        for j in range(len(lk)):
            if lk[j]>0.1:
                sim['sim'].append({'id':ids[j],
                                   'sim':lk[j]})
        print len(sim['sim'])
        col.save(sim)
Esempio n. 26
0
    def calc_cos_sims(self):
        normalized = np.empty_like(self.X)
        for i, vec in enumerate(self.X):
            norm = np.linalg.norm(vec)
            for j, val in enumerate(vec):
                normalized[i, j] = val/norm

        sims = linear_kernel(normalized, normalized)
        return sims
Esempio n. 27
0
def solve():
    query = request.json
    doc = doc_remove_punc(query)
    doc = [lem(doc, porter)]
    query_vect = vectorizer.transform(doc)
    cos_sim = linear_kernel(vect, query_vect)
    top_sims = np.argsort(cos_sim, axis = None)[-1:-4:-1]
    top_posts = [docs[sim] for sim in top_sims]
    return jsonify(top_posts)
Esempio n. 28
0
 def singletextsimilarity(tfidf, index, listofstrings, printdeets=False):
     similarities = linear_kernel(tfidf[index], tfidf).flatten()
     if printdeets:
         most_related_docs_indices = similarities.argsort()[:-5:-1]
         most_related_similarities = similarities[most_related_docs_indices]
         print "docs most related to doc #%s are %s." % (
         index, ', '.join([str(el) for el in most_related_docs_indices]))
         print "there similarities are %s." % (', '.join([str(el) for el in most_related_similarities]))
     return similarities
Esempio n. 29
0
def tfidf_matrix(X, **kwargs):
     # get the tf-idf counts
    count_vect = CountVectorizer(**kwargs)
    counts     = count_vect.fit_transform(X)
    tf_transformer = TfidfTransformer().fit(counts)
    counts_tfidf   = tf_transformer.transform(counts)
    # compute cosine similarity
    matrix         = linear_kernel(counts_tfidf, counts_tfidf)
    return matrix
Esempio n. 30
0
def top_n_posts(vect, ri, n, users, posts):

    cos_sim = linear_kernel(vect, vect)
    sim_sort = np.argsort(cos_sim, axis = 1)
    sim_sort = sim_sort[:, 0:-1]
    top_n = list(range(-1,-n-1,-1))
    doc = sim_sort[ri, :]
    user = users[ri]
    sim_users = list(doc[top_n])
    return (user, posts[ri]), [(users[sim], posts[sim]) for sim in sim_users]
Esempio n. 31
0
    data_i = np.asarray(data_i)

    for batch_j in batches:
        data_j = []
        for j in batch_j:
            data_j.append(np.load(output_data + listdir[j]).ravel())
        data_j = np.asarray(data_j)

        # Compute the kernels
        euclidean_norm[batch_i[0]:batch_i[-1] + 1, batch_j[0]:batch_j[-1] +
                       1] = (pairwise_distances(data_i,
                                                data_j,
                                                metric='euclidean')**2)

        lin_kernel[batch_i[0]:batch_i[-1] + 1,
                   batch_j[0]:batch_j[-1] + 1] = (linear_kernel(
                       data_i, data_j))

# Save the kernels in CSV files
linear_kernel_df = pd.DataFrame(lin_kernel, index=subjects, columns=subjects)
linear_kernel_df.to_csv(output_kernels + 'linear_kernel.csv')

euclidean_norm_df = pd.DataFrame(euclidean_norm,
                                 index=subjects,
                                 columns=subjects)
euclidean_norm_df.to_csv(output_kernels + 'euclidean_norm.csv')

# Save the target variable in a CSV file
# Change this path
df_y = pd.read_csv("/Volumes/dtlake01.aramis/users/clinica/pac2019/dataset/"
                   "PAC2019_BrainAge_Training.csv")
Esempio n. 32
0
def cosine_similarity(documentA, documentB):
    docs = [documentA, documentB]
    tfidf = TfidfVectorizer().fit_transform(docs)
    cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
    return cosine_similarities
df_review_one_sentence = pd.read_csv(
    './hotel/onesentence_hotel_review_final.csv', index_col=0)
#print(df_review_one_sentence.head())
print(df_review_one_sentence.info())

# In[104]:

hotel_idx = df_review_one_sentence[df_review_one_sentence['hotel_name'] ==
                                   '제주 아름다운 리조트'].index[0]  #인덱스 알아내는 방법

# In[105]:

Tfidf = TfidfVectorizer()
Tfidf_matrix = Tfidf.fit_transform(
    df_review_one_sentence['review_one_sentence'])
#print(Tfidf_matrix.shape)
#print(Tfidf_matrix)

# In[106]:

cosine_sim = linear_kernel(Tfidf_matrix[hotel_idx],
                           Tfidf_matrix)  #호텔1개에 대한 588개호텔의 수치
#print(cosine_sim.shape)

# In[109]:

print(getRecommendation(cosine_sim))

# In[ ]:
Esempio n. 34
0
df.genre
#Based on Publisher
df["Publisher"].isnull().sum()  # Tio find NaN values

from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(stop_words='english')

tfidf_matrix = tf.fit_transform(df.Publisher)

tfidf_matrix.shape

from sklearn.metrics.pairwise import linear_kernel

cos_similarity_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)

print(cos_similarity_matrix)

df_index = pd.Series(df.index, index=df['title']).drop_duplicates()


def get_title_recommendations(title, topN):

    #topN = 10
    # Getting the movie index using its title
    df_id = df_index[title]

    # Getting the pair wise similarity score for all the df's with that
    # df
    cosine_scores = list(enumerate(cos_similarity_matrix[df_id]))
# In[188]:

book_data['train'] = book_data.apply(combine, axis=1)

# In[190]:

word_stopped = TfidfVectorizer(stop_words='english')

book_data['train'] = book_data['train'].fillna('')

matrix = word_stopped.fit_transform(book_data['train'])

# In[192]:

co_sim = linear_kernel(matrix, matrix)

# In[194]:

book_data = book_data.reset_index()

# In[195]:

indexing = pd.Series(book_data.index,
                     index=book_data['title']).drop_duplicates()

# In[ ]:

indexing = pd.Series(book_data.index,
                     index=book_data['title', 'publisher']).drop_duplicates()
Esempio n. 36
0
print("There are " + str(num_docs) + " documents in the test data.");

documents = [open(path+f, 'r').read() for f in doc_ids];

print("Calculating tfidf scores");
tfidf = vectorizer.fit_transform(documents);
print("Finished calculating tfidf scores");

query_file = open('Scharrhud_Data/query/query.txt', "r");

query = vectorizer.transform(query_file);

feature_names = vectorizer.get_feature_names();

print("Calculating cosine similarity values");
cosine_sim = linear_kernel(query, tfidf).flatten();
print("Finished calculating cosine similarity scores");

doc_sim_values = dict(zip(doc_ids, cosine_sim));

ranked = sorted(doc_sim_values.items(), key=operator.itemgetter(1), reverse=True);

# Function to write results to an output file specified in the commandline arguments
# As default this is off and results are printed to console.
def write_to_output_file(ranked):
    output_file = open(config.output_file, "w");
    if config.bySimilarity:
        x = 0;
        for item in ranked:
            if item[1] >= float(config.simValue):
                output_file.write(str(x + 1) + ": ");
Esempio n. 37
0
def get_recommendation(request):
    #method put
    if request.method == "PUT":

        data = json.loads(request.body)
        id_movie = data["id"]
        #id_movie = abs(id_movie)
        ##content based:
        model = Word2Vec.load(model_path)  #load model
        #load csv
        ratings_df = pd.read_csv(csv_path3)
        print(ratings_df)
        movies_df = pd.read_csv(csv_path2)
        metadata_df = pd.read_csv(csv_path)

        #function that returns similar movies
        #function that returns similar movies
        def most_similar_movie(movieId):
            print("Similar of " + ratings_df[ratings_df['tmdbId'] == int(
                movieId)].iloc[0]['title'])
            #return [(int(x[0]), ratings_df[ratings_df['tmdbId'] == int(x[0])].iloc[0]['title']) for x in model.wv.most_similar(movieId)]
            return [(
                int(x[0]),
                ratings_df[ratings_df['tmdbId'] == int(x[0])].iloc[0]['title'])
                    for x in model.wv.most_similar(movieId)]

        def most_similar_gener(genres):
            count = 0
            for genre in genres:
                if count == 0:
                    vector = model[genre]
                    count = count + 1
                else:
                    vector = model[genre] + vector
            print("Similar of ", list(genres))
            #print(model.wv.most_similar([vector]))
            resp = []
            for x in model.wv.most_similar([vector]):
                try:
                    int(x[0])
                    resp.append((int(x[0]), ratings_df[
                        ratings_df['tmdbId'] == int(x[0])].iloc[0]['title']))
                except:
                    print(x)
            return resp

        #cosine
        description_df = metadata_df[['tmdbId', 'overview', 'title']]
        tfidf = TfidfVectorizer(stop_words='english')  #tfidf instance
        #Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
        overview_matrix = tfidf.fit_transform(description_df['overview'])
        similarity_matrix = linear_kernel(overview_matrix, overview_matrix)
        #movies index mapping
        mapping = pd.Series(description_df.index,
                            index=description_df['tmdbId'])

        def most_similar_description(movie_input):
            movie_input = int(movie_input)
            movie_index = mapping[movie_input]
            #get similarity values with other movies
            #similarity_score is the list of index and similarity matrix
            similarity_score = list(enumerate(similarity_matrix[movie_index]))
            #sort in descending order the similarity score of movie inputted with all the other movies
            similarity_score = sorted(similarity_score,
                                      key=lambda x: x[1],
                                      reverse=True)
            # Get the scores of the 15 most similar movies. Ignore the first movie.
            similarity_score = similarity_score[1:15]
            #return movie names using the mapping series
            movie_indices = [i[0] for i in similarity_score]
            movie_ids = description_df['tmdbId'].iloc[movie_indices].tolist()
            #return ([(int(movie),description_df[description_df['tmdbId'] == movie]['title'].to_string(index=False).strip()) for movie in movie_ids])
            return ([
                (int(movie), description_df[description_df['tmdbId'] == movie]
                 ['title'].to_string(index=False).strip())
                for movie in movie_ids
            ])

        def get_genres(movie_id):
            return metadata_df[metadata_df['tmdbId'] ==
                               movie_id].genres.to_string(index=False).strip()

        def content_based(movie_id):
            #search by simmilar description
            description_sim = most_similar_description(movie_id)
            #search by simmilar movie name
            try:
                movie_sim = most_similar_movie(movie_id)
            except:
                movie_sim = []
            set_1 = set(description_sim)
            set_2 = set(movie_sim)
            moviesSet2_notin_Set1 = list(set_2 - set_1)
            combined_sim = description_sim + moviesSet2_notin_Set1
            #search by simmilar genres
            genres_string = get_genres(
                int(movie_id))  #take the genres of the movie
            genres = genres_string.split()
            #print(genres)
            #use the function
            try:
                genres_sim = most_similar_gener(genres)
            except:
                genres_sim = []
            set_1 = set(combined_sim)
            set_2 = set(genres_sim)
            moviesSet2_notin_Set1 = list(set_2 - set_1)
            combined_sim = combined_sim + moviesSet2_notin_Set1
            #don't repeat movies
            combined_similar_movies = list(set(combined_sim))
            return combined_similar_movies

            ### Colaborative Filer

        prediction = content_based(id_movie)
        print(prediction)
        return JsonResponse({"recommendation": prediction}, status=201)
    else:
        return JsonResponse({"error": "PUT request required."}, status=400)
Esempio n. 38
0
matrix=vectorizer.fit_transform(news2_list) 
matrix 

for i, feature in enumerate(vectorizer.get_feature_names()):
    print(i,feature) 

tfidf=TfidfVectorizer(preprocessor= lambda x: x, tokenizer= lambda x:x)
tfidf_matrix=tfidf.fit_transform(news2_list)

#convert list to np array 
news2x=np.asarray(news2)
news5x=np.asarray(news_5_list)
news2x.shape,news5x.shape 
news5x1=news5x[0:3217] 
news5x1.shape 

#convert array to list
news2x1=news2x.tolist() 
news5x11=news5x1.tolist() 

#tfidf-vectorizer 
tfidf1=TfidfVectorizer().fit_transform(news2x1)
tfidf2=TfidfVectorizer().fit_transform(news5x11)

#cosine similarites (after tfidf)
from sklearn.metrics.pairwise import linear_kernel
cos_sim1=linear_kernel(tfidf1,tfidf2).flatten() 
cos_sim1 #cosine similarity score between the new york times and the washington post 


Esempio n. 39
0
from sklearn.feature_extraction.text import TfidfVectorizer


def tfidf(data):
    tfidf = TfidfVectorizer(stop_words='english', use_idf=True)
    tfidf_matrix = tfidf.fit_transform(data)
    return tfidf_matrix


tfidf_matrix = tfidf(meta_data['abstract'])

dir(tfidf_matrix)

# in order to explore which documents have more similar respresentaiton, consine simliartiy can be used
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(tfidf_matrix[0:1], tfidf_matrix).flatten()

# 10 most related documents indices
related_docs_indices = cosine_similarities.argsort()[:-11:-1]
print("Related Document:", related_docs_indices)

# Cosine Similarties of related documents
print("Cosine Similarites of related documents",
      cosine_similarities[related_docs_indices])

meta_data.iloc[1]['abstract']

from wordcloud import WordCloud
import matplotlib.pyplot as plt

meta_data['index'] = meta_data.index
Esempio n. 40
0
book_description = pd.read_csv('description.csv', encoding='latin-1')

# checking if we have the right data
book_description.head()

# removing the stop words
books_tfidf = TfidfVectorizer(stop_words='english')
# replace NaN with empty strings
book_description['description'] = book_description['description'].fillna('')
# computing TF-IDF matrix required for calculating cosine similarity
book_description_matrix = books_tfidf.fit_transform(
    book_description['description'])

# Let's check the shape of computed matrix
book_description_matrix.shape

# compuing cosine similarity matrix using linear_kernal of sklearn
cosine_similarity = linear_kernel(book_description_matrix,
                                  book_description_matrix)

# Get the pairwsie similarity scores of all books compared to the book passed by index, sorting them and getting top 5
# here 2 is the index of the book in dataset
similarity_scores = list(enumerate(cosine_similarity[2]))
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
similarity_scores = similarity_scores[1:6]

# Get the similar books index
books_index = [i[0] for i in similarity_scores]

# Return the top 5 most similar books using integer-location based indexing (iloc)
print(book_description['name'].iloc[books_index])
Esempio n. 41
0
def find_similar(tfidf_matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]
Esempio n. 42
0
def find_similar(tfidf_matrix, document):
    top_n = len(document) #change if need top_n
    index = 0
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(document[index-1], cosine_similarities[index]) for index in related_docs_indices][0:top_n]
Esempio n. 43
0
df = pd.read_csv('C:/Users/PGDM//Desktop/movies_metadata.csv')

#content based recommendation
#we use the overview column to extract words so that movies can be recommended
from sklearn.feature_extraction.text import TfidfVectorizer
df = df[:10000]

tfidf = TfidfVectorizer(stop_words='english')
df['overview'] = df['overview'].fillna('')
tfidf_mat = tfidf.fit_transform(df['overview'])

#since tfidf is used, cosine can be used for similarity score
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_mat, tfidf_mat)

#to identify index based on title
idx = pd.Series(df.index, index=df['title']).drop_duplicates()


#function to return recommendations
def recommendations(title, cosine_sim=cosine_sim):
    ind = idx[title]
    scores = list(enumerate(cosine_sim[ind]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    scores = scores[1:16]
    movieind = [i[0] for i in scores]
    return df['title'].iloc[movieind]

Esempio n. 44
0
def _train(path_input, path_output, numrow, numtop):

    tf = TfidfVectorizer(analyzer='word',
                         ngram_range=(1, 3),
                         min_df=1,
                         stop_words='english',
                         encoding='utf-8')
    x = path_input['title'] + ' ' + path_input['genres'].str.replace(
        '|', ' ') + ' ' + path_input['directors'].str.replace(
            '|', ' ') + ' ' + path_input['writers'].str.replace('|', ' ')

    tfidf_matrix = tf.fit_transform(x.values.astype('U'))

    index = 0
    totalRow = len(path_input.index)
    print(totalRow)
    if int(totalRow) < int(numrow):
        cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
        i = 0
        for idx in range(0, int(totalRow)):
            similar_indices = cosine_similarities[i].argsort(
            )[:-int(totalRow):-1]
            similar_items = [str(path_input['movieId'][idx])]
            for j in similar_indices:
                if (idx != j):
                    similar_items.append(
                        str(path_input['movieId'][j]) + "|" +
                        str(cosine_similarities[i][j]))
            To_CSV(similar_items, path_output)
            i = i + 1
        pass
    else:
        count = int(int(totalRow) / int(numrow))
        print('--count: %s' % count)
        remain = int(totalRow) - int(count) * int(numrow)
        while (index < count + 1):
            print('--index: %s' % index)
            begin = index * int(numrow)
            # print('---begin: %s' %(begin))
            if (index == count):
                if int(remain) == 0:
                    end = begin + int(numrow)
                    # print('---end:%s' %(end))
                else:
                    print('--remain: %s' % (remain))
                    end = begin + int(remain)
            else:
                end = begin + int(numrow)
                # print('---end:%s' %(end))

            # print(tfidf_matrix[begin:end])
            print('----begin: %s, end: %s' % (begin, end))
            cosine_similarities = linear_kernel(tfidf_matrix[begin:end],
                                                tfidf_matrix)
            i = 0
            for idx in range(begin, end):
                # print('---idx: %s----' %idx)
                similar_indices = cosine_similarities[i].argsort(
                )[:-int(numtop):-1]
                similar_items = [str(path_input['movieId'][idx])]
                for j in similar_indices:
                    if (idx != j):
                        similar_items.append(
                            str(path_input['movieId'][j]) + "|" +
                            str(cosine_similarities[i][j]))
                To_CSV(similar_items, path_output)
                i = i + 1
            index = index + 1
Esempio n. 45
0
    # Instantiating tfidf vectorizer
    vectorizer = TfidfVectorizer(stop_words=stop_words, tokenizer=lematize)

    # Getting vectors from podcast descriptions
    vectors = vectorizer.fit_transform(df['description'])
    # Changing vectors to a pandas dataframe
    vectors = pd.DataFrame(vectors.todense())
    # Setting the tokens as the column names
    words = vectorizer.get_feature_names()
    vectors.columns = words
    df = pd.concat([df, vectors], axis=1)

    # Compare the documents to themselves; higher numbers are more similar
    # The diagonal is comparing a document to itself, so those are 1's (100% similar)
    cos_sims = linear_kernel(vectors, vectors)
    # Removing 1's on the diagonals
    np.place(cos_sims, cos_sims >= 0.99, 0)

    # let string lengths be as long as they need to be
    pd.set_option('display.max_colwidth', -1)

    # Getting the podcast that is most similar for each podcast
    most_similar = cos_sims.argsort(axis=1)[::-1]

    max_pods_to_recommend = 10

    most_similar_in_order = []

    for rankArr in most_similar:
        most_similar_this_pod_in_order = []
Esempio n. 46
0
X_count = vectorizer.fit_transform(df_tag_strings_new.loc[:, 'tags'].values)
#print(X_count)
X_dense = X_count.todense()  # For euclidean distances

# TF-IDF
tf = TfidfVectorizer(stop_words='english')
tf_idf = tf.fit_transform(df_tag_strings_new['tags'])

#print (tf_idf)
#print (tf_idf.shape)

#Similarity/ Distance measures
cos_sim_count = cosine_similarity(X_count)  # For count vectorizer

cos_sim_tfidf = linear_kernel(
    tf_idf,
    tf_idf)  # Dot Product because of TFIDF vectors and faster processing

man_dist_count = manhattan_distances(X_count)

euc_dist_count = euclidean_distances(X_dense)

euc_dist_tfidf = euclidean_distances(tf_idf)

#reverse lookup of title and movie indices
df_movie_indices = pd.Series(df_movies.index,
                             index=df_movies['title']).drop_duplicates()


def recommend_content(
        title, similarity,
Esempio n. 47
0
def main():
    client = MongoClient()
    wCollection = client.cs229.wArticlesCleaned
    nCollection = client.cs229.nytArticles

    # Get references.
    nArticles = list(nCollection.find().sort([
        ("wikipediaId", pymongo.ASCENDING)
    ]).limit(1000))

    # Fetch all the linked articles.
    wArticles = []
    wIdSet = {}
    for nArticle in nArticles:
        # Delete the id.
        del nArticle["_id"]

        # Fetch the wikipedia article(s) if necessary.
        for wikipediaId in nArticle["wikipediaId"]:
            if wikipediaId in wIdSet:
                continue
            wIdSet[wikipediaId] = 1
            wArticles.append(wCollection.find_one({"_id": wikipediaId}))

        # Fetch distance 1 wikipedia article(s) if necessary.
        for wikipediaId in nArticle["wikipediaId1"][:10]:
            if wikipediaId in wIdSet:
                continue
            wIdSet[wikipediaId] = 1
            wArticles.append(wCollection.find_one({"_id": wikipediaId}))

    print "Finished fetching data, nArticles: {}, wArticles: {}".format(
        len(nArticles), len(wArticles))
    sys.stdout.flush()

    # Split into train, dev, and test.
    shuffle(nArticles)
    nArticlesTrain = nArticles[:600]
    nArticlesDev = nArticles[600:800]
    nArticlesTest = nArticles[800:]

    # Set up tfidf matrix on training data only.
    corpus = []
    for article in wArticles:
        corpus.append(article["title"])
        article["titleCorpusIndex"] = len(corpus) - 1
        corpus.append(article["text"])
        article["textCorpusIndex"] = len(corpus) - 1
    for article in nArticlesTrain:
        corpus.append(article["scrapedTitle"])
        article["titleCorpusIndex"] = len(corpus) - 1
        corpus.append(article["scrapedText"])
        article["textCorpusIndex"] = len(corpus) - 1
    tf = TfidfVectorizer(analyzer='word',
                         ngram_range=(1, 3),
                         min_df=0,
                         stop_words='english',
                         decode_error='ignore')
    tfidfMatrix = tf.fit_transform(corpus)
    cosineSimMatrix = linear_kernel(tfidfMatrix, tfidfMatrix)
    print "Finished tfidf for train"
    sys.stdout.flush()

    # Calculate cosine similarities for dev.
    cosineSimMatrixDev = calculateCosineSimilarities(nArticlesDev, tf,
                                                     tfidfMatrix)
    print "Finished tfidf for dev"
    sys.stdout.flush()

    # Calculate cosine similarities for test.
    cosineSimMatrixTest = calculateCosineSimilarities(nArticlesTest, tf,
                                                      tfidfMatrix)
    print "Finished tfidf for test"
    sys.stdout.flush()

    # Create the (w, n) pairs.
    wnPairsTrain = createWNPairs(nArticlesTrain, wArticles)
    wnPairsDev = createWNPairs(nArticlesDev, wArticles)
    wnPairsTest = createWNPairs(nArticlesTest, wArticles)
    print "Finished creating pairs"
    sys.stdout.flush()

    # Extract features for training data.
    XTrain, YTrain = extractFeatures(wnPairsTrain, cosineSimMatrix)
    np.savetxt("data/docMatchIITrainX.txt", XTrain)
    np.savetxt("data/docMatchIITrainY.txt", YTrain)
    print "Outputted training data, {}".format(len(YTrain))
    sys.stdout.flush()

    # Extract features for dev data.
    XDev, YDev = extractFeatures(wnPairsDev, cosineSimMatrixDev)
    np.savetxt("data/docMatchIIDevX.txt", XDev)
    np.savetxt("data/docMatchIIDevY.txt", YDev)
    print "Outputted dev data, {}".format(len(YDev))
    sys.stdout.flush()

    # Extract features for test data.
    XTest, YTest = extractFeatures(wnPairsTest, cosineSimMatrixTest)
    np.savetxt("data/docMatchIITestX.txt", XTest)
    np.savetxt("data/docMatchIITestY.txt", YTest)
    print "Outputted test data, {}".format(len(YTest))
    sys.stdout.flush()
Esempio n. 48
0
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

users = pd.read_csv('lucid.csv/lucid_table_users.csv', encoding='latin-1')
#users.head()

lucid_tfidf = TfidfVectorizer(stop_words='english')
# filling the missing values with empty string
users['short_bio'] = users['short_bio'].fillna('')
# computing TF-IDF matrix required for calculating cosine similarity
users_matrix = lucid_tfidf.fit_transform(users['short_bio'])

#users_matrix.shape
cosine_similarity = linear_kernel(users_matrix, users_matrix)
indices = pd.Series(users['name'].index)


def recommend(index, cosine_sim=cosine_similarity):
    id = indices[index]
    # Get the pairwsie similarity scores of all names
    # sorting them and getting top 5
    similarity_scores = list(enumerate(cosine_sim[id]))
    similarity_scores = sorted(similarity_scores,
                               key=lambda x: x[1],
                               reverse=True)
    similarity_scores = similarity_scores[1:6]

    # Get the names index
    lucid_index = [i[0] for i in similarity_scores]
sim_mat_content[np.isnan(sim_mat_content)] = 0
'''
    4. TF-IDF:
        - article title 

'''

articles = df_articles.copy()
articles['title'] = articles['title'].fillna("").astype('str')

tf_idf = TfidfVectorizer(analyzer='word',
                         ngram_range=(1, 2),
                         min_df=0,
                         stop_words='english')
tfidf_matrix = tf_idf.fit_transform(articles['title'])
sim_mat_tfidf = linear_kernel(tfidf_matrix, tfidf_matrix)

del articles, tfidf_matrix
'''
    5.  NMF

'''

dat = df_clicks.copy()
dat['click'] = 1
R = dat.pivot(index='userId', columns='articleId', values='click').fillna(0)

n_users = len(dat['userId'].unique())
n_items = len(dat['articleId'].unique())
R_shape = (n_users, n_items)
# print(R_shape)
Esempio n. 50
0
    def chapterize(self, tokens, boundaries=[], language='en', visual=False):
        """segment a document into coherent parts, using a TextTiling-inspired method
        
        Args:
            tokens (TranscriptToken): tokens to segment
            boundaries (list, optional): list of integers, additional boundaries to refine segmentation. Defaults to [].
            language (str, optional): language (ISO 639-1 language code). Defaults to 'en'.
            visual (bool, optional): show graph. Defaults to False.
        
        Returns:
            [type]: [description]
        """

        from chapterize.preprocessor_helper import lemma
        from chapterize.document_vectorizer import DocumentVectorizer
        from write_chapters import Chapter

        import nltk
        from sklearn.metrics.pairwise import linear_kernel
        import numpy as np
        from scipy.signal import savgol_filter
        from scipy.signal import argrelextrema
        from math import floor
        import json

        # preprocess:
        # lowercase, lemmatize, remove stopwords
        # segment transcript into segments of window_width

        processed = []  # segments of width window_width
        end_times = []  # end times of every segment

        # batch preprocess tokens
        chunk_tokens_lemma = lemma([token.token for token in tokens], language)
        for i, token in enumerate(tokens):
            token.token = chunk_tokens_lemma[i]

        chunks = list(divide_chunks(tokens, self.window_width))
        for chunk in chunks:
            processed_section = ''
            for token in chunk:
                processed_section += ' ' + token.token
                last_end_time = token.time
            processed.append(processed_section)
            end_times.append(last_end_time)

        end_times.pop()

        # vectorize
        #dv = DocumentVectorizer('tfidf', tfidf_min_df=default_params.tfidf_min_df, tfidf_max_df=default_params.tfidf_max_df)

        dv = DocumentVectorizer(self.tfidf_min_df, self.tfidf_max_df)
        document_vectors = dv.vectorize_docs('ft_average',
                                             processed,
                                             language=language)

        print(document_vectors.shape[0])

        # calculate cosine similarity score for adjacent segments
        cosine_similarities = []
        print('\ncosine similarity scores:')
        for i, doc_vec in enumerate(document_vectors[:-1]):
            cosine_similarity = linear_kernel(doc_vec,
                                              document_vectors[i + 1])[0][0]
            cosine_similarities.append(cosine_similarity)
            print(cosine_similarity)

        # smooth curve with Savitzky-Golay filter
        if self.savgol_window_length == 0:
            self.savgol_window_length = min(9, len(cosine_similarities))
            if self.savgol_window_length % 2 == 0:
                self.savgol_window_length -= 1
        cosine_similarities_smooth = savgol_filter(cosine_similarities,
                                                   self.savgol_window_length,
                                                   self.savgol_polyorder)

        # calculate local minima
        minima = argrelextrema(cosine_similarities_smooth, np.less)[0]
        print('\nlocal minima found at {}\n'.format(minima))

        self.max_utterance_delta = floor(self.window_width * .4)

        # concatinate tokens
        concat_segments = []
        for i, minimum in enumerate(minima):
            concat_segment = ''
            if i == 0:
                concat_segment += " ".join(processed[0:minimum + 1])
            else:
                concat_segment += " ".join(processed[minima[i - 1] +
                                                     1:minimum + 1])
            concat_segments.append(concat_segment)
        concat_segments.append(" ".join(
            processed[minima[-1] +
                      1:]))  # append last section (from last boundary to end)

        #find closest utterance boundary for each local minima
        segment_boundary_tokens = []
        segment_boundary_times = []
        for minimum in minima:
            closest = min(boundaries,
                          key=lambda x: abs(x - (
                              (minimum + 1) * self.window_width)))
            print(
                'for minimum at token {}, closest utterance boundary is at token {}'
                .format((minimum + 1) * self.window_width, closest))

            if abs((minimum + 1) * self.window_width -
                   closest) <= self.max_utterance_delta:
                segment_boundary_tokens.append(tokens[closest].token)
                segment_boundary_times.append(tokens[closest].time)
            else:
                print(
                    '  closest utterance boundary is too far from minimum boundary (max_utterance_delta exceeded), topic boundary set to {}'
                    .format(tokens[minimum * self.window_width].token))
                segment_boundary_tokens.append(tokens[(minimum + 1) *
                                                      self.window_width].token)
                segment_boundary_times.append(tokens[(minimum + 1) *
                                                     self.window_width].time)

        # print("Segment boundary tokens:\n", segment_boundary_tokens)

        if visual:
            visualize(cosine_similarities_smooth, cosine_similarities, minima,
                      segment_boundary_times, end_times)

        boundary_indices = [0] + [
            minimum * self.window_width for minimum in minima
        ]

        return concat_segments, boundary_indices
Esempio n. 51
0
def main():
    # Switch for algorithms
    # 1 - ucs with tfidf
    # 2 - ucs with lsa (needs more data, probably)
    # 3 - Structured Perceptron with tfidf features
    # 4 - ucs with tfidf over whole article
    algoNum = 4

    # Solve utf errors.
    reload(sys)
    sys.setdefaultencoding('utf8')

    # Train if necessary.
    if algoNum == 3:
        weights = train()
        print weights

    # LSA over entire corpus.
    # lsa = None
    # if algoNum == 2:
    #     corpus = getCorpus("data/wArticlesCleaned/wArticlesCleaned.0.json")
    #     lsa = getLsa(corpus, 100)
    #     corpus = None

    # Load data.
    filename = "data/sentInDev.json"
    dataList = []
    with open(filename, 'rb') as inFile:
        dataList = json.load(inFile)

    # Count bullets.
    killBullets = True
    if killBullets:
        tempDataList = []
        for data in dataList:
            hasBullet = False
            for sentence in data["prelimSection"]:
                if sentence.startswith("*"):
                    hasBullet = True
                    break
            if not hasBullet:
                tempDataList.append(data)
        dataList = tempDataList

    # Go through data.
    predictedList = []
    sectionsToTest = len(dataList) * 1.0
    sumSentencesPerSection = 0.0
    numToInsert = 0.0
    numInserted = 0.0
    numInsertedCorrectly = 0.0
    sumSentencesAway = 0.0
    for dataNum, data in enumerate(dataList):
        # Call the algorithm.
        predictedSection = None
        if algoNum == 1:
            corpus = list(data["prelimSection"])
            cosineSimMatrix = getTfIdfCosineSimMatrix(corpus)
            def cost1(s1, s2, sI):
                index1 = data["prelimSection"].index(s1)
                index2 = data["prelimSection"].index(s2)
                if sI is None:
                    return 1.0 / (2 * (cosineSimMatrix[index1][index2] + 1))
                else:
                    indexI = data["prelimSection"].index(sI)
                    return 1.0 / (cosineSimMatrix[index1][indexI] + 1 + cosineSimMatrix[indexI][index2] + 1)
            predictedSection = sentInUcs.InsertSentences(list(data["sentences"]), list(data["section"]), cost1)
        elif algoNum == 2:
            corpus = []
            data["article"].pop(data["article"].index(data["sentences"][0]))
            for s1, s2 in zip(data["article"][:len(data["article"])-1], data["article"][1:]):
                sCat = s1 + s2
                corpus.append(sCat)
            lsa = getLsa(corpus, 10)
            corpus = list(data["prelimSection"])
            lsaMatrix = lsa.transform(corpus)
            cosineSimMatrix = linear_kernel(lsaMatrix, lsaMatrix)
            def cost2(s1, s2, sI):
                index1 = data["prelimSection"].index(s1)
                index2 = data["prelimSection"].index(s2)
                if sI is None:
                    return 1.0 / (2 * (cosineSimMatrix[index1][index2] + 1))
                else:
                    indexI = data["prelimSection"].index(sI)
                    return 1.0 / (cosineSimMatrix[index1][indexI] + 1 + cosineSimMatrix[indexI][index2] + 1)
            predictedSection = sentInUcs.InsertSentences(list(data["sentences"]), list(data["section"]), cost2)
        elif algoNum == 3:
            predictedSection = getPredicted(data, weights)
        elif algoNum == 4:
            corpus = list(data["article"])
            cosineSimMatrix = getTfIdfCosineSimMatrix(corpus)
            def cost1(s1, s2, sI):
                index1 = data["article"].index(s1)
                index2 = data["article"].index(s2)
                if sI is None:
                    return 1.0 / (2 * (cosineSimMatrix[index1][index2] + 1))
                else:
                    indexI = data["article"].index(sI)
                    return 1.0 / (cosineSimMatrix[index1][indexI] + 1 + cosineSimMatrix[indexI][index2] + 1)
            predictedSection = sentInUcs.InsertSentences(list(data["sentences"]), list(data["section"]), cost1)
        else:
            print "Algo {} not implemented".format(algoNum)
            sys.exit(1)

        # Update metrics.
        sumSentencesPerSection += len(data["section"])
        predictedIndexes = getInsertionIndexes(data["sentences"], predictedSection)
        sentencesAway = "N/A"
        for actualIndex, predictedIndex in zip(data["insertionIndexes"], predictedIndexes):
            numToInsert += 1
            if predictedIndex is None:
                continue

            numInserted += 1
            if predictedIndex == actualIndex:
                numInsertedCorrectly += 1
            sentencesAway = abs(predictedIndex - actualIndex)
            sumSentencesAway += sentencesAway

        # Show progress.
        if dataNum % 100 == 0:
            print "Ran {} sections".format(dataNum)
            sys.stdout.flush()

        # Save some info.
        savePredicted = False
        if savePredicted:
            data["predictedSection"] = predictedSection
            data["sentencesAway"] = sentencesAway
            del data["article"]
            predictedList.append(data)
            if dataNum % 100 == 99:
                with open("data/sentInPredicted2.json", 'ab') as outFile:
                    print "Dumping {} outputs".format(len(predictedList))
                    sys.stdout.flush()
                    outFile.write(json.dumps(predictedList, indent=4))
                    predictedList = []

    print (
        "sectionsTested: {0:.0f}, avgInsertionPoints: {1:.4f}, sentencesToInsert: {2:.0f}, " +
        "numActuallyInserted: {3:.0f}, avgInsertedCorrectly: {4:.4f}, avgSentencesAway: {5:.4f}"
    ).format(
        sectionsToTest,
        sumSentencesPerSection / sectionsToTest + 1,
        numToInsert,
        numInserted,
        numInsertedCorrectly / numInserted,
        sumSentencesAway / numInserted
    )
Esempio n. 52
0
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

ds = pd.read_csv(io.BytesIO(upload_files['sample-data.csv']))

tf = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 3),
                     min_df=0,
                     stop_words='english')
tfidf_matrix = tf.fit_transform(ds['description'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

results = {}

for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], ds['id'][i])
                     for i in similar_indices]

    results[row['id']] = similar_items[1:]

print('done!')


def item(id):
    return ds.loc[ds['id'] == id]['description'].tolist()[0].split(' - ')[1]


# Just reads the results out of the dictionary.
Esempio n. 53
0
links_small = pd.read_csv('./static/data/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype(
    'int')
smd = pd.read_csv('smd.csv')
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])
tf = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=0,
                     stop_words='english')
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')
tfidf_matrix = tf.fit_transform(smd['description'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


@app.route('/')
@app.route("/home")
def index():
    return render_template('index.html')


@app.route("/about")
def about():
    return render_template('about.html')


@app.route("/forme")
Esempio n. 54
0
def test_linear_kernel():
    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    K = linear_kernel(X, X)
    # the diagonal elements of a linear kernel are their squared norm
    assert_array_almost_equal(K.flat[::6], [linalg.norm(x)**2 for x in X])
Esempio n. 55
0
list(smd)
for i in range(0,6000):
    smd["combined_features"] = re.sub('[^a-z A-z]',' ',smd["combined_features"][i]) 
     
    
#---------------To count each word---------------------------------------------
    
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 3),min_df=0, stop_words='english')
count_matrix = tf.fit_transform(smd["combined_features"])

#---------------To find Cosine Similarity--------------------------------------

from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(count_matrix,count_matrix)

#---Function to get title and Index of movies for recommendation

def get_title_from_index(index):
    return smd[smd.index == index]["title"].values[0]
def get_index_from_title(title):
    return smd[smd.title == title]["index"].values[0]

smd = smd.reset_index()
titles = smd['title']
# finding indices of every title
indices = pd.Series(smd.index, index=titles)

#-------Recommendatio for the movie which client have watched recently---------
Esempio n. 56
0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.pairwise import linear_kernel
#twenty = fetch_20newsgroups()
twenty = [
    "hello there, I'm very happy", "I'm feeling really good",
    "everything is happy now", "whatever happened here", "go, go to the boom"
]
tfidf = TfidfVectorizer().fit_transform(twenty)
cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
related_docs_indices = cosine_similarities.argsort()[:-3:-1]
Esempio n. 57
0
        if "answer" not in d:
            d["answer"] = "random"
        dev_question_answers.append(d["answer"])

len_dev = len(dev_questions)
"""
vectorizer = TfidfVectorizer(stop_words=stop_words)
vectors = vectorizer.fit_transform(dev_questions + train_questions)
dev_vectors = vectors[0:len_dev]

train_vectors = vectors[len_dev:]

dev_predict = [None] * len_dev
for query_index in range(len_dev):
    query_vector = vectors[query_index,:]
    cosine_similarities = linear_kernel(query_vector, train_vectors).flatten()
    dev_predict[query_index] = train_question_answers[np.argmax(cosine_similarities)][0]


predict_output = [None] * len_dev
for i in range(len_dev):
    output_dict = {'question': dev_questions[i],
            'prediction': dev_predict[i]
            }
    predict_output[i] = output_dict


pred_file = os.path.join(filep, 'ef_dev_predict.json')
with open(pred_file, 'w') as output:
    output.write(json.dumps(predict_output, indent=4) + '\n')
Esempio n. 58
0
 'Cs': [
     0.0000001,
     0.000001,
     0.00001,
     0.0001,
     0.001,
     0.01,
     0.1,
     1.,
     10.,
     100.,
     1000.
 ],
 'lams': [0., .1, .2, .3, .4, .5, .6, .7, .8, .9, 1.],
 'kernel_funcs': {
     'linear': [lambda X, L=None: pairwise.linear_kernel(X, L)],
     'rbf': [
         lambda X, L=None: pairwise.rbf_kernel(X, L, gamma=0.0000001),
         lambda X, L=None: pairwise.rbf_kernel(X, L, gamma=0.000001),
         lambda X, L=None: pairwise.rbf_kernel(X, L, gamma=0.00001),
         lambda X, L=None: pairwise.rbf_kernel(X, L, gamma=0.0001),
         lambda X, L=None: pairwise.rbf_kernel(X, L, gamma=0.001),
         lambda X, L=None: pairwise.rbf_kernel(X, L, gamma=0.01),
         lambda X, L=None: pairwise.rbf_kernel(X, L, gamma=0.1)
     ],
     'laplacian': [
         lambda X, L=None: pairwise.laplacian_kernel(X, L, gamma=0.0000001),
         lambda X, L=None: pairwise.laplacian_kernel(X, L, gamma=0.000001),
         lambda X, L=None: pairwise.laplacian_kernel(X, L, gamma=0.00001),
         lambda X, L=None: pairwise.laplacian_kernel(X, L, gamma=0.0001),
         lambda X, L=None: pairwise.laplacian_kernel(X, L, gamma=0.001),
Esempio n. 59
0
con_start_idx = len(pro_tweets)

tweets = pro_tweets + con_tweets

print 'Number of pro and con tweets: {0}, {1}'.format(len(pro_tweets), len(con_tweets))

vectorizer = TfidfVectorizer()
tfidf_vecs = vectorizer.fit_transform(tweets)

file_out = open('PrayAbortion_cosine_similarity_5.txt', 'w')
file_out1 = open('Prochoice_cosine_similarity_5.txt', 'w')

k = 5
for i in range(0, len(pro_tweets)):
#for i in random.sample(range(con_start_idx), 5):
    cosine_similarities = linear_kernel(tfidf_vecs[i:i+1], tfidf_vecs)
    cosine_similarities = cosine_similarities.flatten()
    sorted_idx = np.argsort(cosine_similarities)[::-1]
    topk_pro = []
    topk_con = []

    for idx in sorted_idx:
        if idx < con_start_idx and idx != i:
            topk_pro.append(tweets[idx])
            if len(topk_pro) == k:
                break

    for idx in sorted_idx:
        if idx >= con_start_idx and idx != i:
            topk_con.append(tweets[idx])
            if len(topk_con) == k:
Esempio n. 60
0
def predict(data, vect, sn):
    vector = vect.transform(data)

    # get list of the ids of the retweeted people
    most_retweet_ids = run_model(sn)

    client = MongoClient()
    twitter = client['twitter']
    new = twitter['new']

    handle_tweet_dict = defaultdict(list)
    id_handle_dict = defaultdict()

    for an_id in most_retweet_ids:
        docs = new.find({'user.id': an_id})
        for doc in docs:
            tweet = doc.get('text').encode('utf8', 'ignore')
            user_id = doc.get('user').get('id')
            handle = doc.get('user').get('screen_name')
            handle_tweet_dict[handle].append(tweet)
            id_handle_dict[user_id] = handle

    tweet_list = []
    handle_list = []

    for k, v in handle_tweet_dict.iteritems():
        tweet_list.extend(v)
        handle_list.extend([k] * len(v))

    vector = vect.transform(data)
    new_word_counts = vect.transform(tweet_list)

    result_matrix = linear_kernel(vector, new_word_counts)

    indices_of_tweets = []

    # For each tweet by the client, find the 30 most similar tweets
    # This list may include tweets by the client
    for row in result_matrix:
        indices = row.argsort()[:][::-1]
        indices_of_tweets.append(indices[:31])

    # Return the ids of persons that tweeted each of the 30 most similar tweets
    handle_array = np.array(handle_list)
    persons_per_tweet = []

    for row in indices_of_tweets:
        persons_per_tweet.append(handle_array[row])

    # Count up how many times each person shows up.
    # Same weighting is given to people who have many tweets similar to one client tweet
    # and a tweet that matches a high number of client tweets.
    persons_counter = Counter()

    for row in persons_per_tweet:
        persons_counter.update(row)

    # return the top 25 people in this list
    top_people_and_count = persons_counter.most_common(10)

    top_people = [tup[0] for tup in top_people_and_count if tup[0] != sn]

    return top_people