def pilot_test(): """ """ users_vectors = [] vectorsums = [] for i, user in enumerate(sample_users): df = pd.read_pickle('./fc8_100imgs_{}.pkl'.format(user)) users_vectors.append(df) vectorsums.append(df.fc8.values.sum()) corpus = [] for vector in vectorsums: corpus.append(vector_to_document(vector)) tfidf = TfidfVectorizer() tfidf_vectorized = tfidf.fit_transform(corpus) cosine_similarities = linear_kernel(tfidf_vectorized, tfidf_vectorized) new_docs = [] for i, user in enumerate(sample_users): for j, img_vec in enumerate(users_vectors[i].fc8): doc = vector_to_document(img_vec) new_docs.append(doc) # vectorized = tfidf.transform([doc]) # sims = linear_kernel(vectorized, tfidf_vectorized)[0] # most_sims = np.argsort(sims)[::-1] # # print '{} img {} most similar to \n{}'.format(user, j, [(sample_users[i], sims[i]) for i in most_sims] ) new_docs_vectorized = tfidf.transform(new_docs) cosine_similarities = linear_kernel(new_docs_vectorized, tfidf_vectorized) for sim in cosine_similarities: print 'top score: {} top user: {}'.format(sim.max(), sample_users[np.argmax(sim)])
def plot_hist_d_to_centroid(self, min_w=0): ''' histograms of distance to centroids: overall vs. each cluster ''' self.assign_cluster(min_w) self.cal_centroid() n_clusters = np.max(self.clusters) #fig = plt.figure(figsize=(20,8)) X2_dense = self.X2.todense() centroid_overall = np.mean(X2_dense, axis=0) sim = linear_kernel(centroid_overall, X2_dense) max_sim = np.max(sim) min_sim = np.min(sim) # multiple plot, subplots ncols = 3 nrows = (n_clusters + 1) // ncols + (((n_clusters + 1) % ncols) > 0) # subplot preferred way fig, ax = plt.subplots(nrows, ncols, figsize=(30, 10)) axs = ax.flatten() i_plot = 0 axs[i_plot].hist(sim.flatten(), alpha=0.2) # , ax=axs[i_plot]) axs[i_plot].set_xlim(min_sim, max_sim) i_plot = i_plot + 1 for i in xrange(n_clusters): cond = self.clusters == i arr = X2_dense[cond] sim = linear_kernel(self.centroids[i], arr) axs[i_plot].hist(sim.flatten(), alpha=0.2) # , ax=axs[i_plot]) axs[i_plot].set_xlim(min_sim, max_sim) i_plot = i_plot + 1 fig.savefig(self.model_name + '_hist_dis_to_centroid.png') plt.close(fig)
def _build_similarity_matrix(self): """ partitioned similarity matrix ('s' for source nodes and 't' for target nodes) S = [[S_ss, S_st], [S_ts, S_tt]] """ normalize(self.source_features, norm='l2', copy=False) normalize(self.target_features, norm='l2', copy=False) self.ss = linear_kernel(self.source_features) self.st = linear_kernel(self.source_features, self.target_features) self.ts = self.st.T self.tt = linear_kernel(self.target_features)
def plot_hist_d_to_centroid(self, min_w=0): ''' plot histogram of distance to centroid, overall vs. per cluster - INPUT: self.X2 ''' self.assign_cluster(min_w) self.cal_centroid() n_clusters = np.max(self.clusters) #fig = plt.figure(figsize=(20,8)) # multiple plot, subplots ncols = 3 nrows = (n_clusters + 1) // ncols + (((n_clusters + 1) % ncols) > 0) # subplot preferred way fig, ax = plt.subplots(nrows, ncols, figsize=(30, 10)) axs = ax.flatten() centroid_overall = np.mean(self.X2, axis=0) sim = linear_kernel(centroid_overall, self.X2) max_sim = np.max(sim) min_sim = np.min(sim) print 'sim shape: %s X shape: %s centroid_overall shape: %s' % (sim.shape, self.X2.shape, centroid_overall.shape) print 'min %.2f max %.2f ' % (min_sim, max_sim) print sorted(sim.flatten(), reverse=True)[:5] print sorted(centroid_overall.getA().flatten(), reverse=True)[:5] max_sim = 1 min_sim = 0 i_plot = 0 axs[i_plot].hist(sim.flatten(), alpha=0.2) # , ax=axs[i_plot]) axs[i_plot].set_xlim(min_sim, max_sim) i_plot = i_plot + 1 for i in xrange(n_clusters + 1): cond = self.clusters == i arr = self.X2[cond] sim = linear_kernel(self.centroids[i], arr) print 'sim shape: %s arr shape: %s centroid shape: %s' % (sim.shape, arr.shape, self.centroids[i].shape) print sorted(sim.flatten(), reverse=True)[:5] print sorted(self.centroids[i].flatten(), reverse=True)[:5] axs[i_plot].hist(sim.flatten(), alpha=0.2) # , ax=axs[i_plot]) axs[i_plot].set_xlim(min_sim, max_sim) i_plot = i_plot + 1 plt.show() fig.savefig(self.model_name + '_hist_dis_to_centroid.png') plt.close(fig)
def main(): twenty = fetch_20newsgroups() tfidf = TfidfVectorizer().fit_transform(twenty.data) cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten() related_docs_indices = cosine_similarities.argsort()[:-5:-1] print related_docs_indices print cosine_similarities[related_docs_indices] # vectorizer = CountVectorizer(min_df=1) # corpus = [ # 'This is the first document.', # 'This is the second second document.', # 'And the third one.', # 'Is this the first document?', # ] # tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english') # tfs = tfidf.fit_transform(token_dict.values()) train_set = ("The sky is blue.", "The sun is bright.") test_set = ("The sun in the sky is bright.", "We can see the shining sun, the bright sun.") count_vectorizer = CountVectorizer() count_vectorizer.fit_transform(train_set) print "Vocabulary:", count_vectorizer.vocabulary # Vocabulary: {'blue': 0, 'sun': 1, 'bright': 2, 'sky': 3} freq_term_matrix = count_vectorizer.transform(test_set) print freq_term_matrix.todense() tfidf = TfidfTransformer(norm="l2") tfidf.fit(freq_term_matrix) print "IDF:", tfidf.idf_ tf_idf_matrix = tfidf.transform(freq_term_matrix) print tf_idf_matrix.todense()
def __asyncable_similarity(tup): # bs, beer_id_ref, ref_vect, s_ids, b_ids, X_t, top = tup # bs: beer similarity object for db commit # ref_vects from one style # ref_b_ids: beer ids for ref vecs # s_ids, b_ids: style and beer indices of X_t # X_t for beers in other styles to be compared to # keep top similarities by style bs, b_refs, X_t_ref, b_comps, X_t_comp, top = tup start = dt.now() print "Beer ct %s vs ct %s: Compute Similarity" % (len(b_refs), len(b_comps)) try: for i in xrange(len(b_refs)): # compute similarity between beer_ref[i] and all b_comps lk = linear_kernel(X_t_ref.getrow(i), X_t_comp).flatten() # take #top of largest similarities n = len(lk) kp = min(top, n) m_ixs = lk.argsort()[-kp:] sims = [(b_refs[i], b_comps[j], lk[j]) for j in m_ixs if b_refs[i] != b_comps[j]] # bs.smooth_similarity(sims) bs.add_many(sims) print "Comparison Complete: %s" % (dt.now() - start) return (b_refs, None) except Exception as e: return (b_refs, e)
def __kernel_definition__(self): if self.Kf == 'rbf': return lambda X,Y : rbf_kernel(X,Y,self.rbf_gamma) if self.Kf == 'poly': return lambda X,Y : polynomial_kernel(X, Y, degree=self.poly_deg, gamma=None, coef0=self.poly_coeff) if self.Kf == None or self.Kf == 'linear': return lambda X,Y : linear_kernel(X,Y)
def thread_diag_block(top_nbrs,dataM,job_ranges,r_offset, c_offset, n_nbr=100,verbose=False): ''' (cos,idx) Note in the min-heap, the first one is the smallest. ''' for job_bd in job_ranges: crossV = linear_kernel(dataM[job_bd[0]:job_bd[1],:],dataM) n_doc1, n_doc2 = crossV.shape for i_doc in range(n_doc1): i_offset = i_doc + job_bd[0] + r_offset L = top_nbrs[i_offset] for j in range(n_doc2): if i_offset == j+c_offset: continue if len(L)<n_nbr: heapq.heappush(L, (crossV[i_doc,j],j+c_offset)) elif crossV[i_doc,j] > L[0][0]: heapq.heapreplace(L, (crossV[i_doc,j],j+c_offset)) top_nbrs[i_offset] = L if verbose: print('process range (%d,%d)'%(job_bd[0],job_bd[1]))
def get_related_news(articles ,base_art_index): if related_dict.get(base_art_index) is not None : return related_dict.get(base_art_index) corpus = [] for art in articles : corpus.append( ' '.join( jieba.cut(art.context) ) ) ls = [w for w in WordCutLibs.stopwords.split('\n')] vectorizer = CountVectorizer(stop_words=ls) X = vectorizer.fit_transform(corpus) #word = vectorizer.get_feature_names() #stopword = vectorizer.get_stop_words() transformer = TfidfTransformer() tfidf = transformer.fit_transform(X) #weight = tfidf.toarray() target = base_art_index #設定目標標題 #index順序同SQL cosine_similarities = linear_kernel(tfidf[target], tfidf).flatten().argsort() max_len = len(cosine_similarities) bnd = -11 if max_len >= 10 else -(max_len) related_docs_indices = cosine_similarities[: bnd:-1] res = [ articles[idx] for idx in related_docs_indices ] related_dict[base_art_index] = res return res
def print_most_cos_sim(self, thresh=0.675): ''' Prints the two posts that have the highest cosine similarity ''' cos_sims = linear_kernel(self.word_vecs, self.word_vecs) # Initialize max_sim = 0, only consider cos sims under threshold # so we know we're not recording a post compared with itself (1.0) max_cos_sim = 0.0 thr = thresh # Find max_cos_sim for i, j in enumerate(cos_sims): for k, l in enumerate(j): if (float(l) >= max_cos_sim) and (float(l) < thr): max_cos_sim = float(l) # Find indices of max_cos_sim double_break = False for i, j in enumerate(cos_sims): for k, l in enumerate(j): if float(l) == max_cos_sim: ind1, ind2 = i, k double_break = True break if double_break: break print 'Posts with highest cosine similarity ({:.3f}):\n\nPost {}:\n{}\ \n\nPost {}:\n{}'.format(max_cos_sim, ind1, self.posts[ind1], ind2, self.posts[ind2])
def __init__(self, *args, **kwargs): super(QUIRE, self).__init__(*args, **kwargs) self.Uindex = [idx for idx, _ in self.dataset.get_unlabeled_entries()] self.Lindex = [idx for idx in range(len(self.dataset)) if idx not in self.Uindex] self.lmbda = kwargs.pop("lambda", 1.0) X, self.y = zip(*self.dataset.get_entries()) self.y = list(self.y) self.kernel = kwargs.pop("kernel", "rbf") if self.kernel == "rbf": self.K = rbf_kernel(X=X, Y=X, gamma=kwargs.pop("gamma", 1.0)) elif self.kernel == "poly": self.K = polynomial_kernel( X=X, Y=X, coef0=kwargs.pop("coef0", 1), degree=kwargs.pop("degree", 3), gamma=kwargs.pop("gamma", 1.0) ) elif self.kernel == "linear": self.K = linear_kernel(X=X, Y=X) elif hasattr(self.kernel, "__call__"): self.K = self.kernel(X=np.array(X), Y=np.array(X)) else: raise NotImplementedError if not isinstance(self.K, np.ndarray): raise TypeError("K should be an ndarray") if self.K.shape != (len(X), len(X)): raise ValueError("kernel should have size (%d, %d)" % (len(X), len(X))) self.L = np.linalg.inv(self.K + self.lmbda * np.eye(len(X)))
def get(self): query = self.get_argument('q', None) if query is None: return queryTerms = query.split() queryVector = np.array([self._logIDF[term] for term in queryTerms]) docVectorDict = defaultdict(lambda: np.array([0] * len(queryTerms))) for i in range(len(queryTerms)): term = queryTerms[i].lower() newList = self._postingsLists[term] for item in newList: docVectorDict[item[0]][i] = item[1] * self._logIDF[term] docMatrix = np.zeros((len(docVectorDict), len(queryTerms))) docIx = 0 docIxToDocID = {} for docID in docVectorDict.keys(): docMatrix[docIx][:] = docVectorDict[docID][:] docIxToDocID[docIx] = docID docIx += 1 sims = linear_kernel(queryVector, docMatrix).flatten() bestDocIxes = sims.argsort()[::-1] bestDocSims = sims[bestDocIxes] bestDocIDs = [docIxToDocID[docIx] for docIx in bestDocIxes] postings = zip(bestDocIDs, bestDocSims) self.write(json.dumps({"postings": postings}))
def main(lists): # titles = ["overexpression expression overexpression inhibition overexpression association association interaction binds binds interaction affinity affinity", "expression inducing expression detected lacking expression inducing expression detected lacking expression"] stopwords = ["and", "edition", "for", "in", "little", "of", "the", "to"] ignorechars = """,:'!""" mylsa = LSA(stopwords, ignorechars) for x in lists: mylsa.parse(x) mylsa.build() mylsa.printA() mylsa.calc() mylsa.printSVD() tfidf = mylsa.Vt # print cosine_similarity(mylsa.Vt[0:1], mylsa.Vt) print tfidf, tfidf[0:1] cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten() print "cosine similarities", cosine_similarities angle_list = [] for a in cosine_similarities: try: angle_in_radians = math.acos(a) angle_in_degrees = math.degrees(angle_in_radians) angle_list.append(angle_in_degrees) except ValueError: angle_list.append(0) return_list = [] return_list.append(angle_list[1]) print return_list return return_list
def predict(data, vect, user_list, tweet_list, word_counts): vector = vect.transform(data) result_matrix = linear_kernel(vector, word_counts) indices_of_tweets = [] # For each tweet by the client, find the 30 most similar tweets # This list may include tweets by the client for row in result_matrix: indices = row.argsort()[:][::-1] indices_of_tweets.append(indices[2:51]) # Return the person that tweeted each of the 50 most similar tweets user_array = np.array(user_list) persons_per_tweet = [] for row in indices_of_tweets: persons_per_tweet.append(user_array[row]) # Count up how many times each person shows up. # Same weighting is given to people who have many tweets similar to one client tweet # and a tweet that matches a high number of client tweets. persons_counter = Counter() for row in persons_per_tweet: persons_counter.update(row) # return the top 25 people in this list top_people_and_count = persons_counter.most_common(25) top_people = [tup[0] for tup in top_people_and_count] return top_people
def _apply_kernel(self, x, y): """Apply the selected kernel function to the data.""" if self.kernel == 'linear': phi = linear_kernel(x, y) elif self.kernel == 'rbf': phi = rbf_kernel(x, y, self.coef1) elif self.kernel == 'poly': phi = polynomial_kernel(x, y, self.degree, self.coef1, self.coef0) elif callable(self.kernel): phi = self.kernel(x, y) if len(phi.shape) != 2: raise ValueError( "Custom kernel function did not return 2D matrix" ) if phi.shape[0] != x.shape[0]: raise ValueError( "Custom kernel function did not return matrix with rows" " equal to number of data points.""" ) else: raise ValueError("Kernel selection is invalid.") if self.bias_used: phi = np.append(phi, np.ones((phi.shape[0], 1)), axis=1) return phi
def main(protein_dict,q1,q2): if not protein_dict: angle_list = ['9999'] elif len(protein_dict) < 2: angle_list = ['9999'] else: train_set = [' '.join(protein_dict[x]) for x in protein_dict] proteins = [x for x in protein_dict] tfidf_vectorizer = TfidfVectorizer() tfidf = tfidf_vectorizer.fit_transform(train_set) #finds the tfidf score with normalization # print 'tfidf[0:1]', tfidf[0:1] # print 'tfidf[0:2]', tfidf[0:2] # print 'tfidf', tfidf cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten() # print 'cosine_similarities', cosine_similarities related_docs_indices = cosine_similarities.argsort()[:-5:-1] degrees_list = [] for a in (cosine_similarities[related_docs_indices].tolist()): try: angle_list = [] angle_in_radians = math.acos(a) angle_in_degrees = math.degrees(angle_in_radians) degrees_list.append(angle_in_degrees) angle_list.append(angle_in_degrees) except ValueError: angle_list = ['9999'] if len(degrees_list) > 1: return_list = [degrees_list[1]] else: return_list = degrees_list return return_list
def sim_char10(text1, text2): vect = HashingVectorizer(analyzer='char_wb', tokenizer=normalize, stop_words='english', ngram_range=(10, 10)) texts = [text1, text2] matrix = vect.fit_transform(texts) cosine_similarities = linear_kernel(matrix[0:1], matrix).flatten() simmax = max(cosine_similarities[1:]) return simmax
def sim_char5(text1, text2): vect = HashingVectorizer(analyzer='word', tokenizer=normalize, stop_words='english') texts = [text1, text2] matrix = vect.transform(texts) cosine_similarities = linear_kernel(matrix[0:1], matrix).flatten() simmax = max(cosine_similarities[1:]) return simmax
def predict(data, vect, user_list, word_counts, sn): vector = vect.transform(data) result_matrix = linear_kernel(vector, word_counts) tweet_list = [] # For each tweet by the client, find the 30 most similar tweets # This list may include tweets by the client for row in result_matrix: indices = row.argsort()[-51:-1][::-1] tweet_list.append(indices) # Find the 50 tweets that showed up the most number of times in the tweet list # Now you have a list of the tweets that showed up the most number of times as # being similar to your other tweets tweet_indexes = Counter([idx for sublist in tweet_list for idx in sublist]) print tweet_indexes.most_common(50) top_indexes = [tup[0] for tup in tweet_indexes.most_common(50)] #find the users that wrote the tweets user_array = np.array(user_list) people = user_array[top_indexes] print people # remove for duplicate people unique_people = set(people) print unique_people return [x for x in unique_people if x != sn]
def _train(self, ds): """ Train the engine. Create a TF-IDF matrix of unigrams, bigrams, and trigrams for each product. The 'stop_words' param tells the TF-IDF module to ignore common english words like 'the', etc. Then we compute similarity between all products using SciKit Leanr's linear_kernel (which in this case is equivalent to cosine similarity). Iterate through each item's similar items and store the 100 most-similar. Stops at 100 because well... how many similar products do you really need to show? Similarities and their scores are stored in redis as a Sorted Set, with one set for each item. :param ds: A pandas dataset containing two fields: description & id :return: Nothin! """ tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english') tfidf_matrix = tf.fit_transform(ds['content']) cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix) for idx, row in ds.iterrows(): similar_indices = cosine_similarities[idx].argsort()[:-100:-1] similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices] # First item is the item itself, so remove it. # This 'sum' is turns a list of tuples into a single tuple: [(1,2), (3,4)] -> (1,2,3,4) flattened = sum(similar_items[1:], ()) self._r.zadd(self.SIMKEY % row['id'], *flattened)
def getRelevantPassages(query, k): queryVector = allTextVectorizer.transform([query]) queryIndices = numpy.array([allTextVectorizer.vocabulary_.get(word) for word in allTextAnalyzer(query)]) queryIndices = [i for i in queryIndices if i is not None] querySimilarityScores = linear_kernel(queryVector[:,queryIndices], allTextIndex[:,queryIndices]).flatten() relatedDocIndices = querySimilarityScores.argsort()[:-k:-1] return [allTextLines[i] for i in relatedDocIndices]
def get_results(query): test = query response = tfidf.transform([test]) print 'response: ', response RESULTS_ARRAY = [] cosine_similarities = linear_kernel(response, tfs).flatten() related_docs_indices = cosine_similarities.argsort()[:-10:-1] for i in related_docs_indices: if cosine_similarities[i] > 0: file_name = token_dict.keys()[i].split('.')[0] + '.pdf.html.json' data = {} data = summary_dict[file_name] data.update({"candidate": token_dict.keys()[i].split('.')[0], "cosine": cosine_similarities[i]}) # data = {"candidate": token_dict.keys()[i].split('.')[0], # "cosine": cosine_similarities[i]} RESULTS_ARRAY.append(data) # print "%-50s %.4f" % (token_dict.keys()[i].split('.')[0],cosine_similarities[i]) # print RESULTS_ARRAY return RESULTS_ARRAY
def get(self): query = self.get_argument('q', None) if query is None: return queryTerms = query.split() # let's say we have N documents and M terms in query # Apparently we assume unique term in query # queryVector is a 1 * M dimension array queryVector = np.array([self._logIDF[term] for term in queryTerms]) # docVectoDict is a N * M vector, with default value np.array([0] * M) docVectorDict = defaultdict(lambda: np.array([0]*len(queryTerms))) for i in range(len(queryTerms)): term = queryTerms[i].lower() newList = self._postingsList[term] for item in newList: # newList is [(docID,tf)] docVectorDict[item[0]][i] = item[1] * self._logIDF[term] docMatrix = np.zeros((len(docVectoDict)), len(queryTerms))) docIx = 0 docIxToDocID = {} for docID in docVectorDict.keys(): docMatrix[docIx][:] = docVectorDict[docID][:] docIxToDocID[docIx] = docID docIx += 1 # linear_kernel is used to compute the similarity sims = linear_kernel(queryVector,docMatrix).flatten() # argsort return the index bestDocIxes = sims.argsort()[::-1] bestDocSims = sims[bestDocIxes] bestDocIDs = [docIxToDocID[docIx] for docIx in bestDocIxes] postings = zip(bestDocIDs, bestDocSims) self.write(json.dumps({"postings":postings}))
def _apply_kernel(self, X, y=None): """Apply the selected kernel function to the data.""" if self.kernel == 'linear': phi = linear_kernel(X, y) elif self.kernel == 'rbf': phi = rbf_kernel(X, y, gamma=self.gamma) elif self.kernel == 'poly': phi = polynomial_kernel(X, y, degree=self.degree) elif callable(self.kernel): phi = self.kernel(X, y) if len(phi.shape) != 2: raise ValueError( "Custom kernel function did not return 2D matrix" ) if phi.shape[0] != X.shape[0]: raise ValueError( "Custom kernel function did not return matrix with rows" " equal to number of data points.""" ) else: raise ValueError("Kernel selection is invalid.") phi = phi.T if self.bias_used: phi = np.hstack((np.ones((phi.shape[0], 1)), phi)) return phi
def pairwise_similarity(): import pickle import numpy as np import math import heapq from sklearn.metrics.pairwise import linear_kernel singular = 311363 tfidf = pickle.load(open("D:\\Users\\yutao\\eclipse1\\two_tfidfs_dump_")) x = pickle.load(open("D:\\Users\\yutao\\eclipse1\\profiles")) ids = x['ids'] sim_l = {} sim_l_index = {} flag = 0 for i in range(1): print i v1 = tfidf[i] t_v1 = np.transpose(v1) sim_a = [] for j in range(singular,len(ids)): if j%1000==0: print j if flag == 0: sim_l[j] = [] sim_l_index[j] = [] v2 = tfidf[j] t_v2 = np.transpose(v2) sim = np.dot(v1, t_v2)[0,0] / (math.sqrt(np.dot(v1,t_v1)[0,0]) * math.sqrt(np.dot(v2,t_v2)[0,0])) if sim > 0: if len(sim_a)<=100: heapq.heappush(sim_a,sim) else: heapq.heappushpop(sim_a,sim) if len(sim_l[j])<=100: heapq.heappush(sim_l[j],sim) else: heapq.heappushpop(sim_l[j],sim) lil_tfidf = tfidf.tolil() flag = 0 import pymongo col = pymongo.Connection("10.1.1.110",12345)['scrapy']['similarity'] for i in range(singular+((len(ids)-singular)/6)*4,singular+((len(ids)-singular)/6)*5): try: print i print ids[i] except Exception, e: print e sim = {} sim['_id'] = ids[i] sim['sim'] = [] lk = linear_kernel(tfidf[i], lil_tfidf[:singular]).flatten() for j in range(len(lk)): if lk[j]>0.1: sim['sim'].append({'id':ids[j], 'sim':lk[j]}) print len(sim['sim']) col.save(sim)
def calc_cos_sims(self): normalized = np.empty_like(self.X) for i, vec in enumerate(self.X): norm = np.linalg.norm(vec) for j, val in enumerate(vec): normalized[i, j] = val/norm sims = linear_kernel(normalized, normalized) return sims
def solve(): query = request.json doc = doc_remove_punc(query) doc = [lem(doc, porter)] query_vect = vectorizer.transform(doc) cos_sim = linear_kernel(vect, query_vect) top_sims = np.argsort(cos_sim, axis = None)[-1:-4:-1] top_posts = [docs[sim] for sim in top_sims] return jsonify(top_posts)
def singletextsimilarity(tfidf, index, listofstrings, printdeets=False): similarities = linear_kernel(tfidf[index], tfidf).flatten() if printdeets: most_related_docs_indices = similarities.argsort()[:-5:-1] most_related_similarities = similarities[most_related_docs_indices] print "docs most related to doc #%s are %s." % ( index, ', '.join([str(el) for el in most_related_docs_indices])) print "there similarities are %s." % (', '.join([str(el) for el in most_related_similarities])) return similarities
def tfidf_matrix(X, **kwargs): # get the tf-idf counts count_vect = CountVectorizer(**kwargs) counts = count_vect.fit_transform(X) tf_transformer = TfidfTransformer().fit(counts) counts_tfidf = tf_transformer.transform(counts) # compute cosine similarity matrix = linear_kernel(counts_tfidf, counts_tfidf) return matrix
def top_n_posts(vect, ri, n, users, posts): cos_sim = linear_kernel(vect, vect) sim_sort = np.argsort(cos_sim, axis = 1) sim_sort = sim_sort[:, 0:-1] top_n = list(range(-1,-n-1,-1)) doc = sim_sort[ri, :] user = users[ri] sim_users = list(doc[top_n]) return (user, posts[ri]), [(users[sim], posts[sim]) for sim in sim_users]
data_i = np.asarray(data_i) for batch_j in batches: data_j = [] for j in batch_j: data_j.append(np.load(output_data + listdir[j]).ravel()) data_j = np.asarray(data_j) # Compute the kernels euclidean_norm[batch_i[0]:batch_i[-1] + 1, batch_j[0]:batch_j[-1] + 1] = (pairwise_distances(data_i, data_j, metric='euclidean')**2) lin_kernel[batch_i[0]:batch_i[-1] + 1, batch_j[0]:batch_j[-1] + 1] = (linear_kernel( data_i, data_j)) # Save the kernels in CSV files linear_kernel_df = pd.DataFrame(lin_kernel, index=subjects, columns=subjects) linear_kernel_df.to_csv(output_kernels + 'linear_kernel.csv') euclidean_norm_df = pd.DataFrame(euclidean_norm, index=subjects, columns=subjects) euclidean_norm_df.to_csv(output_kernels + 'euclidean_norm.csv') # Save the target variable in a CSV file # Change this path df_y = pd.read_csv("/Volumes/dtlake01.aramis/users/clinica/pac2019/dataset/" "PAC2019_BrainAge_Training.csv")
def cosine_similarity(documentA, documentB): docs = [documentA, documentB] tfidf = TfidfVectorizer().fit_transform(docs) cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten() return cosine_similarities
df_review_one_sentence = pd.read_csv( './hotel/onesentence_hotel_review_final.csv', index_col=0) #print(df_review_one_sentence.head()) print(df_review_one_sentence.info()) # In[104]: hotel_idx = df_review_one_sentence[df_review_one_sentence['hotel_name'] == '제주 아름다운 리조트'].index[0] #인덱스 알아내는 방법 # In[105]: Tfidf = TfidfVectorizer() Tfidf_matrix = Tfidf.fit_transform( df_review_one_sentence['review_one_sentence']) #print(Tfidf_matrix.shape) #print(Tfidf_matrix) # In[106]: cosine_sim = linear_kernel(Tfidf_matrix[hotel_idx], Tfidf_matrix) #호텔1개에 대한 588개호텔의 수치 #print(cosine_sim.shape) # In[109]: print(getRecommendation(cosine_sim)) # In[ ]:
df.genre #Based on Publisher df["Publisher"].isnull().sum() # Tio find NaN values from sklearn.feature_extraction.text import TfidfVectorizer tf = TfidfVectorizer(stop_words='english') tfidf_matrix = tf.fit_transform(df.Publisher) tfidf_matrix.shape from sklearn.metrics.pairwise import linear_kernel cos_similarity_matrix = linear_kernel(tfidf_matrix, tfidf_matrix) print(cos_similarity_matrix) df_index = pd.Series(df.index, index=df['title']).drop_duplicates() def get_title_recommendations(title, topN): #topN = 10 # Getting the movie index using its title df_id = df_index[title] # Getting the pair wise similarity score for all the df's with that # df cosine_scores = list(enumerate(cos_similarity_matrix[df_id]))
# In[188]: book_data['train'] = book_data.apply(combine, axis=1) # In[190]: word_stopped = TfidfVectorizer(stop_words='english') book_data['train'] = book_data['train'].fillna('') matrix = word_stopped.fit_transform(book_data['train']) # In[192]: co_sim = linear_kernel(matrix, matrix) # In[194]: book_data = book_data.reset_index() # In[195]: indexing = pd.Series(book_data.index, index=book_data['title']).drop_duplicates() # In[ ]: indexing = pd.Series(book_data.index, index=book_data['title', 'publisher']).drop_duplicates()
print("There are " + str(num_docs) + " documents in the test data."); documents = [open(path+f, 'r').read() for f in doc_ids]; print("Calculating tfidf scores"); tfidf = vectorizer.fit_transform(documents); print("Finished calculating tfidf scores"); query_file = open('Scharrhud_Data/query/query.txt', "r"); query = vectorizer.transform(query_file); feature_names = vectorizer.get_feature_names(); print("Calculating cosine similarity values"); cosine_sim = linear_kernel(query, tfidf).flatten(); print("Finished calculating cosine similarity scores"); doc_sim_values = dict(zip(doc_ids, cosine_sim)); ranked = sorted(doc_sim_values.items(), key=operator.itemgetter(1), reverse=True); # Function to write results to an output file specified in the commandline arguments # As default this is off and results are printed to console. def write_to_output_file(ranked): output_file = open(config.output_file, "w"); if config.bySimilarity: x = 0; for item in ranked: if item[1] >= float(config.simValue): output_file.write(str(x + 1) + ": ");
def get_recommendation(request): #method put if request.method == "PUT": data = json.loads(request.body) id_movie = data["id"] #id_movie = abs(id_movie) ##content based: model = Word2Vec.load(model_path) #load model #load csv ratings_df = pd.read_csv(csv_path3) print(ratings_df) movies_df = pd.read_csv(csv_path2) metadata_df = pd.read_csv(csv_path) #function that returns similar movies #function that returns similar movies def most_similar_movie(movieId): print("Similar of " + ratings_df[ratings_df['tmdbId'] == int( movieId)].iloc[0]['title']) #return [(int(x[0]), ratings_df[ratings_df['tmdbId'] == int(x[0])].iloc[0]['title']) for x in model.wv.most_similar(movieId)] return [( int(x[0]), ratings_df[ratings_df['tmdbId'] == int(x[0])].iloc[0]['title']) for x in model.wv.most_similar(movieId)] def most_similar_gener(genres): count = 0 for genre in genres: if count == 0: vector = model[genre] count = count + 1 else: vector = model[genre] + vector print("Similar of ", list(genres)) #print(model.wv.most_similar([vector])) resp = [] for x in model.wv.most_similar([vector]): try: int(x[0]) resp.append((int(x[0]), ratings_df[ ratings_df['tmdbId'] == int(x[0])].iloc[0]['title'])) except: print(x) return resp #cosine description_df = metadata_df[['tmdbId', 'overview', 'title']] tfidf = TfidfVectorizer(stop_words='english') #tfidf instance #Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature overview_matrix = tfidf.fit_transform(description_df['overview']) similarity_matrix = linear_kernel(overview_matrix, overview_matrix) #movies index mapping mapping = pd.Series(description_df.index, index=description_df['tmdbId']) def most_similar_description(movie_input): movie_input = int(movie_input) movie_index = mapping[movie_input] #get similarity values with other movies #similarity_score is the list of index and similarity matrix similarity_score = list(enumerate(similarity_matrix[movie_index])) #sort in descending order the similarity score of movie inputted with all the other movies similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True) # Get the scores of the 15 most similar movies. Ignore the first movie. similarity_score = similarity_score[1:15] #return movie names using the mapping series movie_indices = [i[0] for i in similarity_score] movie_ids = description_df['tmdbId'].iloc[movie_indices].tolist() #return ([(int(movie),description_df[description_df['tmdbId'] == movie]['title'].to_string(index=False).strip()) for movie in movie_ids]) return ([ (int(movie), description_df[description_df['tmdbId'] == movie] ['title'].to_string(index=False).strip()) for movie in movie_ids ]) def get_genres(movie_id): return metadata_df[metadata_df['tmdbId'] == movie_id].genres.to_string(index=False).strip() def content_based(movie_id): #search by simmilar description description_sim = most_similar_description(movie_id) #search by simmilar movie name try: movie_sim = most_similar_movie(movie_id) except: movie_sim = [] set_1 = set(description_sim) set_2 = set(movie_sim) moviesSet2_notin_Set1 = list(set_2 - set_1) combined_sim = description_sim + moviesSet2_notin_Set1 #search by simmilar genres genres_string = get_genres( int(movie_id)) #take the genres of the movie genres = genres_string.split() #print(genres) #use the function try: genres_sim = most_similar_gener(genres) except: genres_sim = [] set_1 = set(combined_sim) set_2 = set(genres_sim) moviesSet2_notin_Set1 = list(set_2 - set_1) combined_sim = combined_sim + moviesSet2_notin_Set1 #don't repeat movies combined_similar_movies = list(set(combined_sim)) return combined_similar_movies ### Colaborative Filer prediction = content_based(id_movie) print(prediction) return JsonResponse({"recommendation": prediction}, status=201) else: return JsonResponse({"error": "PUT request required."}, status=400)
matrix=vectorizer.fit_transform(news2_list) matrix for i, feature in enumerate(vectorizer.get_feature_names()): print(i,feature) tfidf=TfidfVectorizer(preprocessor= lambda x: x, tokenizer= lambda x:x) tfidf_matrix=tfidf.fit_transform(news2_list) #convert list to np array news2x=np.asarray(news2) news5x=np.asarray(news_5_list) news2x.shape,news5x.shape news5x1=news5x[0:3217] news5x1.shape #convert array to list news2x1=news2x.tolist() news5x11=news5x1.tolist() #tfidf-vectorizer tfidf1=TfidfVectorizer().fit_transform(news2x1) tfidf2=TfidfVectorizer().fit_transform(news5x11) #cosine similarites (after tfidf) from sklearn.metrics.pairwise import linear_kernel cos_sim1=linear_kernel(tfidf1,tfidf2).flatten() cos_sim1 #cosine similarity score between the new york times and the washington post
from sklearn.feature_extraction.text import TfidfVectorizer def tfidf(data): tfidf = TfidfVectorizer(stop_words='english', use_idf=True) tfidf_matrix = tfidf.fit_transform(data) return tfidf_matrix tfidf_matrix = tfidf(meta_data['abstract']) dir(tfidf_matrix) # in order to explore which documents have more similar respresentaiton, consine simliartiy can be used from sklearn.metrics.pairwise import linear_kernel cosine_similarities = linear_kernel(tfidf_matrix[0:1], tfidf_matrix).flatten() # 10 most related documents indices related_docs_indices = cosine_similarities.argsort()[:-11:-1] print("Related Document:", related_docs_indices) # Cosine Similarties of related documents print("Cosine Similarites of related documents", cosine_similarities[related_docs_indices]) meta_data.iloc[1]['abstract'] from wordcloud import WordCloud import matplotlib.pyplot as plt meta_data['index'] = meta_data.index
book_description = pd.read_csv('description.csv', encoding='latin-1') # checking if we have the right data book_description.head() # removing the stop words books_tfidf = TfidfVectorizer(stop_words='english') # replace NaN with empty strings book_description['description'] = book_description['description'].fillna('') # computing TF-IDF matrix required for calculating cosine similarity book_description_matrix = books_tfidf.fit_transform( book_description['description']) # Let's check the shape of computed matrix book_description_matrix.shape # compuing cosine similarity matrix using linear_kernal of sklearn cosine_similarity = linear_kernel(book_description_matrix, book_description_matrix) # Get the pairwsie similarity scores of all books compared to the book passed by index, sorting them and getting top 5 # here 2 is the index of the book in dataset similarity_scores = list(enumerate(cosine_similarity[2])) similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True) similarity_scores = similarity_scores[1:6] # Get the similar books index books_index = [i[0] for i in similarity_scores] # Return the top 5 most similar books using integer-location based indexing (iloc) print(book_description['name'].iloc[books_index])
def find_similar(tfidf_matrix, index, top_n = 5): cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten() related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index] return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]
def find_similar(tfidf_matrix, document): top_n = len(document) #change if need top_n index = 0 cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten() related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index] return [(document[index-1], cosine_similarities[index]) for index in related_docs_indices][0:top_n]
df = pd.read_csv('C:/Users/PGDM//Desktop/movies_metadata.csv') #content based recommendation #we use the overview column to extract words so that movies can be recommended from sklearn.feature_extraction.text import TfidfVectorizer df = df[:10000] tfidf = TfidfVectorizer(stop_words='english') df['overview'] = df['overview'].fillna('') tfidf_mat = tfidf.fit_transform(df['overview']) #since tfidf is used, cosine can be used for similarity score from sklearn.metrics.pairwise import linear_kernel # Compute the cosine similarity matrix cosine_sim = linear_kernel(tfidf_mat, tfidf_mat) #to identify index based on title idx = pd.Series(df.index, index=df['title']).drop_duplicates() #function to return recommendations def recommendations(title, cosine_sim=cosine_sim): ind = idx[title] scores = list(enumerate(cosine_sim[ind])) scores = sorted(scores, key=lambda x: x[1], reverse=True) scores = scores[1:16] movieind = [i[0] for i in scores] return df['title'].iloc[movieind]
def _train(path_input, path_output, numrow, numtop): tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=1, stop_words='english', encoding='utf-8') x = path_input['title'] + ' ' + path_input['genres'].str.replace( '|', ' ') + ' ' + path_input['directors'].str.replace( '|', ' ') + ' ' + path_input['writers'].str.replace('|', ' ') tfidf_matrix = tf.fit_transform(x.values.astype('U')) index = 0 totalRow = len(path_input.index) print(totalRow) if int(totalRow) < int(numrow): cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix) i = 0 for idx in range(0, int(totalRow)): similar_indices = cosine_similarities[i].argsort( )[:-int(totalRow):-1] similar_items = [str(path_input['movieId'][idx])] for j in similar_indices: if (idx != j): similar_items.append( str(path_input['movieId'][j]) + "|" + str(cosine_similarities[i][j])) To_CSV(similar_items, path_output) i = i + 1 pass else: count = int(int(totalRow) / int(numrow)) print('--count: %s' % count) remain = int(totalRow) - int(count) * int(numrow) while (index < count + 1): print('--index: %s' % index) begin = index * int(numrow) # print('---begin: %s' %(begin)) if (index == count): if int(remain) == 0: end = begin + int(numrow) # print('---end:%s' %(end)) else: print('--remain: %s' % (remain)) end = begin + int(remain) else: end = begin + int(numrow) # print('---end:%s' %(end)) # print(tfidf_matrix[begin:end]) print('----begin: %s, end: %s' % (begin, end)) cosine_similarities = linear_kernel(tfidf_matrix[begin:end], tfidf_matrix) i = 0 for idx in range(begin, end): # print('---idx: %s----' %idx) similar_indices = cosine_similarities[i].argsort( )[:-int(numtop):-1] similar_items = [str(path_input['movieId'][idx])] for j in similar_indices: if (idx != j): similar_items.append( str(path_input['movieId'][j]) + "|" + str(cosine_similarities[i][j])) To_CSV(similar_items, path_output) i = i + 1 index = index + 1
# Instantiating tfidf vectorizer vectorizer = TfidfVectorizer(stop_words=stop_words, tokenizer=lematize) # Getting vectors from podcast descriptions vectors = vectorizer.fit_transform(df['description']) # Changing vectors to a pandas dataframe vectors = pd.DataFrame(vectors.todense()) # Setting the tokens as the column names words = vectorizer.get_feature_names() vectors.columns = words df = pd.concat([df, vectors], axis=1) # Compare the documents to themselves; higher numbers are more similar # The diagonal is comparing a document to itself, so those are 1's (100% similar) cos_sims = linear_kernel(vectors, vectors) # Removing 1's on the diagonals np.place(cos_sims, cos_sims >= 0.99, 0) # let string lengths be as long as they need to be pd.set_option('display.max_colwidth', -1) # Getting the podcast that is most similar for each podcast most_similar = cos_sims.argsort(axis=1)[::-1] max_pods_to_recommend = 10 most_similar_in_order = [] for rankArr in most_similar: most_similar_this_pod_in_order = []
X_count = vectorizer.fit_transform(df_tag_strings_new.loc[:, 'tags'].values) #print(X_count) X_dense = X_count.todense() # For euclidean distances # TF-IDF tf = TfidfVectorizer(stop_words='english') tf_idf = tf.fit_transform(df_tag_strings_new['tags']) #print (tf_idf) #print (tf_idf.shape) #Similarity/ Distance measures cos_sim_count = cosine_similarity(X_count) # For count vectorizer cos_sim_tfidf = linear_kernel( tf_idf, tf_idf) # Dot Product because of TFIDF vectors and faster processing man_dist_count = manhattan_distances(X_count) euc_dist_count = euclidean_distances(X_dense) euc_dist_tfidf = euclidean_distances(tf_idf) #reverse lookup of title and movie indices df_movie_indices = pd.Series(df_movies.index, index=df_movies['title']).drop_duplicates() def recommend_content( title, similarity,
def main(): client = MongoClient() wCollection = client.cs229.wArticlesCleaned nCollection = client.cs229.nytArticles # Get references. nArticles = list(nCollection.find().sort([ ("wikipediaId", pymongo.ASCENDING) ]).limit(1000)) # Fetch all the linked articles. wArticles = [] wIdSet = {} for nArticle in nArticles: # Delete the id. del nArticle["_id"] # Fetch the wikipedia article(s) if necessary. for wikipediaId in nArticle["wikipediaId"]: if wikipediaId in wIdSet: continue wIdSet[wikipediaId] = 1 wArticles.append(wCollection.find_one({"_id": wikipediaId})) # Fetch distance 1 wikipedia article(s) if necessary. for wikipediaId in nArticle["wikipediaId1"][:10]: if wikipediaId in wIdSet: continue wIdSet[wikipediaId] = 1 wArticles.append(wCollection.find_one({"_id": wikipediaId})) print "Finished fetching data, nArticles: {}, wArticles: {}".format( len(nArticles), len(wArticles)) sys.stdout.flush() # Split into train, dev, and test. shuffle(nArticles) nArticlesTrain = nArticles[:600] nArticlesDev = nArticles[600:800] nArticlesTest = nArticles[800:] # Set up tfidf matrix on training data only. corpus = [] for article in wArticles: corpus.append(article["title"]) article["titleCorpusIndex"] = len(corpus) - 1 corpus.append(article["text"]) article["textCorpusIndex"] = len(corpus) - 1 for article in nArticlesTrain: corpus.append(article["scrapedTitle"]) article["titleCorpusIndex"] = len(corpus) - 1 corpus.append(article["scrapedText"]) article["textCorpusIndex"] = len(corpus) - 1 tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english', decode_error='ignore') tfidfMatrix = tf.fit_transform(corpus) cosineSimMatrix = linear_kernel(tfidfMatrix, tfidfMatrix) print "Finished tfidf for train" sys.stdout.flush() # Calculate cosine similarities for dev. cosineSimMatrixDev = calculateCosineSimilarities(nArticlesDev, tf, tfidfMatrix) print "Finished tfidf for dev" sys.stdout.flush() # Calculate cosine similarities for test. cosineSimMatrixTest = calculateCosineSimilarities(nArticlesTest, tf, tfidfMatrix) print "Finished tfidf for test" sys.stdout.flush() # Create the (w, n) pairs. wnPairsTrain = createWNPairs(nArticlesTrain, wArticles) wnPairsDev = createWNPairs(nArticlesDev, wArticles) wnPairsTest = createWNPairs(nArticlesTest, wArticles) print "Finished creating pairs" sys.stdout.flush() # Extract features for training data. XTrain, YTrain = extractFeatures(wnPairsTrain, cosineSimMatrix) np.savetxt("data/docMatchIITrainX.txt", XTrain) np.savetxt("data/docMatchIITrainY.txt", YTrain) print "Outputted training data, {}".format(len(YTrain)) sys.stdout.flush() # Extract features for dev data. XDev, YDev = extractFeatures(wnPairsDev, cosineSimMatrixDev) np.savetxt("data/docMatchIIDevX.txt", XDev) np.savetxt("data/docMatchIIDevY.txt", YDev) print "Outputted dev data, {}".format(len(YDev)) sys.stdout.flush() # Extract features for test data. XTest, YTest = extractFeatures(wnPairsTest, cosineSimMatrixTest) np.savetxt("data/docMatchIITestX.txt", XTest) np.savetxt("data/docMatchIITestY.txt", YTest) print "Outputted test data, {}".format(len(YTest)) sys.stdout.flush()
import pandas as pd from sklearn.metrics.pairwise import linear_kernel from sklearn.feature_extraction.text import TfidfVectorizer users = pd.read_csv('lucid.csv/lucid_table_users.csv', encoding='latin-1') #users.head() lucid_tfidf = TfidfVectorizer(stop_words='english') # filling the missing values with empty string users['short_bio'] = users['short_bio'].fillna('') # computing TF-IDF matrix required for calculating cosine similarity users_matrix = lucid_tfidf.fit_transform(users['short_bio']) #users_matrix.shape cosine_similarity = linear_kernel(users_matrix, users_matrix) indices = pd.Series(users['name'].index) def recommend(index, cosine_sim=cosine_similarity): id = indices[index] # Get the pairwsie similarity scores of all names # sorting them and getting top 5 similarity_scores = list(enumerate(cosine_sim[id])) similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True) similarity_scores = similarity_scores[1:6] # Get the names index lucid_index = [i[0] for i in similarity_scores]
sim_mat_content[np.isnan(sim_mat_content)] = 0 ''' 4. TF-IDF: - article title ''' articles = df_articles.copy() articles['title'] = articles['title'].fillna("").astype('str') tf_idf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english') tfidf_matrix = tf_idf.fit_transform(articles['title']) sim_mat_tfidf = linear_kernel(tfidf_matrix, tfidf_matrix) del articles, tfidf_matrix ''' 5. NMF ''' dat = df_clicks.copy() dat['click'] = 1 R = dat.pivot(index='userId', columns='articleId', values='click').fillna(0) n_users = len(dat['userId'].unique()) n_items = len(dat['articleId'].unique()) R_shape = (n_users, n_items) # print(R_shape)
def chapterize(self, tokens, boundaries=[], language='en', visual=False): """segment a document into coherent parts, using a TextTiling-inspired method Args: tokens (TranscriptToken): tokens to segment boundaries (list, optional): list of integers, additional boundaries to refine segmentation. Defaults to []. language (str, optional): language (ISO 639-1 language code). Defaults to 'en'. visual (bool, optional): show graph. Defaults to False. Returns: [type]: [description] """ from chapterize.preprocessor_helper import lemma from chapterize.document_vectorizer import DocumentVectorizer from write_chapters import Chapter import nltk from sklearn.metrics.pairwise import linear_kernel import numpy as np from scipy.signal import savgol_filter from scipy.signal import argrelextrema from math import floor import json # preprocess: # lowercase, lemmatize, remove stopwords # segment transcript into segments of window_width processed = [] # segments of width window_width end_times = [] # end times of every segment # batch preprocess tokens chunk_tokens_lemma = lemma([token.token for token in tokens], language) for i, token in enumerate(tokens): token.token = chunk_tokens_lemma[i] chunks = list(divide_chunks(tokens, self.window_width)) for chunk in chunks: processed_section = '' for token in chunk: processed_section += ' ' + token.token last_end_time = token.time processed.append(processed_section) end_times.append(last_end_time) end_times.pop() # vectorize #dv = DocumentVectorizer('tfidf', tfidf_min_df=default_params.tfidf_min_df, tfidf_max_df=default_params.tfidf_max_df) dv = DocumentVectorizer(self.tfidf_min_df, self.tfidf_max_df) document_vectors = dv.vectorize_docs('ft_average', processed, language=language) print(document_vectors.shape[0]) # calculate cosine similarity score for adjacent segments cosine_similarities = [] print('\ncosine similarity scores:') for i, doc_vec in enumerate(document_vectors[:-1]): cosine_similarity = linear_kernel(doc_vec, document_vectors[i + 1])[0][0] cosine_similarities.append(cosine_similarity) print(cosine_similarity) # smooth curve with Savitzky-Golay filter if self.savgol_window_length == 0: self.savgol_window_length = min(9, len(cosine_similarities)) if self.savgol_window_length % 2 == 0: self.savgol_window_length -= 1 cosine_similarities_smooth = savgol_filter(cosine_similarities, self.savgol_window_length, self.savgol_polyorder) # calculate local minima minima = argrelextrema(cosine_similarities_smooth, np.less)[0] print('\nlocal minima found at {}\n'.format(minima)) self.max_utterance_delta = floor(self.window_width * .4) # concatinate tokens concat_segments = [] for i, minimum in enumerate(minima): concat_segment = '' if i == 0: concat_segment += " ".join(processed[0:minimum + 1]) else: concat_segment += " ".join(processed[minima[i - 1] + 1:minimum + 1]) concat_segments.append(concat_segment) concat_segments.append(" ".join( processed[minima[-1] + 1:])) # append last section (from last boundary to end) #find closest utterance boundary for each local minima segment_boundary_tokens = [] segment_boundary_times = [] for minimum in minima: closest = min(boundaries, key=lambda x: abs(x - ( (minimum + 1) * self.window_width))) print( 'for minimum at token {}, closest utterance boundary is at token {}' .format((minimum + 1) * self.window_width, closest)) if abs((minimum + 1) * self.window_width - closest) <= self.max_utterance_delta: segment_boundary_tokens.append(tokens[closest].token) segment_boundary_times.append(tokens[closest].time) else: print( ' closest utterance boundary is too far from minimum boundary (max_utterance_delta exceeded), topic boundary set to {}' .format(tokens[minimum * self.window_width].token)) segment_boundary_tokens.append(tokens[(minimum + 1) * self.window_width].token) segment_boundary_times.append(tokens[(minimum + 1) * self.window_width].time) # print("Segment boundary tokens:\n", segment_boundary_tokens) if visual: visualize(cosine_similarities_smooth, cosine_similarities, minima, segment_boundary_times, end_times) boundary_indices = [0] + [ minimum * self.window_width for minimum in minima ] return concat_segments, boundary_indices
def main(): # Switch for algorithms # 1 - ucs with tfidf # 2 - ucs with lsa (needs more data, probably) # 3 - Structured Perceptron with tfidf features # 4 - ucs with tfidf over whole article algoNum = 4 # Solve utf errors. reload(sys) sys.setdefaultencoding('utf8') # Train if necessary. if algoNum == 3: weights = train() print weights # LSA over entire corpus. # lsa = None # if algoNum == 2: # corpus = getCorpus("data/wArticlesCleaned/wArticlesCleaned.0.json") # lsa = getLsa(corpus, 100) # corpus = None # Load data. filename = "data/sentInDev.json" dataList = [] with open(filename, 'rb') as inFile: dataList = json.load(inFile) # Count bullets. killBullets = True if killBullets: tempDataList = [] for data in dataList: hasBullet = False for sentence in data["prelimSection"]: if sentence.startswith("*"): hasBullet = True break if not hasBullet: tempDataList.append(data) dataList = tempDataList # Go through data. predictedList = [] sectionsToTest = len(dataList) * 1.0 sumSentencesPerSection = 0.0 numToInsert = 0.0 numInserted = 0.0 numInsertedCorrectly = 0.0 sumSentencesAway = 0.0 for dataNum, data in enumerate(dataList): # Call the algorithm. predictedSection = None if algoNum == 1: corpus = list(data["prelimSection"]) cosineSimMatrix = getTfIdfCosineSimMatrix(corpus) def cost1(s1, s2, sI): index1 = data["prelimSection"].index(s1) index2 = data["prelimSection"].index(s2) if sI is None: return 1.0 / (2 * (cosineSimMatrix[index1][index2] + 1)) else: indexI = data["prelimSection"].index(sI) return 1.0 / (cosineSimMatrix[index1][indexI] + 1 + cosineSimMatrix[indexI][index2] + 1) predictedSection = sentInUcs.InsertSentences(list(data["sentences"]), list(data["section"]), cost1) elif algoNum == 2: corpus = [] data["article"].pop(data["article"].index(data["sentences"][0])) for s1, s2 in zip(data["article"][:len(data["article"])-1], data["article"][1:]): sCat = s1 + s2 corpus.append(sCat) lsa = getLsa(corpus, 10) corpus = list(data["prelimSection"]) lsaMatrix = lsa.transform(corpus) cosineSimMatrix = linear_kernel(lsaMatrix, lsaMatrix) def cost2(s1, s2, sI): index1 = data["prelimSection"].index(s1) index2 = data["prelimSection"].index(s2) if sI is None: return 1.0 / (2 * (cosineSimMatrix[index1][index2] + 1)) else: indexI = data["prelimSection"].index(sI) return 1.0 / (cosineSimMatrix[index1][indexI] + 1 + cosineSimMatrix[indexI][index2] + 1) predictedSection = sentInUcs.InsertSentences(list(data["sentences"]), list(data["section"]), cost2) elif algoNum == 3: predictedSection = getPredicted(data, weights) elif algoNum == 4: corpus = list(data["article"]) cosineSimMatrix = getTfIdfCosineSimMatrix(corpus) def cost1(s1, s2, sI): index1 = data["article"].index(s1) index2 = data["article"].index(s2) if sI is None: return 1.0 / (2 * (cosineSimMatrix[index1][index2] + 1)) else: indexI = data["article"].index(sI) return 1.0 / (cosineSimMatrix[index1][indexI] + 1 + cosineSimMatrix[indexI][index2] + 1) predictedSection = sentInUcs.InsertSentences(list(data["sentences"]), list(data["section"]), cost1) else: print "Algo {} not implemented".format(algoNum) sys.exit(1) # Update metrics. sumSentencesPerSection += len(data["section"]) predictedIndexes = getInsertionIndexes(data["sentences"], predictedSection) sentencesAway = "N/A" for actualIndex, predictedIndex in zip(data["insertionIndexes"], predictedIndexes): numToInsert += 1 if predictedIndex is None: continue numInserted += 1 if predictedIndex == actualIndex: numInsertedCorrectly += 1 sentencesAway = abs(predictedIndex - actualIndex) sumSentencesAway += sentencesAway # Show progress. if dataNum % 100 == 0: print "Ran {} sections".format(dataNum) sys.stdout.flush() # Save some info. savePredicted = False if savePredicted: data["predictedSection"] = predictedSection data["sentencesAway"] = sentencesAway del data["article"] predictedList.append(data) if dataNum % 100 == 99: with open("data/sentInPredicted2.json", 'ab') as outFile: print "Dumping {} outputs".format(len(predictedList)) sys.stdout.flush() outFile.write(json.dumps(predictedList, indent=4)) predictedList = [] print ( "sectionsTested: {0:.0f}, avgInsertionPoints: {1:.4f}, sentencesToInsert: {2:.0f}, " + "numActuallyInserted: {3:.0f}, avgInsertedCorrectly: {4:.4f}, avgSentencesAway: {5:.4f}" ).format( sectionsToTest, sumSentencesPerSection / sectionsToTest + 1, numToInsert, numInserted, numInsertedCorrectly / numInserted, sumSentencesAway / numInserted )
import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import linear_kernel ds = pd.read_csv(io.BytesIO(upload_files['sample-data.csv'])) tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english') tfidf_matrix = tf.fit_transform(ds['description']) cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix) results = {} for idx, row in ds.iterrows(): similar_indices = cosine_similarities[idx].argsort()[:-100:-1] similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices] results[row['id']] = similar_items[1:] print('done!') def item(id): return ds.loc[ds['id'] == id]['description'].tolist()[0].split(' - ')[1] # Just reads the results out of the dictionary.
links_small = pd.read_csv('./static/data/links_small.csv') links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype( 'int') smd = pd.read_csv('smd.csv') smd = smd.reset_index() titles = smd['title'] indices = pd.Series(smd.index, index=smd['title']) tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english') smd['tagline'] = smd['tagline'].fillna('') smd['description'] = smd['overview'] + smd['tagline'] smd['description'] = smd['description'].fillna('') tfidf_matrix = tf.fit_transform(smd['description']) cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) @app.route('/') @app.route("/home") def index(): return render_template('index.html') @app.route("/about") def about(): return render_template('about.html') @app.route("/forme")
def test_linear_kernel(): rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) K = linear_kernel(X, X) # the diagonal elements of a linear kernel are their squared norm assert_array_almost_equal(K.flat[::6], [linalg.norm(x)**2 for x in X])
list(smd) for i in range(0,6000): smd["combined_features"] = re.sub('[^a-z A-z]',' ',smd["combined_features"][i]) #---------------To count each word--------------------------------------------- from sklearn.feature_extraction.text import TfidfVectorizer tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 3),min_df=0, stop_words='english') count_matrix = tf.fit_transform(smd["combined_features"]) #---------------To find Cosine Similarity-------------------------------------- from sklearn.metrics.pairwise import linear_kernel cosine_sim = linear_kernel(count_matrix,count_matrix) #---Function to get title and Index of movies for recommendation def get_title_from_index(index): return smd[smd.index == index]["title"].values[0] def get_index_from_title(title): return smd[smd.title == title]["index"].values[0] smd = smd.reset_index() titles = smd['title'] # finding indices of every title indices = pd.Series(smd.index, index=titles) #-------Recommendatio for the movie which client have watched recently---------
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.datasets import fetch_20newsgroups from sklearn.metrics.pairwise import linear_kernel #twenty = fetch_20newsgroups() twenty = [ "hello there, I'm very happy", "I'm feeling really good", "everything is happy now", "whatever happened here", "go, go to the boom" ] tfidf = TfidfVectorizer().fit_transform(twenty) cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten() related_docs_indices = cosine_similarities.argsort()[:-3:-1]
if "answer" not in d: d["answer"] = "random" dev_question_answers.append(d["answer"]) len_dev = len(dev_questions) """ vectorizer = TfidfVectorizer(stop_words=stop_words) vectors = vectorizer.fit_transform(dev_questions + train_questions) dev_vectors = vectors[0:len_dev] train_vectors = vectors[len_dev:] dev_predict = [None] * len_dev for query_index in range(len_dev): query_vector = vectors[query_index,:] cosine_similarities = linear_kernel(query_vector, train_vectors).flatten() dev_predict[query_index] = train_question_answers[np.argmax(cosine_similarities)][0] predict_output = [None] * len_dev for i in range(len_dev): output_dict = {'question': dev_questions[i], 'prediction': dev_predict[i] } predict_output[i] = output_dict pred_file = os.path.join(filep, 'ef_dev_predict.json') with open(pred_file, 'w') as output: output.write(json.dumps(predict_output, indent=4) + '\n')
'Cs': [ 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1., 10., 100., 1000. ], 'lams': [0., .1, .2, .3, .4, .5, .6, .7, .8, .9, 1.], 'kernel_funcs': { 'linear': [lambda X, L=None: pairwise.linear_kernel(X, L)], 'rbf': [ lambda X, L=None: pairwise.rbf_kernel(X, L, gamma=0.0000001), lambda X, L=None: pairwise.rbf_kernel(X, L, gamma=0.000001), lambda X, L=None: pairwise.rbf_kernel(X, L, gamma=0.00001), lambda X, L=None: pairwise.rbf_kernel(X, L, gamma=0.0001), lambda X, L=None: pairwise.rbf_kernel(X, L, gamma=0.001), lambda X, L=None: pairwise.rbf_kernel(X, L, gamma=0.01), lambda X, L=None: pairwise.rbf_kernel(X, L, gamma=0.1) ], 'laplacian': [ lambda X, L=None: pairwise.laplacian_kernel(X, L, gamma=0.0000001), lambda X, L=None: pairwise.laplacian_kernel(X, L, gamma=0.000001), lambda X, L=None: pairwise.laplacian_kernel(X, L, gamma=0.00001), lambda X, L=None: pairwise.laplacian_kernel(X, L, gamma=0.0001), lambda X, L=None: pairwise.laplacian_kernel(X, L, gamma=0.001),
con_start_idx = len(pro_tweets) tweets = pro_tweets + con_tweets print 'Number of pro and con tweets: {0}, {1}'.format(len(pro_tweets), len(con_tweets)) vectorizer = TfidfVectorizer() tfidf_vecs = vectorizer.fit_transform(tweets) file_out = open('PrayAbortion_cosine_similarity_5.txt', 'w') file_out1 = open('Prochoice_cosine_similarity_5.txt', 'w') k = 5 for i in range(0, len(pro_tweets)): #for i in random.sample(range(con_start_idx), 5): cosine_similarities = linear_kernel(tfidf_vecs[i:i+1], tfidf_vecs) cosine_similarities = cosine_similarities.flatten() sorted_idx = np.argsort(cosine_similarities)[::-1] topk_pro = [] topk_con = [] for idx in sorted_idx: if idx < con_start_idx and idx != i: topk_pro.append(tweets[idx]) if len(topk_pro) == k: break for idx in sorted_idx: if idx >= con_start_idx and idx != i: topk_con.append(tweets[idx]) if len(topk_con) == k:
def predict(data, vect, sn): vector = vect.transform(data) # get list of the ids of the retweeted people most_retweet_ids = run_model(sn) client = MongoClient() twitter = client['twitter'] new = twitter['new'] handle_tweet_dict = defaultdict(list) id_handle_dict = defaultdict() for an_id in most_retweet_ids: docs = new.find({'user.id': an_id}) for doc in docs: tweet = doc.get('text').encode('utf8', 'ignore') user_id = doc.get('user').get('id') handle = doc.get('user').get('screen_name') handle_tweet_dict[handle].append(tweet) id_handle_dict[user_id] = handle tweet_list = [] handle_list = [] for k, v in handle_tweet_dict.iteritems(): tweet_list.extend(v) handle_list.extend([k] * len(v)) vector = vect.transform(data) new_word_counts = vect.transform(tweet_list) result_matrix = linear_kernel(vector, new_word_counts) indices_of_tweets = [] # For each tweet by the client, find the 30 most similar tweets # This list may include tweets by the client for row in result_matrix: indices = row.argsort()[:][::-1] indices_of_tweets.append(indices[:31]) # Return the ids of persons that tweeted each of the 30 most similar tweets handle_array = np.array(handle_list) persons_per_tweet = [] for row in indices_of_tweets: persons_per_tweet.append(handle_array[row]) # Count up how many times each person shows up. # Same weighting is given to people who have many tweets similar to one client tweet # and a tweet that matches a high number of client tweets. persons_counter = Counter() for row in persons_per_tweet: persons_counter.update(row) # return the top 25 people in this list top_people_and_count = persons_counter.most_common(10) top_people = [tup[0] for tup in top_people_and_count if tup[0] != sn] return top_people