def RecommendUsers(username): # create dict with list of users and messages messages = {} list_users = list(db.users.find({}, {'_id': 0, 'user_name': 1})) for user in list_users: user_message = list( db.messages.find({"user_name": user["user_name"]}, { "_id": 0, "message_text": 1 })) messages[user["user_name"]] = " ".join( [text["message_text"] for text in user_message]) # Create the Document Term Matrix count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(messages.values()) matrix = sparse_matrix.todense() messages_df = pd.DataFrame(matrix, columns=count_vectorizer.get_feature_names(), index=messages.keys()) #Identify top 3 similar users similarity_matrix = distance(messages_df, messages_df) sim_df = pd.DataFrame(similarity_matrix, columns=messages.keys(), index=messages.keys()) recommend_user = sim_df[username].sort_values(ascending=False)[1:].head(3) return dumps(dict(recommend_user))
def sentiment(): user_id = str(request.forms.get("user_id")) unique_users = collection.distinct("idUser") dict_users = {} for user in unique_users: try: dict_users[str(user)] = " ".join([ e["text"] for e in list( collection.find({"idUser": user}, { "text": 1, "_id": 0 })) ]) except: pass count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(dict_users.values()) doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=dict_users.keys()) similarity_matrix = distance(df, df) sim_df = pd.DataFrame(similarity_matrix, columns=dict_users.keys(), index=dict_users.keys()) recommendation = list(sim_df.sort_values(by=[user_id]).index[0:3]) return {"recommendation": recommendation}
def friend_recomm(all_data): ''' This function, recommend an user as a friend of another, based on the words they mention in their comments ''' # all_data = all_data.json() users_chats = flatten_json(all_data) count_vectorizer = CountVectorizer(stop_words='english') sparse_matrix = count_vectorizer.fit_transform(users_chats.values()) doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=users_chats.keys()) similarity_matrix = distance(df, df) sim_df = pd.DataFrame(similarity_matrix, columns=users_chats.keys(), index=users_chats.keys()) np.fill_diagonal(sim_df.values, 0) final = sim_df.idxmax() friend = {} for i, e in final.items(): friend[i] = e print(friend) return friend
def createMatrixSimilarity(dictionary_chat_mess, user_id): ''' This function allows you to create a similarity matrix of the selected user. ''' # create pandas df df_quote = pd.DataFrame(dictionary_chat_mess) # dataframe aggregated with all users with the phrases said in every chat df_quote = df_quote.T.groupby('username').agg({'message': 'sum'}) # the same information but in a dictionary new_dict = dict() for i in range(len(df_quote['message'])): new_dict[df_quote.T.columns[i]] = df_quote['message'][i] # create a sparse_matrix with the count of every word count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(new_dict.values()) # Compute Cosine Similarity matrix (or selected distance) en put it in a dataframe doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=new_dict.keys()) similarity_matrix = distance(df, df) # Similarity dataframe and Similarity heatmap sim_df = pd.DataFrame(similarity_matrix, columns=new_dict.keys(), index=new_dict.keys()) username = db.user.find_one({'_id': ObjectId(user_id)}) similarity_column = sim_df[f'{username["username"]}'].sort_values( ascending=False) return similarity_column
def recommendingUsers(user_id): '''devuelve una lista con los 3 usuarios que más se parecen all usuario insertado''' user_message = CollectionInfo() if int(user_id) not in user_message["userId's"]: return f"{user_id}'s' sentiments aren't analyzed." else: nltk.download('stopwords') stop_words = set(stopwords.words('english')) i = 0 for m in user_message["user_message"].values(): tokenizer = RegexpTokenizer(r"\w+") tokens = tokenizer.tokenize(m) clean_text = [ct for ct in tokens if ct not in stop_words] user_message[str(i)] = " ".join(clean_text) i += 1 count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform( user_message["user_message"].values()) doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=user_message["user_message"].keys()) similarity_matrix = distance(df, df) sim_df = pd.DataFrame( similarity_matrix, columns=user_message["user_message"].keys(), index=user_message["user_message"].keys()) np.fill_diagonal(sim_df.values, 0) return dumps(zip(list(sim_df[user_id].sort_values(ascending=False)[:].index), list(sim_df[user_id].sort_values(ascending=False)[:])))
def recommender(self, docs): ''' Recomendador de episodios en función de los parámetros descritos por el usuario. ''' # Lista de todos los id episodes de la colección episodes c_id = self.col_episodes.distinct('_id') for _id in c_id: all_text = '' for extract in self.alltext_episode(_id)['content']: text = ' '.join([extract['text']]) all_text += text docs = {_id: all_text} count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(docs.values()) doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=docs.keys()) similarity_matrix = distance(df, df) sim_df = pd.DataFrame(similarity_matrix, columns=docs.keys(), index=docs.keys()) np.fill_diagonal(sim_df.values, 0) return sim_df.idxmax().head(3)
def recommendUser(self, user_id): x = list( self.collection.find({}, { "messages.text": 1, 'messages.user': 1, '_id': 0 })) info = {} for i in x: for m in i['messages']: info[m['user']] = m['text'] info[m['user']] = m['text'] count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(info.values()) info_matrix = sparse_matrix.todense() df = pd.DataFrame(info_matrix, columns=count_vectorizer.get_feature_names(), index=info.keys()) similarity_matrix = distance(df, df) sim_df = pd.DataFrame(similarity_matrix, columns=info.keys(), index=info.keys()) recom = sim_df[user_id].sort_values(ascending=False)[1:] users = recom.keys() return users[:3]
def get_chats_for_user(user_name): chats = get_sentiment_analysis_of_chats() users = get_sentiment_analysis_of_users() user_names = [user[0] for user in users] user_values = [user[1] for user in users] df_users = pd.DataFrame(user_values, index=user_names, columns=["pos", "neu", "neg"]) chat_names = [chat[0] for chat in chats] chat_values = [chat[1] for chat in chats] df_chats = pd.DataFrame(chat_values, index=chat_names, columns=["pos", "neu", "neg"]) similarity_matrix = distance(df_users, df_chats) sim_df = pd.DataFrame(similarity_matrix, columns=df_chats.index, index=df_users.index) return sim_df.loc[user_name].head() #def get_chats_for_user(user_name): # chats = get_sentiment_analysis_of_chats() # users = get_sentiment_analysis_of_users() # user_names = [user[0] for user in users] # user_values = [user[1] for user in users] # df_users = pd.DataFrame(user_values, index=user_names, columns=["pos", "neu", "neg"]) # chat_names = [chat[0] for chat in chats] # chat_values = [chat[1] for chat in chats] # df_chats = pd.DataFrame(chat_values, index=chat_names, columns=["pos", "neu", "neg"]) # df = df_users.append(df_chats) # similarity_matrix = distance(df, df) # sim_df = pd.DataFrame(similarity_matrix, columns=df.index, index=df.index) # return sim_df.loc[user_name].head()
def userRecommend(user_id): query = """select username from users where iduser={}""".format(user_id) cur.execute(query) name = cur.fetchone()[0] print(name, type(name)) data = json.loads(selectTables("users")) docs = dict() for u in data: messages = userMessages(u[0]) docs.update({u[1]: messages}) count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(docs.values()) doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=docs.keys()) similarity_matrix = distance(df, df) sim_df = pd.DataFrame(similarity_matrix, columns=docs.keys(), index=docs.keys()) np.fill_diagonal(sim_df.values, 0) # Remove diagonal max values and set those to 0 res = { 'recommended_users': [ e for e in list(sim_df[name].sort_values(ascending=False)[0:3].index) ] } return res
def recommender(userName): #I generate a df with all messages and users count_vectorizer = CountVectorizer() allMesgs = chatColl.find({}, {"_id": 0, "messages": 1}) usr = [] msg = [] for e in allMesgs: for i in e['messages']: usr.append(i['user']) msg.append(i['message']) df = pd.DataFrame({"Users": usr, "Messages": msg}) df = pd.DataFrame(df.groupby("Users")["Messages"].apply(list)) df['Messages'] = df['Messages'].apply(lambda texto: " ".join(texto)) df = df.reset_index() data = {e: i for e, i in zip(list(df['Users']), list(df['Messages']))} #Generate a matrix in order to normalize data info sparse_matrix = count_vectorizer.fit_transform(data.values()) doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=data.keys()) #Proximity matrix similarity_matrix = distance(df, df) sim_df = pd.DataFrame(similarity_matrix, columns=data.keys(), index=data.keys()) #Droping diagonal values because its a comparation with themselves np.fill_diagonal(sim_df.values, 0) #Getting the most similar character similarities_to = {sim_df.idxmax().loc[userName]: sim_df[userName].max()} return similarities_to
def getUserRecommendation(self, user_id): info = {} x = list( self.collection.find({}, { 'message': 1, 'user_id': 1, '_id': 0 })) for i in x: if i['user_id'] not in info.keys(): info[i['user_id']] = i['message'] else: info[i['user_id']] = info[i['user_id']] + " " + i['message'] count_vectorizer = CountVectorizer(stop_words='english') sparse_matrix = count_vectorizer.fit_transform(info.values()) info_matrix = sparse_matrix.todense() df = pd.DataFrame(info_matrix, columns=count_vectorizer.get_feature_names(), index=info.keys()) similarity_matrix = distance(df, df) sim_df = pd.DataFrame(similarity_matrix, columns=info.keys(), index=info.keys()) recom = sim_df[user_id].sort_values(ascending=False)[1:] users = recom.keys() recommendations = {}
def recomendator(user_id): #Seleccionamos database dbC = pickDB(method="Chats") dbU = pickDB() #Obtención de todos los mensajes de un usuario determinado # mainUser = findMessages(user_id) #Obtención de todos los usuarios que han participado en chats menos el principal allUsers = dbU.distinct("Position") allUsers = sorted(allUsers) #Extracción de mensajes de todos los usuarios menos el principal result = [findMessages(element) for element in allUsers] result2 = [] for element in result: if element != None: result2.append(element) #Fusión de values newDict = {} for element in result2: for key, value in element.items(): test = value test2 = " ".join(test) newDict[key] = test2 docs = newDict count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(docs.values()) m = sparse_matrix.todense() doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=docs.keys()) similarity_matrix = distance(df,df) sim_df = pd.DataFrame(similarity_matrix, columns=docs.keys(), index=docs.keys()) np.fill_diagonal(sim_df.values, 0) try: firstUser = sim_df.idxmax()[1] secondUser = sim_df.idxmax()[2] thirdUser = sim_df.idxmax()[3] except IndexError: return json.dumps("No hay suficientes usuarios para recomendar") total = {user_id: [firstUser, secondUser,thirdUser]} return json.dumps(total)
def similarityDF(TokensDict): count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(TokensDict.values()) Tokens_term_matrix = sparse_matrix.todense() df = pd.DataFrame(Tokens_term_matrix,columns=count_vectorizer.get_feature_names(),index=TokensDict.keys()) similarity_matrix = distance(df, df) sim_df = pd.DataFrame(similarity_matrix, columns=TokensDict.keys(), index=TokensDict.keys()) # sns.heatmap(sim_df,annot=True) np.fill_diagonal(sim_df.values, 0) return sim_df
def character_friend_recommender( name): ### us who is the recommended friend for our character characters = list(collection_con.find({}).distinct("c_name")) sentiment_text = {} ### we start the anlysis of sentiment for character in characters: lines = [] text = "" text_clean = "" match = list(collection_con.find( {"c_name": character})) ### we get all the info of the character for dictionary in match: ### then we make a dictionary with his lines lines.append(dictionary["line"]) for line in lines: ### after that, we get a string with all the text text += line # removing symbols from the text to improve the analysis. words = nltk.word_tokenize(text) stop_words = set(stopwords.words('english')) tokens_clean = [e for e in words if e not in stop_words] for word in tokens_clean: text_clean += word sentiment_text[ character] = text_clean ### now we have a dictionary with character:text count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(sentiment_text.values()) doc_term_matrix = sparse_matrix.todense() df_sentiment = pd.DataFrame( doc_term_matrix, ### we create our data frame columns=count_vectorizer.get_feature_names(), index=sentiment_text.keys()) similarity_matrix = distance(df_sentiment, df_sentiment) sim_df = pd.DataFrame(similarity_matrix, columns=sentiment_text.keys(), index=sentiment_text.keys()) np.fill_diagonal(sim_df.values, 0) # Remove diagonal max values and set those to 0 sim_df_idmax = pd.DataFrame( sim_df.idxmax() ) ### now we have the similarity matrix and we can proceed to get the recommended friend return (f"The recommended friend for {name} is:" + " " + np.asarray(sim_df_idmax.loc[name])[0])
def getSimilarUsers(user_id): #get all the chats ids #r = requests.get(f'http://localhost:3500//chat/ids') r=ast.literal_eval(getChatIds()) chat_ids=list(r.keys()) #get all the messages texts for all the chats messages1={}# for chat_id in chat_ids: #r = requests.get(f'http://localhost:3500//chat/{str(chat_id)}/list') r=ast.literal_eval(getMessages(chat_id)) messages1.update(r) #concat all the the messages of an user in a unique string users_messages={} for k,v in messages1.items(): mes_user=db.messages.find_one({'_id':ObjectId(k)})#fetch the message user=mes_user['user_id']#fetch the user user_texts=list(db.messages.find({"user_id":user},{'text':1}))#fetch all the messages for the user user_text='' for text in user_texts: try: text = ast.literal_eval(text['text'])#some of my texts are lists some no except: text=list(text) txt='' for t in text:#put all the messages in a unique string txt += t user_text=user_text +' '+ txt users_messages[user]=user_text sent1={}#k =user_id, v=text for k,v in users_messages.items(): sent1[k]=str(v) #creation of the similarity matrix for the users count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(sent1.values()) text_term_matrix = sparse_matrix.todense() df = pd.DataFrame(text_term_matrix, columns=count_vectorizer.get_feature_names(), index=sent1.keys()) similarity_matrix = distance(df,df) sim_df = pd.DataFrame(similarity_matrix, columns=sent1.keys(), index=sent1.keys()) #recommend the 3 closest users content-wise def get3closest(sim_df,user_id):#user is an ObjectId col=sim_df[user].sort_values(ascending=False)[1:] return list(col[0:3].index) output= get3closest(sim_df,ObjectId(user_id)) output=[str(el) for el in output] return json.dumps({'recommended':output})
def userRecom(user): """ Recommends an user to another based on what is written by those users """ idLista = list(chatCol.find({}, {"_id": 1})) userLista = list(userCol.find({}, {"_id": 0, "name": 1})) lista = {} for chat in idLista: exText = list( chatCol.find({ "_id": ObjectId(chat["_id"]) }).sort([("Texts", 1)]).limit(1))[0]["Texts"].keys() exText = list(exText)[-1] match = re.findall(r"[^msg][0-9]*", exText) lastText = int(match[0]) for use in userLista: for e in range(1, lastText + 1): try: if use["name"] not in lista.keys(): lista[use["name"]] = list( chatCol.find({ "$and": [{ "_id": ObjectId(chat["_id"]) }, { f"Texts.msg{e}.name": use["name"] }] }))[0]["Texts"][f"msg{e}"]["text"] + ". " else: lista[use["name"]] += list( chatCol.find({ "$and": [{ "_id": ObjectId(chat["_id"]) }, { f"Texts.msg{e}.name": use["name"] }] }))[0]["Texts"][f"msg{e}"]["text"] + ". " except: pass print(lista) docs = lista count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(docs.values()) m = sparse_matrix.todense() doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=docs.keys()) similarity_matrix = distance(df, df) sim_df = pd.DataFrame(similarity_matrix, columns=docs.keys(), index=docs.keys()) np.fill_diagonal(sim_df.values, 0) # Remove diagonal max values and set those to 0 return f"{user} is likely to be friends with {sim_df.idxmax()[user]}"
def recommender(user): if not request.args: raise APIError( 'This endpoint requires a paramether type= "similar" or type="sentiment"' ) type_recom = request.args['type'] if type_recom not in ['similar', 'sentiment']: raise APIError( 'The type parameter must be either "similar" or "sentiment"') #Create a dictionary with all the messages of each user messages = {} users = db.users.find({}, {'_id': 1, 'name': 1}) for e in users: texts = db.messages.find({'user': e['_id']}, {'text': 1, '_id': 0}) messages[e['name']] = ' '.join([t['text'] for t in texts]) #Remove stopwords from the messages: trimmed = {} stpwrd = set(stopwords.words('english')) string = '' for k, v in messages.items(): trimmed[k] = ' '.join([w for w in v.split(' ') if w not in stpwrd]) if type_recom == 'similar': #Create a sparse_matrix with the counts of each word for each of the users count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(trimmed.values()) matrix = sparse_matrix.todense() #Calculate the cosine distances between users: similarity_matrix = distance(matrix, matrix) sim_df = pd.DataFrame(similarity_matrix, columns=messages.keys(), index=messages.keys()) similars = sim_df[user].sort_values(ascending=False)[1:].head(3) return {'Similar users': list(similars.index)} elif type_recom == 'sentiment': sia = SentimentIntensityAnalyzer() sentim = {} for k, v in trimmed.items(): sentim[k] = sia.polarity_scores(v) simi = pd.DataFrame(sentim).T distances = pd.DataFrame(1 / (1 + squareform(pdist(simi, 'euclidean'))), index=simi.index, columns=simi.index) similars = distances[user].sort_values(ascending=False)[1:].head(3) return {'Similar users': list(similars.index)}
def recomendations(dic): count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(dic.values()) doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=dic.keys()) similarity_matrix = distance(df, df) sim_df = pd.DataFrame(similarity_matrix, columns=dic.keys(), index=dic.keys()) sns.heatmap(sim_df,annot=True) np.fill_diagonal(sim_df.values, 0) recomendations=sim_df.idxmax() return recomendations
def similarityMatrix(users_messages): count_vectorizer = CountVectorizer(stop_words="english") sparse_matrix = count_vectorizer.fit_transform(users_messages.values()) doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=users_messages.keys()) # from sklearn.metrics.pairwise import cosine_similarity as distance similarity_matrix = distance(df, df) sim_df = pd.DataFrame(similarity_matrix, columns=users_messages.keys(), index=users_messages.keys()) return sim_df
def recommendator(name): count_vectorizer = CountVectorizer(stop_words='english') sparse_matrix = count_vectorizer.fit_transform(recomDic().values()) doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=recomDic().keys()) similarity_matrix = distance(df, df) sim_df = pd.DataFrame(similarity_matrix, columns=recomDic().keys(), index=recomDic().keys()) np.fill_diagonal(sim_df.values, 0) pepe = sim_df.idxmax() return pepe.loc[name]
def recomendaciones(name): docs = diccionarioGrande() count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(docs.values()) m = sparse_matrix.todense() df = pd.DataFrame(m, columns=count_vectorizer.get_feature_names(), index=docs.keys()) similarity_matrix = distance(df,df) sim_df = pd.DataFrame(similarity_matrix, columns=docs.keys(), index=docs.keys()) np.fill_diagonal(sim_df.values, 0) nombre = sim_df.idxmax() respuesta = {} respuesta[name] = f'Creo que conectas bastante bien con {nombre.loc[name]}' return respuesta
def recommending_user(userName): database, collection =connectCollection('chats','chateo') query = list(collection.find({},{'userName':1,"text":1,'_id':0})) diccionario = getting_every_sentence(query) recommendation_dict=dict() count_vectorizer=CountVectorizer(stop_words='english') sparse_matrix = count_vectorizer.fit_transform(diccionario.values()) doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=diccionario.keys()) similarity_matrix = distance(df, df) sim_df = pd.DataFrame(similarity_matrix, columns=diccionario.keys(), index=diccionario.keys()) np.fill_diagonal(sim_df.values, 0) final_matrix=sim_df.idxmax() recommended = list(sim_df.sort_values(by=userName, ascending = False).index[0:3]) return json.dumps(recommended)
def get_recommend_news_by_tfidf_sim(): ''' 基于tfidf生成的user profile和文章的keywords(topN,设为20),从用户的candidate articles中选出相似度最大的TopN返回 ''' topN = 20 uids = os.listdir(user_keywords_by_tfidf) uid2can_newsids = get_user_candidate_newsids(user_candidate_newsids_path) user_recommend_res = [] #recommend_res_path = recommend_res_path.replace('.csv', '_by_tfidf.csv') cnt = 0 for uid in uids: cnt += 1 if cnt % 100 == 0: print 'recommend %d user: %s' % (cnt, uid) user_terms = get_user_tfidf_terms( os.path.join(user_keywords_by_tfidf, uid), topN) candidate_newsids = uid2can_newsids.get(uid, []) if not candidate_newsids: continue candidate_news_top_terms = get_news_top_terms(candidate_newsids, topN) #can_news_vectors和candidate_newsids中的nid一一对应 user_vector, can_news_vectors = generate_feature_vectors( user_terms, candidate_news_top_terms, topN) #调用sklearn接口,可以一次计算user和全部news的cosine distance #注意,该接口的值是1-product(v1, v2), 所以值越小,越相似,表示distance越小 user_news_distances = distance(user_vector, Y=can_news_vectors, metric='cosine') user_news_distances = zip(candidate_newsids, user_news_distances.tolist()[0]) user_news_distances = sorted(user_news_distances, key=lambda d: d[1]) user_recommend_res.append( (uid, [nid for nid, d in user_news_distances][:REC_NUM])) fw = open(recommend_res_path, 'w+') fw.write('userid,newsid\n') cnt = 0 for uid, rec_news in user_recommend_res: #import pdb;pdb.set_trace() cnt += 1 if cnt % 100 == 0: print 'finish %d user: %s, %s' % (cnt, uid, ' '.join(rec_news)) fw.write('\n'.join( [','.join((uid, unicode2str(nid))) for nid in rec_news])) fw.write('\n') fw.close() print 'finish recommending, res saved in %s' % recommend_res_path
def recommendator(name): count_vectorizer = CountVectorizer(stop_words='english') sparse_matrix = count_vectorizer.fit_transform(makeDict().values()) doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=makeDict().keys()) similarity_matrix = distance(df, df) sim_df = pd.DataFrame(similarity_matrix, columns=makeDict().keys(), index=makeDict().keys()) np.fill_diagonal(sim_df.values, 0) pepe = sim_df.idxmax() dict45825121 = {} dict45825121[name] = 'Your best friend should be {}'.format(pepe.loc[name]) return dict45825121
def recommendations(user_name,chat_id): lista=getList(int(chat_id)) lista=json.loads(lista)[0]['mensajes'] data = pd.DataFrame(lista) df=data.groupby('autor').apply(lambda x: ''.join(x.texto)) df=pd.DataFrame(df).reset_index() count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(df[0]) doc_term_matrix = sparse_matrix.todense() letters_users = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=df['autor']) similarity_matrix = distance(letters_users,letters_users) sim_df = pd.DataFrame(similarity_matrix, columns=df['autor'], index=df['autor']) similarities = sim_df[user_name].sort_values(ascending=False)[1:] return dumps(similarities)
def house_friend_recommender(conversation): houses = list(collection_con.find({}).distinct("house")) sentiment_text = {} for house in houses: lines = [] text = "" text_clean = "" match = list(collection_con.find({"house": house})) for dictionary in match: lines.append(dictionary["line"]) for line in lines: text += line words = nltk.word_tokenize(text) stop_words = set(stopwords.words('english')) tokens_clean = [e for e in words if e not in stop_words] for word in tokens_clean: text_clean += word sentiment_text[house] = text_clean count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(sentiment_text.values()) doc_term_matrix = sparse_matrix.todense() df_sentiment = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=sentiment_text.keys()) similarity_matrix = distance(df_sentiment, df_sentiment) sim_df = pd.DataFrame(similarity_matrix, columns=sentiment_text.keys(), index=sentiment_text.keys()) np.fill_diagonal(sim_df.values, 0) sim_df_idmax = pd.DataFrame(sim_df.idxmax()) return (f"The recommended house for {conversation} is:" + " " + np.asarray(sim_df_idmax.loc[conversation])[0])
def recommendCharacter(username): #Dictionary with all conversations doc = {} characters = [ character['username'] for character in list(db['Conversations'].distinct('Characters')) ] for character in characters: i = list( list(db['Users'].find({'username': character}, { '_id': 1, 'Group': 1 }))[0].values()) conversation = list(db['Conversations'].find({'Group': i[1]}))[0]['Message'] characConv = [] for dic in conversation: if dic['username'] == character: characConv.append(dic['message']) phrases = ''.join(str(word) for word in characConv) dic = {character: phrases} doc.update(dic) #Vectorizer count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(doc.values()) doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=doc.keys()) #Similarity similarity_matrix = distance(df, df) sim_df = pd.DataFrame(similarity_matrix, columns=doc.keys(), index=doc.keys()) np.fill_diagonal(sim_df.values, 0) # Remove diagonal max values and set those to 0 sim_df.idxmax() #Recommendation rec = sim_df.loc[username].idxmax() return rec
def recommending_user(user): #returns a recommendation for user to talk to recommendation_dict = dict() count_vectorizer = CountVectorizer(stop_words='english') sparse_matrix = count_vectorizer.fit_transform( getting_every_sentence().values()) doc_term_matrix = getting_sparse_matrix().todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=getting_every_sentence().keys()) similarity_matrix = distance(df, df) sim_df = pd.DataFrame(similarity_matrix, columns=getting_every_sentence().keys(), index=getting_every_sentence().keys()) np.fill_diagonal(sim_df.values, 0) final_matrix = sim_df.idxmax() recommendation_dict[user] = final_matrix.loc[user] return recommendation_dict
def search_title(query, return_size=20): # we need to tap into existing database that contains embedding for all titles # right now we can use ../title_vectors with open('title_vectors', 'rb') as filein: title_vectors = pickle.load(filein) title_distance = [] vector = generate_vector(query) for vector_ins in title_vectors: # some titles does not have a valid embedding, being English etc. if vector_ins.vector: euclidean = distance([vector, vector_ins.vector])[0].sum() title_distance.append((vector_ins.title, euclidean)) title_distance.sort(key=lambda item: item[-1]) return title_distance[:return_size]
def recommender(name, dicc): ''' Function to find your best friend ''' count_vectorizer = CountVectorizer(stop_words='english') sparse_matrix = count_vectorizer.fit_transform(dicc.values()) doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=dicc.keys()) similarity_matrix = distance(df, df) sim_df = pd.DataFrame(similarity_matrix, columns=dicc.keys(), index=dicc.keys()) np.fill_diagonal(sim_df.values, 0) simil = sim_df.idxmax() dic = {} dic[name] = simil.loc[name] return dic