def RecommendUsers(username):
    # create dict with list of users and messages
    messages = {}
    list_users = list(db.users.find({}, {'_id': 0, 'user_name': 1}))
    for user in list_users:
        user_message = list(
            db.messages.find({"user_name": user["user_name"]}, {
                "_id": 0,
                "message_text": 1
            }))
        messages[user["user_name"]] = " ".join(
            [text["message_text"] for text in user_message])
    # Create the Document Term Matrix
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(messages.values())
    matrix = sparse_matrix.todense()
    messages_df = pd.DataFrame(matrix,
                               columns=count_vectorizer.get_feature_names(),
                               index=messages.keys())
    #Identify top 3 similar users
    similarity_matrix = distance(messages_df, messages_df)
    sim_df = pd.DataFrame(similarity_matrix,
                          columns=messages.keys(),
                          index=messages.keys())
    recommend_user = sim_df[username].sort_values(ascending=False)[1:].head(3)
    return dumps(dict(recommend_user))
Beispiel #2
0
def sentiment():
    user_id = str(request.forms.get("user_id"))
    unique_users = collection.distinct("idUser")
    dict_users = {}
    for user in unique_users:
        try:
            dict_users[str(user)] = " ".join([
                e["text"] for e in list(
                    collection.find({"idUser": user}, {
                        "text": 1,
                        "_id": 0
                    }))
            ])
        except:
            pass
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(dict_users.values())
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix,
                      columns=count_vectorizer.get_feature_names(),
                      index=dict_users.keys())
    similarity_matrix = distance(df, df)
    sim_df = pd.DataFrame(similarity_matrix,
                          columns=dict_users.keys(),
                          index=dict_users.keys())
    recommendation = list(sim_df.sort_values(by=[user_id]).index[0:3])
    return {"recommendation": recommendation}
Beispiel #3
0
def friend_recomm(all_data):
    '''
    This function, recommend an user as a friend of another, based on the words they mention in their comments 
    '''
    # all_data = all_data.json()
    users_chats = flatten_json(all_data)

    count_vectorizer = CountVectorizer(stop_words='english')
    sparse_matrix = count_vectorizer.fit_transform(users_chats.values())

    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix,
                      columns=count_vectorizer.get_feature_names(),
                      index=users_chats.keys())

    similarity_matrix = distance(df, df)
    sim_df = pd.DataFrame(similarity_matrix,
                          columns=users_chats.keys(),
                          index=users_chats.keys())
    np.fill_diagonal(sim_df.values, 0)

    final = sim_df.idxmax()
    friend = {}
    for i, e in final.items():
        friend[i] = e
    print(friend)
    return friend
Beispiel #4
0
def createMatrixSimilarity(dictionary_chat_mess, user_id):
    '''
    This function allows you to create a similarity matrix of the selected user.
    '''
    # create pandas df
    df_quote = pd.DataFrame(dictionary_chat_mess)
    # dataframe aggregated with all users with the phrases said in every chat
    df_quote = df_quote.T.groupby('username').agg({'message': 'sum'})
    # the same information but in a dictionary
    new_dict = dict()
    for i in range(len(df_quote['message'])):
        new_dict[df_quote.T.columns[i]] = df_quote['message'][i]
    # create a sparse_matrix with the count of every word
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(new_dict.values())
    # Compute Cosine Similarity matrix (or selected distance) en put it in a dataframe
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix,
                      columns=count_vectorizer.get_feature_names(),
                      index=new_dict.keys())
    similarity_matrix = distance(df, df)
    # Similarity dataframe and Similarity heatmap
    sim_df = pd.DataFrame(similarity_matrix,
                          columns=new_dict.keys(),
                          index=new_dict.keys())
    username = db.user.find_one({'_id': ObjectId(user_id)})
    similarity_column = sim_df[f'{username["username"]}'].sort_values(
        ascending=False)
    return similarity_column
def recommendingUsers(user_id):
    '''devuelve una lista con los 3 usuarios que más se parecen all usuario insertado'''
    user_message = CollectionInfo()
    if int(user_id) not in user_message["userId's"]:
        return f"{user_id}'s' sentiments aren't analyzed."
    else:
        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))
        i = 0
        for m in user_message["user_message"].values():
            tokenizer = RegexpTokenizer(r"\w+")
            tokens = tokenizer.tokenize(m)
            clean_text = [ct for ct in tokens if ct not in stop_words]
            user_message[str(i)] = " ".join(clean_text)
            i += 1
        count_vectorizer = CountVectorizer()
        sparse_matrix = count_vectorizer.fit_transform(
            user_message["user_message"].values())
        doc_term_matrix = sparse_matrix.todense()
        df = pd.DataFrame(doc_term_matrix,
                          columns=count_vectorizer.get_feature_names(),
                          index=user_message["user_message"].keys())
        similarity_matrix = distance(df, df)
        sim_df = pd.DataFrame(
            similarity_matrix, columns=user_message["user_message"].keys(), index=user_message["user_message"].keys())
        np.fill_diagonal(sim_df.values, 0)

        return dumps(zip(list(sim_df[user_id].sort_values(ascending=False)[:].index),
                         list(sim_df[user_id].sort_values(ascending=False)[:])))
    def recommender(self, docs):
        '''
        Recomendador de episodios en función de los parámetros descritos por el usuario.
        '''
        # Lista de todos los id episodes de la colección episodes
        c_id = self.col_episodes.distinct('_id')
        for _id in c_id:
            all_text = ''
            for extract in self.alltext_episode(_id)['content']:
                text = ' '.join([extract['text']])
                all_text += text

            docs = {_id: all_text}

        count_vectorizer = CountVectorizer()
        sparse_matrix = count_vectorizer.fit_transform(docs.values())
        doc_term_matrix = sparse_matrix.todense()
        df = pd.DataFrame(doc_term_matrix,
                          columns=count_vectorizer.get_feature_names(),
                          index=docs.keys())
        similarity_matrix = distance(df, df)
        sim_df = pd.DataFrame(similarity_matrix,
                              columns=docs.keys(),
                              index=docs.keys())
        np.fill_diagonal(sim_df.values, 0)
        return sim_df.idxmax().head(3)
Beispiel #7
0
 def recommendUser(self, user_id):
     x = list(
         self.collection.find({}, {
             "messages.text": 1,
             'messages.user': 1,
             '_id': 0
         }))
     info = {}
     for i in x:
         for m in i['messages']:
             info[m['user']] = m['text']
             info[m['user']] = m['text']
     count_vectorizer = CountVectorizer()
     sparse_matrix = count_vectorizer.fit_transform(info.values())
     info_matrix = sparse_matrix.todense()
     df = pd.DataFrame(info_matrix,
                       columns=count_vectorizer.get_feature_names(),
                       index=info.keys())
     similarity_matrix = distance(df, df)
     sim_df = pd.DataFrame(similarity_matrix,
                           columns=info.keys(),
                           index=info.keys())
     recom = sim_df[user_id].sort_values(ascending=False)[1:]
     users = recom.keys()
     return users[:3]
Beispiel #8
0
def get_chats_for_user(user_name):
    chats = get_sentiment_analysis_of_chats()
    users = get_sentiment_analysis_of_users()
    user_names = [user[0] for user in users]
    user_values = [user[1] for user in users]
    df_users = pd.DataFrame(user_values,
                            index=user_names,
                            columns=["pos", "neu", "neg"])
    chat_names = [chat[0] for chat in chats]
    chat_values = [chat[1] for chat in chats]
    df_chats = pd.DataFrame(chat_values,
                            index=chat_names,
                            columns=["pos", "neu", "neg"])
    similarity_matrix = distance(df_users, df_chats)
    sim_df = pd.DataFrame(similarity_matrix,
                          columns=df_chats.index,
                          index=df_users.index)
    return sim_df.loc[user_name].head()


#def get_chats_for_user(user_name):
#    chats = get_sentiment_analysis_of_chats()
#    users = get_sentiment_analysis_of_users()
#    user_names = [user[0] for user in users]
#    user_values = [user[1] for user in users]
#    df_users = pd.DataFrame(user_values, index=user_names, columns=["pos", "neu", "neg"])
#    chat_names = [chat[0] for chat in chats]
#    chat_values = [chat[1] for chat in chats]
#    df_chats = pd.DataFrame(chat_values, index=chat_names, columns=["pos", "neu", "neg"])
#    df = df_users.append(df_chats)
#    similarity_matrix = distance(df, df)
#    sim_df = pd.DataFrame(similarity_matrix, columns=df.index, index=df.index)
#    return sim_df.loc[user_name].head()
def userRecommend(user_id):
    query = """select username from users where iduser={}""".format(user_id)
    cur.execute(query)
    name = cur.fetchone()[0]
    print(name, type(name))
    data = json.loads(selectTables("users"))
    docs = dict()
    for u in data:
        messages = userMessages(u[0])
        docs.update({u[1]: messages})
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(docs.values())
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix,
                      columns=count_vectorizer.get_feature_names(),
                      index=docs.keys())
    similarity_matrix = distance(df, df)
    sim_df = pd.DataFrame(similarity_matrix,
                          columns=docs.keys(),
                          index=docs.keys())
    np.fill_diagonal(sim_df.values,
                     0)  # Remove diagonal max values and set those to 0
    res = {
        'recommended_users': [
            e
            for e in list(sim_df[name].sort_values(ascending=False)[0:3].index)
        ]
    }
    return res
Beispiel #10
0
def recommender(userName):
    #I generate a df with all messages and users
    count_vectorizer = CountVectorizer()
    allMesgs = chatColl.find({}, {"_id": 0, "messages": 1})
    usr = []
    msg = []
    for e in allMesgs:
        for i in e['messages']:
            usr.append(i['user'])
            msg.append(i['message'])
    df = pd.DataFrame({"Users": usr, "Messages": msg})
    df = pd.DataFrame(df.groupby("Users")["Messages"].apply(list))
    df['Messages'] = df['Messages'].apply(lambda texto: " ".join(texto))
    df = df.reset_index()
    data = {e: i for e, i in zip(list(df['Users']), list(df['Messages']))}
    #Generate a matrix in order to normalize data info
    sparse_matrix = count_vectorizer.fit_transform(data.values())
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix,
                      columns=count_vectorizer.get_feature_names(),
                      index=data.keys())
    #Proximity matrix
    similarity_matrix = distance(df, df)
    sim_df = pd.DataFrame(similarity_matrix,
                          columns=data.keys(),
                          index=data.keys())
    #Droping diagonal values because its a comparation with themselves
    np.fill_diagonal(sim_df.values, 0)
    #Getting the most similar character
    similarities_to = {sim_df.idxmax().loc[userName]: sim_df[userName].max()}
    return similarities_to
Beispiel #11
0
 def getUserRecommendation(self, user_id):
     info = {}
     x = list(
         self.collection.find({}, {
             'message': 1,
             'user_id': 1,
             '_id': 0
         }))
     for i in x:
         if i['user_id'] not in info.keys():
             info[i['user_id']] = i['message']
         else:
             info[i['user_id']] = info[i['user_id']] + " " + i['message']
     count_vectorizer = CountVectorizer(stop_words='english')
     sparse_matrix = count_vectorizer.fit_transform(info.values())
     info_matrix = sparse_matrix.todense()
     df = pd.DataFrame(info_matrix,
                       columns=count_vectorizer.get_feature_names(),
                       index=info.keys())
     similarity_matrix = distance(df, df)
     sim_df = pd.DataFrame(similarity_matrix,
                           columns=info.keys(),
                           index=info.keys())
     recom = sim_df[user_id].sort_values(ascending=False)[1:]
     users = recom.keys()
     recommendations = {}
Beispiel #12
0
def recomendator(user_id):

    #Seleccionamos database
    dbC = pickDB(method="Chats")
    dbU = pickDB()

    #Obtención de todos los mensajes de un usuario determinado
    # mainUser = findMessages(user_id)

    #Obtención de todos los usuarios que han participado en chats menos el principal

    allUsers = dbU.distinct("Position")
    allUsers = sorted(allUsers)

    #Extracción de mensajes de todos los usuarios menos el principal
    result = [findMessages(element) for element in allUsers]

    result2 = []
    for element in result:
        if element != None:
            result2.append(element)

    #Fusión de values
    newDict = {}

    for element in result2:
        for key, value in element.items():
            test = value
            test2 = " ".join(test)
            newDict[key] = test2
    
    docs = newDict

    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(docs.values())
    m = sparse_matrix.todense()

    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=docs.keys())

    similarity_matrix = distance(df,df)

    sim_df = pd.DataFrame(similarity_matrix, columns=docs.keys(), index=docs.keys())

    np.fill_diagonal(sim_df.values, 0)
    try:
        firstUser = sim_df.idxmax()[1]
        secondUser = sim_df.idxmax()[2]
        thirdUser = sim_df.idxmax()[3]
    except IndexError:
        return json.dumps("No hay suficientes usuarios para recomendar")

    total = {user_id: [firstUser, secondUser,thirdUser]}

    return json.dumps(total)
Beispiel #13
0
def similarityDF(TokensDict):
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(TokensDict.values())
    Tokens_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(Tokens_term_matrix,columns=count_vectorizer.get_feature_names(),index=TokensDict.keys())
    similarity_matrix = distance(df, df)
    sim_df = pd.DataFrame(similarity_matrix, columns=TokensDict.keys(), index=TokensDict.keys())
    # sns.heatmap(sim_df,annot=True)
    np.fill_diagonal(sim_df.values, 0)
    return sim_df
def character_friend_recommender(
        name):  ### us who is the recommended friend for our character

    characters = list(collection_con.find({}).distinct("c_name"))

    sentiment_text = {}  ### we start the anlysis of sentiment

    for character in characters:
        lines = []
        text = ""
        text_clean = ""

        match = list(collection_con.find(
            {"c_name": character}))  ### we get all the info of the character

        for dictionary in match:  ### then we make a dictionary with his lines
            lines.append(dictionary["line"])

        for line in lines:  ### after that, we get a string with all the text
            text += line

            # removing symbols from the text to improve the analysis.
        words = nltk.word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        tokens_clean = [e for e in words if e not in stop_words]

        for word in tokens_clean:
            text_clean += word

        sentiment_text[
            character] = text_clean  ### now we have a dictionary with character:text

    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(sentiment_text.values())

    doc_term_matrix = sparse_matrix.todense()
    df_sentiment = pd.DataFrame(
        doc_term_matrix,  ### we create our data frame
        columns=count_vectorizer.get_feature_names(),
        index=sentiment_text.keys())

    similarity_matrix = distance(df_sentiment, df_sentiment)

    sim_df = pd.DataFrame(similarity_matrix,
                          columns=sentiment_text.keys(),
                          index=sentiment_text.keys())

    np.fill_diagonal(sim_df.values,
                     0)  # Remove diagonal max values and set those to 0

    sim_df_idmax = pd.DataFrame(
        sim_df.idxmax()
    )  ### now we have the similarity matrix and we can proceed to get the recommended friend
    return (f"The recommended friend for {name} is:" + " " +
            np.asarray(sim_df_idmax.loc[name])[0])
def getSimilarUsers(user_id):
    #get all the chats ids
    #r = requests.get(f'http://localhost:3500//chat/ids')
    r=ast.literal_eval(getChatIds())
    chat_ids=list(r.keys())
    
    #get all the messages texts for all the chats
    messages1={}#
    for chat_id in chat_ids:
        #r = requests.get(f'http://localhost:3500//chat/{str(chat_id)}/list')
        r=ast.literal_eval(getMessages(chat_id))
        messages1.update(r)
        
    #concat all the the messages of an user in a unique string
    users_messages={}
    for k,v in messages1.items():
        mes_user=db.messages.find_one({'_id':ObjectId(k)})#fetch the message
        user=mes_user['user_id']#fetch the user
        user_texts=list(db.messages.find({"user_id":user},{'text':1}))#fetch all the messages for the user
        user_text=''
        for text in user_texts:
            try:
                text = ast.literal_eval(text['text'])#some of my texts are lists some no
            except:
                text=list(text)
            txt=''
            for t in text:#put all the messages in a unique string
                txt += t
            user_text=user_text +' '+ txt
            users_messages[user]=user_text   
        
        
    sent1={}#k =user_id, v=text
    for k,v in users_messages.items():    
        sent1[k]=str(v)

    #creation of the similarity matrix for the users
    count_vectorizer = CountVectorizer()

    sparse_matrix = count_vectorizer.fit_transform(sent1.values())
    text_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(text_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=sent1.keys())
    similarity_matrix = distance(df,df)
    sim_df = pd.DataFrame(similarity_matrix, columns=sent1.keys(), index=sent1.keys())
    
    #recommend the 3 closest users content-wise
    def get3closest(sim_df,user_id):#user is an ObjectId
        col=sim_df[user].sort_values(ascending=False)[1:]
        return list(col[0:3].index)
    output= get3closest(sim_df,ObjectId(user_id))
    output=[str(el) for el in output]
    return json.dumps({'recommended':output})
Beispiel #16
0
def userRecom(user):
    """
    Recommends an user to another based on what is written by those users
    """
    idLista = list(chatCol.find({}, {"_id": 1}))
    userLista = list(userCol.find({}, {"_id": 0, "name": 1}))
    lista = {}
    for chat in idLista:
        exText = list(
            chatCol.find({
                "_id": ObjectId(chat["_id"])
            }).sort([("Texts", 1)]).limit(1))[0]["Texts"].keys()
        exText = list(exText)[-1]
        match = re.findall(r"[^msg][0-9]*", exText)
        lastText = int(match[0])
        for use in userLista:
            for e in range(1, lastText + 1):
                try:
                    if use["name"] not in lista.keys():
                        lista[use["name"]] = list(
                            chatCol.find({
                                "$and": [{
                                    "_id": ObjectId(chat["_id"])
                                }, {
                                    f"Texts.msg{e}.name": use["name"]
                                }]
                            }))[0]["Texts"][f"msg{e}"]["text"] + ". "
                    else:
                        lista[use["name"]] += list(
                            chatCol.find({
                                "$and": [{
                                    "_id": ObjectId(chat["_id"])
                                }, {
                                    f"Texts.msg{e}.name": use["name"]
                                }]
                            }))[0]["Texts"][f"msg{e}"]["text"] + ". "
                except:
                    pass
    print(lista)
    docs = lista
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(docs.values())
    m = sparse_matrix.todense()
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix,
                      columns=count_vectorizer.get_feature_names(),
                      index=docs.keys())
    similarity_matrix = distance(df, df)
    sim_df = pd.DataFrame(similarity_matrix,
                          columns=docs.keys(),
                          index=docs.keys())
    np.fill_diagonal(sim_df.values,
                     0)  # Remove diagonal max values and set those to 0
    return f"{user} is likely to be friends with {sim_df.idxmax()[user]}"
Beispiel #17
0
def recommender(user):

    if not request.args:
        raise APIError(
            'This endpoint requires a paramether type= "similar" or type="sentiment"'
        )

    type_recom = request.args['type']

    if type_recom not in ['similar', 'sentiment']:
        raise APIError(
            'The type parameter must be either "similar" or "sentiment"')

    #Create a dictionary with all the messages of each user
    messages = {}
    users = db.users.find({}, {'_id': 1, 'name': 1})
    for e in users:
        texts = db.messages.find({'user': e['_id']}, {'text': 1, '_id': 0})
        messages[e['name']] = ' '.join([t['text'] for t in texts])

    #Remove stopwords from the messages:
    trimmed = {}
    stpwrd = set(stopwords.words('english'))
    string = ''
    for k, v in messages.items():
        trimmed[k] = ' '.join([w for w in v.split(' ') if w not in stpwrd])

    if type_recom == 'similar':
        #Create a sparse_matrix with the counts of each word for each of the users
        count_vectorizer = CountVectorizer()
        sparse_matrix = count_vectorizer.fit_transform(trimmed.values())
        matrix = sparse_matrix.todense()

        #Calculate the cosine distances between users:
        similarity_matrix = distance(matrix, matrix)
        sim_df = pd.DataFrame(similarity_matrix,
                              columns=messages.keys(),
                              index=messages.keys())

        similars = sim_df[user].sort_values(ascending=False)[1:].head(3)
        return {'Similar users': list(similars.index)}

    elif type_recom == 'sentiment':
        sia = SentimentIntensityAnalyzer()
        sentim = {}
        for k, v in trimmed.items():
            sentim[k] = sia.polarity_scores(v)
        simi = pd.DataFrame(sentim).T
        distances = pd.DataFrame(1 /
                                 (1 + squareform(pdist(simi, 'euclidean'))),
                                 index=simi.index,
                                 columns=simi.index)
        similars = distances[user].sort_values(ascending=False)[1:].head(3)
        return {'Similar users': list(similars.index)}
Beispiel #18
0
def recomendations(dic):
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(dic.values())
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, 
                    columns=count_vectorizer.get_feature_names(), 
                    index=dic.keys())
    similarity_matrix = distance(df, df)
    sim_df = pd.DataFrame(similarity_matrix, columns=dic.keys(), index=dic.keys())
    sns.heatmap(sim_df,annot=True)
    np.fill_diagonal(sim_df.values, 0)
    recomendations=sim_df.idxmax()
    return recomendations
Beispiel #19
0
def similarityMatrix(users_messages):
    count_vectorizer = CountVectorizer(stop_words="english")
    sparse_matrix = count_vectorizer.fit_transform(users_messages.values())
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix,
                      columns=count_vectorizer.get_feature_names(),
                      index=users_messages.keys())
    # from sklearn.metrics.pairwise import cosine_similarity as distance
    similarity_matrix = distance(df, df)
    sim_df = pd.DataFrame(similarity_matrix,
                          columns=users_messages.keys(),
                          index=users_messages.keys())
    return sim_df
Beispiel #20
0
def recommendator(name):
    count_vectorizer = CountVectorizer(stop_words='english')
    sparse_matrix = count_vectorizer.fit_transform(recomDic().values())
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix,
                      columns=count_vectorizer.get_feature_names(),
                      index=recomDic().keys())
    similarity_matrix = distance(df, df)
    sim_df = pd.DataFrame(similarity_matrix,
                          columns=recomDic().keys(),
                          index=recomDic().keys())
    np.fill_diagonal(sim_df.values, 0)
    pepe = sim_df.idxmax()
    return pepe.loc[name]
def recomendaciones(name):
    docs = diccionarioGrande()
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(docs.values())
    m = sparse_matrix.todense()
    df = pd.DataFrame(m, columns=count_vectorizer.get_feature_names(),
                      index=docs.keys())
    similarity_matrix = distance(df,df)
    sim_df = pd.DataFrame(similarity_matrix, columns=docs.keys(), index=docs.keys())
    np.fill_diagonal(sim_df.values, 0)
    nombre = sim_df.idxmax()
    respuesta = {}
    respuesta[name] = f'Creo que conectas bastante bien con  {nombre.loc[name]}'
    return respuesta
Beispiel #22
0
def recommending_user(userName):
    database, collection =connectCollection('chats','chateo')
    query = list(collection.find({},{'userName':1,"text":1,'_id':0}))
    diccionario = getting_every_sentence(query)   
    recommendation_dict=dict()
    count_vectorizer=CountVectorizer(stop_words='english')
    sparse_matrix = count_vectorizer.fit_transform(diccionario.values())
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=diccionario.keys())
    similarity_matrix = distance(df, df)
    sim_df = pd.DataFrame(similarity_matrix, columns=diccionario.keys(), index=diccionario.keys())
    np.fill_diagonal(sim_df.values, 0)
    final_matrix=sim_df.idxmax()
    recommended = list(sim_df.sort_values(by=userName, ascending = False).index[0:3])
    return json.dumps(recommended)
def get_recommend_news_by_tfidf_sim():
    '''
        基于tfidf生成的user profile和文章的keywords(topN,设为20),从用户的candidate articles中选出相似度最大的TopN返回
    '''
    topN = 20
    uids = os.listdir(user_keywords_by_tfidf)
    uid2can_newsids = get_user_candidate_newsids(user_candidate_newsids_path)
    user_recommend_res = []
    #recommend_res_path = recommend_res_path.replace('.csv', '_by_tfidf.csv')
    cnt = 0
    for uid in uids:
        cnt += 1
        if cnt % 100 == 0:
            print 'recommend %d user: %s' % (cnt, uid)
        user_terms = get_user_tfidf_terms(
            os.path.join(user_keywords_by_tfidf, uid), topN)
        candidate_newsids = uid2can_newsids.get(uid, [])
        if not candidate_newsids:
            continue
        candidate_news_top_terms = get_news_top_terms(candidate_newsids, topN)
        #can_news_vectors和candidate_newsids中的nid一一对应
        user_vector, can_news_vectors = generate_feature_vectors(
            user_terms, candidate_news_top_terms, topN)
        #调用sklearn接口,可以一次计算user和全部news的cosine distance
        #注意,该接口的值是1-product(v1, v2), 所以值越小,越相似,表示distance越小
        user_news_distances = distance(user_vector,
                                       Y=can_news_vectors,
                                       metric='cosine')
        user_news_distances = zip(candidate_newsids,
                                  user_news_distances.tolist()[0])
        user_news_distances = sorted(user_news_distances, key=lambda d: d[1])

        user_recommend_res.append(
            (uid, [nid for nid, d in user_news_distances][:REC_NUM]))

    fw = open(recommend_res_path, 'w+')
    fw.write('userid,newsid\n')
    cnt = 0
    for uid, rec_news in user_recommend_res:
        #import pdb;pdb.set_trace()
        cnt += 1
        if cnt % 100 == 0:
            print 'finish %d user: %s, %s' % (cnt, uid, ' '.join(rec_news))
        fw.write('\n'.join(
            [','.join((uid, unicode2str(nid))) for nid in rec_news]))
        fw.write('\n')
    fw.close()
    print 'finish recommending, res saved in %s' % recommend_res_path
Beispiel #24
0
def recommendator(name):
    count_vectorizer = CountVectorizer(stop_words='english')
    sparse_matrix = count_vectorizer.fit_transform(makeDict().values())
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix,
                      columns=count_vectorizer.get_feature_names(),
                      index=makeDict().keys())
    similarity_matrix = distance(df, df)
    sim_df = pd.DataFrame(similarity_matrix,
                          columns=makeDict().keys(),
                          index=makeDict().keys())
    np.fill_diagonal(sim_df.values, 0)
    pepe = sim_df.idxmax()
    dict45825121 = {}
    dict45825121[name] = 'Your best friend should be {}'.format(pepe.loc[name])
    return dict45825121
def recommendations(user_name,chat_id):
    lista=getList(int(chat_id))
    lista=json.loads(lista)[0]['mensajes']
    data = pd.DataFrame(lista)
    df=data.groupby('autor').apply(lambda x: ''.join(x.texto))
    df=pd.DataFrame(df).reset_index()
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(df[0])
    doc_term_matrix = sparse_matrix.todense()
    letters_users = pd.DataFrame(doc_term_matrix, 
                    columns=count_vectorizer.get_feature_names(), 
                    index=df['autor'])
    similarity_matrix = distance(letters_users,letters_users)
    sim_df = pd.DataFrame(similarity_matrix, columns=df['autor'], index=df['autor'])
    similarities = sim_df[user_name].sort_values(ascending=False)[1:]
    return dumps(similarities)
def house_friend_recommender(conversation):

    houses = list(collection_con.find({}).distinct("house"))

    sentiment_text = {}

    for house in houses:
        lines = []
        text = ""
        text_clean = ""

        match = list(collection_con.find({"house": house}))

        for dictionary in match:
            lines.append(dictionary["line"])

        for line in lines:
            text += line

        words = nltk.word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        tokens_clean = [e for e in words if e not in stop_words]

        for word in tokens_clean:
            text_clean += word

        sentiment_text[house] = text_clean

    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(sentiment_text.values())

    doc_term_matrix = sparse_matrix.todense()
    df_sentiment = pd.DataFrame(doc_term_matrix,
                                columns=count_vectorizer.get_feature_names(),
                                index=sentiment_text.keys())

    similarity_matrix = distance(df_sentiment, df_sentiment)

    sim_df = pd.DataFrame(similarity_matrix,
                          columns=sentiment_text.keys(),
                          index=sentiment_text.keys())

    np.fill_diagonal(sim_df.values, 0)

    sim_df_idmax = pd.DataFrame(sim_df.idxmax())
    return (f"The recommended house for {conversation} is:" + " " +
            np.asarray(sim_df_idmax.loc[conversation])[0])
Beispiel #27
0
def recommendCharacter(username):
    #Dictionary with all conversations
    doc = {}
    characters = [
        character['username']
        for character in list(db['Conversations'].distinct('Characters'))
    ]
    for character in characters:
        i = list(
            list(db['Users'].find({'username': character}, {
                '_id': 1,
                'Group': 1
            }))[0].values())
        conversation = list(db['Conversations'].find({'Group':
                                                      i[1]}))[0]['Message']
        characConv = []
        for dic in conversation:
            if dic['username'] == character:
                characConv.append(dic['message'])
        phrases = ''.join(str(word) for word in characConv)

        dic = {character: phrases}
        doc.update(dic)

    #Vectorizer
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(doc.values())
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix,
                      columns=count_vectorizer.get_feature_names(),
                      index=doc.keys())

    #Similarity
    similarity_matrix = distance(df, df)
    sim_df = pd.DataFrame(similarity_matrix,
                          columns=doc.keys(),
                          index=doc.keys())
    np.fill_diagonal(sim_df.values,
                     0)  # Remove diagonal max values and set those to 0
    sim_df.idxmax()

    #Recommendation
    rec = sim_df.loc[username].idxmax()

    return rec
Beispiel #28
0
def recommending_user(user):
    #returns a recommendation for user to talk to
    recommendation_dict = dict()
    count_vectorizer = CountVectorizer(stop_words='english')
    sparse_matrix = count_vectorizer.fit_transform(
        getting_every_sentence().values())
    doc_term_matrix = getting_sparse_matrix().todense()
    df = pd.DataFrame(doc_term_matrix,
                      columns=count_vectorizer.get_feature_names(),
                      index=getting_every_sentence().keys())
    similarity_matrix = distance(df, df)
    sim_df = pd.DataFrame(similarity_matrix,
                          columns=getting_every_sentence().keys(),
                          index=getting_every_sentence().keys())
    np.fill_diagonal(sim_df.values, 0)
    final_matrix = sim_df.idxmax()
    recommendation_dict[user] = final_matrix.loc[user]
    return recommendation_dict
def search_title(query, return_size=20):

    # we need to tap into existing database that contains embedding for all titles
    # right now we can use ../title_vectors

    with open('title_vectors', 'rb') as filein:
        title_vectors = pickle.load(filein)

    title_distance = []
    vector = generate_vector(query)
    for vector_ins in title_vectors:
        # some titles does not have a valid embedding, being English etc.
        if vector_ins.vector:
            euclidean = distance([vector, vector_ins.vector])[0].sum()
            title_distance.append((vector_ins.title, euclidean))
    title_distance.sort(key=lambda item: item[-1])

    return title_distance[:return_size]
Beispiel #30
0
def recommender(name, dicc):
    ''' Function to find your best friend '''

    count_vectorizer = CountVectorizer(stop_words='english')
    sparse_matrix = count_vectorizer.fit_transform(dicc.values())
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix,
                      columns=count_vectorizer.get_feature_names(),
                      index=dicc.keys())
    similarity_matrix = distance(df, df)
    sim_df = pd.DataFrame(similarity_matrix,
                          columns=dicc.keys(),
                          index=dicc.keys())
    np.fill_diagonal(sim_df.values, 0)
    simil = sim_df.idxmax()
    dic = {}
    dic[name] = simil.loc[name]
    return dic