def calculate_left_right_similarityForpairing(left_vect,left_tfidf_part,left_part,left_extract,right_vect,right_tfidf_part,right_part,right_extract,titles):
   
    #print left_tfidf_part.shape
    #print right_tfidf_part.shape
    #transform the left side of extract into tfidf vector
    left_tfidf_extract = left_vect.transform([left_extract])
    #calculate the similarity scores for left side with all the left sides in database
    left_similarity_scores = cosine_similarity(left_tfidf_extract,left_tfidf_part)
    #transform the right side of extract into tfidf vector
    right_tfidf_extract = right_vect.transform([right_extract])
    #calculate the similarity scores for right side with all the right sides in database
    right_similarity_scores = cosine_similarity(right_tfidf_extract,right_tfidf_part)
   
    #calculate total similarity score corresponding to each entry in the database
    similarity_scores = left_similarity_scores + right_similarity_scores
    

    #get the indexes of top n scores. but not in sorted order
    ind = np.argpartition(similarity_scores, -5)[0][-5:]
    #get the top n scores. but not in sorted order
    selected_scores = similarity_scores[0][ind]
    #get the indexes in sorted order
    sorted_ind=ind[np.argsort(selected_scores)][::-1]
          
    titleAndScores = [titles[sorted_ind[0]]] +[sorted_ind[0]]+ [int(left_similarity_scores[0][i]*100) for i in sorted_ind] + [int(right_similarity_scores[0][i]*100) for i in sorted_ind]
    #print titleAndScores
    return titleAndScores
Example #2
0
def print_recommendations_kmeans(df, km, svd_trans, album_idx, n=25, min_n=2000):
    '''
    Prints list of recommended albums with kmeans preselect

    Args:
        df: dataframe with Pitchfork reviews
        km: fitted sklearn KMeans object
        svd_trans: the low dimensional representation of each review
        album_idx: the iloc value of album for which to generate reccs
        n: number of albums to recommend
        min_n: min number of samples to preselect with kmeans
    Returns:
        None

    '''
    sims_clusters = cosine_similarity(svd_trans[album_idx, :].reshape(1, -1),
                                      km.cluster_centers_).flatten()
    cluster_assgns = km.predict(svd_trans)
    idx = []
    for cluster in np.argsort(sims_clusters)[::-1]:
        idx.extend(np.where(cluster_assgns == cluster)[0])
        if len(idx) > min_n:
            break
    sel = np.bool_(np.ones(len(svd_trans)))
    sel[idx] = 0

    sims = cosine_similarity(svd_trans[album_idx, :].reshape(1, -1), svd_trans)
    sims[:, sel] = -1
    df_temp = df.iloc[np.argsort(sims).flatten()[-n:]]
    df_temp['sim_scores'] = np.sort(sims.flatten())[-n:]
    print df_temp[['url', 'genres', 'sim_scores']][::-1]
 def train(self, images, texts, K,lr_img,lr_txt, batch_size=16, verbose=False):
     assert images.shape[0] == texts.shape[0], "Must have same number of images and texts"
     n_train = images.shape[0]        
     n_batches = n_train/batch_size
     for batch_id in range(n_batches):
         if verbose:
             fwrite('Mini-batch : %2d/%2d\r' % (batch_id, n_batches))
             sys.stdout.flush()
         begin = batch_id*batch_size
         end = (batch_id + 1)*batch_size
         for idx_p in range(begin, end):
             self.forward_count = 0
             im = images[idx_p]
             txt = texts[idx_p]
             x = np.dot(im, self.W_img)
             y = np.dot(txt, self.W_txt)
             t_txt = np.tile(txt.reshape(-1,1), self.n_hid).T
             t_im = np.tile(im.reshape(-1,1), self.n_hid).T
             s = cosine_similarity(x.reshape(1,-1),y.reshape(1,-1))[0][0]
             n = 0
             while self.forward_count<K:
                 #security                    
                 n += 1
                 if n >= 100:
                     break
                 idx_n = np.random.randint(n_train)
                 txt_n = texts[idx_n]
                 y_n = np.dot(txt_n, self.W_txt)
                 s_n = cosine_similarity(x.reshape(1,-1), y_n.reshape(1,-1))[0][0]
                 J = 0.5 + s_n - s
                 if J > 0.:
                     t_txt_n = np.tile(txt_n.reshape(-1,1), self.n_hid).T
                     self.update_grads(x, y, s, y_n, s_n, t_im, t_txt, t_txt_n)
         self.backward(lr_img,lr_txt, batch_size)
     if verbose: fwrite('\n')
def create_tf_idf_sim_matrix( title_rev_log, desc_rev_log, cr_area_top_level, title_file_name):
    #print "Title- rev", title_rev_log
    #print "Desc-rev", desc_rev_log
    #print "cr_area_top_level", cr_area_top_level
    #print "title_file_name", title_file_name
    
   # tfidf_vectorizer = TfidfVectorizer(stop_words='english',decode_error='ignore')
    tfidf_vectorizer = TfidfVectorizer(decode_error='ignore')
    title_rev_log_tfidf_matrix     = tfidf_vectorizer.fit_transform(title_rev_log)
    desc_rev_log_tfidf_matrix      = tfidf_vectorizer.fit_transform(desc_rev_log)
    cr_area_top_level_tfidf_matrix = tfidf_vectorizer.fit_transform(cr_area_top_level)
    title_file_name_tfidf_matrix   = tfidf_vectorizer.fit_transform(title_file_name)
    
    #print  "size=", title_rev_log_tfidf_matrix.shape,  desc_rev_log_tfidf_matrix.shape,  cr_area_top_level_tfidf_matrix.shape, title_file_name_tfidf_matrix.shape         
    #print  "Title Rev Log=",  title_rev_log_tfidf_matrix
    #print "Desc rev log = ",  desc_rev_log_tfidf_matrix
    #print "cr area top level=", cr_area_top_level_tfidf_matrix
    #print  "title file name=", title_file_name_tfidf_matrix
                        
    title_rev_log_sim_matrix      = cosine_similarity(title_rev_log_tfidf_matrix[0:1], title_rev_log_tfidf_matrix)
    desc_rev_log_sim_matrix       = cosine_similarity(desc_rev_log_tfidf_matrix[0:1], desc_rev_log_tfidf_matrix)
    cr_area_top_level_sim_matrix  = cosine_similarity(cr_area_top_level_tfidf_matrix[0:1], cr_area_top_level_tfidf_matrix)
    title_file_name_sim_matrix    = cosine_similarity( title_file_name_tfidf_matrix[0:1],  title_file_name_tfidf_matrix)
    
    #print "sim title-rev log", title_rev_log_sim_matrix    
    #print "desc rev log", desc_rev_log_sim_matrix      
    #print "cr area top", cr_area_top_level_sim_matrix 
    #print "title file name", title_file_name_sim_matrix
    
    return   title_rev_log_sim_matrix, desc_rev_log_sim_matrix, cr_area_top_level_sim_matrix, title_file_name_sim_matrix
Example #5
0
def rank_tweets(tweets):
#    print tweets
    vectorizer = CountVectorizer(min_df=1)
    X = vectorizer.fit_transform(tweets)
    vectors = X.toarray()
    sumvectors = [0] * len(vectors[0])
    for v in vectors:
        for i,val in enumerate(v):
            sumvectors[i] += val
    centroid = [(x/len(vectors)) for x in sumvectors]
    #calculate cosines
    dists = []
    for i,vector in enumerate(vectors):
        dists.append([i,cosine_similarity(vector,centroid)])
    ranked_tweets = []
    ranked_vectors = []
    dists.sort(key = lambda x : x[1],reverse=True)
    for v in dists:
        vector = vectors[v[0]]
        sim = False
        for v2 in ranked_vectors:
            if cosine_similarity(vector,v2) == 1.0:
                sim = True
        if not sim:
            ranked_tweets.append(tweets[v[0]])
            ranked_vectors.append(vector)
        if len(ranked_tweets) == 10:
            break
    return ranked_tweets
Example #6
0
def find(request):
	if request.method == 'GET':
		query = request.GET.get('query')
		s = SessionStore()
		db.sessionHistory.update({'session_key': s.session_key},{'$push': {"query": [query]}}, upsert=True)
		from sklearn.metrics.pairwise import cosine_similarity
		from sklearn.feature_extraction.text import TfidfVectorizer

		dic = passQuery(query)
		docs = dic['docs']

		ids = dic['ids']
		print len(ids)
		print len(ids)
		all_url = []
		urls = db.crawledScienceCollection.find()
		for url in urls:
			all_url.append(url)
		selected_url = []
		for i in range(0, len(ids)):
			selected_url.append(all_url[ids[i]]['url'])
		print selected_url
		tfidf_vectorizer = TfidfVectorizer()
		tfidf_matrix = tfidf_vectorizer.fit_transform(docs)
		print tfidf_matrix.shape
		cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
		template = loader.get_template('results.html')
		context = {'docs': docs, 'url': selected_url, 'zip': zip(docs, selected_url)}
		return HttpResponse(template.render(context, request))
Example #7
0
def gloveSolver_multiplication( ):
    count = 0
    global deviation
    deviation = 0.001
    for i, linet in enumerate(lineText):
        # linet = str(linet).lower()
        word = str(linet[2]).lower()
        if vectorDictionary.has_key(str(linet[0]).lower()):
            a = np.array(vectorDictionary[str(linet[0]).lower()])
        if vectorDictionary.has_key(str(linet[1]).lower()):
            b = np.array(vectorDictionary[str(linet[1]).lower()])
        if vectorDictionary.has_key(str(linet[2]).lower()):
            c = np.array(vectorDictionary[str(linet[2]).lower()])
            aresult=((cosine_similarity(a,vecMatrix[0:30000]))+1)/2
            bresult=((cosine_similarity(b,vecMatrix[0:30000]))+1)/2
            cresult=((cosine_similarity(c, vecMatrix[0:30000]))+1) / 2
            if aresult.all == 0:
                aresult=aresult+deviation
            similarity = cresult * bresult/aresult;
            indexOfSimilarity = np.argmax(similarity)
            if wordDictionary[indexOfSimilarity] ==str(linet[3]).lower():
                count = count + 1

    multiplicationModal = float(count)/float(len(lineText))
    print f,  multiplicationModal
Example #8
0
def sim_score(path_problem, lexicon_8gram, lexicon_3gram, lexicon_bigram, lexicon_unigram):
    sim_score = {}
    for path, subdirs, files in os.walk(path_problem):
        for name_dir in subdirs:
            print(name_dir)
            vec_feature = []
            sim_score_fw = []
            sim_score_stylo = []
            sim_score_8gram = []
            sim_score_3gram = []
            sim_score_bigram = []
            sim_score_unigram = []
            lang = name_dir[:2]
            if lang == 'EN':
                fw_file = './stopwords/english.txt'
            elif lang == 'DU':
                fw_file = './stopwords/dutch.txt'
            elif lang == 'GR':
                fw_file = './stopwords/greek.txt'
            elif lang == 'SP':
                fw_file = './stopwords/spanish.txt'

            dir_path = os.path.join(path_problem, name_dir)
            for name1 in glob.glob(dir_path + "/unknown.txt"):
                file_path_unknown = os.path.join(dir_path, name1)
                fw_unknown = feature_extractor.freq_function_word(file_path_unknown, fw_file)
                stylo_unknown = feature_extractor.stylometric_features(file_path_unknown, lang)
                eight_gr_unknown = feature_extractor.tfidf(file_path_unknown, 8, 'char', lexicon_8gram)
                three_gr_unknown = feature_extractor.tfidf(file_path_unknown, 3, 'char', lexicon_3gram)
                bigram_unknown = feature_extractor.tfidf(file_path_unknown, 2, 'word', lexicon_bigram)
                unigram_unknown = feature_extractor.tfidf(file_path_unknown, 1, 'word', lexicon_unigram)

                for name2 in glob.glob(dir_path + "/known??.txt"):
                    file_path_known = os.path.join(dir_path, name2)

                    fw_known = feature_extractor.freq_function_word(file_path_known, fw_file)
                    stylo_known = feature_extractor.stylometric_features(file_path_known, lang)
                    eight_gr_known = feature_extractor.tfidf(file_path_known, 8, 'char', lexicon_8gram)
                    three_gr_known = feature_extractor.tfidf(file_path_known, 3, 'char', lexicon_3gram)
                    bigram_known = feature_extractor.tfidf(file_path_known, 2, 'word', lexicon_bigram)
                    unigram_known = feature_extractor.tfidf(file_path_known, 1, 'word', lexicon_unigram)

                    sim_score_fw.append(minmax_sim(fw_unknown, fw_known))
                    sim_score_stylo.append(vec_diff(stylo_unknown, stylo_known))
                    sim_score_8gram.append(cosine_similarity(eight_gr_unknown, eight_gr_known))
                    sim_score_3gram.append(cosine_similarity(three_gr_unknown, three_gr_known))
                    sim_score_bigram.append(cosine_similarity(bigram_unknown, bigram_known))
                    sim_score_unigram.append(cosine_similarity(unigram_unknown, unigram_known))

                vec_feature.append(np.mean(sim_score_stylo))
                vec_feature.append(np.mean(sim_score_fw))
                vec_feature.append(np.mean(sim_score_8gram))
                vec_feature.append(np.mean(sim_score_3gram))
                vec_feature.append(np.mean(sim_score_bigram))
                vec_feature.append(np.mean(sim_score_unigram))

            sim_score[name_dir] = vec_feature
    sort = OrderedDict(sorted(sim_score.items(), key=lambda s: s[0]))
    return sort
Example #9
0
def main(output_dir, use_2015F, query_nums, sim_cutoff=.5, use_semsim=False):

    results = []
    for query_num in query_nums:
        event = [e for e in cuttsum.events.get_events() 
                 if e.query_num == query_num][0]
        print event

        gold_probs = False
        df = get_input_stream(
            event, gold_probs, use_2015F=use_2015F, truncate=1)
        df = df.loc[df["stems"].apply(len) >= 10]
        df = df.reset_index(drop=True)

        print df[["update id", "sent text"]]

        if use_semsim:
            semsims = get_all_semsim()
            X_l = semsims[event.type].transform(
                    df["stems"].apply(lambda x: ' '.join(x)).tolist())  
        
            K = cosine_similarity(X_l)
        else:
            Xtf = []
            for stems in df["stems"].tolist():
                sc = {}
                for stem in stems:
                    sc[stem] = sc.get(stem, 0) + 1
                Xtf.append(sc)
            dv = DictVectorizer()
            Xtf = dv.fit_transform(Xtf)
            print Xtf
            K = cosine_similarity(Xtf)

        print K
        S = [0]
        for s in range(1, len(df)):
            max_prev_sim = K[s,:s].max()
            if max_prev_sim < sim_cutoff:
                S.append(s)
        for sent_text in df.iloc[S]["pretty text"].tolist():
            print sent_text
        for _, row in df.iloc[S].iterrows():
            d = row.to_dict()
            d["query id"] = query_num
            d["conf"] = .5
            d["team"] = "CUNLP"
            d["run id"] = "{}.c{}".format(
                "sem" if use_semsim else "bow", sim_cutoff)
            results.append(d)
    df = pd.DataFrame(results, columns=["query id", "team", "run id",
        "stream id", "sent id", "timestamp", "conf"])
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    o = os.path.join(output_dir, "{}.c{}.tsv".format(
        "sem" if use_semsim else "bow", sim_cutoff))
    df.to_csv(o, sep="\t", header=False, index=False)
 def cosine_sim(vec1, vec2):
     try:
         s = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]
     except:
         try:
             s = cosine_similarity(vec1, vec2)[0][0]
         except:
             s = MISSING_VALUE_NUMERIC
     return s
Example #11
0
def getCos_topic(tokens):
    List = get_keyword()
    component = List[0] + List[1] + List[2] + List[3]
    print(component)
    function = List[4] + List[5]
    data = List[6]
    rootcause = List[7] + List[8] + List[9]
    type = []

    t1 = get_topicvector(component, tokens)
    t2 = get_topicvector(function, tokens)
    t3 = get_topicvector(data, tokens)
    t4 = get_topicvector(rootcause, tokens)

    a1 = createCaseArray(component)
    a2 = createCaseArray(function)
    a3 = createCaseArray(data)
    a4 = createCaseArray(rootcause)
    print("-------------" + "topic similarity -----------------------------")
    minnum = 0
    if t1 != []:
        num1 = cosine_similarity(t1, a1)
        print(num1)
    else:
        num1 = 0
    if t2 != []:
        num2 = cosine_similarity(t2, a2)
    else:
        num2 = 0
    if t3 != []:
        num3 = cosine_similarity(t3, a3)
    else:
        num3 = 0
    if t4 != []:
        num4 = cosine_similarity(t4, a4)
    else:
        num4 = 0
    sum = num1 + num2 + num3 + num4
    minnum = min(num1, num2, num3, num4)
    #num = [num1, num2, num3, num4]
    if (sum != 0):
        print(minnum)
        if (num1 == minnum):
            #minnum = num2
            type.append("component")
        if (num2 == minnum):
            #minnum = num3
            type.append("function")
        if (num3 == minnum):
            #minmun = num4
            type.append("data")
        if (num4 == minnum):
            type.append("rootcause")
    else:
        return ["other"]
    print(type)
    return type
Example #12
0
def checkTriangleSanity(keypoint1,keypoint2,keypoint3):    
    # CHECK 1: Compute distances between pair of points
    distX_12 = keypoint1.pt[0] - keypoint2.pt[0]
    distY_12 = keypoint1.pt[1] - keypoint2.pt[1]
    distX_23 = keypoint2.pt[0] - keypoint3.pt[0]
    distY_23 = keypoint2.pt[1] - keypoint3.pt[1]
    distX_31 = keypoint3.pt[0] - keypoint1.pt[0]    
    distY_31 = keypoint3.pt[1] - keypoint1.pt[1]

    dist_12 = abs(distX_12) + abs(distY_12)
    dist_23 = abs(distX_23) + abs(distY_23)
    dist_31 = abs(distX_31) + abs(distY_31)

    # temp variable
    _ = '_'

    if(dist_12 < 5 or dist_23 < 5 or dist_31 < 5):
        return False,_,_,_,_,_

    # CHECK 2: Compute distances ratio between pair of points
    ratio1 = dist_12/float(dist_31)
    ratio2 = dist_12/float(dist_23)
    if(ratio1<0.33 or ratio1>3 or ratio2<0.33 or ratio2>3):
        return False,_,_,_,_,_

    # CHECK 3: Compute angle between every two lines
    # delta_1 = angle between vec 1->2 and vec 1->3
    vec_12 = [-distX_12, -distY_12]
    vec_13 = [ distX_31,  distY_31]    
    cos_delta_1 = cosine_similarity(vec_12, vec_13)
    delta_1 = math.degrees(math.acos(cos_delta_1))
    if delta_1<15:
        return False,_,_,_,_,_

    # delta_2 = angle between vec 2->3 and vec 2->1
    vec_23 = [-distX_23, -distY_23]
    vec_21 = [ distX_12,  distY_12]    
    cos_delta_2 = cosine_similarity(vec_23, vec_21)
    delta_2 = math.degrees(math.acos(cos_delta_2))
    if delta_2<15:
        return False,_,_,_,_,_

    # the third angle of the triangle formed by three input points
    # delta_3 = angle between vec 3->1 and vec 3->2
    delta_3 = 180 - delta_1 - delta_2
    if delta_3<15:
        return False,_,_,_,_,_

    # compute 5-tuple representation for this triangle
    # atan2() -> range (-pi,pi)
    # keypoint.angle -> range (-pi,pi) OR (0,2*pi) (NOT SURE) !!!
    alpha = keypoint1.angle + delta_1 - math.degrees(math.atan2(distY_31, distX_31))
    beta  = keypoint2.angle + delta_2 - math.degrees(math.atan2(distY_12, distX_12))
    gamma = keypoint3.angle + delta_3 - math.degrees(math.atan2(distY_23, distX_23))

    return True,delta_1,delta_2,alpha,beta,gamma
Example #13
0
    def ReturnRank(self,Data_2Rank):
        """ Returns the Rank indexes [-1 to 1] of the data based on cosine similarity with Interested and Notinterested vectors 
        Input: Data_2Rank, is feedparser output of the data to rank
        Output: Rank indexes [-1 to +1] of the input data based on cosine similarity. -1 is highest rank, and +1 is the lowest rank. """
        Text_2Rank = (entry.title + entry.summary for entry in  Data_2Rank.entries)
        Vectors_2Rank = self.vectorizer.transform(Text_2Rank)
        InterestedCosineRank = cosine_similarity(self.InterestedVector,Vectors_2Rank)[0]
        NotInterestedCosineRank = cosine_similarity(self.NotInterestedVector,Vectors_2Rank)[0]

        return NotInterestedCosineRank - InterestedCosineRank
Example #14
0
    def __call__(self, X1, X2):
        rows = []
        for key_1, value_1 in X1.iteritems():
            if self.xstats == 1:
                x1 = np.array([centroid(e) for e in value_1]).flatten()
            elif self.xtats == 2:
                x1 = np.array([dispersion(e) for e in value_1]).flatten()
            else:
                x1_cen = np.array([centroid(e) for e in value_1]).flatten()
                x1_dis = np.array([dispersion(e) for e in value_1]).flatten()
            columns = []
            for key_2, value_2 in X2.iteritems():
                if self.xstats == 1:
                    x2 = np.array([centroid(e) for e in value_2]).flatten()
                elif self.xtats == 2:
                    x2 = np.array([dispersion(e) for e in value_2]).flatten()
                else:
                    x2_cen = np.array([centroid(e) for e in value_2]).flatten()
                    x2_dis = np.array([dispersion(e) for e in value_2]).flatten()

                if self.similarity == 1:
                    if self.xstats == 3:
                        value_cen = polynomial_kernel(x1_cen,x2_cen).flatten()[0]
                        value_dis = polynomial_kernel(x1_dis,x2_dis).flatten()[0]
                        value = (value_cen + value_dis)/2
                    else:
                        value = polynomial_kernel(x1,x2).flatten()[0]
                    if self.domain_adapt:
                        if (key_1 < 10500 and key_2 < 10500) or ((key_1 > 10500 and key_2 > 10500)):
                            columns.append(value)
                        else:
                            columns.append(2*value)
                    else:
                        columns.append(value)
                else:
                    if self.xstats == 3:
                        value_cen = cosine_similarity(x1_cen,x2_cen).flatten()[0]
                        value_dis = cosine_similarity(x1_dis,x2_dis).flatten()[0]
                        value = (value_cen + value_dis)/2
                    else:
                        value = cosine_similarity(x1,x2).flatten()[0]
                    if self.domain_adapt:
                        if (key_1 < 10500 and key_2 < 10500) or ((key_1 > 10500 and key_2 > 10500)):
                            columns.append(value)
                        else:
                            columns.append(2*value)
                    else:
                        columns.append(value)
            rows.append(columns)
        m = np.asarray(rows)
        print m.shape
        return m
def similarity(ratings, kind='user'):
    if kind == 'user':
        sim = cosine_similarity(ratings)
        #cosine_similarity, treat one row as one data point
        # sim is a N by N matrix
        assert(sim.shape[0] == ratings.shape[0])
    elif kind == 'item':
        sim = cosine_similarity(ratings.T)
        # sim is a p by p matrix
        assert(sim.shape[0] == ratings.shape[1])
    sim[np.isnan(sim)] = 0 # when could nan happen? think about it
    np.fill_diagonal(sim,0) # when compute neighbors, we don't need to compute itself
    return sim
def compute_maximum_similarity(input_user_tweets_file):
    similarity_values = []
    tweets_grouped_by_user.insert(0, input_user_tweets_file)
    user_tweets = [open(user_tweets) for user_tweets in tweets_grouped_by_user]
    tfidf_files = TfidfVectorizer(input='file').fit_transform(user_tweets)

    for i in range(1, tfidf_files.shape[0]):
        print cosine_similarity(tfidf_files[0], tfidf_files[i])
        similarity_values.append(cosine_similarity(tfidf_files[0], tfidf_files[i]))

    most_similar_measure = max(similarity_values)
    # most_similar_doc_index = similarity_values.index(most_similar_measure)
    return most_similar_measure
Example #17
0
def make_cosine_list(matrix):
    # Returns list: nth index in list is cosine sim of nth and nth + 1 utt in matrix (final utt cannot have cosine score)
    cosines = []
    if switch == 'tfidf': # matrix type: scipy.sparse.lil.lil_matrix
        n_vectors = matrix.shape[0]
        for i in xrange(n_vectors - 1):
            cosines.append(cosine_similarity(matrix[i:i+1], matrix[i+1:i+2])[0][0])
    elif switch == 'lda' or switch == 'doc2vec': # matrix type: list
        n_vectors = len(matrix)
        for i in xrange(n_vectors - 1):
            cosines.append(cosine_similarity(matrix[i].reshape(1, -1), matrix[i+1].reshape(1, -1))[0][0])
    # elif switch == 'doc2vec':
    #     print type(matrix)
    return cosines
def similarities_without_duplicates(tfidf_matrix, length):
    include = [1] * length
    similarities_base = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix).flatten()
    for i in range(length):
        if include[i] == 0:
            continue
        similarities = cosine_similarity(tfidf_matrix[i : i + 1], tfidf_matrix).flatten()
        for j in range(2, length):
            if similarities[j] > 0.98 and j != i:
                include[j] = 0
    for i in range(length):
        if include[i] == 0:
            similarities_base[i] = 0
    return similarities_base
Example #19
0
def extract_numerical_feature(seg):
    
    list = []
    
    num_query = len(seg[11].strip().split("|"))
    num_keyword = len(seg[12].strip().split("|"))
    num_title = len(seg[13].strip().split("|"))
    num_description = len(seg[14].strip().split("|"))
    
    list.append(str(process_Id_Feature("num_query"," ")) + ":" + str(num_query))
    list.append(str(process_Id_Feature("num_keyword"," ")) + ":" + str(num_keyword))
    list.append(str(process_Id_Feature("num_description"," ")) + ":" + str(num_description))
    list.append(str(process_Id_Feature("num_title"," ")) + ":" + str(num_title))
    
    tfidf_vectorizer = TfidfVectorizer()
    corpus = seg[11:15]
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
    #print tfidf_matrix.shape
    
    #tfidf = tf_idf(corpus)
    
    #query_similar_keyword = tfidf_similarity(tfidf[0],tfidf[1])
    query_similar_keyword = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    #print query_similar_keyword
    #query_similar_title = tfidf_similarity(tfidf[0],tfidf[2])
    query_similar_title = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[2:3])
    #print query_similar_title
    #query_similar_description = tfidf_similarity(tfidf[0],tfidf[3])
    query_similar_description = cosine_similarity(tfidf_matrix[0:1],tfidf_matrix[3:])
    #print query_similar_description
    #keyword_similar_title = tfidf_similarity(tfidf[1],tfidf[2])
    keyword_similar_title = cosine_similarity(tfidf_matrix[1:2],tfidf_matrix[2:3])
    #print keyword_similar_title
    #title_similar_description = tfidf_similarity(tfidf[2],tfidf[3])
    title_similar_description = cosine_similarity(tfidf_matrix[1:2],tfidf_matrix[3:])
    #print title_similar_description  
    
    
    list.append(str(process_Id_Feature(query_similar_keyword," ")) + ":" + str(query_similar_keyword))
    list.append(str(process_Id_Feature(query_similar_title," ")) + ":" + str(query_similar_title))
    list.append(str(process_Id_Feature(query_similar_description," ")) + ":" + str(query_similar_description))
    list.append(str(process_Id_Feature(keyword_similar_title," ")) + ":" + str(keyword_similar_title))
    list.append(str(process_Id_Feature(title_similar_description," ")) + ":" + str(title_similar_description))
    
    depth = float(seg[4])
    position  float(seg[5])
    relative_pos = float((depth - position)*10.0/depth)
    list.append(str(process_Id_feature("relative_pos_num"," ")) + ":" + str(relative_pos))
    return list
Example #20
0
def make_cosine_list(matrix):
    # Returns list: nth index in list is cosine sim of nth and nth + 1 utt in matrix
    cosines = []
    if switch == 'doc2vec' or switch == 'lda':
        # Matrix is numpy array
        n_vectors = len(matrix)
        for i in xrange(n_vectors - 1):
            cosines.append(cosine_similarity(matrix[i].reshape(1, -1), matrix[i+1].reshape(1, -1)))
    elif switch == 'tfidf':
        n_vectors = matrix.shape[0]
        # Iterate over each utt in doc_matrix, score with subsequent utt (final utt cannot have cosine score)
        for i in xrange(n_vectors - 1):
        # Index the Sci-Py matrix by slice to extract vectors
            cosines.append(cosine_similarity(matrix[i:i+1], matrix[i+1:i+2])[0][0])
    return cosines
Example #21
0
def user_similarities_one_to_many(user_car_feat, df_cars_feat, df_cars_scraped, n_predict):
    cosines = []
    # parallelize if takes long
    for car_all in df_cars_feat.featurized:
        cosines_all_one_car = []
        for car in car_all:
            cosines_all_one_car.append(cosine_similarity(user_car_feat, car)[0][0])
        cosines.append(max(cosines_all_one_car))
    cosines = np.array(cosines)
    indexes = cosines.argsort()[::-1]
    df_cars_top = df_cars_feat.ix[indexes][:n_predict]
    df_cars_top = pd.merge(df_cars_top,df_cars_scraped, on='link')
    car_links = df_cars_top.link.values
    car_img_links = df_cars_top.img_x.apply(lambda x: x[0]).values
    car_model_year = df_cars_top.model_year.values.astype(int)
    car_make_and_model = df_cars_top.make_and_model.values
    car_price = df_cars_top.price.values
    car_price_clean = []
    for price in car_price:
        if np.isnan(price) == True:
            car_price_clean.append(0.0)
        else:
            car_price_clean.append(price)
    car_price = car_price_clean
    result = zip(car_links, car_img_links, car_model_year, car_make_and_model, car_price)
    return result
def readJson3(jpath, s, finalTerms, finalTermsIDF, queryTFIDFs):
    reviews = {}
    authors = {}
    dates = {}
    j = 0
    cosinesAll = {}
    for key0 in queryTFIDFs.keys():
        cosinesAll[key0] = []
    for f in os.listdir(jpath):
        fpath = os.path.join(jpath, f)
        if os.path.isfile(fpath):
            jfile = open(fpath).read()
            jsondata = json.loads(jfile)
            try:
                for k in range(len(jsondata['Reviews'])):
                    try:
                        reviews[s+str(j)] = jsondata['Reviews'][k]['Content']
                        authors[s+str(j)] = jsondata['Reviews'][k]['Author']
                        dates[s+str(j)] = jsondata['Reviews'][k]['Date']
                        tokens = tokenizer.tokenize(reviews[s+str(j)])
                        stemmedTokens = []
                        stemmedTokenF = []
                        bigram = []
                        for t in range(len(tokens)):
                            try:
                                tk = int(tokens[t])
                                tk = "NUM"
                            except ValueError:
                                tk = tokens[t]            
                            stemmedToken = stemmer.stem(tk.lower())
                            stemmedTokens.append(stemmedToken)
                            if stemmedToken in finalTerms:
                                stemmedTokenF.append(stemmedToken)
                        for m in range(len(stemmedTokens)-1):
                            tm = stemmedTokens[m] + '-' + stemmedTokens[m+1]
                            if tm in finalTerms:
                                bigram.append(tm)                        
                        unibigram = stemmedTokenF + bigram
                        c1 = Counter(unibigram)
                        tfidfEachReview = []
                        for x in range(len(finalTerms)):
                            if c1[finalTerms[x]] > 0:
                                tf = 1 + np.log(c1[finalTerms[x]])
                            else:
                                tf = 0
                            tfidf = tf * finalTermsIDF[x]
                            tfidfEachReview.append(tfidf) 
                        for key0, value0 in queryTFIDFs.iteritems():
                            cosine = cosine_similarity(value0, tfidfEachReview)
                            infoDoc = (cosine, reviews[s+str(j)], authors[s+str(j)], dates[s+str(j)])
                            cosinesAll[key0].append(infoDoc)
                        j += 1
                    except ValueError:
                        print 'Cannot find Review Content!'
            except ValueError:
                print 'Cannot find Review!'
    cosines = {}
    for k, value in cosinesAll.iteritems():
        cosines[k] = sorted(cosinesAll[k], key=itemgetter(0), reverse = True)[:3]
    return cosines
def t_test_accuracy(topic_id, n_runs, estimator_params_votes_per_doc_tuples):
  """ Test if accuracy for estimators with given parameters is
      significantly better than that of the first estimator in the tuple
  """
  texts, vote_lists, truths = texts_vote_lists_truths_by_topic_id[topic_id]
  vectorizer = TfidfVectorizer()
  text_similarity = cosine_similarity(vectorizer.fit_transform(texts))

  accuracy_arrays = []
  for estimator, args, votes_per_doc in estimator_params_votes_per_doc_tuples:
    stop_idx = votes_per_doc * len(texts)
    # Now get n_runs accuracies and put then into numpy arrays
    accuracies = Parallel(n_jobs=4)( delayed(get_accuracy_sequence)(estimator, stop_idx, texts, 
        vote_lists, truths, text_similarity, idx, True, *args) for idx in xrange(n_runs) )
    accuracy_arrays.append( np.array( filter(lambda x: x is not None, accuracies) ) )

  # Baseline
  result_row = []
  result_row.append( "%0.2f" % np.mean(accuracy_arrays[0]) )
  # T-tests
  for accuracy_array in accuracy_arrays[1:]:
    _, pval = ttest_ind(accuracy_array, accuracy_arrays[0], equal_var=False)
    significance_indicator = lambda p: "*" if p < 0.01 else " "
    is_better = "$" if np.mean(accuracy_array) > np.mean(accuracy_arrays[0]) else " "
    result_row.append( "%0.2f %s %s" % (np.mean(accuracy_array), significance_indicator(pval), is_better))

  return "|".join(result_row)
Example #24
0
def MMR(docs, count):
    # Setup
    select_lst = [docs.pop(0)]
    candidates = []
    tfidf_vectorizer = TfidfVectorizer()
    relevance_weight = 0.9

    # Start recalculating scores
    while len(select_lst) != len(docs):
        select_sen = []
        for i in select_lst:
            select_sen.append(i.sentence)

        for candidate in docs:
            old_score = candidate.rating

            stemmed_sen = stemming([candidate])
            stemmed_lst = stemming(select_lst)
            tfidf_matrix = tfidf_vectorizer.fit_transform(stemmed_lst)
            target = tfidf_vectorizer.transform(stemmed_sen)
            similarities = cosine_similarity(target,tfidf_matrix).flatten()
            similarities.sort()
            similarity = similarities[-1]
            
            new_score = old_score * relevance_weight - similarity * (1 - relevance_weight)

            candidate.rating = new_score
            
        docs = sorted(docs, key=attrgetter("rating"), reverse=True)
        select_lst.append(docs.pop(0))

    return select_lst
 def classify(self, x):
     """Transforms and classifies x"""
     x = x.lower()
     x_matrix = self.input_transform_fnc([x])
     x = cosine_similarity(self.tfidf_matrix, x_matrix)
     idx = numpy.where(x == max(x))[0][0]
     return self.y[idx]
Example #26
0
def concept_to_concept_threshold_char(concept,name):
	#print "Concept_to_Concept ", name
	
	conList = []
	docs = [(name + concept)]
	
        for x in range(0,len(concept)):
  	    docs.append((name+concept[0:x]))

        tfidf_matrix = tfidf_vectorizer.fit_transform(docs)
        
        matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
        #print matrix
	   
	for row in matrix:
            for x in row[1:]:
		conList.append(x)

	
	mean = statistics.mean(conList)
	stdev = statistics.pstdev(conList)

	thld = 1 - (Num_Deviations * stdev)
	#print abs(mean_confidence_interval(mean,stdev))
	#thld1 = thld - abs(mean_confidence_interval(mean,stdev)) 

	#if thld == 1:
	    #return
	#print statistics.pstdev(conList)

	#print thld
	#out = [thld,thld1]
	#wr.writerow(out)
	return thld
 def create_similarity_matrix_text(self, features=None):
     if features == None:
         features = self.text_features
     if features == None:
         print 'You must provide the text features as argument or run extract_text_features() first'
     else:
         self.text_similarity_matrix = cosine_similarity(features)
def writeOutput(tfidf_matrix_train,relname,othername,othername1,realname,realname1,authorMap,listsize, outname):
    temp = []
    for j in range(tfidf_matrix_train.shape[0]):
        targetname = relname[j]
        targetname = targetname.strip()
        for n in range(len(othername1)):
            othername1[n] = othername1[n].strip()
            if (targetname == othername1[n]):
                print j,targetname
                temp.append(j)
                #print 'original one'
                result = cosine_similarity(tfidf_matrix_train[j:j+1], tfidf_matrix_train)
                index = [i[0] for i in sorted(enumerate(result[0]), key=lambda x:x[1],reverse=True)]
                recList = index[1:listsize+1]
                if(j in authorMap.keys()):
                    coauther = authorMap.get(j)
                    coauther = list(coauther)
                    recList = deleteCoauthor(coauther, recList, index)
                recomd = []
                similar = []
                for k in recList: 
                    recomd.append(relname[k])
                    similar.append(result[0][k]) 
                with open('../RecommendationResult/'+outname+'Model/'+relname[j]+'.txt','w') as fn:
                    fn.write(realname1[n]+" ID:"+str(j)+"\n\n")
                    for n1 in xrange(len(recomd)):
                        for m in range(len(othername)):
                            person = recomd[n1]
                            person = person.strip()
                            othername[m] = othername[m].strip()
                            if (person == othername[m]):
                                fn.write(str(n1+1)+". "+str(realname[m])+": "+str(similar[n1])+"\n")
    return temp
Example #29
0
File: utils.py Project: lopuhin/WSI
def print_cluster_sim(centers):
    sim_matrix = cosine_similarity(centers, centers)
    print('\t'.join('{}'.format(j) for j, _ in enumerate(sim_matrix)))
    for i, row in enumerate(sim_matrix):
        print('\t'.join(
            ('{:.2f}'.format(x) if i < j else ' ')
            for j, x in enumerate(row)), i, sep='\t')
Example #30
0
def hCSimilarity(train_fn, test_fn, save_fn):
    '''
        calculate History and Candidate matrix similarity
        n x 100 , m x 100 -> n x m 
        #@TODO :
            there are a lot of ways to calcaute similarity
    '''

    fp = open(train_fn, 'r')
    train = pickle.load(fp).toarray()
    fp = open(test_fn , 'r')
    test  = pickle.load(fp).toarray()
    result = np.zeros((len(train), len(test)))

    # for i in range(len(train)):
    #     for j in range(len(test)):
    #         na = np.linalg.norm(train[i,:])
    #         nb = np.linalg.norm(test[i,:])
    #         if na == 0 or nb == 0 :
    #             result[i, j] = 0
    #         else:
    #             #result[i][j] = np.dot(train[i,:], test[j,:])/na/nb
    #             # result[i][j] = cosine_similarity(train[i,:], test[j,:])
    #             result[i][j] = 1 - spatial.distance.cosine(train[i,:], test[j,:])
    result = cosine_similarity(train, test)
    saveItem(result, save_fn)
Example #31
0
def first_level_grouping(feature_map_dict,
                         encoded_list_rearrange_concat,
                         mask_arr,
                         all_keys,
                         keys_1d,
                         keys_2d,
                         keys_3d=[]):
    height = 32
    width = 20
    relation_all_df = pd.DataFrame(0, columns=all_keys, index=all_keys)
    num_data = len(encoded_list_rearrange_concat[0])
    # num_data
    for n in range(num_data):
        print('n: ', n)
        for ds_name1 in all_keys:
            # 1D case
            if ds_name1 in keys_1d:
                temp_arr1 = feature_map_dict[ds_name1][n, :]  # (24, 1, 1, 1)
                # (24, 1) - > [32, 20, 24]
                temp_1d_dup = np.repeat(temp_arr1, 32, axis=1)
                temp_1d_dup = np.repeat(temp_1d_dup, 20,
                                        axis=2)  # 32, 20, 24, 1

                temp_1d_dup = np.squeeze(temp_1d_dup, axis=-1)  #[24, 32, 20,]
                temp_1d_dup = np.moveaxis(temp_1d_dup, 0, -1)  # (32, 20, 24)
                dim1 = temp_arr1.shape[0]  # number of layers in the 2d data
                #         dim1 = temp_arr1.shape[-1]  # number of layers in the 2d data
                for ds_name2 in all_keys:
                    # 1D VS 1D
                    if ds_name2 in keys_1d:
                        ave_SR = 0
                        #                 print(ds_name1, ds_name2)
                        temp_arr2 = feature_map_dict[ds_name2][n, :]
                        sim_sparse = cosine_similarity(
                            temp_arr1.reshape(1, -1), temp_arr2.reshape(1, -1))
                        ave_SR = sim_sparse[0][0]
                        relation_all_df.loc[ds_name1, ds_name2] += ave_SR

                    # 2D VS 1D
                    # 2D:  32, 20, 1
                    # 1D duplicate: 32, 20, 3. This means that there is no spatial variations for 1D
                    # duplicate 2D to 32, 20, 3. This means that there is no temporal variations for 2D
                    # then flatten and compare
                    # This means that there is no temporal variations for 2D
                    if ds_name2 in keys_2d:
                        # temp_arr2 = feature_map_dict[ds_name2][n,:,:,:] # 32, 20, 1
                        # # duplicate to [32, 20, 24]
                        # temp_arr2_mean_dup = np.repeat(temp_arr2, dim1, axis = -1)
                        #
                        # compress_arr2 = remove_outside_cells(temp_arr2_mean_dup, mask_arr) # [32, 20, 24]
                        # compress_arr1 = remove_outside_cells( temp_1d_dup, mask_arr) # [32, 20, 24]
                        #
                        # ave_SR = 0
                        # sim_sparse = cosine_similarity(compress_arr2.reshape(1, -1),
                        #                                            compress_arr1.reshape(1, -1))
                        #
                        # ave_SR = sim_sparse[0][0]
                        # relation_all_df.loc[ds_name1, ds_name2]  += ave_SR
                        relation_all_df.loc[ds_name1, ds_name2] += 0

                    # 3D VS 1D
                    # duplicate 1D to 3D, flatten and compare
                    if ds_name2 in keys_3d:
                        temp_arr2 = feature_map_dict[ds_name2][
                            n, :, :, :, :]  # 3d, e.g. [24, 32, 20, 1]
                        temp_arr2 = np.squeeze(temp_arr2,
                                               axis=-1)  #[24, 32, 20]
                        temp_arr2 = np.moveaxis(temp_arr2, 0,
                                                -1)  # (32, 20, 24)

                        ave_SR = 0  # average spearman correlation

                        compress_arr2 = remove_outside_cells(
                            temp_arr2, mask_arr)
                        compress_arr1 = remove_outside_cells(
                            temp_1d_dup, mask_arr)

                        sim_sparse = cosine_similarity(
                            compress_arr1.reshape(1, -1),
                            compress_arr2.reshape(1, -1))

                        ave_SR = sim_sparse[0][0]
                        relation_all_df.loc[ds_name1, ds_name2] += ave_SR

            # 2D case
            if ds_name1 in keys_2d:
                temp_arr1 = feature_map_dict[ds_name1][
                    n, :, :, :]  # [32, 20, 1]
                # print('temp_arr1_mean.shape: ', temp_arr1_mean.shape)
                # temp_arr1_mean_dup = np.repeat(temp_arr1_mean_dup, temp_arr2.shape[-1], axis = 0)

                for ds_name2 in all_keys:
                    # 2D Vs 1D
                    if ds_name2 in keys_1d:
                        relation_all_df.loc[
                            ds_name1, ds_name2] = relation_all_df.loc[ds_name2,
                                                                      ds_name1]
                    # 2D Vs 2D
                    # take mean along 3rd dimension and compare
                    if ds_name2 in keys_2d:
                        ave_SR = 0  # average spearman correlation
                        temp_arr2 = feature_map_dict[ds_name2][n, :, :, :]

                        compress_arr2 = remove_outside_cells(
                            temp_arr2, mask_arr)
                        compress_arr1 = remove_outside_cells(
                            temp_arr1, mask_arr)

                        sim_sparse = cosine_similarity(
                            compress_arr1.reshape(1, -1),
                            compress_arr2.reshape(1, -1))
                        #                             pearson_coef, p_value = stats.pearsonr(temp_arr1[ :, :, i].ravel(), temp_arr2[ :, :, j].ravel())

                        ave_SR = sim_sparse[0][0]
                        relation_all_df.loc[ds_name1, ds_name2] += ave_SR

                    # 2D VS 3D
                    # for 2D feature maps, output 3rd dimension of feature map is 1.
                    # for 3D feature maps, output 3rd dimension is 3
                    # average 3D feature map by 3rd dimension
                    # flatten and compare
                    if ds_name2 in keys_3d:
                        temp_arr2 = feature_map_dict[ds_name2][
                            n, :, :, :, :]  #[24, 32, 20, 1]
                        temp_arr2 = np.squeeze(temp_arr2,
                                               axis=-1)  #[24, 32, 20]
                        temp_arr2 = np.moveaxis(temp_arr2, 0,
                                                -1)  # (32, 20, 24)

                        # average along third dimension
                        temp_arr2_mean = np.mean(temp_arr2, axis=-1)
                        temp_arr2_mean_dup = np.expand_dims(
                            temp_arr2_mean, axis=-1)  #[32, 20, 1]

                        compress_arr2 = remove_outside_cells(
                            temp_arr2_mean_dup, mask_arr)
                        compress_arr1 = remove_outside_cells(
                            temp_arr1, mask_arr)

                        ave_SR = 0  # average spearman correlation
                        sim_sparse = cosine_similarity(
                            compress_arr1.reshape(1, -1),
                            compress_arr2.reshape(1, -1))
                        ave_SR = sim_sparse[0][0]
                        relation_all_df.loc[ds_name1, ds_name2] += ave_SR

            # 3D
            if ds_name1 in keys_3d:
                temp_arr1 = feature_map_dict[ds_name1][
                    n, :, :, :, :]  # [24, 32, 20, 1]
                temp_arr1 = np.squeeze(temp_arr1, axis=-1)  #[24, 32, 20]
                temp_arr1 = np.moveaxis(temp_arr1, 0, -1)  # (32, 20, 24)

                for ds_name2 in all_keys:
                    # 1D
                    if ds_name2 in keys_1d:
                        relation_all_df.loc[
                            ds_name1, ds_name2] = relation_all_df.loc[ds_name2,
                                                                      ds_name1]
                    # 3D VS 2D
                    if ds_name2 in keys_2d:
                        temp_arr2 = feature_map_dict[ds_name2]

                        relation_all_df.loc[
                            ds_name1, ds_name2] = relation_all_df.loc[ds_name2,
                                                                      ds_name1]

                    # 3D VS 3D
                    # flatten and compare. Because 3rd dimension contains
                    # temporal information
                    if ds_name2 in keys_3d:
                        temp_arr2 = feature_map_dict[ds_name2][n, :, :, :, :]
                        temp_arr2 = np.squeeze(temp_arr2,
                                               axis=-1)  #[24, 32, 20]
                        temp_arr2 = np.moveaxis(temp_arr2, 0,
                                                -1)  # (32, 20, 24)

                        ave_SR = 0  # average spearman correlation
                        compress_arr2 = remove_outside_cells(
                            temp_arr2, mask_arr)
                        compress_arr1 = remove_outside_cells(
                            temp_arr1, mask_arr)

                        sim_sparse = cosine_similarity(
                            compress_arr1.reshape(1, -1),
                            compress_arr2.reshape(1, -1))

                        ave_SR = float(sim_sparse[0][0])
                        relation_all_df.loc[ds_name1, ds_name2] += ave_SR
    relation_all_df = relation_all_df / num_data
    return relation_all_df
Example #32
0
def run(test, n_songs, n_tags, spr_list, tag_tid_id):
    start = time.time()
    train_user_songs_A,train_user_tags_A,\
        test_title,title_sp,gnr_sp,test_gnr_sp,\
        title_gnr,test_title_gnr = spr_list

    res = []
    for i in range(len(test)):
        dat = test.iloc[i]
        pid = i
        songs_already = dat["songs"]
        tags_already = dat["tags_id"]

        if len(dat['songs']) != 0 and len(dat['tags_id']) != 0:
            p = np.zeros((n_songs, 1))
            p[dat['songs']] = 1
            val_song = cosine_similarity(train_user_songs_A, p.T)

            pp = np.zeros((n_tags, 1))
            pp[dat['tags_id']] = 1
            val_tag = cosine_similarity(train_user_tags_A, pp.T)

            val_title_genre = cosine_similarity(title_gnr,
                                                test_title_gnr[i:(i + 1)])
            val = val_song * val_tag * val_title_genre

        elif len(dat['songs']) != 0:
            p = np.zeros((n_songs, 1))
            p[dat['songs']] = 1
            val_song = cosine_similarity(train_user_songs_A, p.T)

            val_title_genre = cosine_similarity(title_gnr,
                                                test_title_gnr[i:(i + 1)])
            val = val_song * val_title_genre

        elif len(dat['tags_id']) != 0:
            p = np.zeros((n_tags, 1))
            p[dat['tags_id']] = 1

            val = cosine_similarity(train_user_tags_A, p.T)

            if len(dat['plylst_title']) != 0:
                val_title = cosine_similarity(title_sp, test_title[i:(i + 1)])
                val = val * val_title

        else:
            val = cosine_similarity(title_sp, test_title[i:(i + 1)])

        cand_song = train_user_songs_A.T.tocsr().dot(
            val)  # 행에는 노래 열에는 유저 정보 %*% 유사한 유저 -> 유사한 노래에 대하여 높은 값 나옴
        cand_song_idx = cand_song.reshape(
            -1).argsort()[-300:][::-1]  # 값이 높은 상위 150개 노래 추출
        cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) ==
                                      False][:100]  # 중복되는 노래 있는지 확인하고 100개 추출

        cand_tag = train_user_tags_A.T.tocsr().dot(val)  # 똑같은 작업 실시
        cand_tag_idx = cand_tag.reshape(-1).argsort()[-30:][::-1]
        cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) ==
                                    False][:10]
        rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

        res.append({
            "id": test.loc[pid, 'id'],
            "songs": list(cand_song_idx),
            "tags": rec_tag_idx
        })
        if i % 1000 == 0:
            print("{} time :".format(i), time.time() - start)

    write_json(res, "pre_tag.json")
	def getCosineSimilarity(self,vec_Pair):
		sim_array = cosine_similarity(vec_Pair[0],vec_Pair[1])
		return sim_array[0][0]
def find_similarity(test_description_modified):
    ## make sparse array of filtered test description
    test_document_vector = vectorizer.transform([test_description_modified])
    test_document_encoded = (test_document_vector.toarray())

    #print (test_document_encoded)

    ## Cosine Similarity:

    all_documents_similarity = []
    ## all_documents_similarity is a array in whcih we save the similarity and the primary key/index together as we will have to sort the       list for selecting top similar descriptions, so we need to save the indexes as well
    for i in range(len(all_documents_encoded)):
        all_documents_similarity.append([
            cosine_similarity(all_documents_encoded[i], test_document_encoded),
            i
        ])


#     ## Sort all similarities in desc order
#     all_documents_similarity_sorted = sorted(all_documents_similarity, reverse = True)

## Select similar documents:
## Select Top X% of the sorted values
## NOTE - Is there a better way to decide what percentage to select, other than trial and error on percentages?
    Xpercent = 0.15  ##Top 10 documents
    topXpercent = int(len(all_documents_similarity) * (Xpercent / 100))

    #     all_documents_similarity_sorted_topXpercent = all_documents_similarity_sorted[:topXpercent]
    #     #print (all_documents_similarity)
    #     #print (all_documents_similarity_sorted)
    #     #print (topXpercent)
    #     #print (all_documents_similarity_sorted_topXpercent)

    ## SORT ONLY THOSE MUCH SIMILARITIES THAT ARE NEEDED - all_documents_similarity_sorted - dont sort all 7000 or so entires - Sort only to get the topXpercent similarity values
    all_documents_similarity_sorted_topXpercent = sortTopXpercent(
        all_documents_similarity, topXpercent)

    print("\nRating of most similar app:",
          all_documents_similarity_sorted_topXpercent[0][0][0][0])

    ## If highest similarity is < say 0.2 then tell the user to add more description for better results? - So as to tackle single word or       line descriptions. To tackle persistant users, add a button if they want analytics with only that much of description?
    if (all_documents_similarity_sorted_topXpercent[0][0][0][0] < 0.35):
        print(
            "For better analytics, enter more description specific to your app idea"
        )

    ## Print index of the description found to be similar
    #print (all_documents_similarity_sorted_topXpercent[0][1])

    ## Link Datasets and Find Weighted Average of Ratings and other details
    total_weight = 0
    total_weighted_rating = 0
    users_by_rating_dict = {
        "1.0": 0,
        "1.5": 0,
        "2.0": 0,
        "2.5": 0,
        "3.0": 0,
        "3.5": 0,
        "4.0": 0,
        "4.5": 0,
        "5.0": 0
    }
    ## For equalized 'number of users' - apple app store lets you rate with integers from 1 - 5
    users_by_rating_equalized_dict = {
        "One": 0,
        "Two": 0,
        "Three": 0,
        "Four": 0,
        "Five": 0
    }
    users_by_ageGroup_dict = {
        "Children_5": 0,
        "Teenager_13": 0,
        "Adult_18": 0,
        "Elderly_50": 0
    }
    total_users_that_rated = 0
    ## Arbitrary installs factor value, , through intuition - Apple dataset does not contain information about total installs so have to calculate an average value assuming that every 1 person among 'total_users_that_rated/installs_factor' number of persons rates the app
    installs_factor = 250  ## depends on topXpercent

    for i in range(len(all_documents_similarity_sorted_topXpercent)):
        #document_rating = data_full.iloc[(all_documents_similarity_sorted_topXpercent[i][1])]['user_rating']
        document_rating = rating_array[(
            all_documents_similarity_sorted_topXpercent[i][1])]
        document_rating_count = rating_count_array[(
            all_documents_similarity_sorted_topXpercent[i][1])]
        if document_rating_count == 0:
            continue
        if document_rating == 0:
            continue
        document_name = track_name_array[(
            all_documents_similarity_sorted_topXpercent[i][1])]
        document_id = all_documents_similarity_sorted_topXpercent[i][1]
        print("id:", document_id, "name:", document_name, "rating:",
              document_rating, "rating_count:", document_rating_count,
              "similarity:",
              all_documents_similarity_sorted_topXpercent[i][0][0][0])

        ## Find the final Average rating - Weighted average of ratings of topXpercent similar documents
        ## Considering Document weight = Similarity Score multiplied by document_rating_count
        document_weight = all_documents_similarity_sorted_topXpercent[i][0][0][
            0] * document_rating_count
        document_weighted_rating = document_weight * document_rating
        total_weighted_rating = total_weighted_rating + document_weighted_rating
        total_weight = total_weight + document_weight

        ## For the actual graph of "number of users" by "Rating Given"
        this_rating = str(document_rating)
        users_at_this_rating = users_by_rating_dict[this_rating]
        users_by_rating_dict[this_rating] = int(users_at_this_rating +
                                                document_weight)
        ## For the equalized graph of "number of users" by "Rating Given"
        ## Equalizing the number of user per rating:
        ## According to the 5 diff ratings, we make 5 diff dict of percentages of usage for the 5 ratings
        ## Arbitrary average percentage values, through intuition- Apple dataset does not contain information about total users per rating give, for each app. Therefore we use arbitrary average values
        users_by_rating_equalized_dict = users_by_rating_equalized_dict_modify_by_percentage(
            this_rating, users_by_rating_equalized_dict, document_weight)

        ## Keeping count of total_users_that_rated a particular app
        total_users_that_rated = total_users_that_rated + document_weight

        ## For the graph of "number of users" by "Age Group" -
        ## According to the 4 diff content ratings, we make 4 diff dict of percentages of usage for the 4 age groups
        ## Arbitrary average percentage values, through intuition- Apple dataset does not contain information about total user rating per age group or total installs per age group, for each app. Therefore we use arbitrary average values
        age_group = age_group_array[(
            all_documents_similarity_sorted_topXpercent[i][1])]
        users_by_ageGroup_dict = users_by_ageGroup_dict_modify_by_percentage(
            age_group, users_by_ageGroup_dict, document_weight)

        #print ("users:", document_weight, "rating:", document_rating, "previous users at this rating:", users_at_this_rating, "new users at this rating", users_by_rating_dict[this_rating])
        print("users:", document_weight, "rating:", document_rating,
              "Content rating:", age_group)

        #print ("\nCurrent users_by_ageGroup_dict:", users_by_ageGroup_dict)

    ## Final Average rating
    final_rating = total_weighted_rating / total_weight

    ## Total users by rating - actual (output not modified):
    print("\nUsers by rating - actual:", users_by_rating_dict)
    ## total_users_that_rated as the total number of ratings given
    print("\nTotal users that are likely to rate: ",
          int(total_users_that_rated))
    ## Total users by rating - equalized (output modified):
    ## users_by_rating_dict dictionary is not equalized, i.e it could be possible that:
    ## Consider the following final users_by_rating_dict dictionary - Users by rating: {'1.0': 0, '1.5': 0, '2.0': 0, '2.5': 0, '3.0': 503946, '3.5': 0, '4.0': 27265, '4.5': 9559, '5.0': 0}
    ## There are no users at rating 1.0, 2.0, and so on which will not give a distributed graph
    ## Therefore we have to equalize the graph to some extent so that the peaks get distributed and we get a smoother bar graph (This is possibly manipulation of dataset but the kaggle dataset does not have user count for each rating increment for any particular distribution, and that's why we have to normalize the graph)
    ## NOTE - Could not find a library for this so make a function for equalization?
    print("\nUsers by rating - equalized:", users_by_rating_equalized_dict)

    ## Total predicted installs
    factor = total_users_that_rated / installs_factor
    print("\nTotal installs: ", int(total_users_that_rated * factor))

    ## Total predicted "users that rated" ordered by age
    print("\nUsers that rated, ordered by age:", users_by_ageGroup_dict)

    ## Total predicted "installs" ordered by age
    for x in users_by_ageGroup_dict:
        temp4 = users_by_ageGroup_dict[x]
        users_by_ageGroup_dict[x] = int(temp4 * factor)
    print("\nInstalls, ordered by age:", users_by_ageGroup_dict)

    ## Print genre as the genre of the description with the highest similarity
    prime_genre = genre[all_documents_similarity_sorted_topXpercent[0][1]]
    print("\nPrime Genre: ", prime_genre)

    ## Print rounded off final rating
    print("\nPredicted rating: ", round(final_rating, 2))  ## 2 decimal places

    if (final_rating >= 4):
        selling_ability = '"Selling_Ability" : "Excellent"'
    elif (final_rating >= 3 and final_rating < 4):
        selling_ability = '"Selling_Ability" : "Good"'
    elif (final_rating >= 2 and final_rating < 3):
        selling_ability = '"Selling_Ability" : "Average"'
    elif (final_rating >= 1 and final_rating < 2):
        selling_ability = '"Selling_Ability" : "Poor"'

    frontend_json = ""

    ## Making a string containing all the output data in jsonic form
    frontend_json += '{ "Predicted_Rating" : ' + '"' + str(
        round(final_rating, 2)) + '",'
    frontend_json += ' ' + selling_ability
    frontend_json += ', "Detected_Genre" : ' + '"' + prime_genre + '",'
    frontend_json += ' "Total_Installs" : ' + '"' + str(
        int(total_users_that_rated * factor)) + '",'
    frontend_json += ' "Total_Users_That_Rated" : ' + '"' + str(
        int(total_users_that_rated)) + '",'

    ## changing single quotes in users_by_rating_equalized_dict keys to double quotes
    json_graph_dict_ratings = json.dumps(users_by_rating_equalized_dict)
    json_graph_dict_age_group = json.dumps(users_by_ageGroup_dict)
    frontend_json += ' "Graph_Users_By_Ratings" : ' + '[ ' + json_graph_dict_ratings + ' ],'
    frontend_json += ' "Graph_Installs_By_Age_Group" : ' + '[ ' + json_graph_dict_age_group + ' ],'

    top_3_string_concat = "{ "
    for k in range(0, 3):
        top_3_string_concat += '"' + 'One' + str(k + 1) + '" : [ '
        #print (top_3_string_concat)
        top_3_document_name = track_name_array[(
            all_documents_similarity_sorted_topXpercent[k][1])]
        top_3_document_rating = rating_array[(
            all_documents_similarity_sorted_topXpercent[k][1])]
        top_3_document_rating_count = rating_count_array[(
            all_documents_similarity_sorted_topXpercent[k][1])]
        top_3_this_document_installs = top_3_document_rating_count * (
            top_3_document_rating_count / installs_factor)
        this_description_trunc = (unmodified_description_array[
            (all_documents_similarity_sorted_topXpercent[k][1])][0:350]
                                  ).replace("\n", " ") + "..."
        top_3_dict_concat = '{ "Name" : "Name: ' + top_3_document_name + '",  "Rating" : "Rating: ' + str(
            top_3_document_rating
        ) + '", "Similarity_Score" : "Similarity Score: ' + str(
            round(
                all_documents_similarity_sorted_topXpercent[k][0][0][0] * 100)
        ) + '%", "This_Description" : "Description: ' + this_description_trunc + '" }'
        top_3_string_concat += top_3_dict_concat + ' ]'
        if (k != 2):
            top_3_string_concat += ', '
    top_3_string_concat += " }"

    frontend_json += ' "Top_3_Similar_Apps" : ' + '[ ' + top_3_string_concat + ' ]'
    frontend_json += ' }'

    print("FRONTEND\n")

    return frontend_json
def task1cFunc(userid):
    # read mltags, mlrating, mlmovies, movie-actor
    mltagsFile = pd.read_csv('mltags.csv')
    mlratingsFile = pd.read_csv('mlratings.csv')
    genomeFile = pd.read_csv('genome-tags.csv')
    movieFile = pd.read_csv('smallmlmovies.csv')

    # Exgtract tag from tagid
    genomeFile['tagid'] = genomeFile['tagId']
    del genomeFile['tagId']
    mltagsFile = pd.merge(mltagsFile, genomeFile, on='tagid')

    s = movieFile["genres"].str.split('|', expand=True).stack()
    i = s.index.get_level_values(0)
    movieFile = movieFile.loc[i].copy()
    movieFile["genres"] = s.values

    # Extract movie from movieid
    del movieFile['year']
    mlratingsFile = pd.merge(mlratingsFile, movieFile, on='movieid')
    mltagsFile = pd.merge(mltagsFile, movieFile, on='movieid')

    mltagsFileUser = mltagsFile.loc[mltagsFile['userid'] == userid]
    tagUserMovies = mltagsFileUser['moviename'].values
    mlratingsFileUser = mlratingsFile.loc[mlratingsFile['userid'] == userid]
    ratingUserMovies = mlratingsFileUser['moviename'].values
    tagRatingUserMovies = list(set(tagUserMovies) | set(ratingUserMovies))

    mltagsFileUser['timestamp'] = pd.to_datetime(mltagsFileUser['timestamp'])
    mltagsFileUser['timestamp'] = (mltagsFileUser['timestamp'] -
                                   dt.datetime(1970, 1, 1)).dt.total_seconds()
    mltagsFileUser['timestamp'] = \
        ((mltagsFileUser['timestamp'] - mltagsFileUser['timestamp'].min()) / (mltagsFileUser['timestamp'].max() - mltagsFileUser['timestamp'].min()+1))+1

    mlratingsFileUser['timestamp'] = pd.to_datetime(
        mlratingsFileUser['timestamp'])
    mlratingsFileUser['timestamp'] = (
        mlratingsFileUser['timestamp'] -
        dt.datetime(1970, 1, 1)).dt.total_seconds()
    mlratingsFileUser['timestamp'] = \
        ((mlratingsFileUser['timestamp'] - mlratingsFileUser['timestamp'].min()) / (mlratingsFileUser['timestamp'].max() - mlratingsFileUser['timestamp'].min()+1))+1

    commonTagRating = list(set(tagUserMovies) & set(ratingUserMovies))
    uncommonTag = list(set(tagUserMovies) ^ set(commonTagRating))
    uncommonRating = list(set(ratingUserMovies) ^ set(commonTagRating))

    timeWeights = {}
    for i in range(len(commonTagRating)):
        tag = mltagsFileUser.loc[mltagsFileUser['moviename'] ==
                                 commonTagRating[i]]['timestamp'].values[0]
        rating = mlratingsFileUser.loc[
            mlratingsFileUser['moviename'] ==
            commonTagRating[i]]['timestamp'].values[0]
        if tag > rating:
            timeWeights[commonTagRating[i]] = tag
        else:
            timeWeights[commonTagRating[i]] = rating

    for i in range(len(uncommonRating)):
        rating = mlratingsFileUser.loc[
            mlratingsFileUser['moviename'] ==
            uncommonRating[i]]['timestamp'].values[0]
        timeWeights[uncommonRating[i]] = rating

    for i in range(len(uncommonTag)):
        tag = mltagsFileUser.loc[mltagsFileUser['moviename'] ==
                                 uncommonTag[i]]['timestamp'].values[0]
        timeWeights[uncommonTag[i]] = tag
    #
    # deleting columns that are not required
    del mlratingsFile['timestamp']
    del mlratingsFile['imdbid']
    del mlratingsFile['userid']
    del mltagsFile['timestamp']
    del mltagsFile['userid']
    del mltagsFile['tagid']

    # creating a dictionary with movieid as key and a list of all tags associated with tha movie and removing duplicates
    movieGenreDict = {
        k: g['genres'].tolist()
        for k, g in movieFile.groupby('moviename')
    }
    movieGenreDict = {k: list(set(j)) for k, j in movieGenreDict.items()}

    # creating a dictionary with movieid as key and a list of all ratings given by a user for that particular movie and removing duplicates
    movieRatingDict = {
        k: g['rating'].tolist()
        for k, g in mlratingsFile.groupby('moviename')
    }
    movieRatingDict = {k: list(set(j)) for k, j in movieRatingDict.items()}

    # computing the average rating for all movies and storing in a dictionary
    avgRating = mlratingsFile.groupby('moviename').mean().reset_index()
    avgRatingDict = {
        k: g['rating'].tolist()
        for k, g in avgRating.groupby('moviename')
    }

    # List of unique movies, genres and ratings
    movieList = mlratingsFile.moviename.unique()
    movieList = np.asarray(movieList)
    movieListDict = dict(enumerate(movieList))
    genreList = movieFile.genres.unique()
    genreList = np.asarray(genreList)
    genreListDict = dict(enumerate(genreList))
    ratingList = mlratingsFile.rating.unique()
    ratingList = np.asarray(ratingList)
    ratingListDict = dict(enumerate(ratingList))

    movieListDictInverse = invertDictionary(movieListDict)
    genreListDictInverse = invertDictionary(genreListDict)
    ratingListDictInverse = invertDictionary(ratingListDict)

    movieNotWatched = list(set(movieList) ^ set(tagRatingUserMovies))

    # declaring a tensor with three modes - with movie, tags and ratings
    T = np.zeros((movieList.shape[0], genreList.shape[0], ratingList.shape[0]))
    arrayofvalues = []

    for i in movieList:
        if i in movieRatingDict:
            if i in movieGenreDict:
                movieTags = movieGenreDict[i]
                rList = movieRatingDict[i]
                for j in movieTags:
                    for k in rList:
                        mIndex = movieListDictInverse[i]
                        gIndex = genreListDictInverse[j]
                        rIndex = ratingListDictInverse[k]
                        avgRatingValue = avgRatingDict[i][0]
                        if k >= avgRatingValue:
                            T[mIndex, gIndex, rIndex] = 1
                            arrayofvalues.append([mIndex, gIndex, rIndex])
                        else:
                            T[mIndex, gIndex, rIndex] = 0

    # building the tensor using sktensor
    tensor = dtensor(T)

    # applying CP-decomposition with ALS(Alternating Least Squares)
    U, fit, itr, exectimes, P = cp_als(tensor, 5, init='random')

    latent_semantics_movie = pd.DataFrame(
        columns=['movie', 'ls1', 'ls2', 'ls3', 'ls4', 'ls5'])
    latent_semantics_movie['movie'] = movieList
    latent_semantics_movie['ls1'] = U[0][:, 0]
    latent_semantics_movie['ls2'] = U[0][:, 1]
    latent_semantics_movie['ls3'] = U[0][:, 2]
    latent_semantics_movie['ls4'] = U[0][:, 3]
    latent_semantics_movie['ls5'] = U[0][:, 4]

    x = latent_semantics_movie.loc[latent_semantics_movie['movie'].isin(
        tagRatingUserMovies)].values
    for i in range(len(x)):
        for j in range(1, len(x[0])):
            x[i][j] = x[i][j] * timeWeights.get(x[i][0])
    y = latent_semantics_movie.loc[latent_semantics_movie['movie'].isin(
        movieNotWatched)].values

    cossim = cosine_similarity(x[:, 1:], y[:, 1:])
    simDF = pd.DataFrame(cossim,
                         index=tagRatingUserMovies,
                         columns=movieNotWatched)
    simDF.to_csv('cos.csv')

    temp = simDF.values.tolist()
    sorted_movies_for_each_watched_movieDict = []
    for i in range(len(temp)):
        sorted_movies_for_each_watched_movie = np.argsort(temp[i])
        sorted_movies_for_each_watched_movieDict.append(
            sorted_movies_for_each_watched_movie.tolist()[:10])

    sortedMoviesRavel = [
        item for sublist in sorted_movies_for_each_watched_movieDict
        for item in sublist
    ]
    freq = {}
    for i in range(len(sorted_movies_for_each_watched_movieDict)):
        for j in range(len(sorted_movies_for_each_watched_movieDict[0])):
            freq[sorted_movies_for_each_watched_movieDict[i][j]] = 0

    for i in range(len(sorted_movies_for_each_watched_movieDict)):
        for j in range(len(sorted_movies_for_each_watched_movieDict[0])):
            freq[sorted_movies_for_each_watched_movieDict[i][j]] += (10 - j)

    freq = OrderedDict(sorted(freq.items(), reverse=True, key=lambda t: t[1]))
    freq = freq.items()

    recommendedMovies = []
    for i in range(10):
        index = freq[i][0]
        recommendedMovies.append(y[index][0])

    relevant = []
    notRelevant = []

    choice = 'y'
    while choice != 'n':
        rel_dict = {}
        selected_dict = {}
        N = 5
        R = 0
        for i in range(len(recommendedMovies)):
            print "If ", recommendedMovies[
                i], " is relevant, enter 1. If it is not relevant, enter 0"
            relevant.append(int(raw_input()))
            rel_dict[recommendedMovies[i]] = relevant[i]
            if relevant[i] == 1:
                R = R + 1
            else:
                notRelevant.append(recommendedMovies[i])

        genreset = set()
        for movie in recommendedMovies:
            genres_list = movieGenreDict[movie]
            selected_dict[movie] = genres_list
            genreset = genreset.union(set(genres_list))

        genreTop5 = list(genreset)
        ri = []
        ni = []
        for i in range(0, len(genreTop5)):
            ri.append(0)
            ni.append(0)
        for m in recommendedMovies:
            for i in range(0, len(genreTop5)):
                l1 = selected_dict[m]
                rval = rel_dict[m]
                if genreTop5[i] in l1:
                    ni[i] = ni[i] + 1
                    if rval == 1:
                        ri[i] = ri[i] + 1

        pr_feedback = {}

        for i in range(0, len(genreTop5)):
            try:
                numerator = ri[i] / (R - ri[i])
                denominator = (ni[i] - ri[i]) / (N - R - ni[i] + ri[i])
                pr = math.log((numerator / denominator), 2)
            except:
                numerator = (ri[i] + 0.5) / (R - ri[i] + 1)
                denominator = (ni[i] - ri[i] + 0.5) / (N - R - ni[i] + ri[i] +
                                                       1)
                pr = math.log((numerator / denominator), 2)

            pr_feedback[genreTop5[i]] = pr

        for key, value in pr_feedback.iteritems():
            pr_feedback[key] = (pr_feedback[key] - min(
                pr_feedback.values())) / max(pr_feedback.values())

        pr_dict = {}
        for i in movieList:
            if i in movieRatingDict:
                if i in movieGenreDict:
                    movieTags = movieGenreDict[i]
                    rList = movieRatingDict[i]
                    for j in movieTags:
                        for k in rList:
                            mIndex = movieListDictInverse[i]
                            tIndex = genreListDictInverse[j]
                            rIndex = ratingListDictInverse[k]
                            avgRatingValue = avgRatingDict[i][0]
                            if k >= avgRatingValue:
                                if j in genreTop5:
                                    T[mIndex, tIndex, rIndex] *= pr_feedback[j]

        tensor = dtensor(T)

        # applying CP-decomposition with ALS(Alternating Least Squares)
        U, fit, itr, exectimes, P = cp_als(tensor, 5, init='random')

        latent_semantics_movie = pd.DataFrame(
            columns=['movie', 'ls1', 'ls2', 'ls3', 'ls4', 'ls5'])
        latent_semantics_movie['movie'] = movieList
        latent_semantics_movie['ls1'] = U[0][:, 0]
        latent_semantics_movie['ls2'] = U[0][:, 1]
        latent_semantics_movie['ls3'] = U[0][:, 2]
        latent_semantics_movie['ls4'] = U[0][:, 3]
        latent_semantics_movie['ls5'] = U[0][:, 4]

        x = latent_semantics_movie.loc[latent_semantics_movie['movie'].isin(
            tagRatingUserMovies)].values
        for i in range(len(x)):
            for j in range(1, len(x[0])):
                x[i][j] = x[i][j] * timeWeights.get(x[i][0])
        y = latent_semantics_movie.loc[latent_semantics_movie['movie'].isin(
            movieNotWatched)].values
        cossim = cosine_similarity(x[:, 1:], y[:, 1:])
        simDF = pd.DataFrame(cossim,
                             index=tagRatingUserMovies,
                             columns=movieNotWatched)

        temp = simDF.values.tolist()
        sorted_movies_for_each_watched_movieDict = []
        for i in range(len(temp)):
            sorted_movies_for_each_watched_movie = np.argsort(temp[i])
            sorted_movies_for_each_watched_movieDict.append(
                sorted_movies_for_each_watched_movie.tolist()[:10])

        sortedMoviesRavel = [
            item for sublist in sorted_movies_for_each_watched_movieDict
            for item in sublist
        ]
        freq = {}
        for i in range(len(sorted_movies_for_each_watched_movieDict)):
            for j in range(len(sorted_movies_for_each_watched_movieDict[0])):
                freq[sorted_movies_for_each_watched_movieDict[i][j]] = 0

        for i in range(len(sorted_movies_for_each_watched_movieDict)):
            for j in range(len(sorted_movies_for_each_watched_movieDict[0])):
                freq[sorted_movies_for_each_watched_movieDict[i][j]] += (10 -
                                                                         j)

        freq = OrderedDict(
            sorted(freq.items(), reverse=True, key=lambda t: t[1]))
        freq = freq.items()

        recommendedMovies = []
        for i in range(10):
            index = freq[i][0]
            recommendedMovies.append(y[index][0])
        print recommendedMovies
        relevant = []

        print('Do you want to continue? Enter Y for yes and N for No')
        choice = raw_input()
        while choice not in ['y', 'n']:
            print('invalid input')
            choice = input()
Example #36
0
def buildSimilarityMatrix():

    #### Ideas for Optimization:
    # DONE if you come across a duplicate, for ex: (m1,m2) and (m2,m1) then look up this value and
    #

    WatchedBoth = (
        "MATCH (U:USER) WHERE (U:USER)-[:Has_rated]->(:MOVIE{id:{A}}) "
        "AND (U:USER)-[:Has_rated]->(:MOVIE{id:{B}})  RETURN U.id")

    findRating = "MATCH (USER {id:{user_id}})-[r:Has_rated]->(MOVIE{id:{movie_id}}) RETURN r.rating"

    numMovies = graph.evaluate("MATCH (m:MOVIE) RETURN COUNT(m)")

    m1_ratings = []
    m2_ratings = []
    angle_in_degrees = 0
    Row = []
    matrix = []

    pr = cProfile.Profile()

    # timing the queries

    for m1 in range(1, 3):  #numMovies+1):
        pr.enable()
        for m2 in range(1, numMovies + 1):

            if m2 < m1:
                angle_in_degrees = matrix[m2 - 1][m1 - 1]

            # on diagonal
            if m1 == m2: angle_in_degrees = 0

            else:
                # Find 'users' who've watched both m1 and m2
                users = graph.run(WatchedBoth, {
                    "A": m1,
                    "B": m2
                }).data()  #[0]['U.id']

                if len(users) == 0:
                    angle_in_degrees = 90

                else:

                    # create arrays of m1's and m2's ratings
                    for u in users:
                        m1rating = graph.evaluate(findRating, {
                            "user_id": u['U.id'],
                            "movie_id": m1
                        })
                        m1_ratings.append(m1rating)

                        m2rating = graph.evaluate(findRating, {
                            "user_id": u['U.id'],
                            "movie_id": m2
                        })
                        m2_ratings.append(m2rating)

                    # create vector v1 andv2
                    v1 = np.array(m1_ratings).reshape(1, -1)
                    v2 = np.array(m2_ratings).reshape(1, -1)

                    # calculate cosine similarity
                    similarity = cosine_similarity(v1, v2)
                    similarity = np.clip(similarity, -1, 1)
                    angle_in_radians = math.acos(similarity)
                    angle_in_degrees = math.degrees(angle_in_radians)

                    m1_ratings = []
                    m2_ratings = []
                    users = []

            Row.append(angle_in_degrees)
        pr.disable()
        pr.print_stats()
        matrix.append(Row)
        Row = []

    df = pd.DataFrame(matrix)

    print(df)

    return
Example #37
0
def docSimilarity(invIndex, query1, query2):
    documentsMatrix = sparseMatrix(invIndex, query1)
    queryVector = sparseMatrix(invIndex, query2)
    similarity = cosine_similarity(documentsMatrix, queryVector)
    return similarity
def return_top_ranked_sentences(news_content):
    import numpy as np
    import pandas as pd
    from nltk import tokenize

    def process_text(news_content):

        #inputfilepath = inputfilepath = "C:\\Users\\ekrigos\\Desktop\\DataS\\REVA\\finalyearProj\\pdf_analysis\\inputs_healthHazard\\"
        #filename = 'server_noise_health_hazard.pdf.txt'
        #        fileName1 = inputfilepath + filename
        #
        #        file = open(fileName1,"r")
        #
        #        fullText = file.read()
        #        file.close()

        tokens_lst = tokenize.sent_tokenize(news_content)
        #print(tokens_lst[:1])

        return tokens_lst

    sentences = process_text(news_content)
    #print(sentences[:1])

    #clean the data
    # remove punctuations, numbers and special characters
    clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

    # make alphabets lowercase
    clean_sentences = [s.lower() for s in clean_sentences]

    #remove stopwords
    #nltk.download('stopwords')
    from nltk.corpus import stopwords
    stop_words = stopwords.words('english')

    # function to remove stopwords
    def remove_stopwords(sen):
        sen_new = " ".join([i for i in sen if i not in stop_words])
        return sen_new

    # remove stopwords from the sentences
    clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

    #extract the word embeddings
    word_embeddings = {}
    f = open('c:\\datasets\\glove\\glove.6B.100d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs
    f.close()

    len(word_embeddings)

    #let's create vectors for our sentences
    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0:
            v = sum(
                [word_embeddings.get(w, np.zeros((100, )))
                 for w in i.split()]) / (len(i.split()) + 0.001)
        else:
            v = np.zeros((100, ))
        sentence_vectors.append(v)

    #similarity matrix representation
    #Let’s first define a zero matrix of dimensions (n * n).
    #We will initialize this matrix with cosine similarity scores of the sentences.
    #Here, n is the number of sentences.

    # similarity matrix
    sim_mat = np.zeros([len(sentences), len(sentences)])

    #We will use Cosine Similarity to compute the similarity between a pair of sentences.
    from sklearn.metrics.pairwise import cosine_similarity

    #And initialize the matrix with cosine similarity scores.
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(
                    sentence_vectors[i].reshape(1, 100),
                    sentence_vectors[j].reshape(1, 100))[0, 0]

    # =============================================================================
    #  let’s convert the similarity matrix sim_mat into a graph.
    #  The nodes of this graph will represent the sentences and
    #  the edges will represent the similarity scores between the sentences.
    #  On this graph, we will apply the PageRank algorithm to arrive at the sentence rankings.
    # =============================================================================

    import networkx as nx

    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)

    #Summary Extraction
    #extract the top N sentences based on their rankings for summary generation
    ranked_sentences = sorted(
        ((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    top_ranked_sentences = []

    # Extract top 10 sentences as the summary
    for i in range(10):
        #print(ranked_sentences[i][1])
        top_ranked_sentences.append(ranked_sentences[i][1])

    print("Inside Summary function")
    print(top_ranked_sentences)
    return top_ranked_sentences


#dump into a file
#with open(output_file_name_top_ranked, 'w') as filehandle:
#    for listitem in top_ranked_sentences:
#        filehandle.write('%s\n' % listitem)

##########
#https://www.analyticsvidhya.com/blog/2018/11/introduction-text-summarization-textrank-python/
#########
Example #39
0
def get_loadings(agg_doc_vecs_path, agg_dic_vecs_path, out_path, num_features, delimiter='\t'):
    """Get loadings between each document vector in agg-doc_vecs_path and each dictionary dimension in
    agg_dic_vecs_path"""


    n_docs = float(file_length.file_len(agg_doc_vecs_path))
    prog_counter = 0
    counter = 0
    dic_vecs = pd.read_csv(agg_dic_vecs_path, sep=delimiter)
    dic_vecs = dic_vecs.to_dict(orient='list')

    with open(agg_doc_vecs_path, 'rb') as doc_vecs, open(out_path, 'wb') as out_file:

        doc_vecs_reader = csv.reader(doc_vecs, delimiter='\t')
        doc_vecs_reader.next()

        writer = csv.writer(out_file, delimiter='\t')
        fieldnames_out = ['ID'] + dic_vecs.keys()

        writer.writerow(fieldnames_out)

        for doc_vec in doc_vecs_reader:

            prog_counter += 1
            counter += 1
            doc_id = doc_vec[0]
            out_row = [doc_id]

            for dic_vec in dic_vecs.keys():
                doc_vec = [float(x) for x in doc_vec[-num_features:]]
                dic_similarity = cosine_similarity(doc_vec, dic_vecs[dic_vec])[0][0]
                out_row.append(dic_similarity)

            writer.writerow(out_row)

            if prog_counter >= 0.05 * n_docs:
                prog_counter = 0
                update_progress(counter / (n_docs - 1))

        print 'Finished calculating document loadings'


#get_loadings('out_test.txt', 'dic_vecs_out_test.tsv', 'hope.tsv')
#
# if __name__ == "__main__":
#
# # This is not finished.
#
#     if sys.argv[1] is 'make_dic_vecs':
#
#         model, num_features, model_word_set = load_model(model_path=sys.argv[2])
#         dic_terms = getDicTerms(sys.argv[3])
#         dic_vecs = getAggDicVec(dic_terms)
#         writeDicVecs(dic_vecs=dic_vecs, out_path=sys.argv[4])
#
#     elif sys.argv[2] is 'make_doc_vecs':
#
#         model, num_features, model_word_set = load_model(model_path=sys.argv[2])
#         getAggDocVecs(docs_path=sys.argv[3], out_path=sys.argv[4], text_col=sys.argv[5])
#
#
#     elif sys.argv[3] is 'get_loadings':
#
#         model, num_features, model_word_set = load_model(model_path=sys.argv[2])
#         get_loadings(agg_doc_vecs=sys.argv[3], agg_dic_vecs=sys.argv[4], out_path=sys.argv[5])
Example #40
0
def eval_emb_metrics(hypothesis, references, emb=None):
    from sklearn.metrics.pairwise import cosine_similarity
    from nltk.tokenize import word_tokenize
    import numpy as np
    if emb is None:
        emb = Embedding()

    emb_hyps = []
    avg_emb_hyps = []
    extreme_emb_hyps = []
    for hyp in hypothesis:
        embs = [emb.vec(word) for word in word_tokenize(hyp)]

        avg_emb = np.sum(embs, axis=0) / np.linalg.norm(np.sum(embs, axis=0))
        assert not np.any(np.isnan(avg_emb))

        maxemb = np.max(embs, axis=0)
        minemb = np.min(embs, axis=0)
        extreme_emb = list(
            map(
                lambda x, y: x if ((x > y or x < -y) and y > 0) or (
                    (x < y or x > -y) and y < 0) else y, maxemb, minemb))

        emb_hyps.append(embs)
        avg_emb_hyps.append(avg_emb)
        extreme_emb_hyps.append(extreme_emb)

    emb_refs = []
    avg_emb_refs = []
    extreme_emb_refs = []
    for refsource in references:
        emb_refsource = []
        avg_emb_refsource = []
        extreme_emb_refsource = []
        for ref in refsource:
            embs = [emb.vec(word) for word in word_tokenize(ref)]

            avg_emb = np.sum(embs, axis=0) / np.linalg.norm(
                np.sum(embs, axis=0))
            assert not np.any(np.isnan(avg_emb))

            maxemb = np.max(embs, axis=0)
            minemb = np.min(embs, axis=0)
            extreme_emb = list(
                map(
                    lambda x, y: x if ((x > y or x < -y) and y > 0) or (
                        (x < y or x > -y) and y < 0) else y, maxemb, minemb))

            emb_refsource.append(embs)
            avg_emb_refsource.append(avg_emb)
            extreme_emb_refsource.append(extreme_emb)
        emb_refs.append(emb_refsource)
        avg_emb_refs.append(avg_emb_refsource)
        extreme_emb_refs.append(extreme_emb_refsource)

    cos_similarity = list(
        map(lambda refv: cosine_similarity(refv, avg_emb_hyps).diagonal(),
            avg_emb_refs))
    cos_similarity = np.max(cos_similarity, axis=0).mean()
    average = "EmbeddingAverageCosineSimilairty: %0.6f" % (cos_similarity)

    cos_similarity = list(
        map(lambda refv: cosine_similarity(refv, extreme_emb_hyps).diagonal(),
            extreme_emb_refs))
    cos_similarity = np.max(cos_similarity, axis=0).mean()
    extrema = "VectorExtremaCosineSimilarity: %0.6f" % (cos_similarity)

    scores = []
    for emb_refsource in emb_refs:
        score_source = []
        for emb_ref, emb_hyp in zip(emb_refsource, emb_hyps):
            simi_matrix = cosine_similarity(emb_ref, emb_hyp)
            dir1 = simi_matrix.max(axis=0).mean()
            dir2 = simi_matrix.max(axis=1).mean()
            score_source.append((dir1 + dir2) / 2)
        scores.append(score_source)
    scores = np.max(scores, axis=0).mean()
    greedy = "GreedyMatchingScore: %0.6f" % (scores)

    rval = "\n".join([average, extrema, greedy])
    return rval
Example #41
0
def main(output_dir, sim_threshold, bucket_size):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    dev_qids = set([
        19,
        23,
        27,
        34,
        35,
    ] + [7, 24])

    summary_data = []

    K_data = []
    for event in cuttsum.events.get_events():
        if event.query_num in dev_qids: continue

        print event

        semsim = event2semsim(event)
        istream = get_input_stream(event,
                                   False,
                                   extractor="goose",
                                   thresh=.8,
                                   delay=None,
                                   topk=20)
        prev_time = 0
        cache = None

        clusters = []

        max_h = len(event.list_event_hours()) - 1

        for h, hour in enumerate(event.list_event_hours()):
            if h % bucket_size != 0 and h != max_h:
                continue

            current_time = epoch(hour)
            input_sents = istream[
                (istream["timestamp"] < current_time) & \
                (istream["timestamp"] >= prev_time)]
            len_select = input_sents["lemmas stopped"].apply(len) > 10
            input_sents = input_sents[len_select]

            if len(input_sents) <= 1: continue

            stems = input_sents["stems"].apply(lambda x: ' '.join(x)).tolist()
            X = semsim.transform(stems)
            K = -(1 - cosine_similarity(X))
            K_ma = np.ma.masked_array(K, np.eye(K.shape[0]))
            Kmin = np.ma.min(K_ma)
            Kmax = np.ma.max(K_ma)
            median = np.ma.median(K_ma)[0]
            print "SYS TIME:", hour, "# SENTS:", K.shape[0],
            print "min/median/max pref: {}/{}/{}".format(Kmin, median, Kmax)

            #
            ap = AffinityPropagation(affinity="precomputed",
                                     verbose=True,
                                     max_iter=1000)
            ap.fit(K)
            labels = ap.labels_
            if ap.cluster_centers_indices_ != None:
                for c in ap.cluster_centers_indices_:
                    if cache == None:
                        cache = X[c]
                        updates_df = \
                            input_sents.reset_index(drop=True).iloc[c]
                        updates_df["query id"] = event.query_num
                        updates_df["system timestamp"] = current_time
                        summary_data.append(updates_df[[
                            "query id", "stream id", "sent id",
                            "system timestamp", "sent text"
                        ]].to_frame().T)

                    else:
                        Ksum = cosine_similarity(cache, X[c])
                        if Ksum.max() < sim_threshold:
                            cache = np.vstack([cache, X[c]])
                            updates_df = \
                                input_sents.reset_index(drop=True).iloc[c]
                            updates_df["query id"] = event.query_num
                            updates_df["system timestamp"] = current_time
                            summary_data.append(updates_df[[
                                "query id", "stream id", "sent id",
                                "system timestamp", "sent text"
                            ]].to_frame().T)

            prev_time = current_time

    df = pd.DataFrame(K_data, columns=["min", "max", "median"])
    print df
    print df.mean()
    print df.std()
    print df.max()
    df = pd.concat(summary_data)
    df["conf"] = .5
    df["team id"] = "AP"
    df["run id"] = "sim{}_bs{}".format(sim_threshold, bucket_size)
    print df
    of = os.path.join(
        output_dir,
        "ap." + "sim{}_bs{}.tsv".format(sim_threshold, bucket_size))
    cols = [
        "query id", "team id", "run id", "stream id", "sent id",
        "system timestamp", "conf"
    ]
    df[cols].to_csv(of, sep="\t", header=False, index=False)

def combinedfeatures(row):
    return row['keywords'] + ' ' + row['cast'] + ' ' + row[
        'genres'] + ' ' + row['director']


for feature in features:
    df[feature] = df[feature].fillna('')
df['combinedfeature'] = df.apply(combinedfeatures, axis=1)

#print(df['combinedfeature'])
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['combinedfeature'])

cosine_sim = cosine_similarity(count_matrix)


def get_index_from_title(title):
    return df[df.title == title]['index'].values[0]


def get_title_from_index(index):
    return df[df.index == index]['title'].values[0]


movie_user_liked = input('enter the movie name : ')
movie_index = get_index_from_title(movie_user_liked)
similar_movies = list(enumerate(cosine_sim[movie_index]))

sorted_similar_movies = sorted(similar_movies,
Example #43
0
def similarity(vec1, vec2):
    vec1 = vec1.reshape(1, -1)
    vec2 = vec2.reshape(1, -1)

    return cosine_similarity(vec1, vec2)[0][0]
Example #44
0
def first_level_grouping_within_group(feature_map_dict,
                                      encoded_list_rearrange_concat,
                                      mask_arr,
                                      all_keys,
                                      keys_1d,
                                      keys_2d,
                                      keys_3d=[]):
    height = 32
    width = 20
    relation_1d_df = pd.DataFrame(0, columns=keys_1d, index=keys_1d)
    relation_2d_df = pd.DataFrame(0, columns=keys_2d, index=keys_2d)
    relation_3d_df = pd.DataFrame(0, columns=keys_3d, index=keys_3d)
    num_data = len(encoded_list_rearrange_concat[0])
    timestep = 24
    # num_data
    for n in range(num_data):
        print('n: ', n)
        for ds_name1 in all_keys:
            # 1D case
            if ds_name1 in keys_1d:
                temp_arr1 = feature_map_dict[ds_name1][n, :]  # (24, 1, 1, 1)
                # (24, 1) - > [32, 20, 24]
                dim1 = temp_arr1.shape[0]  # number of layers in the 2d data
                #         dim1 = temp_arr1.shape[-1]  # number of layers in the 2d data
                for ds_name2 in all_keys:
                    # 1D VS 1D
                    if ds_name2 in keys_1d:
                        temp_arr2 = feature_map_dict[ds_name2][n, :]
                        sim_sparse = cosine_similarity(
                            temp_arr1.reshape(1, -1), temp_arr2.reshape(1, -1))
                        ave_SR = sim_sparse[0][0]
                        relation_1d_df.loc[ds_name1, ds_name2] += ave_SR

            # 2D case
            if ds_name1 in keys_2d:
                temp_arr1 = feature_map_dict[ds_name1][
                    n, :, :, :]  # [32, 20, 1]
                # duplicate to [32, 20, 24]
                temp_arr1_mean_dup = np.repeat(temp_arr1, timestep, axis=-1)
                for ds_name2 in all_keys:
                    # 2D Vs 2D
                    # all duplicate to 3D
                    if ds_name2 in keys_2d:

                        ave_SR = 0  # average spearman correlation
                        temp_arr2 = feature_map_dict[ds_name2][n, :, :, :]

                        compress_arr2 = remove_outside_cells(
                            temp_arr2, mask_arr)
                        compress_arr1 = remove_outside_cells(
                            temp_arr1, mask_arr)

                        sim_sparse = cosine_similarity(
                            compress_arr1.reshape(1, -1),
                            compress_arr2.reshape(1, -1))

                        ave_SR = sim_sparse[0][0]
                        relation_2d_df.loc[ds_name1, ds_name2] += ave_SR

            # 3D
            if ds_name1 in keys_3d:
                temp_arr1 = feature_map_dict[ds_name1][
                    n, :, :, :, :]  # [24, 32, 20, 1]
                temp_arr1 = np.squeeze(temp_arr1, axis=-1)  #[24, 32, 20]
                temp_arr1 = np.moveaxis(temp_arr1, 0, -1)  # (32, 20, 24)

                for ds_name2 in all_keys:

                    # 3D VS 3D
                    # flatten and compare. Because 3rd dimension contains
                    # temporal information
                    if ds_name2 in keys_3d:
                        temp_arr2 = feature_map_dict[ds_name2][n, :, :, :, :]
                        temp_arr2 = np.squeeze(temp_arr2,
                                               axis=-1)  #[24, 32, 20]
                        temp_arr2 = np.moveaxis(temp_arr2, 0,
                                                -1)  # (32, 20, 24)

                        ave_SR = 0  # average spearman correlation
                        compress_arr2 = remove_outside_cells(
                            temp_arr2, mask_arr)
                        compress_arr1 = remove_outside_cells(
                            temp_arr1, mask_arr)

                        sim_sparse = cosine_similarity(
                            compress_arr1.reshape(1, -1),
                            compress_arr2.reshape(1, -1))

                        ave_SR = float(sim_sparse[0][0])
                        relation_3d_df.loc[ds_name1, ds_name2] += ave_SR

    relation_1d_df = relation_1d_df / num_data
    relation_2d_df = relation_2d_df / num_data
    relation_3d_df = relation_3d_df / num_data
    return relation_1d_df, relation_2d_df, relation_3d_df
 def cosine_similarity(dataframe_1, dataframe_2):
     return float(cosine_similarity(dataframe_1.values, dataframe_2.values))
Example #46
0
def get_cosine_sim(*strs):
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)
def topic_wtv(wtv_model, n_top_words, topic_model):
    ret_val = []
    for topic in topic_model.components_:
        ret_val.append(np.sum(np.array([wtv_model[item[0]]*item[1] for item in  [(tf_feature_names[i],topic[i]) for i \
                               in np.argsort(topic)[:-n_top_words - 1:-1]]]),axis=0)/np.sum(np.sort(topic)[:-n_top_words - 1:-1]))
    return np.array(ret_val)


#Word for which reviews are requested
category = 'entertainment'
#Column name for True column
category_label = 'show'
#Weighted average word2vec vector
topic_av_wtv = topic_wtv(tweet_w2v, lda.components_.shape[1], lda)
#Calculate similarity of word2vec vector of word with LDA topic
topic_sim = cosine_similarity(topic_av_wtv, tweet_w2v[category].reshape(1, -1))
#Taking only top 3 important topic
topic_sim[np.argsort(topic_sim.reshape((1, -1)))[0][::-1][3:]] = 0
#Checking for review similarity with LDA topic distribution of review
rewiew_category = cosine_similarity(topic_sim.reshape((1, -1)), BoW_lda)
rest1['cat_sim_unscale'] = rewiew_category[0]
#Scaling review importance between 0 and 1
rest1['cat_sim'] = (rest1.cat_sim_unscale - min(rest1.cat_sim_unscale)) / (
    max(rest1.cat_sim_unscale) - min(rest1.cat_sim_unscale))
#Creating column for ground truth
rest1['Pizza_Italian'] = rest1.categories.apply(lambda a: 1 if (\
     ('Pizza' in a)\
     |('Italian' in a)) else 0)
rest1['beverage'] = rest1.categories.apply(lambda a: 1 if (\
     ('Tea Rooms' in a)|\
     ('Wineries' in a)|\
Example #48
0
def recommendation_drink_of_contents_based(LiquorNum="", stopword="", top=6):
    # 실제 각행렬간 유사도 계산된것은 이미 REDIS에 저장된상태
    # 새로운술이 들어오거나 기존의 술이 삭제되었을때에, 계산해둔다.

    ## DB에서 가져와야한다.
    drink_dataframe = pd.DataFrame(models.Liquor.objects.all().values())

    drink_dataframe.set_index(drink_dataframe['liquornumber'], inplace=True)
    drink_dataframe.drop(columns=['liquornumber'], inplace=True)

    indexList = list(drink_dataframe['liquorname'])
    target = int(LiquorNum) - 1
    keyword = indexList[target]
    print(keyword + " 의 와 유사한 술 계산 .... ")
    # 사용자가 못먹는 원재료가 들어간 술은 추천에서 제외
    if len(stopword) > 0:
        drink_dataframe.drop(index=[
            i for i, item in enumerate(drink_dataframe["liquoringredient"])
            if len(list(set([stopword]) & set(item.split(",")))) > 0
        ],
                             inplace=True)

    # 출처지역,종류,원재료를 제외한 숫자데이터에서 유사도 추출
    exceptList = [
        "liquorname", "liquorarea", "liquoringredient", "url", 'liquorcategory'
    ]
    drink_dataframe_without_literal = drink_dataframe.drop(columns=exceptList)

    # MinMax Scaling을 통한 데이터 정규화
    scaleList = [
        item for item in drink_dataframe_without_literal.columns
        if item not in exceptList
    ]

    # 정규화시킬List를 선정
    scaler = MinMaxScaler()
    drink_dataframe_without_literal[scaleList] = scaler.fit_transform(
        drink_dataframe_without_literal[scaleList])

    drink_datafrmae_with_normalization = drink_dataframe_without_literal[
        scaleList]

    # 피어슨&코사인 유사도 계산 dictionary
    similarity_dict = dict()

    # 피어스 유사도 추출 후 가장 항목이 높은 5가지 전통주 추천
    pearson_similarity_metrix = drink_datafrmae_with_normalization.T.corr(
        method="pearson").to_numpy()

    topid = sorted(range(len(pearson_similarity_metrix[target])),
                   key=lambda i: pearson_similarity_metrix[target][i])[-top:]

    return [i + 1 for i in reversed(topid)][1:]

    # For Testing
    # 코사인 유사도 추출후 가장 항목이 높은 5가지 전통주 추천
    cosine_similarity_metrix = cosine_similarity(
        drink_datafrmae_with_normalization)

    # 내가 찾을 술과 유사한것 1번째는 자기자신일꺼임
    index = LiquorNum

    topid = sorted(range(len(cosine_similarity_metrix[index])),
                   key=lambda i: cosine_similarity_metrix[index][i])[-top:]
    recommendation_drink_of_contents_based_top_five = []
    for i in range(top - 2, 0, -1):
        recommendation_drink_of_contents_based_top_five.append([
            np.array(indexList[2:])[topid][:-1][i],
            round(cosine_similarity_metrix[index][topid][:-1][i] * 100, 3)
        ])
    similarity_dict["cosine"] = recommendation_drink_of_contents_based_top_five
    return similarity_dict
Example #49
0
def pipeline_test(train, test, lim_unigram):
    """

    Process test set

    Returns:
        test_set: list, of numpy arrays

    """

    # Initialise
    heads = []
    heads_track = {}
    bodies = []
    bodies_track = {}
    body_ids = []
    test_heads = []
    test_heads_track = {}
    test_bodies = []
    test_bodies_track = {}
    test_body_ids = []

    # Identify unique heads and bodies
    for instance in train.instances:
        head = instance['Headline']
        body_id = instance['Body ID']
        if head not in heads_track:
            heads.append(head)
            heads_track[head] = 1
        if body_id not in bodies_track:
            bodies.append(train.bodies[body_id])
            bodies_track[body_id] = 1
            body_ids.append(body_id)

    for instance in test.instances:
        head = instance['Headline']
        body_id = instance['Body ID']
        if head not in test_heads_track:
            test_heads.append(head)
            test_heads_track[head] = 1
        if body_id not in test_bodies_track:
            test_bodies.append(test.bodies[body_id])
            test_bodies_track[body_id] = 1
            test_body_ids.append(body_id)

    # Create vectorizers and BOW and TF arrays for train set
    bow_vectorizer = CountVectorizer(max_features=lim_unigram,
                                     stop_words=stop_words)
    bow = bow_vectorizer.fit_transform(heads + bodies)

    tfreq_vectorizer = TfidfTransformer(use_idf=False).fit(bow)

    tfidf_vectorizer = TfidfVectorizer(max_features=lim_unigram, stop_words=stop_words).\
        fit(heads + bodies + test_heads + test_bodies)

    # Initialise
    test_set = []
    heads_track = {}
    bodies_track = {}
    cos_track = {}

    # Process test set
    for instance in test.instances:
        head = instance['Headline']
        body_id = instance['Body ID']
        if head not in heads_track:
            head_bow = bow_vectorizer.transform([head]).toarray()
            head_tf = tfreq_vectorizer.transform(
                head_bow).toarray()[0].reshape(1, -1)
            head_tfidf = tfidf_vectorizer.transform([head]).toarray().reshape(
                1, -1)
            heads_track[head] = (head_tf, head_tfidf)
        else:
            head_tf = heads_track[head][0]
            head_tfidf = heads_track[head][1]
        if body_id not in bodies_track:
            body_bow = bow_vectorizer.transform([test.bodies[body_id]
                                                 ]).toarray()
            body_tf = tfreq_vectorizer.transform(
                body_bow).toarray()[0].reshape(1, -1)
            body_tfidf = tfidf_vectorizer.transform(
                [test.bodies[body_id]]).toarray().reshape(1, -1)
            bodies_track[body_id] = (body_tf, body_tfidf)
        else:
            body_tf = bodies_track[body_id][0]
            body_tfidf = bodies_track[body_id][1]
        if (head, body_id) not in cos_track:
            tfidf_cos = cosine_similarity(head_tfidf,
                                          body_tfidf)[0].reshape(1, 1)
            cos_track[(head, body_id)] = tfidf_cos
        else:
            tfidf_cos = cos_track[(head, body_id)]
        feat_vec = np.squeeze(np.c_[head_tf, body_tf, tfidf_cos])
        test_set.append(feat_vec)

    return test_set
Example #50
0
#similary_sc.to_csv(direc + 'similarity_score') 
#%% 
similary_sc = pd.read_csv(join(direc, 'similarity_score'), names = ['score']) 
 
#%%
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, KernelPCA

tv = TfidfVectorizer(min_df=5, use_idf=True)
tv_matrix = tv.fit_transform(sentences)
tv_matrix = tv_matrix.toarray()
vocab = tv.get_feature_names()

similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)

#%%


##topic modeling
#lda = LatentDirichletAllocation(n_components=2, max_iter=2, random_state=0)
#dt_matrix = lda.fit_transform(tv_matrix)
#features = pd.DataFrame(dt_matrix, columns=['T1', 'T2'])
#tt_matrix = lda.components_
#
#for topic_weights in tt_matrix:
#    topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]
#    topic = sorted(topic, key=lambda x: -x[1])
#    topic = [item for item in topic if item[1] > 0.9]
Example #51
0
def visualize_heatmap(topic_model,
                      topics: List[int] = None,
                      top_n_topics: int = None,
                      n_clusters: int = None,
                      width: int = 800,
                      height: int = 800) -> go.Figure:
    """ Visualize a heatmap of the topic's similarity matrix

    Based on the cosine similarity matrix between topic embeddings,
    a heatmap is created showing the similarity between topics.

    Arguments:
        topic_model: A fitted BERTopic instance.
        topics: A selection of topics to visualize.
        top_n_topics: Only select the top n most frequent topics.
        n_clusters: Create n clusters and order the similarity
                    matrix by those clusters.
        width: The width of the figure.
        height: The height of the figure.

    Returns:
        fig: A plotly figure

    Usage:

    To visualize the similarity matrix of
    topics simply run:

    ```python
    topic_model.visualize_heatmap()
    ```

    Or if you want to save the resulting figure:

    ```python
    fig = topic_model.visualize_heatmap()
    fig.write_html("path/to/file.html")
    ```
    <iframe src="../../getting_started/visualization/heatmap.html"
    style="width:1000px; height: 720px; border: 0px;""></iframe>
    """

    # Select topic embeddings
    if topic_model.topic_embeddings is not None:
        embeddings = np.array(topic_model.topic_embeddings)
    else:
        embeddings = topic_model.c_tf_idf

    # Select topics based on top_n and topics args
    if topics is not None:
        topics = list(topics)
    elif top_n_topics is not None:
        topics = sorted(topic_model.get_topic_freq().Topic.to_list()[1:top_n_topics + 1])
    else:
        topics = sorted(list(topic_model.get_topics().keys()))

    # Order heatmap by similar clusters of topics
    if n_clusters:
        if n_clusters >= len(set(topics)):
            raise ValueError("Make sure to set `n_clusters` lower than "
                             "the total number of unique topics.")

        embeddings = embeddings[[topic + 1 for topic in topics]]
        distance_matrix = cosine_similarity(embeddings)
        Z = linkage(distance_matrix, 'ward')
        clusters = fcluster(Z, t=n_clusters, criterion='maxclust')

        # Extract new order of topics
        mapping = {cluster: [] for cluster in clusters}
        for topic, cluster in zip(topics, clusters):
            mapping[cluster].append(topic)
        mapping = [cluster for cluster in mapping.values()]
        sorted_topics = [topic for cluster in mapping for topic in cluster]
    else:
        sorted_topics = topics

    # Select embeddings
    indices = np.array([topics.index(topic) for topic in sorted_topics])
    embeddings = embeddings[indices]
    distance_matrix = cosine_similarity(embeddings)

    # Create nicer labels
    new_labels = [[[str(topic), None]] + topic_model.get_topic(topic) for topic in sorted_topics]
    new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels]
    new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels]

    fig = px.imshow(distance_matrix,
                    labels=dict(color="Similarity Score"),
                    x=new_labels,
                    y=new_labels,
                    color_continuous_scale='GnBu'
                    )

    fig.update_layout(
        title={
            'text': "<b>Similarity Matrix",
            'y': .95,
            'x': 0.55,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        width=width,
        height=height,
        hoverlabel=dict(
            bgcolor="white",
            font_size=16,
            font_family="Rockwell"
        ),
    )
    fig.update_layout(showlegend=True)
    fig.update_layout(legend_title_text='Trend')

    return fig
def Redundancy(xi, xj, count_vect, tfidf):
    #xi and xj are two sentences in the summary
    return cosine_similarity(getTfidf(xi, count_vect, tfidf),
                             getTfidf(xj, count_vect, tfidf)).flatten()[0]
Example #53
0
import streamlit as st
from stream import rec_sim, simulate_matches2, recommendations3, match3
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import *


df_scaled2 = pd.read_csv('df_scaled2.csv', index_col='Team')
team_list = df_scaled2.index.tolist()
df_stats_poss3 = pd.read_csv('df_stats_poss3.csv', index_col='Team')
cos_sim1 = cosine_similarity(df_scaled2)
df_leagues = pd.read_csv('df_stats_leagues.csv', index_col=0)
indices = pd.Series(df_stats_poss3.index)
df_merged2 = pd.read_csv('df_merged2.csv')
df_merged2 = df_merged2.drop(df_merged2.columns[0], axis=1)
col_list1 = df_merged2.columns[62:91]
col_list2 = df_merged2.columns[6:62]
col_list2 = col_list2.append(df_merged2.columns[1:3])
results = df_merged2.drop(['home_goals', 'away_goals', 'Home', 'Away'], 1)
results['winner'] = None
results['winner'][results.score > 0] = 1
results['winner'][results.score < 0] = 2
results['winner'][results.score == 0] = 0
y = results['winner'].astype(int)
X = results[col_list1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20)
lr = LogisticRegression(C=0.1)
lr.fit(X_train, y_train)

def simulate_matches2(team, team2, n_matches=50):
Example #54
0
def occ_vec(u, v):

    return cosine_similarity(u, v)[0][0]
Example #55
0
def get_cos_sim(dset,
                n_cats,
                dtype,
                dset_name,
                version,
                sim_type,
                IPC_dict=None):
    """
    This will take a dataset and calculate the cosine similiarity within and
    between classes, producing a csv with results and updating a main doc.

    :param dset: data to be tested, csv, (pd or np array?)
    :param n_cats: number of classes (items per-class calculated as items/classes)
    :param dtype: binary, chan_dist or chanProp.  only needed for labelling
    :param dset_name: of dataset eg HBHW, HBLW, LBHW, LBLW
    :param version: number with 2 versions of each type
    :param sim_type: Describe the similarity e.g., HBHW or vary etc
    :param IPC_dict: defalt = None.  if the number of items per class is not
                    equal, enter a dict


    """
    print("\nrunning ** get_cos_sim()**")

    file_path = "/home/nm13850/Documents/PhD/python_v2/experiments/" \
                "within_between_dist_july2020/New_data/"
    if running_on_laptop():
        file_path = '/Users/nickmartin/Library/Mobile Documents/com~apple~CloudDocs/' \
                    'Documents/PhD/python_v2/experiments/' \
                    'within_between_dist_july2020/New_data/'

    save_path = os.path.join(file_path, 'similarity_details')

    # # enter either 'cos_sim, 'cos_dist' or 'taxi'
    distance = 'cos_sim'

    dataset = np.asarray(dset)
    items, features = np.shape(dataset)
    print(f'\ndataset: {dataset}')
    print(f'items, features: {items}, {features}')

    # add IPC dict here if class_sizes are not equal
    if IPC_dict is None:
        cat_size = int(items / n_cats)
        IPC_dict = {i: cat_size for i in range(n_cats)}
        print(f'\nequal size IPC dict\n{IPC_dict}')
    else:
        print("using IPC dict")

    # separate out the individual classes
    # start with class inidices list containing zero, index of the first class
    class_indices = [0]
    IPC_vals = list(IPC_dict.values())
    print(f'\nIPC_vals: {IPC_vals}')
    for i in range(n_cats):
        next_val = class_indices[-1] + IPC_vals[i]
        class_indices.append(next_val)

    #  list of items numbers to start each class
    start_indices = class_indices[:n_cats]
    # print(f'\nstart_indices: {start_indices}')

    # list of indices to end each class
    end_indices = class_indices[1:]
    # print(f'end_indices: {end_indices}')

    # 1. define classes as slices of dataset array
    class_list = []
    names_list = []

    for cat in range(n_cats):
        this_name = f'class_{cat}'
        names_list.append(this_name)

        this_class = dataset[start_indices[cat]:end_indices[cat], :]
        class_list.append(this_class)

        # print(f'\n{this_name}\n{this_class}\n')

    # within class similarities
    # 3. make empty list to store results.
    within_list = []

    for index, this_cat in enumerate(class_list):
        # print(f'\ngetting within class cos_sim for {names_list[index]}')

        # will do all pairwise comparrisons within the given category
        if distance in [
                'cos_sim', 'cosine_similarity', 'cosine_sim', 'cos_similarity'
        ]:
            within_cat = cosine_similarity(this_cat)
            # the SIMILARITY between two identical vectors will be 1
        elif distance in [
                'cos_dist', 'cosine_distance', 'cosine_dist', 'cos_distance'
        ]:
            within_cat = cosine_distances(this_cat)
            # this DISTANCE between two identical vectors will be 0
            # Cosine_distance = 1 - cosine_similarity
        elif distance in ['manhattan', 'taxi']:
            within_cat = manhattan_distances(this_cat)
        else:
            raise ValueError('must input a valid distance name')

        # print(within_cat)

        # just take the triangle since this analysis compares items with themselves
        triangle_indices = np.triu_indices(IPC_dict[index], 1)
        values_for_descriptives = (within_cat[triangle_indices])
        # print(values_for_descriptives)

        data_similarity_descriptives = scipy.stats.describe(
            values_for_descriptives, axis=None)
        mean_sim = str(np.round(data_similarity_descriptives.mean, decimals=2))
        print(
            f"\nWithin group mean {distance} for {names_list[index]}: {mean_sim}"
        )

        within_list.append(mean_sim)

    print(f'\nwithin_list ({distance}): {within_list}\n')

    # between class similarities.
    print('\nbetween class similarities')
    '''
    For each pair of classes
    - get the similarities of each item in one class to each item in the other class.
    - take the average of the whole matrix (not just the triangle) to get the 
    mean similaritiy between these two classes.
    
    These mean between class similarities go into an n_cats x n_cats-1 matrix.
    (n_cats-1 because I am not going to have diagonals comparing classes with themselves.  
    Each row shows a classes similarity to all other classes.
    - Take the average of each row to a get a class's mean between class similarity.
    
    Example below shows 4 classes (rows) and the values show which other class is being compared.
    e.g., class1 is compared with classes 2, 3, 4.  Class2 is compared with classes 1, 3, 4.
           compA   compB   compC
    class1: 2       3       4
    class2: 1       3       4
    class3: 1       2       4
    class4: 1       2       3
    '''

    class_pairs_list = list(combinations(class_list, 2))
    class_names_list = list(combinations(names_list, 2))
    class_index_list = list(combinations(range(n_cats), 2))
    print(
        f'running {len(class_index_list)} between class comparrrions.\n{class_index_list}'
    )
    between_array = np.zeros(shape=(n_cats, n_cats - 1))

    for index, cat_pair in enumerate(class_pairs_list):
        cat_a = cat_pair[0]
        cat_name_a = class_names_list[index][0]

        cat_b = cat_pair[1]
        cat_name_b = class_names_list[index][1]

        print(f'\nbetween class {distance} for: {cat_name_a} and {cat_name_b}')

        # # do all pairwise comparrisons between the classes
        if distance in [
                'cos_sim', 'cosine_similarity', 'cosine_sim', 'cos_similarity'
        ]:
            between_pairs_matrix = cosine_similarity(X=cat_a, Y=cat_b)
        elif distance in [
                'cos_dist', 'cosine_distance', 'cosine_dist', 'cos_distance'
        ]:
            between_pairs_matrix = cosine_distances(X=cat_a, Y=cat_b)
        elif distance in ['manhattan', 'taxi']:
            between_pairs_matrix = manhattan_distances(X=cat_a, Y=cat_b)
        else:
            raise ValueError('must input a valid distance name')

        print(f'{between_pairs_matrix}')
        mean_between_pair = np.mean(between_pairs_matrix)
        print(f'mean_between_pair: {mean_between_pair}')

        # append to between array in both (ofset) diagonals
        idxA, idxB = class_index_list[index]
        print(f'add to matrix position: {idxA}, {idxB}')
        between_array[idxA, idxB - 1] = mean_between_pair
        between_array[idxB, idxA] = mean_between_pair

    print(f"\nbetween_array:\n{between_array}")

    print(f'\nmean between class {distance}')
    between_list = []
    for index in range(n_cats):
        this_row = between_array[index]
        this_mean = np.mean(this_row)
        between_list.append(this_mean)
        print(index, this_mean)

    print("I want to get the mean of the between list and the within list")
    dset_between_mean = np.mean(between_list)
    dset_between_sd = np.std(between_list)
    print(
        f"dataset mean between class distance: {dset_between_mean} std.dev: {dset_between_sd}"
    )

    print(f"check within list:\n{within_list}")
    within_list_num = [float(i) for i in within_list]
    print(f"check within_list_num:\n{within_list_num}")

    dset_within_mean = np.mean(within_list_num)
    dset_within_sd = np.std(within_list_num)
    print(
        f"dataset mean within class distance: {dset_within_mean} std.dev: {dset_within_sd}"
    )

    # # save output.
    '''for each class:
       mean within
       mean between
       paired between 
    '''
    names_list.append('Dset_means')
    names_list.append('Dset_sd')
    within_list.append(dset_within_mean)
    within_list.append(dset_within_sd)
    between_list.append(dset_between_mean)
    between_list.append(dset_between_sd)

    class_sim_dict = {
        'class': names_list,
        'between': between_list,
        'within': within_list
    }
    class_sim_df = pd.DataFrame(class_sim_dict)
    print(class_sim_df)
    csv_name = f'{dset_name}_{distance}.csv'
    csv_path = os.path.join(save_path, csv_name)
    class_sim_df.to_csv(
        csv_path,
        index_label='class',
    )

    # check if similiarity summary exists
    similarity_info = [
        dtype, dset_name, sim_type, version, n_cats, dset_between_mean,
        dset_between_sd, dset_within_mean, dset_within_sd
    ]
    print(f"similarity_info:\n{similarity_info}")

    # check if training_info.csv exists
    summary_name = 'similarity_summary.csv'
    print(f"\nlooking for file:\n{os.path.join(save_path, summary_name)}")
    if not os.path.isfile(os.path.join(save_path, summary_name)):
        print("making summary page")
        headers = [
            "dtype", "dset_name", 'sim_type', "version", "n_cats", "mean_b",
            "sd_b", "mean_w", "sd_w"
        ]

        similarity_overview = open(os.path.join(save_path, summary_name), 'w')
        mywriter = csv.writer(similarity_overview)
        mywriter.writerow(headers)
    else:
        print("appending to summary page")
        similarity_overview = open(os.path.join(save_path, summary_name), 'a')
        mywriter = csv.writer(similarity_overview)

    mywriter.writerow(similarity_info)
    similarity_overview.close()

    return_dict = {
        "dtype": dtype,
        "dset_name": dset_name,
        'sim_type': sim_type,
        "version": version,
        "n_cats": n_cats,
        "dset_between_mean": dset_between_mean,
        "dset_between_sd": dset_between_sd,
        "dset_within_mean": dset_within_mean,
        "dset_within_sd": dset_within_sd
    }

    return return_dict
def Relevant(xi, count_vect, tfidf, documents):
    #xi is a sentence in the summary
    return cosine_similarity(getTfidf(xi, count_vect, tfidf),
                             getTfidf(' '.join(documents), count_vect,
                                      tfidf)).flatten()[0] + getPosition(
                                          xi, documents)
Example #57
0
    # update weights after example
    for e in range(0, len(input_feed)):  #for all items in input
        x = sess.run(optimizer,
                     feed_dict={
                         train_inputs: [input_feed[e]],
                         train_labels: [output_feed[e]]
                     })

    # collect vectors
    inp_vectors = {}
    for v in range(0, len(vocab)):
        inp_vectors[vocab[v]] = sess.run(
            embed, feed_dict={train_inputs: [word_to_index[vocab[v]]]})

    # calculate similarities
    for v in inp_vectors:
        for vv in inp_vectors:
            sim_dict[v][vv].append(
                cosine_similarity(inp_vectors[v], inp_vectors[vv])[0][0])

print('Bass - Acoustic: ', np.mean(sim_dict['bass']['acoustic']))
print('Bass - Trout: ', np.mean(sim_dict['bass']['trout']))

print('Bass - Acoustic Std: ', np.std(sim_dict['bass']['acoustic']))
print('Bass - Trout Std: ', np.std(sim_dict['bass']['trout']))

#dframe = pd.DataFrame(sim_dict)
#dframe.to_pickle('Random_Isub_100runs.pkl')

sess.close()
Example #58
0
def get_response(q):
    my_q = vectorizer.transform([q])
    cs = cosine_similarity(my_q, vec)
    rs = pd.Series(cs[0]).sort_values(ascending=0)
    rsi = rs.index[0]
    return convo_frame.iloc[rsi]['a']
def main():
	synopses = []
	Id = []
	title = []
	tags = []
	for questions in question_cursor:
		synopses.append(questions["Body"])
		Id.append(questions["Id"])
		title.append(questions["Title"])
		tags.append(questions["Tags"])


	#use extend so it's a big flat list of vocab
	totalvocab_stemmed = []
	totalvocab_tokenized = []
	for i in synopses:
		i = text_preprocess(i)
		i = re.sub(r'python|Python|[^A-Za-z0-9. ]+',' ',i)
		allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
		totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list

		allwords_tokenized = tokenize_only(i)
		totalvocab_tokenized.extend(allwords_tokenized)

	vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
	print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')
	print(vocab_frame.head())
	print()

	from sklearn.feature_extraction.text import TfidfVectorizer

	#define vectorizer parameters
	tfidf_vectorizer = TfidfVectorizer(max_df=0.5, max_features=200000,
		min_df=0.2, stop_words='english',
		use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
	
	tfidf_matrix = tfidf_vectorizer.fit_transform(synopses) #fit the vectorizer to synopses
	print(tfidf_matrix.shape)
	

	terms = tfidf_vectorizer.get_feature_names()

	from sklearn.metrics.pairwise import cosine_similarity
	dist = 1 - cosine_similarity(tfidf_matrix)
	print
	print
	
	from sklearn.cluster import KMeans
	num_clusters = 5
	km = KMeans(n_clusters=num_clusters)
	km.fit(tfidf_matrix)
	clusters = km.labels_.tolist()

	#from sklearn.externals import joblib

	#uncomment the below to save your model 
	#since I've already run my model I am loading from the pickle
	# joblib.dump(km,  'doc_cluster.pkl')
	# km = joblib.load('doc_cluster.pkl')
	#clusters = km.labels_.tolist()

	#uncomment the below to save your model 
	since I've already run my model I am loading from the pickle
	joblib.dump(km,  'doc_cluster.pkl')
	km = joblib.load('doc_cluster.pkl')
	clusters = km.labels_.tolist()


	#posts = {'Title': title, "Id": Id, 'synopsis': synopses, 'cluster': clusters}
	posts = {"Id": Id, 'synopsis': synopses, 'cluster': clusters}
	#frame = pd.DataFrame(posts, index = [clusters] , columns = ['Title', 'Id', 'cluster'])
	frame = pd.DataFrame(posts, index = [clusters] , columns = ['Id', 'cluster'])
	print(frame['cluster'].value_counts()) #number of films per cluster (clusters from 0 to 4)
			
	print("Top terms per cluster:")
	print()
	#sort cluster centers by proximity to centroid
	order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

	for i in range(num_clusters):
		print("Cluster %d words: " % i, end='')
		
		for ind in order_centroids[i, :]: #replace 6 with n words per cluster
			print(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0], end=',')
			#print(' %s' % frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')

		print() #add whitespace
		print() #add whitespace

		print("Cluster %d ids:" % i, end='')
		for id in frame.ix[i]['Id'].values.tolist():
			print(' %s,' % id, end='  ')

		print() #add whitespace
		print() #add whitespace
Example #60
0
        else:
            return ''

# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres']) #.replace(u'\xa0', u'')

metadata['soup'] = metadata.apply(create_soup, axis=1)

# import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])

# compute the cosine similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
# reset index of your main dataframe and construct reverse mapping as before
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])

print(get_recommendations('The Dark Knight Rises', cosine_sim2))

print(get_recommendations('The Godfather', cosine_sim2))