コード例 #1
0
 def graph_communities(self,
                       data,
                       min_value_=0.6,
                       topn_=400,
                       k1=30,
                       expected_density=0.1,
                       graph_communities_df=None):
     if not graph_communities_df is None:
         self.graph_clusters = graph_communities_df
         return graph_communities_df
     graph_clusters = product_finder(data,
                                     min_value_=min_value_,
                                     topn_=topn_,
                                     k1=k1,
                                     expected_density=expected_density)
     try:
         graph_clusters = apply_word_embedings(
             graph_clusters['clustered_data'], model_name=self.model_name
         ).assign(
             starting_date=graph_clusters['clustered_data'].date_min,
             last_modified_date=graph_clusters['clustered_data'].date_max)
     except:
         graph_clusters = apply_word_embedings(
             graph_clusters['clustered_data'],
             model_name=self.model_name).assign(
                 starting_date=datetime.datetime.today().strftime(
                     '%Y-%m-%d'),
                 last_modified_date=datetime.datetime.today().strftime(
                     '%Y-%m-%d'))
     if not 'counter' in graph_clusters.columns:
         self.graph_clusters = graph_clusters.assign(counter=1)
     else:
         self.graph_clusters = graph_clusters
     return self.graph_clusters
コード例 #2
0
    def linkage(self, title_clusters, method='ward'):

        try:
            data = np.array([i[0][0] for i in title_clusters.word_vector])
            Z = fastcluster.linkage_vector(data, method=method)
        except AttributeError:
            title_clusters = apply_word_embedings(title_clusters)
            data = np.array([i[0][0] for i in title_clusters.word_vector])
            Z = fastcluster.linkage_vector(data, method=method)

        return Z
コード例 #3
0
    def linkage(self, title_clusters, method='ward', linkage_matrix=None):
        if not linkage_matrix is None:
            self.linkage_matrix = linkage_matrix
            return linkage_matrix

        try:
            data = np.array([i[0][0] for i in title_clusters.word_vector])
            Z = fastcluster.linkage_vector(data, method=method)
        except AttributeError:
            title_clusters = apply_word_embedings(title_clusters,
                                                  model_name=self.model_name)
            data = np.array([i[0][0] for i in title_clusters.word_vector])
            Z = fastcluster.linkage_vector(data, method=method)

        return Z
コード例 #4
0
def search_engine_fasttext(data_ , productsDB ,min_sim=0.9,topn = 1,model_name='model_fast_text_sg_40',column_name_db = 'word_vector',column_name_data = 'word_vector',model_folder_path = r'C:\ProductClustering\ProductsDB\fasttext_models\\', metric ='cosine',pre_computed_word_vectors = True):
    
    g = os.path.join(os.path.dirname(model_folder_path), model_name)
    model_fast_text = pickle.load(open(g, 'rb'))
    print('generating reference csr_matrix from data')
    reference_cv_matrix = csr_matrix(np.array([i[0][0] for i in productsDB[column_name_db]]))
    data_ =  data_.assign(product_id_fasttext = 0)
    reference_product_id = productsDB['product_id']

    if not pre_computed_word_vectors:        
        print('applying word embedings to data')                  
        data = apply_word_embedings(data_)
        cv_matrix = csr_matrix(np.array([i[0][0] for i in data[column_name_data]]))   
        print('computing similarity matrix. metric = {}'.format(metric))
        sim_matrix = pairwise_cosine_sparse_sim(cv_matrix,reference_cv_matrix, topn = topn, min_value=min_sim)
        iterable = sim_matrix.shape[0]
        labels = []
        if topn > 1:
            for ad in range(iterable):        
                if np.max(sim_matrix[ad].A) >= min_sim:
                    labels.append(reference_product_id.iloc[np.argmax(sim_matrix[ad].A)])
                else:
                    labels.append(-1)
                print(ad)
        else:
            for ad in range(iterable):        
                if np.max(sim_matrix[ad].A) >= min_sim:
                    labels.append(reference_product_id.iloc[sim_matrix[ad].nonzero()[1][0]])  
                else:
                    labels.append(-1)
                print(ad)
                    
        data_ = data.assign(product_id_fasttext = labels)
    else:
        cv_matrix = csr_matrix(np.array([i[0][0] for i in data_[column_name_data]]))
        print('computing similarity matrix. metric = {}'.format(metric))
        sim_matrix = pairwise_cosine_sparse_sim(cv_matrix,reference_cv_matrix, topn = topn, min_value=min_sim)
        iterable = sim_matrix.shape[0]
        labels = []
        for ad in range(iterable):        
            if np.max(sim_matrix[ad].A) > min_sim:
                labels.append(reference_product_id.iloc[np.argmax(sim_matrix[ad].A)])
            else:
                labels.append(-1)  
            print(ad)
        data_ = data_.assign(product_id_fasttext = labels)
    
    return data_
コード例 #5
0
    def get_similar_products(self,
                             product_id=123,
                             top_n=10,
                             title=None,
                             column_name='word_vector'):

        if type(title) == None:
            try:
                csr1 = csr_matrix(
                    np.array([
                        i[0][0] for i in self.products_db[column_name][
                            self.products_db['product_id'].isin(product_id)]
                    ]))
                ads = product_id
            except:
                csr1 = csr_matrix(
                    np.array([
                        i[0][0] for i in self.products_db[column_name][
                            self.products_db['product_id'].isin([product_id])]
                    ]))
                ads = [product_id]
        else:
            if type(title) == str:
                title = [title]
            titles = apply_word_embedings(pd.DataFrame({'ad_title': title}),
                                          model_name=self.model_name)
            csr1 = csr_matrix(
                np.array([i[0][0] for i in titles['word_vector']]))
            ads = title

        csr2 = csr_matrix(
            np.array([i[0][0] for i in self.products_db[column_name]]))
        sim_matrix = pairwise_cosine_sparse_sim(csr1,
                                                csr2,
                                                topn=top_n,
                                                min_value=0,
                                                word_embedings=True)

        labels = {}
        for index, ad in enumerate(ads):
            labels[ad] = self.products_db.iloc[np.where(
                sim_matrix[index].A > 0.01)[1]].assign(
                    similarity=sim_matrix[index].A[np.where(
                        sim_matrix[index].A > 0.01)]).sort_values(
                            by='similarity', ascending=False)

        return labels
コード例 #6
0
 def graph_communities(self,
                       sample,
                       min_value_=0.6,
                       topn_=400,
                       k1=30,
                       expected_density=0.1):
     graph_clusters = product_finder(sample,
                                     min_value_=min_value_,
                                     topn_=topn_,
                                     k1=k1,
                                     expected_density=expected_density)
     graph_clusters = apply_word_embedings(
         graph_clusters['clustered_data']).assign(
             starting_date=datetime.datetime.today().strftime('%Y-%m-%d'),
             last_modified_date=datetime.datetime.today().strftime(
                 '%Y-%m-%d'))
     self.graph_clusters = graph_clusters.assign(counter=1)
     return graph_clusters.assign(counter=1)
コード例 #7
0
    def handle_unlabeled(self,
                         data,
                         max_product_id,
                         clustering_algorithm='agglomerative'):
        unknown_products = apply_word_embedings(data,
                                                model_name=self.model_name)

        if clustering_algorithm == 'agglomerative':

            unknown_data = np.array(
                [i[0][0] for i in unknown_products.word_vector])
            cluster_ = fastcluster.linkage_vector(unknown_data, method='ward')
            cluster_labels = Cluster.hierarchy.fcluster(cluster_, 0.2)
            unknown_products = unknown_products.assign(
                product_id=cluster_labels)

        elif clustering_algorithm == 'community':
            unknown_products = self.graph_communities(
                unknown_products,
                min_value_=0.8,
                topn_=400,
                k1=50,
                expected_density=0.1,
                graph_communities_df=None)
コード例 #8
0
a = products_db_finder(model_name=model_name)

data = get_data(
    period=360,
    final_date_string='today',
    date_format="%Y-%m-%d",
    query_path='C:\ProductClustering\sql_queries\categories_metrics.txt')
cluster_graph = a.graph_communities(
    data[['ad_id', 'ad_title', 'category_id', 'date_min', 'date_max']])
g1 = a.group_by_product(cluster_graph)
link = a.linkage(g1)
g1.word_vector

g1 = a.group_by_product(
    apply_word_embedings(
        pd.read_csv(r'C:\ProductClustering\productsDB\classtreste.csv')))
link = pickle.load(
    open(r'C:\ProductClustering\productsDB\model_fast_text_sg_40_linkage',
         'rb'))

g1 = a.group_by_product(cluster_graph)
hierarchy = g1.assign(
    product_id=a.hierarchycal_clustering(link, threshold=0.3))
hierarchy[hierarchy.counter > 2].product_id.nunique()
g2 = a.group_by_product(hierarchy)
g3 = g2[g2.counter > 2]

a.export_db_dic('C:\ProductClustering\productsDB\products_db_objects\seeds\\',
                seed_name_export)

teste = a.get_similar_products(product_id=123,