def compare_different_method(self, name, args, sampling=False, times=1):
     file_name = '{} {} sampling={} times={}'.format(
         name, self._number_article_per_test_cluster, sampling, times)
     print(file_name)
     result_table = {}
     for time_counter in range(times):
         print(time_counter)
         articles = self._get_test_articles(sampling)
         for (feature, linkage, threshold, sim, quick, use_idf) in args:
             t = time.time()
             self._feature_extraction(feature, articles, use_idf=use_idf)
             if quick:
                 clusters = HAC(threshold, linkage=linkage,
                                similarity=sim).quick_fit(articles)
             else:
                 clusters = HAC(threshold, linkage=linkage,
                                similarity=sim).fit(articles)
             result = validate_clustering(self._labeled_clusters, clusters)
             result['time'] = time.time() - t
             key = '{} {} {} {} {} {}'.format(feature, linkage, threshold,
                                              sim, quick, use_idf)
             if key not in result_table:
                 result_table[key] = []
             result_table[key].append(result)
     self._print_test_result(result_table)
     self._save_as_csv(result_table, 'compare all', file_name)
    def compare_extraction(self, args, sampling=False, times=1):
        file_name = 'extraction {} sampling={} times={}'.format(
            self._number_article_per_test_cluster, sampling, times)
        print(file_name)
        result_table = {}
        for time_counter in range(times):
            print(time_counter)
            articles = self._get_test_articles(sampling)
            for (method, k, linkage, threshold, with_weight) in args:
                t = time.time()
                invalid_id_list = self._feature_extractor.fit_with_extraction(
                    articles, method, k, with_weight=with_weight)
                for invalid_id in invalid_id_list:
                    removed = False
                    for cluster in self._labeled_clusters:
                        if removed:
                            break
                        for article in cluster['articles']:
                            if article.id == invalid_id:
                                cluster['articles'].remove(article)
                                removed = True
                                break

                clusters = HAC(
                    threshold, linkage=linkage,
                    similarity=HAC.SIMILARITY_DOT).quick_fit(articles)
                result = validate_clustering(self._labeled_clusters, clusters)
                result['time'] = time.time() - t
                key = 'method{} k={} {} {} weight={}'.format(
                    method, k, linkage, threshold, with_weight)
                if key not in result_table:
                    result_table[key] = []
                result_table[key].append(result)
        self._print_test_result(result_table)
        self._save_as_csv(result_table, 'compare all', file_name)
 def find_ratio_threshold(self,
                          method,
                          k,
                          t,
                          c,
                          start_th=0.3,
                          end_th=0.8,
                          step=0.05,
                          sampling=True,
                          times=1):
     file_name = 'ratio th method{} k={} t={} c={} sampling={} times={}'.format(
         method, k, t, c, sampling, times)
     print(file_name)
     result_table = {}
     for time_counter in range(times):
         print(time_counter)
         articles = self._get_test_articles(sampling)
         self._feature_extractor.fit_with_extraction_ratio(
             articles, method, k, t, c)
         threshold = start_th
         while threshold < end_th + step:
             print('threshold', threshold)
             clusters = HAC(
                 threshold,
                 linkage=HAC.LINKAGE_CENTROID,
                 similarity=HAC.SIMILARITY_DOT).quick_fit(articles)
             result = validate_clustering(self._labeled_clusters, clusters)
             key = 'th{} method{} k{} t{} c{}'.format(
                 threshold, method, k, t, c)
             if key not in result_table:
                 result_table[key] = []
             result_table[key].append(result)
             threshold += step
     self._print_test_result(result_table)
     self._save_as_csv(result_table, self._feature_mode, file_name)
 def find_best_threshold(self,
                         linkage,
                         sim,
                         quick,
                         start_th=0.3,
                         end_th=0.8,
                         step=0.05,
                         sampling=True,
                         times=1):
     file_name = 'threshold {} {} quick={} idf={} sampling={} times={}'.format(
         linkage, sim, quick, self.use_idf, sampling, times)
     print(file_name)
     result_table = {}
     for time_counter in range(times):
         print(time_counter)
         articles = self._get_test_articles(sampling)
         threshold = start_th
         while threshold < end_th + step:
             print('threshold', threshold)
             if quick is True:
                 clusters = HAC(threshold, linkage=linkage,
                                similarity=sim).quick_fit(articles)
             else:
                 clusters = HAC(threshold, linkage=linkage,
                                similarity=sim).fit(articles)
             result = validate_clustering(self._labeled_clusters, clusters)
             key = '{0:.2f}'.format(threshold)
             if key not in result_table:
                 result_table[key] = []
             result_table[key].append(result)
             threshold += step
     self._print_test_result(result_table)
     self._save_as_csv(result_table, self._feature_mode, file_name)
    def stable_test(self, times=3):
        file_name = 'stable_test times={}'.format(times)
        result_table = {}

        for time_counter in range(times):
            articles = self._get_test_articles(False)
            random.shuffle(articles)
            print('time counter', time_counter)
            for key in [HAC(0.55).quick_fit, HAC(0.55).fit]:
                clusters = key(articles)
                result = validate_clustering(self._labeled_clusters, clusters)
                algorithm_name = str(key).split(' ')[2]
                if algorithm_name not in result_table:
                    result_table[algorithm_name] = []
                print(result)
                result_table[algorithm_name].append(result)
        self._print_test_result(result_table)
        self._save_as_csv(result_table, self._feature_mode, file_name)
    def compare_time_feature(self,
                             name,
                             threshold,
                             linkage,
                             sim,
                             sampling=False,
                             times=1):
        file_name = '{} {} sampling={} times={}'.format(
            name, self._number_article_per_test_cluster, sampling, times)
        print(file_name)
        result_table = {}
        for time_counter in range(times):
            print(time_counter)
            articles = self._get_test_articles(sampling)
            for i in range(3):
                t = time.time()
                if i == 0:
                    clusters = HAC(threshold, linkage=linkage,
                                   similarity=sim).fit(articles)
                    key = 'normal {} {} {}'.format(linkage, threshold, sim)
                elif i == 1:
                    clusters = HAC(threshold, linkage=linkage,
                                   similarity=sim).quick_fit(articles,
                                                             time_order=True)
                    key = 'time_order {} {} {}'.format(linkage, threshold, sim)
                else:
                    clusters = HAC(threshold, linkage=linkage,
                                   similarity=sim).quick_fit(articles,
                                                             time_order=False)
                    key = 'random {} {} {}'.format(linkage, threshold, sim)

                result = validate_clustering(self._labeled_clusters, clusters)
                result['time'] = time.time() - t
                if key not in result_table:
                    result_table[key] = []
                result_table[key].append(result)
        self._print_test_result(result_table)
        self._save_as_csv(result_table, 'compare all', file_name)
    def compare(self, sim, quick, args, sampling=False, times=1):
        file_name = 'compare {} quick={} sampling={} times={}'.format(
            sim, quick, sampling, times)
        print(file_name)
        result_table = {}
        for time_counter in range(times):
            print(time_counter)
            articles = self._get_test_articles(sampling)
            for linkage, threshold in args:
                if quick is True:
                    clusters = HAC(threshold, linkage=linkage,
                                   similarity=sim).quick_fit(articles)
                else:
                    clusters = HAC(threshold, linkage=linkage,
                                   similarity=sim).fit(articles)
                result = validate_clustering(self._labeled_clusters, clusters)
                key = '{}-{}'.format(linkage, threshold)
                if key not in result_table:
                    result_table[key] = []
                result_table[key].append(result)

        self._print_test_result(result_table)
        self._save_as_csv(result_table, self._feature_mode, file_name)
 def compare_ratio(self, method, k, args, sampling=True, times=1):
     file_name = 'compare ratio method{} k={} sampling={} times={}'.format(
         method, k, sampling, times)
     print(file_name)
     result_table = {}
     for time_counter in range(times):
         print(time_counter)
         articles = self._get_test_articles(sampling)
         for t, c, threshold in args:
             print('t ratio', t)
             self._feature_extractor.fit_with_extraction_ratio(
                 articles, method, k, t, c)
             clusters = HAC(threshold,
                            linkage=HAC.LINKAGE_CENTROID,
                            similarity=HAC.SIMILARITY_DOT).fit(articles)
             result = validate_clustering(self._labeled_clusters, clusters)
             key = 't{} c{} th{} method{} k{} '.format(
                 t, c, threshold, method, k)
             if key not in result_table:
                 result_table[key] = []
             result_table[key].append(result)
     self._print_test_result(result_table)
     self._save_as_csv(result_table, self._feature_mode, file_name)
def print_validation_result(labeled_clusters, clusters):
    print("\n===============clustering validation===============")
    validate_result = validate_clustering(labeled_clusters, clusters)
    for key in sorted(validate_result):
        print(key, "{0:.2f}".format(validate_result[key]))