Ejemplo n.º 1
0
    def sample_for_test(self, parameter):
        self.__parameter_theta = parameter["FAVOR"]
        size_ = self.__data_size
        matrix_l = np.zeros([size_, size_])
        for row_i in range(size_):
            for col_j in range(row_i, size_):
                num = self.__calculate_matrix_element(row_i, col_j)
                matrix_l[row_i][col_j] = num
                matrix_l[col_j][row_i] = num
        tmp_ans = dict()
        list_y, eigenvalue = ds.sample(matrix_l)
        tmp_ans["FAVOR"] = list_y

        self.__parameter_theta = parameter["AGAINST"]
        tmp = range(size_)
        diff = list(set(tmp) - set(list_y))
        self.__f_x_test = self.__f_x_test[diff][:]
        self.__similarity = self.__similarity[diff][:, diff]
        size_ = len(diff)
        log.debug(self.__f_x_test)
        for row_i in range(size_):
            for col_j in range(row_i, size_):
                num = self.__calculate_matrix_element(row_i, col_j)
                matrix_l[row_i][col_j] = num
                matrix_l[col_j][row_i] = num
        list_y, eigenvalue = ds.sample(matrix_l)

        tmp_ans["AGAINST"] = []
        tmp_ans["NONE"] = []
        for i in range(size_):
            if i in list_y:
                tmp_ans["AGAINST"].append(diff[i])
            else:
                tmp_ans["NONE"].append(diff[i])
        return tmp_ans
Ejemplo n.º 2
0
 def __get_doc2vec_matrix(self, path):
     log.info('use word2vec')
     self.quality_method__ = "word2vec"
     self.distance_method__ = "100"
     word2vec_matrix = read_file(path)
     word2vec_matrix = word2vec_matrix[2:len(word2vec_matrix) - 1]
     self.__key_word = [vec_.split(u" ")[0] for vec_ in word2vec_matrix]
     log.debug("word2vec key words: \n" + "\t".join(self.__key_word))
     word2vec = np.array([(vec_.encode("utf-8")).split(" ")[1:]
                          for vec_ in word2vec_matrix])
     word2vec = word2vec.astype(np.float64)
     return word2vec.dot(word2vec.transpose()) * 1000
Ejemplo n.º 3
0
 def __get_word_frequency(self, key_word):
     frequency = []
     orig_doc_ = " ".join(self.__paper).split(" ")
     union_word = set(orig_doc_)
     for word in key_word:
         frequency.append(word + '\t' + str(orig_doc_.count(word)))
     frequency.append("not in word")
     for word in union_word:
         if word in key_word:
             continue
         frequency.append(word + '\t' + str(orig_doc_.count(word)))
     log.debug("word frequency: \n" + "\n".join(frequency))
     return frequency
Ejemplo n.º 4
0
    def __learning_single_attitude(self, attitude):
        """
        learning parameters __parameter_theta, which is the parameters of quality model
        :param input_x: input data of X
        :param label_y: label y
        :return: L-matrix
        """
        log.info('learning: ' + attitude)
        self.__parameter_theta = np.random.random(size=self.__feature_size)
        self.__sample = sp.DppSampling(100, self.__feature_size)
        self.__initiate_f_x()
        grad = self.calculate_gradient(attitude)
        best_f = 0.0
        iter_count = 0
        log.debug("grad")
        log.info(grad)
        while (not self.__whether_end(grad)) and iter_count < 1000:
            log.info("interation: " + str(iter_count))
            new_f, ignore, ignore_value = self.__sample.sampling(self.__parameter_theta, attitude)
            if new_f > best_f:
                best_f = new_f
                self.__final_parameter[attitude] = copy.deepcopy(self.__parameter_theta)
            log.debug("grad")
            log.info(grad)
            log.debug("parameter")
            log.debug(self.__parameter_theta)
            self.__parameter_theta = self.__parameter_theta - self.__step * grad
            grad = self.calculate_gradient(attitude)
            iter_count += 1

        self.__tmp_eigenvalue[attitude] = copy.deepcopy(self.__sample.get_best_eigenvalue())
        self.__tmp_answer[attitude] = copy.deepcopy(self.__sample.get_best_answer())
Ejemplo n.º 5
0
    def get_mss_paper_summary(self, file_name, if_write_file=True):
        """
        generate summary for one paper, single document summarization
        :param lang:
        :param file_name: current file name, used for write summary answer
        :param if_write_file: whether write generated summary to answer file named file_name
        :return:
        """
        # initial
        self.__quality, self.__paper_name = None, file_name
        self.quality_method__ = ""

        if self.stop_word_method == "remove_stop":
            self.__paper = read_file(self.__child_path + "RemoveStop.temp")
        elif self.stop_word_method == "with_stop":
            self.__paper = read_file(self.__child_path + "word_segment.temp")
        self.__titles = read_file(self.__child_path + "titles.temp")
        self.__paper_original = read_file(self.__child_path +
                                          "word_segment.temp")
        self.__sub_paper_len = [
            int(i) for i in read_file(self.__child_path + "sec_idx.temp")
        ]

        # extract sentence
        feature_subset, eig = self.__cal_candidate_set()
        #print len(feature_subset)
        #        feature_subset = range(len(self.__paper_original))
        #        eig = []
        log.info(feature_subset)
        log.debug(eig)
        # use feature list to extract summary
        summary = self.__construct_summary(feature_subset, eig)

        if if_write_file:
            if file_name == '':
                log.error("file name is empty")
                return ""
            # write answer to file for ROUGE
            #print self.__rouge_path
            answer_path = self.__child_path
            write_file(
                summary,
                os.path.join('%s/%s.txt' %
                             (answer_path, file_name + '_result')), False)

        return "".join(summary)
Ejemplo n.º 6
0
    def __cal_candidate_set(self):
        matrix_l = self.__cal_matrix()
        subset_ = []
        eigenvalue = []
        #print self.candidate_method
        try:
            if self.candidate_method == "DR":
                subset_, eigenvalue = ds.sample(matrix_l)
                #print len(subset_)

            elif self.candidate_method == "CLU-DPP":
                cluster = hlda_analysis.sentence_cluster(
                    self.__child_path, "run000")
                # debug hLDA message, include: total cluster number, each cluster sentence,
                i = 0
                tmp = ""
                log.debug("cluster number: " + str(len(cluster)))
                for sen_list in cluster:
                    tmp += "\n cluster: " + str(
                        i) + "\tsentence_num is " + str(len(sen_list)) + "\n"
                    tmp += "\n".join(np.array(self.__paper_original)[sen_list])
                    i += 1
                log.debug(tmp)
                # begin calculate and get sentence
                for i in range(len(cluster) / 2):
                    sen_list = cluster[i]
                    tmp_matrix = matrix_l[sen_list][:, sen_list]
                    tmp_set, eig = ds.sample(tmp_matrix)
                    if len(sen_list) < 10:
                        subset_.append(sen_list)
                        eigenvalue.append(eig)
                        continue
                    subset_.append(np.array(sen_list)[tmp_set].tolist())
                    eigenvalue.append(np.array(eig)[tmp_set].tolist())
            elif self.candidate_method == "RANDOM":
                for i in range(20):
                    subset_.append(
                        np.random.randint(0, len(self.__paper_original)))
            else:
                raise RuntimeError("value error: " + self.candidate_method)
        except RuntimeError as e:
            log.error(e)
        finally:
            return subset_, eigenvalue
Ejemplo n.º 7
0
    def calculate_gradient(self, attitude):
        # compute L(x; theta) as in equation (155)
        log.debug(self.__parameter_theta)
        log.info("computing matrix L")
        for row_i in range(self.__data_size):
            for col_j in range(self.__data_size):
                self.__matrix_l[row_i][col_j] = self.__calculate_matrix_element(row_i, col_j)

        # Eigendecompose L(x; theta)
        log.info("engendecomposing")
        log.debug("matrix value: " + str(np.linalg.det(self.__matrix_l)))
        (eigenvalue, feature_vector) = np.linalg.eig(self.__matrix_l)
        for i in range(len(eigenvalue)):
            eigenvalue[i] = float(eigenvalue[i])
            if np.abs(eigenvalue[i]) < 0.000000001:
                eigenvalue[i] = 0.0
        log.debug("eigenvalue")
        log.debug(eigenvalue)

        # calculate K_ii
        log.info("calculating Kii")
        vector_k = np.zeros(self.__data_size)
        log.debug("feature value matrix")
        for i in range(self.__data_size):
            for j in range(self.__data_size):
                vector_k[i] += ((eigenvalue[j] / (eigenvalue[j] + 1)) * (feature_vector[i][j] ** 2))

#        log.debug("Kii: " + str(vector_k))
        # calculate gradient
        log.info("calculating gradient")
        sigma_sub_f_x = np.zeros(self.__feature_size)
        for index in range(self.__data_size):
            if self.__train_label[index] == attitude:
                sigma_sub_f_x += self.__f_x[index]

        sigma_kii_f_x = np.zeros(self.__feature_size)
        for i in range(self.__data_size):
            sigma_kii_f_x += vector_k[i] * self.__f_x[i]

        return sigma_sub_f_x - sigma_kii_f_x
Ejemplo n.º 8
0
 def get_f_score(self, experiment_label, attitude, i, j):
     count = 0
     answer_count = 0
     exp_count = 0
     for idx in range(j):
         idx += i
         if self.__test_label[idx] == attitude:
             answer_count += 1
             if experiment_label[idx] == attitude:
                 count += 1
         if experiment_label[idx] == attitude:
             exp_count += 1
     if count == 0 or exp_count == 0 or answer_count == 0:
         return 0.0
     precision = float(count) / exp_count
     recall = float(count) / answer_count
     log.debug("precision is: " + str(precision))
     log.debug("recall is: " + str(recall))
     f = precision * recall * 2 / (recall + precision)
     log.debug("f is: " + str(f))
     return f
Ejemplo n.º 9
0
    def get_mss_paper_summary(self, lang, file_name, if_write_file=True):
        """
        generate summary for one paper, single document summarization
        :param lang:
        :param file_name: current file name, used for write summary answer
        :param if_write_file: whether write generated summary to answer file named file_name
        :return:
        """
        # initial
        self.__quality, self.__paper_name = None, file_name
        self.quality_method__ = ""
        '''
        if DATA == "mms2015":
            self.__all_file.merge_mms_2015(os.path.dirname(self.__child_path), "chinese")
        elif DATA == "mss2017":
            if lang in ["vi", "ka"]:
                self.__all_file.merge_mss_2017(os.path.dirname(self.__child_path))
            else:
                self.__all_file.merge_mss_2017_ros(os.path.dirname(self.__child_path))
        self.__paper_original = self.__all_file.get_merged_paper()
        if self.stop_word_method == "remove_stop":
            self.__paper = self.__all_file.get_filtered_paper()
        elif self.stop_word_method == "with_stop":
            self.__paper = self.__all_file.get_merged_paper()
        self.__titles = self.__all_file.get_titles()
        # used for generate hLDA input file and calculate level method.
        if (not os.path.exists(self.__child_path + "model.temp")) or False:
            write_file(self.__paper, self.__child_path + "RemoveStop.temp", False)
            write_file(self.__paper_original, self.__child_path + "word_segment.temp", False)
            model_temp(self.__paper, self.__child_path)
            return ""
        '''
        if self.stop_word_method == "remove_stop":
            self.__paper = read_file(self.__child_path + "RemoveStop.temp")
        elif self.stop_word_method == "with_stop":
            self.__paper = read_file(self.__child_path + "word_segment.temp")
        self.__titles = read_file(self.__child_path + "titles.temp")
        self.__paper_original = read_file(self.__child_path +
                                          "word_segment.temp")
        self.__sub_paper_len = [
            int(i) for i in read_file(self.__child_path + "sec_idx.temp")
        ]
        # extract sentence
        feature_subset, eig = self.__cal_candidate_set()
        #        feature_subset = range(len(self.__paper_original))
        #        eig = []
        log.error("results is: ")
        log.info(feature_subset)
        log.debug(eig)
        # use feature list to extract summary
        summary = self.__construct_summary(feature_subset, eig, lang)
        if if_write_file:
            if file_name == '':
                log.error("file name is empty")
                return ""
            # write answer to file for ROUGE
            answer_path = self.__rouge_path + lang + "/systems/"
            write_file(summary,
                       os.path.join('%s%s.txt' % (answer_path, file_name)),
                       False)
            '''
            # generate gold summary split by CHAR
            gold_path = self.__rouge_path + lang + "/models/"
            if not os.path.exists(gold_path):
                os.makedirs(gold_path)
            tmp_name = lang + "/" + file_name + "_summary.txt"
            abs_human = read_file('./data/MultiLing2015-MSS/multilingMss2015Eval/summary/' + tmp_name)
            if not os.path.exists(gold_path + file_name + "_summary.txt") and lang != "vi" and lang != 'ka':
                write_file([" ".join(api.tokenize("\n".join(abs_human)))],
                           gold_path + file_name + "_summary.txt", False)
            if lang == "vi":
                write_file(abs_human, gold_path + file_name + "_summary.txt", False)
            # generate configure file of each document for ROUGE
            conf_path = self.__rouge_path + lang + "/configure/"
            if not os.path.exists(conf_path):
                os.makedirs(conf_path)
            tmp_conf_ = answer_path + file_name + ".txt " + gold_path + file_name + "_summary.txt"
            self.__all_conf.append(tmp_conf_)
            write_file([tmp_conf_], os.path.join('%s/%s.txt' % (conf_path, file_name)), False)
            '''

        return "".join(summary)
Ejemplo n.º 10
0
    def __construct_summary(self, sentence_subset, eig, lang="zh"):
        summary = []
        sum_length = 0
        if self.summary_method == "QD":
            for sentence_idx in sentence_subset:
                if sum_length < self.max_sum_len__:
                    tmp_sen = self.__paper_original[sentence_idx]
                    # if lang in ['ja', 'th', 'zh']:
                    #     summary.append(" ".join([i for i in tmp_sen]))
                    # else:
                    #     summary.append(tmp_sen)
                    summary.append(tmp_sen)
                else:
                    break
                sum_length += len(self.__paper_original[sentence_idx])
            quality = np.array(self.__quality)
            quality[sentence_subset] = -999
            while sum_length < self.max_sum_len__:
                max_quality = np.where(quality == np.max(quality))
                tmp_summary = np.array(self.__paper_original)[max_quality]
                tmp_sen = "\n".join(tmp_summary.tolist())
                summary.append(tmp_sen)
                sum_length += len(tmp_sen)
                quality[max_quality] = -999
            summary = [
                " ".join(("\n".join(summary)[:self.max_sum_len__]).split("\n"))
            ]
        elif self.summary_method == "DM":
            print len(self.__key_word)
            key_word = set(np.array(self.__key_word)[sentence_subset])
            self.__get_word_frequency(key_word)
            common_number = np.zeros([len(self.__paper_original)])
            for i in range(len(self.__paper_original)):
                common_number[i] = len(
                    key_word.intersection(
                        set(self.__paper_original[i].split(" "))))
            while sum_length < self.max_sum_len__:
                b = np.where(common_number == np.max(common_number))
                common_number[b] = 0
                for sen in np.array(self.__paper_original)[b].tolist():
                    summary.append(sen)
                    sum_length = len(summary)
                    if sum_length > self.max_sum_len__:
                        break
            summary = [
                " ".join(("\n".join(summary)[:self.max_sum_len__]).split("\n"))
            ]
        elif self.summary_method == "newDM":
            key_word = set(np.array(self.__key_word)[sentence_subset])
            self.__get_word_frequency(key_word)
            common_number = np.zeros([len(self.__paper_original)])
            selected_sen = []
            while sum_length < self.max_sum_len__ and len(key_word) > 0:
                for i in range(len(self.__paper_original)):
                    if i in selected_sen:
                        continue
                    common_number[i] = len(
                        key_word.intersection(
                            set(self.__paper_original[i].split(" "))))
                b = np.where(common_number == np.max(common_number))
                common_number[b] = 0
                key_word -= key_word.intersection(
                    set(self.__paper_original[b[0][0]].split(" ")))
                selected_sen += b[0].tolist()
                for sen in np.array(self.__paper_original)[b].tolist():
                    summary.append(sen)
                    sum_length = len(summary)
                    if sum_length > self.max_sum_len__:
                        break
            summary = [
                " ".join(("\n".join(summary)[:self.max_sum_len__]).split("\n"))
            ]
        elif self.summary_method == "hDPP":
            tmp_summary = np.array(self.__paper_original)[sentence_subset]
            matrix_l = self.__doc_matrix_
            while len(
                ("".join(tmp_summary)).replace(" ", "")) > self.max_sum_len__:
                matrix_l = matrix_l[sentence_subset][:, sentence_subset]
                sentence_subset, eigenvalue = ds.sample(matrix_l)
                tmp_summary = tmp_summary[sentence_subset]
            # tmp_sen = ("".join(tmp_summary)).replace(" ", "")
            # sum_length = len(tmp_sen)
            # summary.append(" ".join([i for i in tmp_sen]))
            tmp_sen = (" ".join(tmp_summary))
            sum_length = len(tmp_sen.replace(" ", ""))
            summary.append(tmp_sen)
        elif self.summary_method == "OneInDoc":
            sen_sub = []
            tmp_b = set(range(len(
                self.__paper_original))) - set(sentence_subset)
            sentence_subset += np.sort(list(tmp_b)).tolist()
            for i in range(len(self.__sub_paper_len) - 1):
                idx = np.where(
                    np.array(sentence_subset) >= self.__sub_paper_len[i])
                tmp = np.array(sentence_subset)[idx]
                idx = np.where(tmp < self.__sub_paper_len[i + 1])
                if tmp[idx].size == 0:
                    continue
                sen_sub.append(list(tmp[idx]))
            log.debug("splited sentence idx is: " + str(sen_sub))
            if_stop = False
            while sum_length < self.max_sum_len__ and not if_stop:
                if_stop = True
                for li in sen_sub:
                    if len(li) == 0:
                        continue
                    if_stop = False
                    tmp_sen = self.__paper_original[li[0]].replace(" ", "")
                    sum_length += len(tmp_sen)
                    if sum_length > self.max_sum_len__:
                        # summary.append(" ".join([i for i in tmp_sen]))
                        summary.append(self.__paper_original[li[0]])
                        break
                    else:
                        # summary.append(" ".join([i for i in tmp_sen]))
                        summary.append(self.__paper_original[li[0]])
                        li.remove(li[0])
            summary = [
                " ".join(("\n".join(summary)[:self.max_sum_len__]).split("\n"))
            ]
        elif self.summary_method == "quality":
            quality = np.array(self.__quality)
            while sum_length < self.max_sum_len__:
                max_quality = np.where(quality == np.max(quality))
                tmp_summary = np.array(self.__paper_original)[max_quality]
                # tmp_sen = "".join(tmp_summary.tolist()).replace(" ", "")
                # summary.append(" ".join([i for i in tmp_sen]))
                tmp_sen = "\n".join(tmp_summary.tolist())
                summary.append(tmp_sen)
                sum_length += len(tmp_sen)
                #                print max_quality
                #                print quality
                #                print summary
                quality[max_quality] = -999
#                print quality
            summary = ("\n".join(summary)[:self.max_sum_len__]).split("\n")
        elif self.summary_method == "CLU-DPP":
            while sum_length < self.max_sum_len__:
                tmp_len = 0
                for i in range(len(sentence_subset)):
                    # sen_num = int(float(len(subset)**2)/len(self.__paper_original))
                    subset = sentence_subset[i]
                    tmp_eig = eig[i]
                    tmp_len += len(subset)
                    if len(subset) == 0:
                        continue
                    idx = np.where(np.max(tmp_eig) == tmp_eig)[0][0]
                    tmp_sen = self.__paper_original[subset[idx]]
                    tmp_eig.remove(tmp_eig[idx])
                    subset.remove(subset[idx])
                    summary.append(tmp_sen)
                    sum_length += len(tmp_sen)
                    if sum_length > self.max_sum_len__:
                        break
                if tmp_len == 0:
                    break
            summary = [
                " ".join(("\n".join(summary)[:self.max_sum_len__]).split("\n"))
            ]
        elif self.summary_method == "PathSumm":
            sentence_subset = range(len(self.__paper))
            path_sum = PathSum(self.__child_path, sentence_subset)
            while sum_length < self.max_sum_len__:
                sen_idx = path_sum.get_next_sentence()
                summary.append(self.__paper_original[sen_idx])
                sum_length += len(self.__paper_original[sen_idx])
            summary = [
                " ".join(("\n".join(summary)[:self.max_sum_len__]).split("\n"))
            ]
        else:
            return ""

        log.debug("summary length is: " + str(sum_length))
        log.debug("generated summary: \n" + " ".join(summary))
        return summary
Ejemplo n.º 11
0
    def sampling(self, parameter, attitude):
        self.__parameter_theta = parameter
        matrix_l = np.zeros([self.__data_size, self.__data_size])
        for row_i in range(self.__data_size):
            for col_j in range(row_i, self.__data_size):
                num = self.__calculate_matrix_element(row_i, col_j)
                matrix_l[row_i][col_j] = num
                matrix_l[col_j][row_i] = num
        list_y, eigenvalue = ds.sample(matrix_l)
        '''
        (eigenvalue, feature_vector) = np.linalg.eig(matrix_l)

        j = list()
        for x in range(eigenvalue.size):
            random_a = random.randrange(1, 11)
            if (eigenvalue[x] / (eigenvalue[x] + 1)) * 10 > random_a:
                j.append(x)

        matrix_y = matrix_l[j][:, np.array(j)]
        det = np.linalg.det(matrix_y)
        list_y = j
        det = 0
        while np.abs(det) > 0.000001:
            log.debug("det: " + str(det))
            v_dem = np.sqrt(matrix_y.size)
            for i in range(int(v_dem)):
                prop = 0.0
                # column j of feature_vector is the feature of element j
                for j in range(int(v_dem)):
                    prop += feature_vector[j][i] ** 2
                prop /= det
                log.debug("prop: " + str(prop))
                random_a = random.randrange(1, 11)
                if prop * 10 > random_a:
                    list_y.append(i)
                    j.pop(i)
                    feature_vector = feature_vector[j][:, j]
                    log.info(feature_vector)
        '''
        ans = []
        new_f = 0.0
        for attitude in ['FAVOR', 'AGAINST', 'NONE']:
            ans = []
            for i in range(self.__data_size):
                if i in list_y:
                    ans.append(attitude)
                else:
                    ans.append("NONEs")
            new_f = self.get_f_score(ans, attitude)
            if self.__best_float <= new_f:
                self.__best_float = new_f
                self.__best_answer = ans
                self.__best_answer_eigenvalue = eigenvalue / np.sum(eigenvalue)
            log.debug("best_matrix function is: ")
            #            log.debug(matrix_l.tolist())
            #            log.debug("best_float is: " + str(ans))
            log.debug("best_answer is: " + str(ans))

        ans = []
        print len(list_y)
        #        list_y = list(set(range(self.__data_size)) - set(list_y))
        print len(list_y)
        list_y = range(self.__data_size)
        for i in list_y:
            ans.append(self.__test_label[i])
        tmp = []
        for i in range(len(list_y)):
            if ans[i] == 'FAVOR':
                tmp.append(i)
        tmp_ans = []
        for i in tmp:
            tmp_ans.append(
                np.array([
                    self.__f_x_test[i, 0], self.__f_x_test[i, 1], eigenvalue[i]
                ]).tolist())
        tmp = tmp_ans
        #        tmp = list(eigenvalue[tmp])
        #        tmp = self.__f_x_test[tmp, :]
        log.debug(tmp)
        tmp = []
        for i in range(len(list_y)):
            if ans[i] == 'AGAINST':
                tmp.append(i)
        tmp_ans = []
        for i in tmp:
            tmp_ans.append(
                np.array([
                    self.__f_x_test[i, 0], self.__f_x_test[i, 1], eigenvalue[i]
                ]).tolist())
        tmp = tmp_ans
        #        tmp = list(eigenvalue[tmp])
        #        tmp = self.__f_x_test[tmp, :]
        log.debug(tmp)
        tmp = []
        for i in range(len(list_y)):
            if ans[i] == 'NONE':
                tmp.append(i)
        tmp_ans = []
        for i in tmp:
            tmp_ans.append(
                np.array([
                    self.__f_x_test[i, 0], self.__f_x_test[i, 1], eigenvalue[i]
                ]).tolist())
#            tmp_ans.append(self.__calculate_quality(i))
        tmp = tmp_ans
        #        tmp = list(eigenvalue[tmp])
        #        tmp = self.__f_x_test[tmp, :]
        log.debug(tmp)

        log.debug(list(eigenvalue))
        log.debug(ans)
        return new_f, list_y, eigenvalue
Ejemplo n.º 12
0
    def sample_k_test(self, parameter, sentiment_test):
        self.__parameter_theta = parameter
        matrix_l = np.zeros([self.__data_size, self.__data_size])
        for row_i in range(self.__data_size):
            for col_j in range(row_i, self.__data_size):
                num = self.__calculate_matrix_element(row_i, col_j)
                print num
                matrix_l[row_i][col_j] = num
                matrix_l[col_j][row_i] = num
        size_ = self.__data_size
        ans = range(size_)
        '''
        matrix_k = matrix_l
        diff = range(size_)
        for i in range(10):
            center_idx, vec = ds.sample_k(matrix_k, 2)
            tmp = range(size_)
            log.info("center_idx: " + str(diff[center_idx[0]]) + " " + str(diff[center_idx[1]]))
            if i > 2:
                ans.append(diff[center_idx[0]])
                ans.append(diff[center_idx[1]])
                log.info("".join(self.__test_data[diff[center_idx[0]]]))
                log.info(self.__test_label[diff[center_idx[0]]])
                log.info("".join(self.__test_data[diff[center_idx[1]]]))
                log.info(self.__test_label[diff[center_idx[1]]])
            diff = list(set(tmp)-set(center_idx))
            self.__f_x_test = self.__f_x_test[diff][:]
            matrix_k = matrix_k[diff][:, diff]
            size_ = len(diff)
        log.info(matrix_l.shape)
        '''
        center_idx, vec = ds.sample_k(matrix_l, 3)
        log.debug(vec)
        final_answer_ = np.array(ans)[center_idx]
        log.info(final_answer_)
        log.info("".join(self.__test_data[final_answer_[0]]))
        log.info(self.__test_label[final_answer_[0]])
        log.info("".join(self.__test_data[final_answer_[1]]))
        log.info(self.__test_label[final_answer_[1]])
        log.info("".join(self.__test_data[final_answer_[2]]))
        log.info(self.__test_label[final_answer_[2]])
        cluster_1 = []
        cluster_2 = []
        cluster_3 = []
        score_0 = sentiment_test[center_idx[0]]
        score_1 = sentiment_test[center_idx[1]]
        score_2 = sentiment_test[center_idx[2]]
        label_1 = ''
        label_0 = ''
        label_2 = ''
        if score_0 > score_1:
            if score_0 > score_2:
                label_0 = 'FAVOR'
                if score_1 > score_2:
                    label_1 = 'NONE'
                    label_2 = 'AGAINST'
                else:
                    label_1 = 'AGAINST'
                    label_2 = 'NONE'
            else:
                label_2 = 'FAVOR'
                label_0 = 'NONE'
                label_1 = 'AGAINST'
        else:
            if score_1 > score_2:
                label_1 = 'FAVOR'
                if score_0 > score_2:
                    label_0 = 'NONE'
                    label_2 = 'AGAINST'
                else:
                    label_0 = 'AGAINST'
                    label_2 = 'NONE'
            else:
                score_2 = 'FAVOR'
                score_0 = 'AGAINST'
                score_1 = 'NONE'

        cluster_1.append(center_idx[0])
        cluster_2.append(center_idx[1])
        cluster_3.append(center_idx[2])
        for i in range(self.__data_size):
            #            sim_1 = matrix_l[i][i] * matrix_l[center_idx[0]][center_idx[0]] - matrix_l[i][center_idx[0]] ** 2
            #            sim_2 = matrix_l[i][i] * matrix_l[center_idx[1]][center_idx[1]] - matrix_l[i][center_idx[1]] ** 2
            #            sim_3 = matrix_l[i][i] * matrix_l[center_idx[2]][center_idx[2]] - matrix_l[i][center_idx[2]] ** 2
            #            sim_1 = np.sum(np.square(vec[:][i] - vec[:][center_idx[0]]))
            #            sim_2 = np.sum(np.square(vec[:][i] - vec[:][center_idx[1]]))
            #            sim_3 = np.sum(np.square(vec[:][i] - vec[:][center_idx[2]]))
            sim_1 = self.__similarity[i][center_idx[0]]
            sim_2 = self.__similarity[i][center_idx[1]]
            sim_3 = self.__similarity[i][center_idx[2]]
            if sim_1 > sim_2:
                if sim_1 > sim_3:
                    cluster_1.append(i)
                else:
                    cluster_3.append(i)
            elif sim_2 > sim_3:
                cluster_2.append(i)
            else:
                cluster_3.append(i)
        ans = []
        for i in range(self.__data_size):
            print "i: " + str(i)
            if i in cluster_1:
                #                ans.append(self.__test_label[center_idx[0]])
                ans.append(label_0)
            elif i in cluster_2:
                ans.append(label_1)
            else:
                ans.append(label_2)

        log.info('FINAL' + str(ans))
        for label in ["FAVOR", "AGAINST", "NONE"]:
            self.get_f_score(ans, label)