def sample_for_test(self, parameter): self.__parameter_theta = parameter["FAVOR"] size_ = self.__data_size matrix_l = np.zeros([size_, size_]) for row_i in range(size_): for col_j in range(row_i, size_): num = self.__calculate_matrix_element(row_i, col_j) matrix_l[row_i][col_j] = num matrix_l[col_j][row_i] = num tmp_ans = dict() list_y, eigenvalue = ds.sample(matrix_l) tmp_ans["FAVOR"] = list_y self.__parameter_theta = parameter["AGAINST"] tmp = range(size_) diff = list(set(tmp) - set(list_y)) self.__f_x_test = self.__f_x_test[diff][:] self.__similarity = self.__similarity[diff][:, diff] size_ = len(diff) log.debug(self.__f_x_test) for row_i in range(size_): for col_j in range(row_i, size_): num = self.__calculate_matrix_element(row_i, col_j) matrix_l[row_i][col_j] = num matrix_l[col_j][row_i] = num list_y, eigenvalue = ds.sample(matrix_l) tmp_ans["AGAINST"] = [] tmp_ans["NONE"] = [] for i in range(size_): if i in list_y: tmp_ans["AGAINST"].append(diff[i]) else: tmp_ans["NONE"].append(diff[i]) return tmp_ans
def __get_doc2vec_matrix(self, path): log.info('use word2vec') self.quality_method__ = "word2vec" self.distance_method__ = "100" word2vec_matrix = read_file(path) word2vec_matrix = word2vec_matrix[2:len(word2vec_matrix) - 1] self.__key_word = [vec_.split(u" ")[0] for vec_ in word2vec_matrix] log.debug("word2vec key words: \n" + "\t".join(self.__key_word)) word2vec = np.array([(vec_.encode("utf-8")).split(" ")[1:] for vec_ in word2vec_matrix]) word2vec = word2vec.astype(np.float64) return word2vec.dot(word2vec.transpose()) * 1000
def __get_word_frequency(self, key_word): frequency = [] orig_doc_ = " ".join(self.__paper).split(" ") union_word = set(orig_doc_) for word in key_word: frequency.append(word + '\t' + str(orig_doc_.count(word))) frequency.append("not in word") for word in union_word: if word in key_word: continue frequency.append(word + '\t' + str(orig_doc_.count(word))) log.debug("word frequency: \n" + "\n".join(frequency)) return frequency
def __learning_single_attitude(self, attitude): """ learning parameters __parameter_theta, which is the parameters of quality model :param input_x: input data of X :param label_y: label y :return: L-matrix """ log.info('learning: ' + attitude) self.__parameter_theta = np.random.random(size=self.__feature_size) self.__sample = sp.DppSampling(100, self.__feature_size) self.__initiate_f_x() grad = self.calculate_gradient(attitude) best_f = 0.0 iter_count = 0 log.debug("grad") log.info(grad) while (not self.__whether_end(grad)) and iter_count < 1000: log.info("interation: " + str(iter_count)) new_f, ignore, ignore_value = self.__sample.sampling(self.__parameter_theta, attitude) if new_f > best_f: best_f = new_f self.__final_parameter[attitude] = copy.deepcopy(self.__parameter_theta) log.debug("grad") log.info(grad) log.debug("parameter") log.debug(self.__parameter_theta) self.__parameter_theta = self.__parameter_theta - self.__step * grad grad = self.calculate_gradient(attitude) iter_count += 1 self.__tmp_eigenvalue[attitude] = copy.deepcopy(self.__sample.get_best_eigenvalue()) self.__tmp_answer[attitude] = copy.deepcopy(self.__sample.get_best_answer())
def get_mss_paper_summary(self, file_name, if_write_file=True): """ generate summary for one paper, single document summarization :param lang: :param file_name: current file name, used for write summary answer :param if_write_file: whether write generated summary to answer file named file_name :return: """ # initial self.__quality, self.__paper_name = None, file_name self.quality_method__ = "" if self.stop_word_method == "remove_stop": self.__paper = read_file(self.__child_path + "RemoveStop.temp") elif self.stop_word_method == "with_stop": self.__paper = read_file(self.__child_path + "word_segment.temp") self.__titles = read_file(self.__child_path + "titles.temp") self.__paper_original = read_file(self.__child_path + "word_segment.temp") self.__sub_paper_len = [ int(i) for i in read_file(self.__child_path + "sec_idx.temp") ] # extract sentence feature_subset, eig = self.__cal_candidate_set() #print len(feature_subset) # feature_subset = range(len(self.__paper_original)) # eig = [] log.info(feature_subset) log.debug(eig) # use feature list to extract summary summary = self.__construct_summary(feature_subset, eig) if if_write_file: if file_name == '': log.error("file name is empty") return "" # write answer to file for ROUGE #print self.__rouge_path answer_path = self.__child_path write_file( summary, os.path.join('%s/%s.txt' % (answer_path, file_name + '_result')), False) return "".join(summary)
def __cal_candidate_set(self): matrix_l = self.__cal_matrix() subset_ = [] eigenvalue = [] #print self.candidate_method try: if self.candidate_method == "DR": subset_, eigenvalue = ds.sample(matrix_l) #print len(subset_) elif self.candidate_method == "CLU-DPP": cluster = hlda_analysis.sentence_cluster( self.__child_path, "run000") # debug hLDA message, include: total cluster number, each cluster sentence, i = 0 tmp = "" log.debug("cluster number: " + str(len(cluster))) for sen_list in cluster: tmp += "\n cluster: " + str( i) + "\tsentence_num is " + str(len(sen_list)) + "\n" tmp += "\n".join(np.array(self.__paper_original)[sen_list]) i += 1 log.debug(tmp) # begin calculate and get sentence for i in range(len(cluster) / 2): sen_list = cluster[i] tmp_matrix = matrix_l[sen_list][:, sen_list] tmp_set, eig = ds.sample(tmp_matrix) if len(sen_list) < 10: subset_.append(sen_list) eigenvalue.append(eig) continue subset_.append(np.array(sen_list)[tmp_set].tolist()) eigenvalue.append(np.array(eig)[tmp_set].tolist()) elif self.candidate_method == "RANDOM": for i in range(20): subset_.append( np.random.randint(0, len(self.__paper_original))) else: raise RuntimeError("value error: " + self.candidate_method) except RuntimeError as e: log.error(e) finally: return subset_, eigenvalue
def calculate_gradient(self, attitude): # compute L(x; theta) as in equation (155) log.debug(self.__parameter_theta) log.info("computing matrix L") for row_i in range(self.__data_size): for col_j in range(self.__data_size): self.__matrix_l[row_i][col_j] = self.__calculate_matrix_element(row_i, col_j) # Eigendecompose L(x; theta) log.info("engendecomposing") log.debug("matrix value: " + str(np.linalg.det(self.__matrix_l))) (eigenvalue, feature_vector) = np.linalg.eig(self.__matrix_l) for i in range(len(eigenvalue)): eigenvalue[i] = float(eigenvalue[i]) if np.abs(eigenvalue[i]) < 0.000000001: eigenvalue[i] = 0.0 log.debug("eigenvalue") log.debug(eigenvalue) # calculate K_ii log.info("calculating Kii") vector_k = np.zeros(self.__data_size) log.debug("feature value matrix") for i in range(self.__data_size): for j in range(self.__data_size): vector_k[i] += ((eigenvalue[j] / (eigenvalue[j] + 1)) * (feature_vector[i][j] ** 2)) # log.debug("Kii: " + str(vector_k)) # calculate gradient log.info("calculating gradient") sigma_sub_f_x = np.zeros(self.__feature_size) for index in range(self.__data_size): if self.__train_label[index] == attitude: sigma_sub_f_x += self.__f_x[index] sigma_kii_f_x = np.zeros(self.__feature_size) for i in range(self.__data_size): sigma_kii_f_x += vector_k[i] * self.__f_x[i] return sigma_sub_f_x - sigma_kii_f_x
def get_f_score(self, experiment_label, attitude, i, j): count = 0 answer_count = 0 exp_count = 0 for idx in range(j): idx += i if self.__test_label[idx] == attitude: answer_count += 1 if experiment_label[idx] == attitude: count += 1 if experiment_label[idx] == attitude: exp_count += 1 if count == 0 or exp_count == 0 or answer_count == 0: return 0.0 precision = float(count) / exp_count recall = float(count) / answer_count log.debug("precision is: " + str(precision)) log.debug("recall is: " + str(recall)) f = precision * recall * 2 / (recall + precision) log.debug("f is: " + str(f)) return f
def get_mss_paper_summary(self, lang, file_name, if_write_file=True): """ generate summary for one paper, single document summarization :param lang: :param file_name: current file name, used for write summary answer :param if_write_file: whether write generated summary to answer file named file_name :return: """ # initial self.__quality, self.__paper_name = None, file_name self.quality_method__ = "" ''' if DATA == "mms2015": self.__all_file.merge_mms_2015(os.path.dirname(self.__child_path), "chinese") elif DATA == "mss2017": if lang in ["vi", "ka"]: self.__all_file.merge_mss_2017(os.path.dirname(self.__child_path)) else: self.__all_file.merge_mss_2017_ros(os.path.dirname(self.__child_path)) self.__paper_original = self.__all_file.get_merged_paper() if self.stop_word_method == "remove_stop": self.__paper = self.__all_file.get_filtered_paper() elif self.stop_word_method == "with_stop": self.__paper = self.__all_file.get_merged_paper() self.__titles = self.__all_file.get_titles() # used for generate hLDA input file and calculate level method. if (not os.path.exists(self.__child_path + "model.temp")) or False: write_file(self.__paper, self.__child_path + "RemoveStop.temp", False) write_file(self.__paper_original, self.__child_path + "word_segment.temp", False) model_temp(self.__paper, self.__child_path) return "" ''' if self.stop_word_method == "remove_stop": self.__paper = read_file(self.__child_path + "RemoveStop.temp") elif self.stop_word_method == "with_stop": self.__paper = read_file(self.__child_path + "word_segment.temp") self.__titles = read_file(self.__child_path + "titles.temp") self.__paper_original = read_file(self.__child_path + "word_segment.temp") self.__sub_paper_len = [ int(i) for i in read_file(self.__child_path + "sec_idx.temp") ] # extract sentence feature_subset, eig = self.__cal_candidate_set() # feature_subset = range(len(self.__paper_original)) # eig = [] log.error("results is: ") log.info(feature_subset) log.debug(eig) # use feature list to extract summary summary = self.__construct_summary(feature_subset, eig, lang) if if_write_file: if file_name == '': log.error("file name is empty") return "" # write answer to file for ROUGE answer_path = self.__rouge_path + lang + "/systems/" write_file(summary, os.path.join('%s%s.txt' % (answer_path, file_name)), False) ''' # generate gold summary split by CHAR gold_path = self.__rouge_path + lang + "/models/" if not os.path.exists(gold_path): os.makedirs(gold_path) tmp_name = lang + "/" + file_name + "_summary.txt" abs_human = read_file('./data/MultiLing2015-MSS/multilingMss2015Eval/summary/' + tmp_name) if not os.path.exists(gold_path + file_name + "_summary.txt") and lang != "vi" and lang != 'ka': write_file([" ".join(api.tokenize("\n".join(abs_human)))], gold_path + file_name + "_summary.txt", False) if lang == "vi": write_file(abs_human, gold_path + file_name + "_summary.txt", False) # generate configure file of each document for ROUGE conf_path = self.__rouge_path + lang + "/configure/" if not os.path.exists(conf_path): os.makedirs(conf_path) tmp_conf_ = answer_path + file_name + ".txt " + gold_path + file_name + "_summary.txt" self.__all_conf.append(tmp_conf_) write_file([tmp_conf_], os.path.join('%s/%s.txt' % (conf_path, file_name)), False) ''' return "".join(summary)
def __construct_summary(self, sentence_subset, eig, lang="zh"): summary = [] sum_length = 0 if self.summary_method == "QD": for sentence_idx in sentence_subset: if sum_length < self.max_sum_len__: tmp_sen = self.__paper_original[sentence_idx] # if lang in ['ja', 'th', 'zh']: # summary.append(" ".join([i for i in tmp_sen])) # else: # summary.append(tmp_sen) summary.append(tmp_sen) else: break sum_length += len(self.__paper_original[sentence_idx]) quality = np.array(self.__quality) quality[sentence_subset] = -999 while sum_length < self.max_sum_len__: max_quality = np.where(quality == np.max(quality)) tmp_summary = np.array(self.__paper_original)[max_quality] tmp_sen = "\n".join(tmp_summary.tolist()) summary.append(tmp_sen) sum_length += len(tmp_sen) quality[max_quality] = -999 summary = [ " ".join(("\n".join(summary)[:self.max_sum_len__]).split("\n")) ] elif self.summary_method == "DM": print len(self.__key_word) key_word = set(np.array(self.__key_word)[sentence_subset]) self.__get_word_frequency(key_word) common_number = np.zeros([len(self.__paper_original)]) for i in range(len(self.__paper_original)): common_number[i] = len( key_word.intersection( set(self.__paper_original[i].split(" ")))) while sum_length < self.max_sum_len__: b = np.where(common_number == np.max(common_number)) common_number[b] = 0 for sen in np.array(self.__paper_original)[b].tolist(): summary.append(sen) sum_length = len(summary) if sum_length > self.max_sum_len__: break summary = [ " ".join(("\n".join(summary)[:self.max_sum_len__]).split("\n")) ] elif self.summary_method == "newDM": key_word = set(np.array(self.__key_word)[sentence_subset]) self.__get_word_frequency(key_word) common_number = np.zeros([len(self.__paper_original)]) selected_sen = [] while sum_length < self.max_sum_len__ and len(key_word) > 0: for i in range(len(self.__paper_original)): if i in selected_sen: continue common_number[i] = len( key_word.intersection( set(self.__paper_original[i].split(" ")))) b = np.where(common_number == np.max(common_number)) common_number[b] = 0 key_word -= key_word.intersection( set(self.__paper_original[b[0][0]].split(" "))) selected_sen += b[0].tolist() for sen in np.array(self.__paper_original)[b].tolist(): summary.append(sen) sum_length = len(summary) if sum_length > self.max_sum_len__: break summary = [ " ".join(("\n".join(summary)[:self.max_sum_len__]).split("\n")) ] elif self.summary_method == "hDPP": tmp_summary = np.array(self.__paper_original)[sentence_subset] matrix_l = self.__doc_matrix_ while len( ("".join(tmp_summary)).replace(" ", "")) > self.max_sum_len__: matrix_l = matrix_l[sentence_subset][:, sentence_subset] sentence_subset, eigenvalue = ds.sample(matrix_l) tmp_summary = tmp_summary[sentence_subset] # tmp_sen = ("".join(tmp_summary)).replace(" ", "") # sum_length = len(tmp_sen) # summary.append(" ".join([i for i in tmp_sen])) tmp_sen = (" ".join(tmp_summary)) sum_length = len(tmp_sen.replace(" ", "")) summary.append(tmp_sen) elif self.summary_method == "OneInDoc": sen_sub = [] tmp_b = set(range(len( self.__paper_original))) - set(sentence_subset) sentence_subset += np.sort(list(tmp_b)).tolist() for i in range(len(self.__sub_paper_len) - 1): idx = np.where( np.array(sentence_subset) >= self.__sub_paper_len[i]) tmp = np.array(sentence_subset)[idx] idx = np.where(tmp < self.__sub_paper_len[i + 1]) if tmp[idx].size == 0: continue sen_sub.append(list(tmp[idx])) log.debug("splited sentence idx is: " + str(sen_sub)) if_stop = False while sum_length < self.max_sum_len__ and not if_stop: if_stop = True for li in sen_sub: if len(li) == 0: continue if_stop = False tmp_sen = self.__paper_original[li[0]].replace(" ", "") sum_length += len(tmp_sen) if sum_length > self.max_sum_len__: # summary.append(" ".join([i for i in tmp_sen])) summary.append(self.__paper_original[li[0]]) break else: # summary.append(" ".join([i for i in tmp_sen])) summary.append(self.__paper_original[li[0]]) li.remove(li[0]) summary = [ " ".join(("\n".join(summary)[:self.max_sum_len__]).split("\n")) ] elif self.summary_method == "quality": quality = np.array(self.__quality) while sum_length < self.max_sum_len__: max_quality = np.where(quality == np.max(quality)) tmp_summary = np.array(self.__paper_original)[max_quality] # tmp_sen = "".join(tmp_summary.tolist()).replace(" ", "") # summary.append(" ".join([i for i in tmp_sen])) tmp_sen = "\n".join(tmp_summary.tolist()) summary.append(tmp_sen) sum_length += len(tmp_sen) # print max_quality # print quality # print summary quality[max_quality] = -999 # print quality summary = ("\n".join(summary)[:self.max_sum_len__]).split("\n") elif self.summary_method == "CLU-DPP": while sum_length < self.max_sum_len__: tmp_len = 0 for i in range(len(sentence_subset)): # sen_num = int(float(len(subset)**2)/len(self.__paper_original)) subset = sentence_subset[i] tmp_eig = eig[i] tmp_len += len(subset) if len(subset) == 0: continue idx = np.where(np.max(tmp_eig) == tmp_eig)[0][0] tmp_sen = self.__paper_original[subset[idx]] tmp_eig.remove(tmp_eig[idx]) subset.remove(subset[idx]) summary.append(tmp_sen) sum_length += len(tmp_sen) if sum_length > self.max_sum_len__: break if tmp_len == 0: break summary = [ " ".join(("\n".join(summary)[:self.max_sum_len__]).split("\n")) ] elif self.summary_method == "PathSumm": sentence_subset = range(len(self.__paper)) path_sum = PathSum(self.__child_path, sentence_subset) while sum_length < self.max_sum_len__: sen_idx = path_sum.get_next_sentence() summary.append(self.__paper_original[sen_idx]) sum_length += len(self.__paper_original[sen_idx]) summary = [ " ".join(("\n".join(summary)[:self.max_sum_len__]).split("\n")) ] else: return "" log.debug("summary length is: " + str(sum_length)) log.debug("generated summary: \n" + " ".join(summary)) return summary
def sampling(self, parameter, attitude): self.__parameter_theta = parameter matrix_l = np.zeros([self.__data_size, self.__data_size]) for row_i in range(self.__data_size): for col_j in range(row_i, self.__data_size): num = self.__calculate_matrix_element(row_i, col_j) matrix_l[row_i][col_j] = num matrix_l[col_j][row_i] = num list_y, eigenvalue = ds.sample(matrix_l) ''' (eigenvalue, feature_vector) = np.linalg.eig(matrix_l) j = list() for x in range(eigenvalue.size): random_a = random.randrange(1, 11) if (eigenvalue[x] / (eigenvalue[x] + 1)) * 10 > random_a: j.append(x) matrix_y = matrix_l[j][:, np.array(j)] det = np.linalg.det(matrix_y) list_y = j det = 0 while np.abs(det) > 0.000001: log.debug("det: " + str(det)) v_dem = np.sqrt(matrix_y.size) for i in range(int(v_dem)): prop = 0.0 # column j of feature_vector is the feature of element j for j in range(int(v_dem)): prop += feature_vector[j][i] ** 2 prop /= det log.debug("prop: " + str(prop)) random_a = random.randrange(1, 11) if prop * 10 > random_a: list_y.append(i) j.pop(i) feature_vector = feature_vector[j][:, j] log.info(feature_vector) ''' ans = [] new_f = 0.0 for attitude in ['FAVOR', 'AGAINST', 'NONE']: ans = [] for i in range(self.__data_size): if i in list_y: ans.append(attitude) else: ans.append("NONEs") new_f = self.get_f_score(ans, attitude) if self.__best_float <= new_f: self.__best_float = new_f self.__best_answer = ans self.__best_answer_eigenvalue = eigenvalue / np.sum(eigenvalue) log.debug("best_matrix function is: ") # log.debug(matrix_l.tolist()) # log.debug("best_float is: " + str(ans)) log.debug("best_answer is: " + str(ans)) ans = [] print len(list_y) # list_y = list(set(range(self.__data_size)) - set(list_y)) print len(list_y) list_y = range(self.__data_size) for i in list_y: ans.append(self.__test_label[i]) tmp = [] for i in range(len(list_y)): if ans[i] == 'FAVOR': tmp.append(i) tmp_ans = [] for i in tmp: tmp_ans.append( np.array([ self.__f_x_test[i, 0], self.__f_x_test[i, 1], eigenvalue[i] ]).tolist()) tmp = tmp_ans # tmp = list(eigenvalue[tmp]) # tmp = self.__f_x_test[tmp, :] log.debug(tmp) tmp = [] for i in range(len(list_y)): if ans[i] == 'AGAINST': tmp.append(i) tmp_ans = [] for i in tmp: tmp_ans.append( np.array([ self.__f_x_test[i, 0], self.__f_x_test[i, 1], eigenvalue[i] ]).tolist()) tmp = tmp_ans # tmp = list(eigenvalue[tmp]) # tmp = self.__f_x_test[tmp, :] log.debug(tmp) tmp = [] for i in range(len(list_y)): if ans[i] == 'NONE': tmp.append(i) tmp_ans = [] for i in tmp: tmp_ans.append( np.array([ self.__f_x_test[i, 0], self.__f_x_test[i, 1], eigenvalue[i] ]).tolist()) # tmp_ans.append(self.__calculate_quality(i)) tmp = tmp_ans # tmp = list(eigenvalue[tmp]) # tmp = self.__f_x_test[tmp, :] log.debug(tmp) log.debug(list(eigenvalue)) log.debug(ans) return new_f, list_y, eigenvalue
def sample_k_test(self, parameter, sentiment_test): self.__parameter_theta = parameter matrix_l = np.zeros([self.__data_size, self.__data_size]) for row_i in range(self.__data_size): for col_j in range(row_i, self.__data_size): num = self.__calculate_matrix_element(row_i, col_j) print num matrix_l[row_i][col_j] = num matrix_l[col_j][row_i] = num size_ = self.__data_size ans = range(size_) ''' matrix_k = matrix_l diff = range(size_) for i in range(10): center_idx, vec = ds.sample_k(matrix_k, 2) tmp = range(size_) log.info("center_idx: " + str(diff[center_idx[0]]) + " " + str(diff[center_idx[1]])) if i > 2: ans.append(diff[center_idx[0]]) ans.append(diff[center_idx[1]]) log.info("".join(self.__test_data[diff[center_idx[0]]])) log.info(self.__test_label[diff[center_idx[0]]]) log.info("".join(self.__test_data[diff[center_idx[1]]])) log.info(self.__test_label[diff[center_idx[1]]]) diff = list(set(tmp)-set(center_idx)) self.__f_x_test = self.__f_x_test[diff][:] matrix_k = matrix_k[diff][:, diff] size_ = len(diff) log.info(matrix_l.shape) ''' center_idx, vec = ds.sample_k(matrix_l, 3) log.debug(vec) final_answer_ = np.array(ans)[center_idx] log.info(final_answer_) log.info("".join(self.__test_data[final_answer_[0]])) log.info(self.__test_label[final_answer_[0]]) log.info("".join(self.__test_data[final_answer_[1]])) log.info(self.__test_label[final_answer_[1]]) log.info("".join(self.__test_data[final_answer_[2]])) log.info(self.__test_label[final_answer_[2]]) cluster_1 = [] cluster_2 = [] cluster_3 = [] score_0 = sentiment_test[center_idx[0]] score_1 = sentiment_test[center_idx[1]] score_2 = sentiment_test[center_idx[2]] label_1 = '' label_0 = '' label_2 = '' if score_0 > score_1: if score_0 > score_2: label_0 = 'FAVOR' if score_1 > score_2: label_1 = 'NONE' label_2 = 'AGAINST' else: label_1 = 'AGAINST' label_2 = 'NONE' else: label_2 = 'FAVOR' label_0 = 'NONE' label_1 = 'AGAINST' else: if score_1 > score_2: label_1 = 'FAVOR' if score_0 > score_2: label_0 = 'NONE' label_2 = 'AGAINST' else: label_0 = 'AGAINST' label_2 = 'NONE' else: score_2 = 'FAVOR' score_0 = 'AGAINST' score_1 = 'NONE' cluster_1.append(center_idx[0]) cluster_2.append(center_idx[1]) cluster_3.append(center_idx[2]) for i in range(self.__data_size): # sim_1 = matrix_l[i][i] * matrix_l[center_idx[0]][center_idx[0]] - matrix_l[i][center_idx[0]] ** 2 # sim_2 = matrix_l[i][i] * matrix_l[center_idx[1]][center_idx[1]] - matrix_l[i][center_idx[1]] ** 2 # sim_3 = matrix_l[i][i] * matrix_l[center_idx[2]][center_idx[2]] - matrix_l[i][center_idx[2]] ** 2 # sim_1 = np.sum(np.square(vec[:][i] - vec[:][center_idx[0]])) # sim_2 = np.sum(np.square(vec[:][i] - vec[:][center_idx[1]])) # sim_3 = np.sum(np.square(vec[:][i] - vec[:][center_idx[2]])) sim_1 = self.__similarity[i][center_idx[0]] sim_2 = self.__similarity[i][center_idx[1]] sim_3 = self.__similarity[i][center_idx[2]] if sim_1 > sim_2: if sim_1 > sim_3: cluster_1.append(i) else: cluster_3.append(i) elif sim_2 > sim_3: cluster_2.append(i) else: cluster_3.append(i) ans = [] for i in range(self.__data_size): print "i: " + str(i) if i in cluster_1: # ans.append(self.__test_label[center_idx[0]]) ans.append(label_0) elif i in cluster_2: ans.append(label_1) else: ans.append(label_2) log.info('FINAL' + str(ans)) for label in ["FAVOR", "AGAINST", "NONE"]: self.get_f_score(ans, label)