def __cal_matrix(self, file_name=""): log.info("extract feature from pre-defined setting!") if self.__feature_method == "QD": paper_len = len(self.__paper) matrix_l = np.zeros([paper_len, paper_len]) for i in range(paper_len): for j in range(paper_len): # print "element %d, %d" % (i, j) num = self.__cal_matrix_element(i, j) matrix_l[i][j] = num matrix_l[j][i] = num elif self.__feature_method == "DM": # if file_name == "": # log.error("file name is empty, please check!") # return [] # file_path = os.path.join("./data/word2vec/remove_stop/%s.vec" % file_name) file_path = self.__child_path + "word_segment.vec" matrix_l = self.__get_doc2vec_matrix(file_path) else: log.error("self.__feature_method is " + self.__feature_method) return [] # matrix_l = self.__feature_normalization(matrix_l) if self.summary_method == "hDPP": self.__doc_matrix_ = matrix_l return matrix_l
def __quality_initial_coverage(self): """ initial quality list use sentence coverage feature :return: null """ if self.feature_merge.split("-")[3] == "0": return log.info("quality calculation: sentence coverage") if self.quality_method__ == "": self.quality_method__ += "cov" else: self.quality_method__ += "-cov" tmp_quality = np.zeros([len(self.__paper)]) sen_num = len(self.__paper) union_paper = " ".join( [" ".join(set(sen.split(" "))) for sen in self.__paper]).split(" ") for i in range(sen_num): word_list = self.__paper[i].split(" ") word_in_sen = [ union_paper.count(cur_word) / float(sen_num) for cur_word in word_list ] tmp_quality[i] = np.sum(word_in_sen) / len(word_list) tmp_quality = self.__feature_normalization(tmp_quality) self.__quality += tmp_quality * float(self.feature_merge.split("-")[3])
def launch_multiling_single_summary(self, dic_path): self.__rouge_path = ini_rouge_data(name_suffix=self.feature_merge + "-" + "-" + self.summary_method) path_dir = os.listdir(dic_path) for cur_lang in path_dir: if cur_lang not in ["zh", "en"]: continue lang_dir = os.path.join("%s/%s" % (dic_path, cur_lang)) self.__all_conf = [] # get target length of current language self.__target_len = dict() for line in read_file(self.__target_len_dir + cur_lang + ".txt"): self.__target_len[line.split("_")[0]] = int(line.split(",")[1]) # get summary of current file(cur_file) for cur_file in os.listdir(lang_dir): self.max_sum_len__ = self.__target_len[cur_file] child_path = os.path.join('%s/%s/%s/' % (dic_path, cur_lang, cur_file)) self.__child_path = child_path log.info(child_path) self.get_mss_paper_summary(cur_lang, cur_file) # write_file(self.__all_conf, self.__rouge_path + cur_lang + "/configure/.configure_all_" + cur_lang + ".txt", False) # if not os.path.exists(self.__rouge_path + cur_lang + "/output"): # os.makedirs(self.__rouge_path + cur_lang + "/output") return self.__rouge_path
def __initial(self, start_idx): # doc2vec_file = open("data/input_data/qualityForTrain.vec", "r") doc2vec_file = open("data/input_data/qualityForTrain.vec", "r") i = 0 for vector in doc2vec_file.readlines(): if i < start_idx: i += 1 continue list_vector = vector.strip().split(" ") self.__f_x_test[i - start_idx][:] = np.array(list_vector[:]) i += 1 if i - start_idx >= self.__data_size: break doc2vec_file.close() file = open("./data/input_data/taskAAForDoc2VecTrainData.txt", "r") # file = open("./data/input_data_test/all_test_data.txt", "r") datas = [ sentence.strip().decode("utf-8").split(" ") for sentence in file.readlines() ] self.__test_data = [ datas[i + start_idx] for i in range(self.__data_size) ] log.info(len(self.__test_data)) # log.info(self.__test_data) file.close() file = open("./data/input_data/taskAAForDoc2VecTrainLabel.txt", "r") datas = [sentence.strip() for sentence in file.readlines()] self.__test_label = [ datas[i + start_idx] for i in range(self.__data_size) ] file.close()
def __feature_normalization(tmp_array): log.info("feature normalization: sigmoid") # sigmoid if isinstance(tmp_array, list): return (np.array(tmp_array) / np.max(tmp_array)).tolist() # return [np.exp(-1.0 * x) for x in tmp_array] else: return tmp_array / np.max(tmp_array)
def __similarity_calculating(self, idx_i, idx_j): if idx_i + idx_j <= 0: log.info("distance calculation: JACCARD") self.distance_method__ = "jaccard" inter_ = set(self.__paper[idx_i].split(" ")).intersection( self.__paper[idx_j].split(" ")) union_ = set(self.__paper[idx_i].split(" ")).union( self.__paper[idx_j].split(" ")) return float(len(inter_)) / float(len(union_))
def __get_doc2vec_matrix(self, path): log.info('use word2vec') self.quality_method__ = "word2vec" self.distance_method__ = "100" word2vec_matrix = read_file(path) word2vec_matrix = word2vec_matrix[2:len(word2vec_matrix) - 1] self.__key_word = [vec_.split(u" ")[0] for vec_ in word2vec_matrix] log.debug("word2vec key words: \n" + "\t".join(self.__key_word)) word2vec = np.array([(vec_.encode("utf-8")).split(" ")[1:] for vec_ in word2vec_matrix]) word2vec = word2vec.astype(np.float64) return word2vec.dot(word2vec.transpose()) * 1000
def __learning_single_attitude(self, attitude): """ learning parameters __parameter_theta, which is the parameters of quality model :param input_x: input data of X :param label_y: label y :return: L-matrix """ log.info('learning: ' + attitude) self.__parameter_theta = np.random.random(size=self.__feature_size) self.__sample = sp.DppSampling(100, self.__feature_size) self.__initiate_f_x() grad = self.calculate_gradient(attitude) best_f = 0.0 iter_count = 0 log.debug("grad") log.info(grad) while (not self.__whether_end(grad)) and iter_count < 1000: log.info("interation: " + str(iter_count)) new_f, ignore, ignore_value = self.__sample.sampling(self.__parameter_theta, attitude) if new_f > best_f: best_f = new_f self.__final_parameter[attitude] = copy.deepcopy(self.__parameter_theta) log.debug("grad") log.info(grad) log.debug("parameter") log.debug(self.__parameter_theta) self.__parameter_theta = self.__parameter_theta - self.__step * grad grad = self.calculate_gradient(attitude) iter_count += 1 self.__tmp_eigenvalue[attitude] = copy.deepcopy(self.__sample.get_best_eigenvalue()) self.__tmp_answer[attitude] = copy.deepcopy(self.__sample.get_best_answer())
def __quality_initial_position(self): if self.feature_merge.split("-")[0] == "0": return log.info("quality calculation: position") if self.quality_method__ == "": self.quality_method__ += "pos" else: self.quality_method__ += "-pos" tmp_quality = np.zeros([len(self.__paper)]) for i in range(len(self.__paper)): tmp_quality[i] = 1.0 - float(i) / float(len(self.__paper)) # tmp_quality = self.__feature_normalization(tmp_quality) self.__quality += tmp_quality * float(self.feature_merge.split("-")[0])
def __quality_initial_level(self): if self.feature_merge.split("-")[4] == "0": return log.info("quality calculation: hLDA level") log.info("get level score: " + self.__child_path) if self.quality_method__ == "": self.quality_method__ += "lev" else: self.quality_method__ += "-lev" if self.__level_tmp is None: self.__level_tmp = LevelScore() tmp_level = self.__level_tmp.get_paper_level_score(self.__child_path) tmp_level = self.__feature_normalization(tmp_level) self.__quality += (np.array(tmp_level) * float(self.feature_merge.split("-")[4]))
def __calculate_similarity(self, idx_i, idx_j): ''' if self.__similarity is None: log.info("initiating similarity") self.__similarity = self.__f_x_test.dot( self.__f_x_test.transpose()) return self.__similarity[idx_i][idx_j] ''' if self.__similarity is None: log.info("initiating similarity") self.__similarity = np.zeros([self.__data_size, self.__data_size]) for i in range(self.__data_size): for j in range(i, self.__data_size): self.__similarity[i][j] = self.__similarity__method(i, j) self.__similarity[j][i] = self.__similarity[i][j] return self.__similarity[idx_i][idx_j]
def launch_multiling_single_summary(self, dic_path): # MMS-2015 #test = PreProcessing() #self.__rouge_path = test.ini_rouge_data(name_suffix=self.feature_merge) print dic_path path_dir = os.listdir(dic_path) print path_dir for cur_cluster in path_dir: print cur_cluster cluster_dir = os.path.join("%s/%s/" % (dic_path, cur_cluster)) self.__all_conf = [] # get target length of current language # get summary of current file(cur_file) self.__child_path = cluster_dir log.info(cluster_dir) self.get_mss_paper_summary(cur_cluster) return ""
def __init__(self, data_size, feature_size): log.info("dpp learning module") self.__data_size = data_size self.__feature_size = feature_size self.__train_label = [] self.__train_data = [] # self.__parameter_theta = np.zeros(feature_size) self.__parameter_theta = None self.__matrix_l = np.zeros([data_size, data_size]) self.__similarity = None self.__f_x = np.zeros([data_size, self.__feature_size]) self.__label = np.zeros(data_size) self.__step = 0.01 self.__tmp_answer = dict() self.__final_parameter = dict() self.__tmp_eigenvalue = dict() self.__sample = None
def get_mss_paper_summary(self, file_name, if_write_file=True): """ generate summary for one paper, single document summarization :param lang: :param file_name: current file name, used for write summary answer :param if_write_file: whether write generated summary to answer file named file_name :return: """ # initial self.__quality, self.__paper_name = None, file_name self.quality_method__ = "" if self.stop_word_method == "remove_stop": self.__paper = read_file(self.__child_path + "RemoveStop.temp") elif self.stop_word_method == "with_stop": self.__paper = read_file(self.__child_path + "word_segment.temp") self.__titles = read_file(self.__child_path + "titles.temp") self.__paper_original = read_file(self.__child_path + "word_segment.temp") self.__sub_paper_len = [ int(i) for i in read_file(self.__child_path + "sec_idx.temp") ] # extract sentence feature_subset, eig = self.__cal_candidate_set() #print len(feature_subset) # feature_subset = range(len(self.__paper_original)) # eig = [] log.info(feature_subset) log.debug(eig) # use feature list to extract summary summary = self.__construct_summary(feature_subset, eig) if if_write_file: if file_name == '': log.error("file name is empty") return "" # write answer to file for ROUGE #print self.__rouge_path answer_path = self.__child_path write_file( summary, os.path.join('%s/%s.txt' % (answer_path, file_name + '_result')), False) return "".join(summary)
def __initiate_f_x(self): log.info("initial f(x)") doc2vec_file = open("data/input_data/qualityForTrain.vec", "r") i = 0 for vector in doc2vec_file.readlines(): list_vector = vector.strip().split(" ") self.__f_x[i][:] = np.array(list_vector[:]) i += 1 if i >= self.__data_size: break doc2vec_file.close() sentence_file = open("data/input_data/taskAAForDoc2VecTrainData.txt") self.__train_data = [sentence.strip().decode("utf-8").split(" ") for sentence in sentence_file.readlines()] sentence_file.close() train_label_file = open("data/input_data/taskAAForDoc2VecTrainLabel.txt") self.__train_label = [sentence.strip() for sentence in train_label_file.readlines()] train_label_file.close()
def __quality_initial_length(self): if self.feature_merge.split("-")[1] == "0": return log.info("quality calculation: length") if self.quality_method__ == "": self.quality_method__ += "len" else: self.quality_method__ += "-len" tmp_quality = np.zeros([len(self.__paper)]) for i in range(len(self.__paper)): tmp_quality[i] = len(self.__paper[i].replace(" ", "")) mean = tmp_quality.sum() / float(len(self.__paper)) var = np.cov(tmp_quality) for i in range(len(self.__paper)): tmp_quality[i] = np.exp( (-1 * np.square(tmp_quality[i] - mean)) / var) # tmp_quality = self.__feature_normalization(tmp_quality) self.__quality += tmp_quality * float(self.feature_merge.split("-")[1])
def __cal_candidate_set(self): matrix_l = self.__cal_matrix() subset_ = [] eigenvalue = [] try: if self.candidate_method == "DR": subset_, eigenvalue = ds.sample(matrix_l) elif self.candidate_method == "CLU-DPP": cluster = hlda_analysis.sentence_cluster( self.__child_path, "run000") # debug hLDA message, include: total cluster number, each cluster sentence, i = 0 tmp = "" log.info("cluster number: " + str(len(cluster))) for sen_list in cluster: tmp += "\n cluster: " + str( i) + "\tsentence_num is " + str(len(sen_list)) + "\n" tmp += "\n".join(np.array(self.__paper_original)[sen_list]) i += 1 log.debug(tmp) # begin calculate and get sentence for i in range(len(cluster) / 2): sen_list = cluster[i] tmp_matrix = matrix_l[sen_list][:, sen_list] tmp_set, eig = ds.sample(tmp_matrix) if len(sen_list) < 10: subset_.append(sen_list) eigenvalue.append(eig) continue subset_.append(np.array(sen_list)[tmp_set].tolist()) eigenvalue.append(np.array(eig)[tmp_set].tolist()) elif self.candidate_method == "RANDOM": for i in range(20): subset_.append( np.random.randint(0, len(self.__paper_original))) else: raise RuntimeError("value error: " + self.candidate_method) except RuntimeError as e: log.error(e) finally: return subset_, eigenvalue
def __similarity_calculating_wordnet(self, idx_i, idx_j): if idx_i + idx_j <= 0: log.info("distance calculation: WordNet") self.distance_method__ = "WordNet" list1 = self.__paper[idx_i].split() list2 = self.__paper[idx_j].split() # list1 = ['RAM','keeps','things','being','worked','with'] # list2 = ['The','CPU','uses','RAM','as','a','short-term','memory','store'] vec = [] bylist = [list1, list2] totallist = list(set(list1 + list2)) for i in range(len(bylist)): vector = np.zeros(len(totallist)) for j in range(len(totallist)): if totallist[j] in bylist[i]: vector[j] = 1 continue tmp_vec = [] synsets1 = wn.synsets(totallist[j]) if len(synsets1) == 0: vector[j] = 0.0 continue for word in bylist[i]: synsets2 = wn.synsets(word) if len(synsets2) == 0: continue tmp_score = synsets1[0].path_similarity(synsets2[0]) if tmp_score is not None: tmp_vec.append(tmp_score) if len(tmp_vec) == 0: vector[j] = 0.0 else: vector[j] = max(tmp_vec) vec.append(vector) l_1 = np.sqrt(vec[0].dot(vec[0])) l_2 = np.sqrt(vec[1].dot(vec[1])) cos_angle = vec[0].dot(vec[1]) / (l_1 * l_2) return cos_angle
def __cal_matrix(self, file_name=""): log.info("extract feature from pre-defined setting!") if self.__feature_method == "QD": paper_len = len(self.__paper) matrix_l = np.zeros([paper_len, paper_len]) for i in range(paper_len): for j in range(paper_len): if i > j: continue num = self.__cal_matrix_element(i, j) matrix_l[i][j] = num matrix_l[j][i] = num elif self.__feature_method == "DM": file_path = self.__child_path + "word_segment.vec" matrix_l = self.__get_doc2vec_matrix(file_path) else: log.error("self.__feature_method is " + self.__feature_method) return [] # matrix_l = self.__feature_normalization(matrix_l) if self.summary_method == "hDPP": self.__doc_matrix_ = matrix_l return matrix_l
def __quality_initial_similarity(self): if self.feature_merge.split("-")[2] == "0": return log.info("quality calculation: similarity") if self.quality_method__ == "": self.quality_method__ += "sim" else: self.quality_method__ += "-sim" tmp_quality = np.zeros([len(self.__paper)]) ''' calculate quality use similarity ''' title = " ".join(self.__titles) vectorizer = CountVectorizer() transformer = TfidfTransformer() corpus = [title] corpus.extend(self.__paper) tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) # word = vectorizer.get_feature_names() # all words weight = tfidf.toarray() for i in range(len(self.__paper)): tmp_quality[i] = np.array(weight[i + 1]).dot(np.array(weight[0])) tmp_quality = self.__feature_normalization(tmp_quality) self.__quality += tmp_quality * float(self.feature_merge.split("-")[2])
def calculate_gradient(self, attitude): # compute L(x; theta) as in equation (155) log.debug(self.__parameter_theta) log.info("computing matrix L") for row_i in range(self.__data_size): for col_j in range(self.__data_size): self.__matrix_l[row_i][col_j] = self.__calculate_matrix_element(row_i, col_j) # Eigendecompose L(x; theta) log.info("engendecomposing") log.debug("matrix value: " + str(np.linalg.det(self.__matrix_l))) (eigenvalue, feature_vector) = np.linalg.eig(self.__matrix_l) for i in range(len(eigenvalue)): eigenvalue[i] = float(eigenvalue[i]) if np.abs(eigenvalue[i]) < 0.000000001: eigenvalue[i] = 0.0 log.debug("eigenvalue") log.debug(eigenvalue) # calculate K_ii log.info("calculating Kii") vector_k = np.zeros(self.__data_size) log.debug("feature value matrix") for i in range(self.__data_size): for j in range(self.__data_size): vector_k[i] += ((eigenvalue[j] / (eigenvalue[j] + 1)) * (feature_vector[i][j] ** 2)) # log.debug("Kii: " + str(vector_k)) # calculate gradient log.info("calculating gradient") sigma_sub_f_x = np.zeros(self.__feature_size) for index in range(self.__data_size): if self.__train_label[index] == attitude: sigma_sub_f_x += self.__f_x[index] sigma_kii_f_x = np.zeros(self.__feature_size) for i in range(self.__data_size): sigma_kii_f_x += vector_k[i] * self.__f_x[i] return sigma_sub_f_x - sigma_kii_f_x
def __calculate_similarity(self, idx_i, idx_j): if self.__similarity is None: log.info("initiating similarity") self.__similarity = self.__f_x.dot(self.__f_x.transpose().reshape(self.__data_size, self.__feature_size)) return self.__similarity[idx_i][idx_j] '''
def get_mss_paper_summary(self, lang, file_name, if_write_file=True): """ generate summary for one paper, single document summarization :param lang: :param file_name: current file name, used for write summary answer :param if_write_file: whether write generated summary to answer file named file_name :return: """ # initial self.__quality, self.__paper_name = None, file_name self.quality_method__ = "" ''' if DATA == "mms2015": self.__all_file.merge_mms_2015(os.path.dirname(self.__child_path), "chinese") elif DATA == "mss2017": if lang in ["vi", "ka"]: self.__all_file.merge_mss_2017(os.path.dirname(self.__child_path)) else: self.__all_file.merge_mss_2017_ros(os.path.dirname(self.__child_path)) self.__paper_original = self.__all_file.get_merged_paper() if self.stop_word_method == "remove_stop": self.__paper = self.__all_file.get_filtered_paper() elif self.stop_word_method == "with_stop": self.__paper = self.__all_file.get_merged_paper() self.__titles = self.__all_file.get_titles() # used for generate hLDA input file and calculate level method. if (not os.path.exists(self.__child_path + "model.temp")) or False: write_file(self.__paper, self.__child_path + "RemoveStop.temp", False) write_file(self.__paper_original, self.__child_path + "word_segment.temp", False) model_temp(self.__paper, self.__child_path) return "" ''' if self.stop_word_method == "remove_stop": self.__paper = read_file(self.__child_path + "RemoveStop.temp") elif self.stop_word_method == "with_stop": self.__paper = read_file(self.__child_path + "word_segment.temp") self.__titles = read_file(self.__child_path + "titles.temp") self.__paper_original = read_file(self.__child_path + "word_segment.temp") self.__sub_paper_len = [ int(i) for i in read_file(self.__child_path + "sec_idx.temp") ] # extract sentence feature_subset, eig = self.__cal_candidate_set() # feature_subset = range(len(self.__paper_original)) # eig = [] log.error("results is: ") log.info(feature_subset) log.debug(eig) # use feature list to extract summary summary = self.__construct_summary(feature_subset, eig, lang) if if_write_file: if file_name == '': log.error("file name is empty") return "" # write answer to file for ROUGE answer_path = self.__rouge_path + lang + "/systems/" write_file(summary, os.path.join('%s%s.txt' % (answer_path, file_name)), False) ''' # generate gold summary split by CHAR gold_path = self.__rouge_path + lang + "/models/" if not os.path.exists(gold_path): os.makedirs(gold_path) tmp_name = lang + "/" + file_name + "_summary.txt" abs_human = read_file('./data/MultiLing2015-MSS/multilingMss2015Eval/summary/' + tmp_name) if not os.path.exists(gold_path + file_name + "_summary.txt") and lang != "vi" and lang != 'ka': write_file([" ".join(api.tokenize("\n".join(abs_human)))], gold_path + file_name + "_summary.txt", False) if lang == "vi": write_file(abs_human, gold_path + file_name + "_summary.txt", False) # generate configure file of each document for ROUGE conf_path = self.__rouge_path + lang + "/configure/" if not os.path.exists(conf_path): os.makedirs(conf_path) tmp_conf_ = answer_path + file_name + ".txt " + gold_path + file_name + "_summary.txt" self.__all_conf.append(tmp_conf_) write_file([tmp_conf_], os.path.join('%s/%s.txt' % (conf_path, file_name)), False) ''' return "".join(summary)
def sample_k_test(self, parameter, sentiment_test): self.__parameter_theta = parameter matrix_l = np.zeros([self.__data_size, self.__data_size]) for row_i in range(self.__data_size): for col_j in range(row_i, self.__data_size): num = self.__calculate_matrix_element(row_i, col_j) print num matrix_l[row_i][col_j] = num matrix_l[col_j][row_i] = num size_ = self.__data_size ans = range(size_) ''' matrix_k = matrix_l diff = range(size_) for i in range(10): center_idx, vec = ds.sample_k(matrix_k, 2) tmp = range(size_) log.info("center_idx: " + str(diff[center_idx[0]]) + " " + str(diff[center_idx[1]])) if i > 2: ans.append(diff[center_idx[0]]) ans.append(diff[center_idx[1]]) log.info("".join(self.__test_data[diff[center_idx[0]]])) log.info(self.__test_label[diff[center_idx[0]]]) log.info("".join(self.__test_data[diff[center_idx[1]]])) log.info(self.__test_label[diff[center_idx[1]]]) diff = list(set(tmp)-set(center_idx)) self.__f_x_test = self.__f_x_test[diff][:] matrix_k = matrix_k[diff][:, diff] size_ = len(diff) log.info(matrix_l.shape) ''' center_idx, vec = ds.sample_k(matrix_l, 3) log.debug(vec) final_answer_ = np.array(ans)[center_idx] log.info(final_answer_) log.info("".join(self.__test_data[final_answer_[0]])) log.info(self.__test_label[final_answer_[0]]) log.info("".join(self.__test_data[final_answer_[1]])) log.info(self.__test_label[final_answer_[1]]) log.info("".join(self.__test_data[final_answer_[2]])) log.info(self.__test_label[final_answer_[2]]) cluster_1 = [] cluster_2 = [] cluster_3 = [] score_0 = sentiment_test[center_idx[0]] score_1 = sentiment_test[center_idx[1]] score_2 = sentiment_test[center_idx[2]] label_1 = '' label_0 = '' label_2 = '' if score_0 > score_1: if score_0 > score_2: label_0 = 'FAVOR' if score_1 > score_2: label_1 = 'NONE' label_2 = 'AGAINST' else: label_1 = 'AGAINST' label_2 = 'NONE' else: label_2 = 'FAVOR' label_0 = 'NONE' label_1 = 'AGAINST' else: if score_1 > score_2: label_1 = 'FAVOR' if score_0 > score_2: label_0 = 'NONE' label_2 = 'AGAINST' else: label_0 = 'AGAINST' label_2 = 'NONE' else: score_2 = 'FAVOR' score_0 = 'AGAINST' score_1 = 'NONE' cluster_1.append(center_idx[0]) cluster_2.append(center_idx[1]) cluster_3.append(center_idx[2]) for i in range(self.__data_size): # sim_1 = matrix_l[i][i] * matrix_l[center_idx[0]][center_idx[0]] - matrix_l[i][center_idx[0]] ** 2 # sim_2 = matrix_l[i][i] * matrix_l[center_idx[1]][center_idx[1]] - matrix_l[i][center_idx[1]] ** 2 # sim_3 = matrix_l[i][i] * matrix_l[center_idx[2]][center_idx[2]] - matrix_l[i][center_idx[2]] ** 2 # sim_1 = np.sum(np.square(vec[:][i] - vec[:][center_idx[0]])) # sim_2 = np.sum(np.square(vec[:][i] - vec[:][center_idx[1]])) # sim_3 = np.sum(np.square(vec[:][i] - vec[:][center_idx[2]])) sim_1 = self.__similarity[i][center_idx[0]] sim_2 = self.__similarity[i][center_idx[1]] sim_3 = self.__similarity[i][center_idx[2]] if sim_1 > sim_2: if sim_1 > sim_3: cluster_1.append(i) else: cluster_3.append(i) elif sim_2 > sim_3: cluster_2.append(i) else: cluster_3.append(i) ans = [] for i in range(self.__data_size): print "i: " + str(i) if i in cluster_1: # ans.append(self.__test_label[center_idx[0]]) ans.append(label_0) elif i in cluster_2: ans.append(label_1) else: ans.append(label_2) log.info('FINAL' + str(ans)) for label in ["FAVOR", "AGAINST", "NONE"]: self.get_f_score(ans, label)
def learning(self): for label in ["FAVOR", "AGAINST", "NONE"]: self.__learning_single_attitude(label) right = 0.0 ans = [] log.info("answer") print self.__tmp_answer log.info(self.__tmp_answer["FAVOR"]) log.info(self.__tmp_eigenvalue["FAVOR"]) log.info(self.__tmp_answer["AGAINST"]) log.info(self.__tmp_eigenvalue["AGAINST"]) log.info(self.__tmp_answer["NONE"]) log.info(self.__tmp_eigenvalue["NONE"]) tmp = [] for i in range(100): a = list() a.append(self.__tmp_answer["FAVOR"][i]) a.append(self.__tmp_answer["AGAINST"][i]) a.append(self.__tmp_answer["NONE"][i]) count = 0 label = "" for j in range(3): if a[j] == "NONEs": count += 1 else: label = a[j] tmp.append(str(self.__tmp_eigenvalue["FAVOR"][i]) + '\t' + str(self.__tmp_eigenvalue["AGAINST"][i]) + '\t' + str(self.__tmp_eigenvalue["NONE"][i])) if count == 2: right += 1 ans.append(label) else: favor_value = self.__tmp_eigenvalue["FAVOR"][i] against_value = self.__tmp_eigenvalue["AGAINST"][i] none_value = self.__tmp_eigenvalue["NONE"][i] if favor_value > against_value: if favor_value > none_value: ans.append("FAVOR") else: ans.append("NONE") else: if against_value > none_value: ans.append("AGAINST") else: ans.append("NONE") log.info(tmp) log.info("final_answer: " + str(ans)) log.info("final parameters: ") log.info(self.__final_parameter) for label in ["FAVOR", "AGAINST", "NONE"]: self.__sample.get_f_score(ans, label)