def __cal_matrix(self, file_name=""): log.info("extract feature from pre-defined setting!") if self.__feature_method == "QD": paper_len = len(self.__paper) matrix_l = np.zeros([paper_len, paper_len]) for i in range(paper_len): for j in range(paper_len): # print "element %d, %d" % (i, j) num = self.__cal_matrix_element(i, j) matrix_l[i][j] = num matrix_l[j][i] = num elif self.__feature_method == "DM": # if file_name == "": # log.error("file name is empty, please check!") # return [] # file_path = os.path.join("./data/word2vec/remove_stop/%s.vec" % file_name) file_path = self.__child_path + "word_segment.vec" matrix_l = self.__get_doc2vec_matrix(file_path) else: log.error("self.__feature_method is " + self.__feature_method) return [] # matrix_l = self.__feature_normalization(matrix_l) if self.summary_method == "hDPP": self.__doc_matrix_ = matrix_l return matrix_l
def write_file(file_conent, file_path, if_convert=True): try: output = open(file_path, 'w') except IOError as e: log.error(e) exit() finally: for cur_line in file_conent: new_line = (cur_line + u"\n").encode("utf-8") # if if_convert: # output.write(new_line.decode('utf-8').encode('GB2312', 'ignore')) # else: # output.write(new_line) output.write(new_line) output.close()
def get_mss_paper_summary(self, file_name, if_write_file=True): """ generate summary for one paper, single document summarization :param lang: :param file_name: current file name, used for write summary answer :param if_write_file: whether write generated summary to answer file named file_name :return: """ # initial self.__quality, self.__paper_name = None, file_name self.quality_method__ = "" if self.stop_word_method == "remove_stop": self.__paper = read_file(self.__child_path + "RemoveStop.temp") elif self.stop_word_method == "with_stop": self.__paper = read_file(self.__child_path + "word_segment.temp") self.__titles = read_file(self.__child_path + "titles.temp") self.__paper_original = read_file(self.__child_path + "word_segment.temp") self.__sub_paper_len = [ int(i) for i in read_file(self.__child_path + "sec_idx.temp") ] # extract sentence feature_subset, eig = self.__cal_candidate_set() #print len(feature_subset) # feature_subset = range(len(self.__paper_original)) # eig = [] log.info(feature_subset) log.debug(eig) # use feature list to extract summary summary = self.__construct_summary(feature_subset, eig) if if_write_file: if file_name == '': log.error("file name is empty") return "" # write answer to file for ROUGE #print self.__rouge_path answer_path = self.__child_path write_file( summary, os.path.join('%s/%s.txt' % (answer_path, file_name + '_result')), False) return "".join(summary)
def read_file(file_path): file_ans = list() try: file_object_ = open(file_path, 'rb') except IOError as e: log.error(e) else: try: file_content = file_object_.read() file_content = __pattern.sub('\n', file_content) for cur_line in file_content.split('\n'): if cur_line == "": continue file_ans.append(cur_line.strip().decode("utf-8")) finally: file_object_.close() return file_ans
def __cal_candidate_set(self): matrix_l = self.__cal_matrix() subset_ = [] eigenvalue = [] #print self.candidate_method try: if self.candidate_method == "DR": subset_, eigenvalue = ds.sample(matrix_l) #print len(subset_) elif self.candidate_method == "CLU-DPP": cluster = hlda_analysis.sentence_cluster( self.__child_path, "run000") # debug hLDA message, include: total cluster number, each cluster sentence, i = 0 tmp = "" log.debug("cluster number: " + str(len(cluster))) for sen_list in cluster: tmp += "\n cluster: " + str( i) + "\tsentence_num is " + str(len(sen_list)) + "\n" tmp += "\n".join(np.array(self.__paper_original)[sen_list]) i += 1 log.debug(tmp) # begin calculate and get sentence for i in range(len(cluster) / 2): sen_list = cluster[i] tmp_matrix = matrix_l[sen_list][:, sen_list] tmp_set, eig = ds.sample(tmp_matrix) if len(sen_list) < 10: subset_.append(sen_list) eigenvalue.append(eig) continue subset_.append(np.array(sen_list)[tmp_set].tolist()) eigenvalue.append(np.array(eig)[tmp_set].tolist()) elif self.candidate_method == "RANDOM": for i in range(20): subset_.append( np.random.randint(0, len(self.__paper_original))) else: raise RuntimeError("value error: " + self.candidate_method) except RuntimeError as e: log.error(e) finally: return subset_, eigenvalue
def __quality_calculating(self, idx): """ calculate quality using different methods :param idx: index of matrix element :return: """ if self.__paper is None: log.error("") sys.exit() if self.__quality is not None: return self.__quality[idx] self.__quality = np.zeros([len(self.__paper)]) self.__quality_initial_length() self.__quality_initial_coverage() self.__quality_initial_position() self.__quality_initial_level() self.__quality_initial_similarity() # self.__quality /= 2.0 return self.__quality[idx]
def __cal_matrix(self, file_name=""): log.info("extract feature from pre-defined setting!") if self.__feature_method == "QD": paper_len = len(self.__paper) matrix_l = np.zeros([paper_len, paper_len]) for i in range(paper_len): for j in range(paper_len): if i > j: continue num = self.__cal_matrix_element(i, j) matrix_l[i][j] = num matrix_l[j][i] = num elif self.__feature_method == "DM": file_path = self.__child_path + "word_segment.vec" matrix_l = self.__get_doc2vec_matrix(file_path) else: log.error("self.__feature_method is " + self.__feature_method) return [] # matrix_l = self.__feature_normalization(matrix_l) if self.summary_method == "hDPP": self.__doc_matrix_ = matrix_l return matrix_l
def get_mss_paper_summary(self, lang, file_name, if_write_file=True): """ generate summary for one paper, single document summarization :param lang: :param file_name: current file name, used for write summary answer :param if_write_file: whether write generated summary to answer file named file_name :return: """ # initial self.__quality, self.__paper_name = None, file_name self.quality_method__ = "" ''' if DATA == "mms2015": self.__all_file.merge_mms_2015(os.path.dirname(self.__child_path), "chinese") elif DATA == "mss2017": if lang in ["vi", "ka"]: self.__all_file.merge_mss_2017(os.path.dirname(self.__child_path)) else: self.__all_file.merge_mss_2017_ros(os.path.dirname(self.__child_path)) self.__paper_original = self.__all_file.get_merged_paper() if self.stop_word_method == "remove_stop": self.__paper = self.__all_file.get_filtered_paper() elif self.stop_word_method == "with_stop": self.__paper = self.__all_file.get_merged_paper() self.__titles = self.__all_file.get_titles() # used for generate hLDA input file and calculate level method. if (not os.path.exists(self.__child_path + "model.temp")) or False: write_file(self.__paper, self.__child_path + "RemoveStop.temp", False) write_file(self.__paper_original, self.__child_path + "word_segment.temp", False) model_temp(self.__paper, self.__child_path) return "" ''' if self.stop_word_method == "remove_stop": self.__paper = read_file(self.__child_path + "RemoveStop.temp") elif self.stop_word_method == "with_stop": self.__paper = read_file(self.__child_path + "word_segment.temp") self.__titles = read_file(self.__child_path + "titles.temp") self.__paper_original = read_file(self.__child_path + "word_segment.temp") self.__sub_paper_len = [ int(i) for i in read_file(self.__child_path + "sec_idx.temp") ] # extract sentence feature_subset, eig = self.__cal_candidate_set() # feature_subset = range(len(self.__paper_original)) # eig = [] log.error("results is: ") log.info(feature_subset) log.debug(eig) # use feature list to extract summary summary = self.__construct_summary(feature_subset, eig, lang) if if_write_file: if file_name == '': log.error("file name is empty") return "" # write answer to file for ROUGE answer_path = self.__rouge_path + lang + "/systems/" write_file(summary, os.path.join('%s%s.txt' % (answer_path, file_name)), False) ''' # generate gold summary split by CHAR gold_path = self.__rouge_path + lang + "/models/" if not os.path.exists(gold_path): os.makedirs(gold_path) tmp_name = lang + "/" + file_name + "_summary.txt" abs_human = read_file('./data/MultiLing2015-MSS/multilingMss2015Eval/summary/' + tmp_name) if not os.path.exists(gold_path + file_name + "_summary.txt") and lang != "vi" and lang != 'ka': write_file([" ".join(api.tokenize("\n".join(abs_human)))], gold_path + file_name + "_summary.txt", False) if lang == "vi": write_file(abs_human, gold_path + file_name + "_summary.txt", False) # generate configure file of each document for ROUGE conf_path = self.__rouge_path + lang + "/configure/" if not os.path.exists(conf_path): os.makedirs(conf_path) tmp_conf_ = answer_path + file_name + ".txt " + gold_path + file_name + "_summary.txt" self.__all_conf.append(tmp_conf_) write_file([tmp_conf_], os.path.join('%s/%s.txt' % (conf_path, file_name)), False) ''' return "".join(summary)