def merge_loop(double_set, root_name, file=None): """ 进行团合并操作,循环直到不能合并 :param double_set:强相关的两两组合 :param root_name:词根列表 :param file:对聚类结果进行dump的目标路径 :return:团成员最大数,最终的团 """ best_set = set() old_set = double_set num_list = [] count_list = [] group_list = [] while len(old_set) > 0: # oldSet为需要继续进行合并操作的团 print('成员数:', len(list(old_set)[0])) # oldSet中团的成员数量 print('个数:', len(old_set)) # oldSet中团的数量 print(old_set) num_list.append(len(list(old_set)[0])) count_list.append(len(old_set)) group_list.append(old_set) best_set = old_set old_set = merge_group(old_set, double_set) # 返回新组合成的团,对这些团继续进行合并操作 # 若oldSet不存在,则说明聚类收敛、合并到最大的团了,无法继续合并了 if file is not None: group_list = index_2_word(root_name, group_list) write_csv(['成员数', '个数', '团'], file, [num_list, count_list, group_list]) save_pickle(file + '.pkl', group_list) print(best_set) return len(list(best_set)[0]), best_set
def group_all(): """ 先将基于不同数量亲友团的聚类结果进行清理、删除被包含在更大团中的团,然后将清洗后的聚类结果进行结合并输出 :return: """ group_all = [] group_name = [] for i in range(max_relatives_nums, 4, -1): group_path = os.path.join("data", "group" + str(i) + ".csv.pkl") group_name.append("group" + str(i)) group_all.append(group_clean(group_path)) write_csv(group_name, group_all_path, group_all)
def search_relatives(self): """ 先计算每一对两两组合之间的互信息,然后根据最大亲友团数量找到每个词根的亲友团 :return: """ correlation = calculate_correlation(self.combine_index, self.combine_fre, self.root_fre) self.combine_name = index_2_word( self.root_name, self.combine_index) # 将单独的词根和组合中的索引转换为词 # 将互信息按照大小降序排列大小,然后再写入到csv中 data = write_csv(['组合', '关联度系数'], correlation_path, [self.combine_name, correlation]) # 获取每个症状的亲友团list self.relatives_list = create_relatives(self.root_name, data, max_relatives_nums)
def main(): parser = argparse.ArgumentParser( description= 'Split train.csv into train, dev, and test splits. Specify dev and validation set sizes with args, the remainder is used for training.' ) parser.add_argument( '--dataset-file', required=True, help='path to the train.csv file containing the quora training data') parser.add_argument('--ndev', type=int, default=1e4, help='size of dev set to create') parser.add_argument('--nvalid', type=int, default=5e4, help='size of validation set to create') parser.add_argument( '--output-dir', required=True, help='directory to which to write train.csv, dev.csv, and valid.csv') parser.add_argument( '--seed', type=int, help= 'optional random seed to have reproducibility between multiple uses of this tool' ) args = parser.parse_args() data = du.load_csv(args.dataset_file) shuffled = du.shuffle(data, args.seed) ntrain = len(data) - args.ndev - args.nvalid train, dev, valid = du.split(shuffled, ntrain, args.ndev, args.nvalid) du.write_csv(train, os.path.join(args.output_dir, 'train.csv')) du.write_csv(dev, os.path.join(args.output_dir, 'dev.csv')) du.write_csv(valid, os.path.join(args.output_dir, 'valid.csv'))
def analyze_oov(word_dict, quora_data, writer, output_dir, prefix): oov_word_freqs = defaultdict(return_0) all_words = defaultdict(return_0) oov_counts_per_question = defaultdict(return_0) num_tokens = 0 for col_label in ['question1', 'question2']: print " ...now handling " + col_label + "..." raw_text = st.preprocess(quora_data[col_label]) for i, q in enumerate(raw_text): num_oovs_in_cur_q = 0 tokens = q.split() num_tokens += len(tokens) for w in tokens: all_words[w] = 1 if not w in word_dict: # Count the occurrence of this OOV word oov_word_freqs[w] += 1 # Count the occurrence of an OOV word for this question num_oovs_in_cur_q += 1 # Signal that the word is OOV with our special signal, which we checked # beforehand is not in train.csv or test.csv orig_text = quora_data[col_label][i] quora_data.set_value(i, col_label, orig_text.replace(w, '%<' + w + '>%')) # Increment the histogram count for questions with this number of OOVs if num_oovs_in_cur_q: oov_counts_per_question[num_oovs_in_cur_q] += 1 # Write everything to disk oov_word_fd = open( os.path.join(output_dir, prefix + '_oov_word_freqs.pkl'), 'w') pkl.dump(oov_word_freqs, oov_word_fd) oov_word_fd.close() counts_per_q_fd = open(os.path.join(output_dir, prefix + '_oov_per_q.pkl'), 'w') pkl.dump(oov_counts_per_question, counts_per_q_fd) counts_per_q_fd.close() edited_df_file = os.path.join(output_dir, prefix + '_oov_annot.csv') du.write_csv(quora_data, edited_df_file) # Some summary stats to humor the user num_oov = sum([v for v in oov_word_freqs.values()]) writer.emit_line( "Frequency of oov tokens={0} (out of {1} tokens total, or {2:.4f}%)". format(num_oov, num_tokens, float(num_oov) / num_tokens * 100)) writer.emit_line( "Num unique oov words={0} (out of {1}, or {2:.4f}%)".format( len(oov_word_freqs), len(all_words), len(oov_word_freqs) / float(len(all_words)) * 100)) num_oov_qs = sum([v for v in oov_counts_per_question.values()]) writer.emit_line( "Num of q's containing oov words={0} (out of {1}, or {2:.4f}%)".format( num_oov_qs, len(quora_data) * 2, float(num_oov_qs) / (len(quora_data) * 2) * 100)) oovs_per_q = sum([ count * float(num_qs) / (len(quora_data) * 2) for count, num_qs in oov_counts_per_question.items() ]) writer.emit_line("Avg num OOV words per question: {0}".format(oovs_per_q))