Beispiel #1
0
def merge_loop(double_set, root_name, file=None):
    """
    进行团合并操作,循环直到不能合并
    :param double_set:强相关的两两组合
    :param root_name:词根列表
    :param file:对聚类结果进行dump的目标路径
    :return:团成员最大数,最终的团
    """
    best_set = set()
    old_set = double_set
    num_list = []
    count_list = []
    group_list = []
    while len(old_set) > 0:
        # oldSet为需要继续进行合并操作的团
        print('成员数:', len(list(old_set)[0]))  # oldSet中团的成员数量
        print('个数:', len(old_set))  # oldSet中团的数量
        print(old_set)
        num_list.append(len(list(old_set)[0]))
        count_list.append(len(old_set))
        group_list.append(old_set)
        best_set = old_set
        old_set = merge_group(old_set, double_set)  # 返回新组合成的团,对这些团继续进行合并操作
    # 若oldSet不存在,则说明聚类收敛、合并到最大的团了,无法继续合并了
    if file is not None:
        group_list = index_2_word(root_name, group_list)
        write_csv(['成员数', '个数', '团'], file, [num_list, count_list, group_list])
        save_pickle(file + '.pkl', group_list)
    print(best_set)
    return len(list(best_set)[0]), best_set
 def group_all():
     """
     先将基于不同数量亲友团的聚类结果进行清理、删除被包含在更大团中的团,然后将清洗后的聚类结果进行结合并输出
     :return:
     """
     group_all = []
     group_name = []
     for i in range(max_relatives_nums, 4, -1):
         group_path = os.path.join("data", "group" + str(i) + ".csv.pkl")
         group_name.append("group" + str(i))
         group_all.append(group_clean(group_path))
     write_csv(group_name, group_all_path, group_all)
 def search_relatives(self):
     """
     先计算每一对两两组合之间的互信息,然后根据最大亲友团数量找到每个词根的亲友团
     :return:
     """
     correlation = calculate_correlation(self.combine_index,
                                         self.combine_fre, self.root_fre)
     self.combine_name = index_2_word(
         self.root_name, self.combine_index)  # 将单独的词根和组合中的索引转换为词
     # 将互信息按照大小降序排列大小,然后再写入到csv中
     data = write_csv(['组合', '关联度系数'], correlation_path,
                      [self.combine_name, correlation])
     # 获取每个症状的亲友团list
     self.relatives_list = create_relatives(self.root_name, data,
                                            max_relatives_nums)
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Split train.csv into train, dev, and test splits. Specify dev and validation set sizes with args, the remainder is used for training.'
    )
    parser.add_argument(
        '--dataset-file',
        required=True,
        help='path to the train.csv file containing the quora training data')
    parser.add_argument('--ndev',
                        type=int,
                        default=1e4,
                        help='size of dev set to create')
    parser.add_argument('--nvalid',
                        type=int,
                        default=5e4,
                        help='size of validation set to create')
    parser.add_argument(
        '--output-dir',
        required=True,
        help='directory to which to write train.csv, dev.csv, and valid.csv')
    parser.add_argument(
        '--seed',
        type=int,
        help=
        'optional random seed to have reproducibility between multiple uses of this tool'
    )
    args = parser.parse_args()

    data = du.load_csv(args.dataset_file)
    shuffled = du.shuffle(data, args.seed)

    ntrain = len(data) - args.ndev - args.nvalid
    train, dev, valid = du.split(shuffled, ntrain, args.ndev, args.nvalid)

    du.write_csv(train, os.path.join(args.output_dir, 'train.csv'))
    du.write_csv(dev, os.path.join(args.output_dir, 'dev.csv'))
    du.write_csv(valid, os.path.join(args.output_dir, 'valid.csv'))
Beispiel #5
0
def analyze_oov(word_dict, quora_data, writer, output_dir, prefix):
    oov_word_freqs = defaultdict(return_0)
    all_words = defaultdict(return_0)
    oov_counts_per_question = defaultdict(return_0)
    num_tokens = 0

    for col_label in ['question1', 'question2']:
        print "  ...now handling " + col_label + "..."
        raw_text = st.preprocess(quora_data[col_label])
        for i, q in enumerate(raw_text):
            num_oovs_in_cur_q = 0
            tokens = q.split()
            num_tokens += len(tokens)
            for w in tokens:
                all_words[w] = 1
                if not w in word_dict:
                    # Count the occurrence of this OOV word
                    oov_word_freqs[w] += 1
                    # Count the occurrence of an OOV word for this question
                    num_oovs_in_cur_q += 1

                    # Signal that the word is OOV with our special signal, which we checked
                    # beforehand is not in train.csv or test.csv
                    orig_text = quora_data[col_label][i]
                    quora_data.set_value(i, col_label,
                                         orig_text.replace(w, '%<' + w + '>%'))

            # Increment the histogram count for questions with this number of OOVs
            if num_oovs_in_cur_q:
                oov_counts_per_question[num_oovs_in_cur_q] += 1

    # Write everything to disk
    oov_word_fd = open(
        os.path.join(output_dir, prefix + '_oov_word_freqs.pkl'), 'w')
    pkl.dump(oov_word_freqs, oov_word_fd)
    oov_word_fd.close()

    counts_per_q_fd = open(os.path.join(output_dir, prefix + '_oov_per_q.pkl'),
                           'w')
    pkl.dump(oov_counts_per_question, counts_per_q_fd)
    counts_per_q_fd.close()

    edited_df_file = os.path.join(output_dir, prefix + '_oov_annot.csv')
    du.write_csv(quora_data, edited_df_file)

    # Some summary stats to humor the user
    num_oov = sum([v for v in oov_word_freqs.values()])
    writer.emit_line(
        "Frequency of oov tokens={0} (out of {1} tokens total, or {2:.4f}%)".
        format(num_oov, num_tokens,
               float(num_oov) / num_tokens * 100))
    writer.emit_line(
        "Num unique oov words={0} (out of {1}, or {2:.4f}%)".format(
            len(oov_word_freqs), len(all_words),
            len(oov_word_freqs) / float(len(all_words)) * 100))
    num_oov_qs = sum([v for v in oov_counts_per_question.values()])
    writer.emit_line(
        "Num of q's containing oov words={0} (out of {1}, or {2:.4f}%)".format(
            num_oov_qs,
            len(quora_data) * 2,
            float(num_oov_qs) / (len(quora_data) * 2) * 100))
    oovs_per_q = sum([
        count * float(num_qs) / (len(quora_data) * 2)
        for count, num_qs in oov_counts_per_question.items()
    ])
    writer.emit_line("Avg num OOV words per question: {0}".format(oovs_per_q))