Example #1
0
def analysis_0():
    data_iter = get_train()

    id_num = 0
    eq_len = []
    neq_len = []
    avg_eq_len = []
    avg_neq_len = []

    for _, eqs_list, neqs_list in data_iter:
        id_num += 1
        avg_eq_len.append(len(eqs_list))
        avg_neq_len.append(len(neqs_list))
        for eq in eqs_list:
            eq_len.append(len(eq))
        for neq in neqs_list:
            neq_len.append(len(neq))
        print('\rnum: {}'.format(id_num), end='    ')

    print('\nover.')
    eq_len = np.array(eq_len)
    neq_len = np.array(neq_len)
    avg_eq_len = np.array(avg_eq_len)
    avg_neq_len = np.array(avg_neq_len)
    print('eq_len: ')
    print(eq_len.mean(), eq_len.max(), eq_len.min())
    print('neq_len: ')
    print(neq_len.mean(), neq_len.max(), neq_len.min())
    print('avg_eq_len: ')
    print(avg_eq_len.mean(), avg_eq_len.max(), avg_eq_len.min())
    print('avg_neq_len: ')
    print(avg_neq_len.mean(), avg_neq_len.max(), avg_neq_len.min())
    """
Example #2
0
def get_exist_words():
    import jieba
    from aa_cfg import join, DATA_PATH
    import json

    jieba.load_userdict(join(DATA_PATH, 'token_freq.txt'))
    jieba.load_userdict(join(DATA_PATH, 'law_word.txt'))

    chars = dict()

    train_iter = get_train()
    for _d in train_iter:
        _id, eqs_list, neqs_list = _d
        d_list = eqs_list
        d_list.extend(neqs_list)
        for s in d_list:
            s_list = jieba.lcut(s)
            for w in s_list:
                chars[w] = chars.get(w, 0) + 1
    chars = [(i, j) for i, j in chars.items() if j >= 10 and len(i) > 1]
    chars = sorted(chars, key=lambda c: -c[1])
    chars = [c[0] for c in chars]

    json.dump(chars,
              open(join(DATA_PATH, 'chars.dict'), 'w', encoding='utf-8'),
              indent=4,
              ensure_ascii=False)
Example #3
0
def get_line_text():
    train_iter = get_train()
    for _d in train_iter:
        _id, eqs_list, neqs_list = _d
        d_list = eqs_list
        d_list.extend(neqs_list)
        for s in d_list:
            yield s[:ec_cfg.max_seq_len]
Example #4
0
def simplify_vocab_dict():
    import json
    chars = dict()

    min_count = 1

    model_pre_save_path = join(MODEL_PATH, 'train_pre')
    if not os.path.isdir(model_pre_save_path):
        os.makedirs(model_pre_save_path)

    data = get_train()
    for _, pos, neg in data:
        for sentence in pos:
            for w in sentence:
                chars[w] = chars.get(w, 0) + 1
        for sentence in neg:
            for w in sentence:
                chars[w] = chars.get(w, 0) + 1

    chars = [(i, j) for i, j in chars.items() if j >= min_count]
    chars = sorted(chars, key=lambda c: -c[1])
    chars = [c[0] for c in chars]
    json.dump(chars,
              open(join(model_pre_save_path, 'chars.dict'),
                   'w',
                   encoding='utf-8'),
              indent=4,
              ensure_ascii=False)

    # checkpoint_path = os.path.join(main_path, 'model/bert/bert_model.ckpt')
    dict_path = os.path.join(DATA_PATH, 'bert_roberta/vocab.txt')

    _token_dict = load_vocab(dict_path)  # 读取词典
    token_dict, keep_words = {}, []  # keep_words是在bert中保留的字表

    for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
        token_dict[c] = len(token_dict)
        keep_words.append(_token_dict[c])

    for c in chars:
        if c in _token_dict:
            token_dict[c] = len(token_dict)
            keep_words.append(_token_dict[c])
    print('len of keep_words: ', len(keep_words))
    joblib.dump(token_dict, join(model_pre_save_path, 'token_dict.joblib'))
    joblib.dump(keep_words, join(model_pre_save_path, 'keep_words.joblib'))
Example #5
0
def split_data(split_n=MAX_FOLD):
    # 5个交叉验证集
    print('read...')
    data = list(get_train())
    print('shuffle...')
    random.shuffle(data)
    val_len = int(len(data) / split_n)
    for i in range(split_n):
        if i == split_n - 1:
            val = data[val_len * i:]
        else:
            val = data[val_len * i:val_len * (i + 1)]
        train = data[:val_len * i]
        train.extend(data[val_len * (i + 1):])
        val_final = []
        for d in val:
            val_final.append(trains_pairs(d, 2))
        random.shuffle(val_final)
        print('save {}'.format(i))
        joblib.dump(train, join(MID_PATH, 'train_{}.joblib'.format(i)))
        joblib.dump(val_final, join(MID_PATH, 'val_{}.joblib'.format(i)))