コード例 #1
0
def preprocess(p, h, chars_per_word, preprocessor, save_dir, data_paths,
               word_vector_save_path, normalize_word_vectors, max_loaded_word_vectors=None, word_vectors_load_path=None,
               include_word_vectors=True, include_chars=True,
               include_syntactical_features=True, include_exact_match=True):

    preprocessor.get_all_words_with_parts_of_speech([data_path[1] for data_path in data_paths])
    print('Found', len(preprocessor.unique_words), 'unique words')
    print('Found', len(preprocessor.unique_parts_of_speech), 'unique parts of speech')

    # Init mappings of the preprocessor
    preprocessor.init_word_to_vectors(vectors_file_path=get_word2vec_file_path(word_vectors_load_path),
                                      needed_words=preprocessor.unique_words,
                                      normalize=normalize_word_vectors,
                                      max_loaded_word_vectors=max_loaded_word_vectors)
    preprocessor.init_chars(words=preprocessor.unique_words)
    preprocessor.init_parts_of_speech(parts_of_speech=preprocessor.unique_parts_of_speech)

    # Process and save the data
    preprocessor.save_word_vectors(word_vector_save_path)
    for dataset, input_path in data_paths:
        data = preprocessor.parse(input_file_path=input_path,
                                  max_words_p=p,
                                  max_words_h=h,
                                  chars_per_word=chars_per_word)

        # Determine which part of data we need to dump
        if not include_exact_match:             del data[6:8]  # Exact match feature
        if not include_syntactical_features:    del data[4:6]  # Syntactical POS tags
        if not include_chars:                   del data[2:4]  # Character features
        if not include_word_vectors:            del data[0:2]  # Word vectors

        data_saver = ChunkDataManager(save_data_path=os.path.join(save_dir, dataset))
        data_saver.save([np.array(item) for item in data])
コード例 #2
0
def preprocess(p,
               h,
               chars_per_word,
               preprocessor,
               save_dir,
               data_paths,
               word_vector_save_path,
               normalize_word_vectors,
               max_loaded_word_vectors=None,
               word_vectors_load_path=None,
               include_word_vectors=True,
               include_chars=True,
               include_syntactical_features=True,
               include_exact_match=True):

    preprocessor.get_all_words_with_parts_of_speech(
        [data_path[1] for data_path in data_paths])
    print('Found', len(preprocessor.unique_words), 'unique words')
    print('Found', len(preprocessor.unique_parts_of_speech),
          'unique parts of speech')

    # Init mappings of the preprocessor
    preprocessor.init_word_to_vectors(
        vectors_file_path=word_vectors_load_path,
        needed_words=preprocessor.unique_words,
        normalize=normalize_word_vectors,
        max_loaded_word_vectors=max_loaded_word_vectors)
    # preprocessor.init_chars(words=preprocessor.unique_words)

    preprocessor.init_parts_of_speech(
        parts_of_speech=preprocessor.unique_parts_of_speech)

    # 预处理及保存数据;
    preprocessor.save_word_vectors(
        word_vector_save_path)  # 保存相关词(包括缺失词向量的词随机初始化的词向量)的向量到文件
    for dataset, input_path in data_paths:
        # [('train', '/home/gswyhq/data/LCQMC/train.txt'),
        #  ('test', '/home/gswyhq/data/LCQMC/test.txt'),
        #  ('dev', '/home/gswyhq/data/LCQMC/dev.txt')]
        data = preprocessor.parse(input_file_path=input_path,
                                  max_words_p=p,
                                  max_words_h=h,
                                  chars_per_word=chars_per_word)
        # 句1词id列表,句2词id列表,句1字填充补全矩阵,句2字填充补全矩阵,句1词性填充补全矩阵,句2词性填充补全矩阵,句1对应词对方是否包含,句2对应词对方是否包含,标签

        # Determine which part of data we need to dump
        if not include_exact_match: del data[6:8]  # Exact match feature
        if not include_syntactical_features:
            del data[4:6]  # Syntactical POS tags
        if not include_chars: del data[2:4]  # Character features
        if not include_word_vectors: del data[0:2]  # Word vectors

        data_saver = ChunkDataManager(
            save_data_path=os.path.join(save_dir, dataset))
        data_saver.save([np.array(item) for item in data])
コード例 #3
0
def test_DUCTAC(model, data_path, voca_name, re_exp, args):
    #    data_list = [0,1]
    data_path = data_path + '_' + voca_name
    data_list = [0, 1, 2, 3]

    folders = [
        os.path.join(data_path, d) for d in os.listdir(data_path)
        if os.path.isdir(os.path.join(data_path, d)) and re.search(re_exp, d)
    ]
    print('{} folders are found from {}'.format(len(folders), data_path))

    time_takes = []
    for i, folder in enumerate(folders):
        x_test = ChunkDataManager(load_data_path=folder).load(
            load_list=data_list)

        start_test = time.time()
        y_pred = model.predict(x_test, batch_size=args.batch_size)
        finish_test = time.time() - start_test

        base_name = os.path.basename(folder)
        time_takes.append(finish_test)
        print('[{} {}] elapsed testing time: {:3.3f} secs'.format(
            i + 1, base_name, finish_test))

        matfile_name = os.path.join(folder, 'prediction.mat')
        sio.savemat(matfile_name, {'pred_siamese': y_pred})

    print('average elapsed testing time of 1 doc: {}'.format(
        np.mean(time_takes)))

    # copy prediction files
    dest_path = os.path.dirname(data_path)
    aux_name = '_' + voca_name
    dest_path = os.path.join(dest_path, 'predictions', 'model' + aux_name)
    copy_predfile(data_path, dest_path, re_exp)
コード例 #4
0
        os.makedirs(save_folder)

    params = vars(args)
    print(json.dumps(params, indent=2))
    config_fn = os.path.join(save_folder, 'config.json')
    if not args.testing:
        with open(config_fn, 'w') as outfile:
            json.dump(params, outfile)
    ''' Prepare data '''
    voca_name = 'voca' + str(args.voca_size / 1000) + 'k_' + str(
        args.voca_dim) + 'd'
    dataset_list = args.dataset_list
    for dataset in dataset_list:
        if dataset.startswith('cnn'):
            train_cnndm = ChunkDataManager(
                load_data_path=os.path.join(args.load_dir, 'cnn_dm', 'train_' +
                                            voca_name)).load(
                                                load_list=args.data_list)
            test_cnndm = ChunkDataManager(
                load_data_path=os.path.join(args.load_dir, 'cnn_dm', 'test_' +
                                            voca_name)).load(
                                                load_list=args.data_list)
            val_cnndm = ChunkDataManager(
                load_data_path=os.path.join(args.load_dir, 'cnn_dm', 'val_' +
                                            voca_name)).load(
                                                load_list=args.data_list)

            if not args.testing:
                test_cnndm = val_cnndm
            print('# cnn_dm samples: train:{}, test:{}'.format(
                train_cnndm[0].shape[0], test_cnndm[0].shape[0]))
コード例 #5
0
                     default='logs',
                     help='Tensorboard logs dir',
                     type=str)
 parser.add_argument('--word_vec_path',
                     default='data/word-vectors.npy',
                     help='Save path word vectors',
                     type=str)
 parser.add_argument('--omit_word_vectors', action='store_true')
 parser.add_argument('--omit_chars', action='store_true')
 parser.add_argument('--omit_syntactical_features', action='store_true')
 parser.add_argument('--omit_exact_match', action='store_true')
 parser.add_argument('--train_word_embeddings', action='store_true')
 args = parser.parse_args()
 ''' Prepare data '''
 word_embedding_weights = np.load(args.word_vec_path)
 train_data = ChunkDataManager(
     load_data_path=os.path.join(args.load_dir, 'train')).load()
 test_data = ChunkDataManager(
     load_data_path=os.path.join(args.load_dir, 'test')).load()
 dev_data = ChunkDataManager(
     load_data_path=os.path.join(args.load_dir, 'dev')).load()
 ''' Getting dimensions of the input '''
 chars_per_word = train_data[3].shape[-1] if not args.omit_chars else 0
 syntactical_feature_size = train_data[5].shape[
     -1] if not args.omit_syntactical_features else 0
 ''' Prepare the model and optimizers '''
 adam = L2Optimizer(Adam())
 adadelta = L2Optimizer(Adadelta(lr=0.5, rho=0.95, epsilon=1e-8))
 sgd = L2Optimizer(SGD(lr=3e-3))
 model = DIIN(
     p=train_data[0].shape[-1],  # or None
     h=train_data[1].shape[-1],  # or None
コード例 #6
0
ファイル: train.py プロジェクト: xuzhuojia/DIIN4CCKS
                        default='logs',
                        help='Tensorboard logs dir',
                        type=str)
    parser.add_argument('--word_vec_path',
                        default='data/word-vectors.npy',
                        help='Save path word vectors',
                        type=str)
    parser.add_argument('--omit_word_vectors', default=0, type=int)
    parser.add_argument('--omit_chars', default=0, type=int)
    parser.add_argument('--omit_exact_match', default=0, type=int)
    parser.add_argument('--train_word_embeddings', default=True)
    parser.add_argument('--is_train', default=1, type=int)
    args = parser.parse_args()
    ''' Prepare data '''
    word_embedding_weights = np.load(args.word_vec_path)
    all_data = ChunkDataManager(load_data_path=os.path.join(args.load_dir +
                                                            '/train')).load()
    predict_data = ChunkDataManager(
        load_data_path=os.path.join(args.load_dir + '/dev')).load()
    is_train = args.is_train

    if args.omit_exact_match == 1:
        del all_data[4:6]
        del predict_data[4:6]
    if args.omit_chars == 1:
        del all_data[2:4]
        del predict_data[4:6]

    train_data = [[] for i in range(len(all_data))]
    test_data = [[] for i in range(len(all_data))]
    dev_data = [[] for i in range(len(all_data))]
    for i, all_data_i in enumerate(all_data):
コード例 #7
0
def preprocess_unified(p,
                       h,
                       preprocessors,
                       save_dir,
                       data_paths,
                       dataset_to_save,
                       word_vectors_load_path=None,
                       normalize_word_vectors=False,
                       voca_size=[50000],
                       voca_dim=300,
                       data_root_dir='data',
                       word_vector_save_path=None,
                       word2id_save_path=None,
                       max_loaded_word_vectors=None,
                       include_word_vectors=True,
                       include_exact_match=False,
                       include_chars=False,
                       include_syntactical_features=False):

    unified_preprocessor = SNLIPreprocessor()

    # load word vector
    if voca_dim == 300:
        word2vec_func = get_word2vec_file_path
    elif voca_dim == 100:
        word2vec_func = get_word2vec_100d_file_path
    unified_preprocessor.call_load_word_vector(
        file_path=word2vec_func(word_vectors_load_path),
        normalize=normalize_word_vectors,
        max_words=max_loaded_word_vectors)

    if False and os.path.exists(word2id_save_path) and os.path.exists(
            word_vector_save_path):
        unified_preprocessor.load_word2id_dict(word2id_save_path)
        unified_preprocessor.vectors = np.load(word_vector_save_path)
    else:
        # get all tokenized words from all dataset
        for dataset, ppr in preprocessors.iteritems():
            data_path = data_paths[dataset]
            if dataset.endswith('nli'):
                if dataset.startswith('s'):
                    train_path = os.path.join(data_path,
                                              dataset + '_1.0_train.jsonl')
                    test_path = os.path.join(data_path,
                                             dataset + '_1.0_test.jsonl')
                    dev_path = os.path.join(data_path,
                                            dataset + '_1.0_dev.jsonl')
                    paths = [('train', train_path), ('test', test_path),
                             ('dev', dev_path)]
                elif dataset.startswith('m'):
                    train_path = os.path.join(data_path,
                                              dataset + '_1.0_train.jsonl')
                    test_matched_path = os.path.join(
                        data_path, dataset + '_1.0_dev_matched.jsonl')
                    test_mismatched_path = os.path.join(
                        data_path, dataset + '_1.0_dev_mismatched.jsonl')
                    paths = [('train', train_path),
                             ('test_matched', test_matched_path),
                             ('test_mismatched', test_mismatched_path)]

                ppr.get_all_words_with_parts_of_speech(
                    [path[1] for path in paths])
                print(
                    'Found {} unique words, {} unique parts of speech from {}'.
                    format(len(ppr.unique_words),
                           len(ppr.unique_parts_of_speech), dataset))
                unified_preprocessor.unique_parts_of_speech = unified_preprocessor.unique_parts_of_speech.union(
                    ppr.unique_parts_of_speech)
            elif dataset.startswith('DUC'):
                folders = [
                    os.path.join(data_path, d) for d in os.listdir(data_path)
                    if os.path.isdir(os.path.join(data_path, d))
                    and re.search('^d[0-9]+', d)
                ]
                ppr.get_all_words_DUC(folders)
                print('Found {} unique words from {}'.format(
                    len(ppr.unique_words), dataset))
            elif dataset.startswith('cnn'):
                paths = [('train', data_path), ('test', data_path),
                         ('val', data_path)]
                ppr.gen_all_pairs(paths)
                ppr.get_all_words()
                print('Found {} unique words from {}'.format(
                    len(ppr.unique_words), dataset))

            unified_preprocessor.unique_words = unified_preprocessor.unique_words.union(
                ppr.unique_words)

            temp_dict = dict()
            for d in (unified_preprocessor.unique_words_freq,
                      ppr.unique_words_freq):
                for word, freq in d.items():
                    if word in temp_dict:
                        temp_dict[word] += freq
                    else:
                        temp_dict[word] = freq
            unified_preprocessor.unique_words_freq = temp_dict

        # sort to get 50k frequent words
        sorted_dict = sorted(unified_preprocessor.unique_words_freq.items(),
                             key=operator.itemgetter(1),
                             reverse=True)

    for v_size in voca_size:
        voca_name = 'voca' + str(v_size / 1000) + 'k_' + str(voca_dim) + 'd'

        selected_dict = sorted_dict[:v_size]  # list of tuples
        unified_preprocessor.unique_words_voca = [
            word[0] for word in selected_dict
        ]

        print('Found {} unique words, {} unique parts of speech from unified'.
              format(len(unified_preprocessor.unique_words),
                     len(unified_preprocessor.unique_parts_of_speech)))
        print('Found {} unique words freq(dict), {} {} words(list)'.format(
            len(unified_preprocessor.unique_words_freq),
            len(unified_preprocessor.unique_words_voca), voca_name))
        n_print = 20
        print('Top {} frequent words'.format(n_print))
        for it in range(n_print):
            print('{} - {}:{}'.format(it + 1, selected_dict[it][0],
                                      selected_dict[it][1]))

        # initialize w2v, word2id
        # ADD <START>, <END>, UNK, ZERO into the vocabulary
        unified_preprocessor.init_word_to_vectors(
            needed_words=unified_preprocessor.unique_words_voca,
            normalize=normalize_word_vectors)
        unified_preprocessor.init_chars(
            words=unified_preprocessor.unique_words)
        unified_preprocessor.init_parts_of_speech(
            parts_of_speech=unified_preprocessor.unique_parts_of_speech)

        #dirname = os.path.dirname(word2id_save_path)
        word2id_path = os.path.join(args.data_root_dir,
                                    'word2id_' + voca_name + '_unified.pkl')
        wordvec_path = os.path.join(
            args.data_root_dir, 'word-vectors_' + voca_name + '_unified.npy')
        unified_preprocessor.save_word2id_dict(word2id_path)
        unified_preprocessor.save_word_vectors(wordvec_path)

        # assign word id and save
        for dataset in dataset_to_save:
            print('***** [{}] data saving *****'.format(dataset))
            data_path = data_paths[dataset]

            if dataset.endswith('nli'):
                if dataset.startswith('s'):
                    train_path = os.path.join(data_path,
                                              dataset + '_1.0_train.jsonl')
                    test_path = os.path.join(data_path,
                                             dataset + '_1.0_test.jsonl')
                    dev_path = os.path.join(data_path,
                                            dataset + '_1.0_dev.jsonl')
                    paths = [('train_' + voca_name, train_path),
                             ('test_' + voca_name, test_path),
                             ('dev_' + voca_name, dev_path)]
                elif dataset.startswith('m'):
                    train_path = os.path.join(data_path,
                                              dataset + '_1.0_train.jsonl')
                    test_matched_path = os.path.join(
                        data_path, dataset + '_1.0_dev_matched.jsonl')
                    test_mismatched_path = os.path.join(
                        data_path, dataset + '_1.0_dev_mismatched.jsonl')
                    paths = [('train_' + voca_name, train_path),
                             ('test_matched_' + voca_name, test_matched_path),
                             ('test_mismatched_' + voca_name,
                              test_mismatched_path)]

                for dataset_var, input_path in paths:
                    data = unified_preprocessor.parse(
                        input_file_path=input_path,
                        max_words_p=p,
                        max_words_h=h)

                    # Determine which part of data we need to dump
                    if not include_exact_match:
                        del data[6:8]  # Exact match feature
                    if not include_syntactical_features:
                        del data[4:6]  # Syntactical POS tags
                    if not include_chars: del data[2:4]  # Character features
                    if not include_word_vectors: del data[0:2]  # Word vectors

                    data_saver = ChunkDataManager(
                        save_data_path=os.path.join(data_path, dataset_var))
                    data_saver.save([np.array(item) for item in data])

            elif dataset.startswith('cnn'):
                cnn_dm = preprocessors['cnn_dm']
                cnn_dm.assign_w2id(unified_preprocessor.word_to_id)

                data_path_1up = os.path.dirname(data_path)  # data/cnn_dm
                paths = [
                    'train_' + voca_name, 'test_' + voca_name,
                    'val_' + voca_name
                ]

                for i, path_name in enumerate(paths):
                    data_saver = ChunkDataManager(
                        save_data_path=os.path.join(data_path_1up, path_name))
                    data_saver.save(
                        [np.array(item) for item in cnn_dm.data_all[i]])

            elif dataset.startswith('DUC'):
                DUC_save_dir = 'data_' + voca_name
                save_path = os.path.join(data_path, DUC_save_dir)  #save_dir
                folders = [
                    os.path.join(data_path, d) for d in os.listdir(data_path)
                    if os.path.isdir(os.path.join(data_path, d))
                    and re.search('^d[0-9]+', d)
                ]
                for dir in folders:
                    file_name = os.path.basename(dir)
                    file_path = os.path.join(dir, file_name + '.txt')
                    sents = unified_preprocessor.load_txt_data(
                        file_path=file_path)

                    # list of list: processed words in sentences
                    sents_processed = []
                    sents_raw_words = []
                    for sent in sents:
                        sent_tmp = []
                        sent_tmp.append(
                            unified_preprocessor.word_to_id['<START>'])
                        word_tokens = word_tokenize(sent)

                        for word in word_tokens:
                            word = preprocess_word(word)
                            if word in unified_preprocessor.word_to_id:
                                sent_tmp.append(
                                    unified_preprocessor.word_to_id[word])
                            else:
                                sent_tmp.append(
                                    unified_preprocessor.word_to_id['<UNK>'])
                        sent_tmp.append(
                            unified_preprocessor.word_to_id['<END>'])
                        sents_processed.append(sent_tmp)
                        sents_raw_words.append(word_tokens)

                    # test pair for similarity measure
                    sents_pair_p = []
                    sents_pair_h = []
                    sents_exact_pair_p = []
                    sents_exact_pair_h = []
                    sent_len = len(sents_processed)
                    for i in range(sent_len):
                        for j in range(i + 1, sent_len):
                            sents_pair_p.append(sents_processed[i])
                            sents_pair_h.append(sents_processed[j])

                            # exact words
                            premise_exact_match = unified_preprocessor.calculate_exact_match(
                                sents_raw_words[i], sents_raw_words[j])
                            hypothesis_exact_match = unified_preprocessor.calculate_exact_match(
                                sents_raw_words[j], sents_raw_words[i])
                            sents_exact_pair_p.append(premise_exact_match)
                            sents_exact_pair_h.append(hypothesis_exact_match)

                    data = []
                    w2id_p = pad_sequences(sents_pair_p,
                                           maxlen=p + 2,
                                           padding='post',
                                           truncating='post',
                                           value=0.)
                    w2id_h = pad_sequences(sents_pair_h,
                                           maxlen=h + 2,
                                           padding='post',
                                           truncating='post',
                                           value=0.)
                    data.append(w2id_p)
                    data.append(w2id_h)

                    sents_exact_pair_p = pad_sequences(sents_exact_pair_p,
                                                       maxlen=p,
                                                       padding='post',
                                                       truncating='post',
                                                       value=0.)
                    sents_exact_pair_h = pad_sequences(sents_exact_pair_h,
                                                       maxlen=h,
                                                       padding='post',
                                                       truncating='post',
                                                       value=0.)
                    data.append(sents_exact_pair_p)
                    data.append(sents_exact_pair_h)

                    # save as npy
                    data_saver = ChunkDataManager(
                        save_data_path=os.path.join(save_path, file_name))
                    data_saver.save([np.array(item) for item in data])
コード例 #8
0
def preprocess_DUC(p,
                   h,
                   preprocessor,
                   data_path,
                   save_dir,
                   word_vector_save_path,
                   word_vectors_load_path,
                   word2id_save_path,
                   normalize_word_vectors,
                   max_loaded_word_vectors=None):
    """    
    :param p:                       maximum number of words in text
    :param h:                       maximum number of words in hypothesis
    :param preprocessor:            preprocessor
    :param data_path:               root directory of dataset
    :param word_vector_save_path:   path to save a word_vector (only vectors)
    :param word_vectors_load_path:  path to load a Glove word_vector
    :param word2id_save_path:       path to save a word2id
    :param normalize_word_vectors:  normalize word_vector or not
    :param max_loaded_word_vectors: maximum limitation of number of words
    
    :return: (premise_word_ids, hypothesis_word_ids,
              premise_chars, hypothesis_chars,
              premise_syntactical_one_hot, hypothesis_syntactical_one_hot,
              premise_exact_match, hypothesis_exact_match)
    """

    #dirs = [x[0] for x in os.walk(data_path)]  # os.walk finds all sub-directoreis
    #folders = [d for d in dirs if os.path.basename(d).startswith('d')]
    folders = [
        os.path.join(data_path, d) for d in os.listdir(data_path) if
        os.path.isdir(os.path.join(data_path, d)) and re.search('^d[0-9]+', d)
    ]

    if os.path.exists(word2id_save_path) and os.path.exists(
            word_vector_save_path):
        preprocessor.load_word2id_dict(word2id_save_path)
        preprocessor.vectors = np.load(word_vector_save_path)
    else:

        preprocessor.get_all_words_DUC(folders)
        print('Found', len(preprocessor.unique_words), 'unique words from DUC')

        preprocessor.init_word_to_vectors(
            vectors_file_path=get_word2vec_file_path(word_vectors_load_path),
            needed_words=preprocessor.unique_words,
            normalize=normalize_word_vectors,
            max_loaded_word_vectors=max_loaded_word_vectors)
        preprocessor.save_word2id_dict(word2id_save_path)
        preprocessor.save_word_vectors(word_vector_save_path)

    save_path = os.path.join(data_path, save_dir)

    for dir in folders:
        file_name = os.path.basename(dir)
        file_path = os.path.join(dir, file_name + '.txt')
        sents = preprocessor.load_txt_data(file_path=file_path)

        # list of list: processed words in sentences
        sents_processed = []
        for sent in sents:
            sent_tmp = []
            for word in sent.split():
                w = preprocessor.word_to_id[word.translate(None, ',.')]
                sent_tmp.append(w)
            sents_processed.append(sent_tmp)

        sents_pair_p = []
        sents_pair_h = []
        sent_len = len(sents_processed)
        for i in range(sent_len):
            for j in range(i + 1, sent_len):
                sents_pair_p.append(sents_processed[i])
                sents_pair_h.append(sents_processed[j])

        data = [sents_pair_p, sents_pair_h]
        data[0] = pad_sequences(data[0],
                                maxlen=p,
                                padding='post',
                                truncating='post',
                                value=0.)
        data[1] = pad_sequences(data[1],
                                maxlen=h,
                                padding='post',
                                truncating='post',
                                value=0.)

        # save as npy
        data_saver = ChunkDataManager(
            save_data_path=os.path.join(save_path, file_name))
        data_saver.save([np.array(item) for item in data])