Beispiel #1
0
    def generate(self):
        for padding in ['left']:
            params = dict()
            params['padding'] = padding

            samples, computed_params = load_dataset(params)
            embeddings = WordEmbeddings.load_word_vectors(
                wordchar2vector_path, word2vector_path)
            word_dims = embeddings.vector_size
            computed_params['embeddings'] = embeddings
            computed_params['word_dims'] = word_dims

            for net_arch in ['rnn(cnn)']:  # 'rnn', 'cnn'
                params['net_arch'] = net_arch

                if net_arch == 'rnn':
                    for rnn_size in [150, 200, 256]:
                        params['rnn_size'] = rnn_size

                        for units1 in [16]:
                            params['units1'] = units1

                            for activation1 in ['relu']:
                                params['activation1'] = activation1

                                for optimizer in ['nadam']:
                                    params['optimizer'] = optimizer

                                    for batch_size in [150]:
                                        params['batch_size'] = batch_size

                                        yield params, computed_params, samples

                if net_arch == 'rnn(cnn)':
                    for rnn_size in [450, 500, 550]:
                        params['rnn_size'] = rnn_size

                        for nb_filters in [130, 140, 150]:
                            params['nb_filters'] = nb_filters

                            for min_kernel_size in [1]:
                                params['min_kernel_size'] = min_kernel_size

                                for max_kernel_size in [2]:
                                    params['max_kernel_size'] = max_kernel_size

                                    for pooling in ['max']:
                                        params['pooling'] = pooling

                                        for units1 in [15, 20, 25]:
                                            params['units1'] = units1

                                            for activation1 in ['relu']:
                                                params[
                                                    'activation1'] = activation1

                                                for optimizer in ['nadam']:
                                                    params[
                                                        'optimizer'] = optimizer

                                                    for batch_size in [150]:
                                                        params[
                                                            'batch_size'] = batch_size

                                                        yield params, computed_params, samples
                if net_arch == 'cnn':
                    for nb_filters in [100]:
                        params['nb_filters'] = nb_filters

                        for min_kernel_size in [1]:
                            params['min_kernel_size'] = min_kernel_size

                            for max_kernel_size in [2]:
                                params['max_kernel_size'] = max_kernel_size

                                for pooling in ['max']:
                                    params['pooling'] = pooling

                                    for units1 in [16]:
                                        params['units1'] = units1

                                        for activation1 in ['relu']:
                                            params['activation1'] = activation1

                                            for optimizer in ['nadam']:
                                                params['optimizer'] = optimizer

                                                for batch_size in [150]:
                                                    params[
                                                        'batch_size'] = batch_size

                                                    yield params, computed_params, samples
Beispiel #2
0
            best_score_wrt.flush()

    logging.info('Grid search complete, best_score=%f best_params=%s',
                 best_score, get_params_str(best_params))
    best_score_wrt.close()

if run_mode == 'train':
    logging.info('Start with run_mode==train')

    params = dict()

    params['padding'] = 'left'

    samples, computed_params = load_dataset(params)

    embeddings = WordEmbeddings.load_word_vectors(wordchar2vector_path,
                                                  word2vector_path)
    word_dims = embeddings.vector_size
    computed_params['embeddings'] = embeddings
    computed_params['word_dims'] = word_dims

    params['net_arch'] = 'rnn(cnn)'
    params['rnn_size'] = 200  # 500
    params['units1'] = 15
    params['activation1'] = 'relu'
    params['nb_filters'] = 150
    params['min_kernel_size'] = 1
    params['max_kernel_size'] = 2
    params['pooling'] = 'max'
    params['optimizer'] = 'nadam'
    params['batch_size'] = 250  # 150
Beispiel #3
0
def prepare_data(input_path, params, max_samples):
    logging.info('prepare_data for "%s', get_params_str(params))
    samples3 = []
    df = pd.read_csv(input_path, encoding='utf-8', delimiter='\t', quoting=3)
    for anchor, positive, negative in zip(df['anchor'].values,
                                          df['positive'].values,
                                          df['negative'].values):
        samples3.append(Sample3(anchor, positive, negative))

    if len(samples3) > max_samples:
        samples3 = random.sample(samples3, max_samples)

    computed_params = dict()

    if params['repres'] == 'words':
        embeddings = WordEmbeddings.load_word_vectors(
            params['wordchar2vector_path'], params['word2vector_path'])
        computed_params['embeddings'] = embeddings
        computed_params['word_dims'] = embeddings.vector_size

        tokenizer = Tokenizer()
        tokenizer.load()
        computed_params['tokenizer'] = tokenizer

        max_wordseq_len = 0
        for sample in samples3:
            for phrase in [sample.anchor, sample.positive, sample.negative]:
                words = tokenizer.tokenize(phrase)
                max_wordseq_len = max(max_wordseq_len, len(words))

        logging.info('max_wordseq_len={}'.format(max_wordseq_len))
        computed_params['max_wordseq_len'] = max_wordseq_len

        # Выравниваем все фразы
        pad_func = lpad_wordseq if padding == 'left' else rpad_wordseq
        computed_params['pad_func'] = pad_func
        for sample in samples3:
            sample.anchor_words = pad_func(tokenizer.tokenize(sample.anchor),
                                           max_wordseq_len)
            sample.positive_words = pad_func(
                tokenizer.tokenize(sample.positive), max_wordseq_len)
            sample.negative_words = pad_func(
                tokenizer.tokenize(sample.negative), max_wordseq_len)
    elif params['repres'] == 'pieces':
        spm_name = 'spm_synonymy({})'.format(params['spm_items'])
        computed_params['spm_name'] = spm_name

        if not os.path.exists(os.path.join(tmp_folder, spm_name + '.model')):
            # Для обучения модели SentencePiece нам нужен текстовый корпус. Изготовим его
            # из имеющихся вариантов предложений в обучающем наборе
            all_texts = set()
            for sample in samples3:
                all_texts.add(sample.anchor)
                all_texts.add(sample.positive)
                all_texts.add(sample.negative)

            sentencepiece_corpus = os.path.join(tmp_folder,
                                                'sentencepiece_corpus.txt')
            with io.open(sentencepiece_corpus, 'w', encoding='utf-8') as wrt:
                for text in all_texts:
                    wrt.write(text)
                    wrt.write(u'\n')

            # Корпус готов, обучаем сегментатор
            logging.info('Train SentencePiece model on {}...'.format(
                sentencepiece_corpus))
            spm.SentencePieceTrainer.Train(
                '--input={} --model_prefix={} --vocab_size={} --character_coverage=1.0 --model_type=bpe --input_sentence_size=10000000'
                .format(sentencepiece_corpus, spm_name, params['spm_items']))
            os.rename(spm_name + '.vocab',
                      os.path.join(tmp_folder, spm_name + '.vocab'))
            os.rename(spm_name + '.model',
                      os.path.join(tmp_folder, spm_name + '.model'))

        splitter = spm.SentencePieceProcessor()
        splitter.Load(os.path.join(tmp_folder, spm_name + '.model'))
        computed_params['splitter'] = splitter

        max_wordseq_len = 0
        all_tokens = set([PAD_TOKEN])
        for sample in samples3:
            for phrase in [sample.anchor, sample.positive, sample.negative]:
                tokens = splitter.EncodeAsPieces(phrase)
                max_wordseq_len = max(max_wordseq_len, len(tokens))
                all_tokens.update(tokens)

        logging.info('max_wordseq_len={}'.format(max_wordseq_len))
        computed_params['max_wordseq_len'] = max_wordseq_len

        token2index = {PAD_TOKEN: 0}
        for token in all_tokens:
            if token != PAD_TOKEN:
                token2index[token] = len(token2index)

        computed_params['token2index'] = token2index

        for sample in samples3:
            sample.anchor_words = spm2tokens(splitter, sample.anchor,
                                             max_wordseq_len, token2index)
            sample.positive_words = spm2tokens(splitter, sample.positive,
                                               max_wordseq_len, token2index)
            sample.negative_words = spm2tokens(splitter, sample.negative,
                                               max_wordseq_len, token2index)

    else:
        raise NotImplementedError()

    return samples3, computed_params
Beispiel #4
0
    answers_train, answers_test = train_test_split(premises, questions, answers,
                                                   test_size=TEST_SHARE,
                                                   random_state=SEED)

    print('Generating training samples...')
    train_inputs, train_targets = generate_samples(premises_train, questions_train, answers_train, max_answer_len)
    nb_train = len(train_inputs)
    print('nb_train={}'.format(nb_train))

    print('Generating test samples...')
    test_inputs, test_targets = generate_samples(premises_test, questions_test, answers_test, max_answer_len)
    nb_test = len(test_inputs)
    print('nb_test={}'.format(nb_test))

    wc2v_path = os.path.join(data_folder, 'wordchar2vector.dat')
    word2vec = WordEmbeddings.load_word_vectors(wc2v_path, w2v_path)
    word_dims = word2vec.vector_size
    print('word_dims={0}'.format(word_dims))

    model_config = {
                    'engine': 'nn',
                    'max_inputseq_len': max_phrase_len,
                    'max_outseq_len': max_answer_len,
                    'w2v_path': w2v_path,
                    'wordchar2vector_path': wc2v_path,
                    'PAD_WORD': PAD_WORD,
                    'model_folder': tmp_folder,
                    'word_dims': word_dims,
                    'char2index': char2index,
                    'arch_filepath': arch_filepath,
                    'weights_filepath': weights_path
Beispiel #5
0
    # Грузим конфигурацию модели, веса и т.д.
    with open(config_path, 'r') as f:
        model_config = json.load(f)
        repres = model_config['repres']
        max_wordseq_len = model_config['max_wordseq_len']
        net_arch = model_config['net_arch']
        padding = model_config['padding']

    with open(arch_filepath, 'r') as f:
        model = model_from_json(f.read())

    model.load_weights(weights_path)

    if repres == 'words':
        embeddings = WordEmbeddings.load_word_vectors(
            model_config['wordchar2vector_path'], model_config['w2v_path'])
        word_dims = embeddings.vector_size
        pad_func = lpad_wordseq if padding == 'left' else rpad_wordseq
        tokenizer = Tokenizer()
        tokenizer.load()
    elif repres == 'pieces':
        splitter = spm.SentencePieceProcessor()
        splitter.Load(
            os.path.join(tmp_folder, model_config['spm_name'] + '.model'))

    # Загрузим эталонные предложения, похожесть на которые будем определять для
    # введенного в консоли предложения.
    phrases2 = set()
    if True:
        for phrase in load_strings_from_yaml(
                os.path.join(data_folder, 'rules.yaml')):