def transform_lines_to_nn_input(tokenized_dialog_lines, token_to_index): """ Splits lines (IterableSentences) and generates numpy arrays of token ids suitable for training. Doesn't store all lines in memory. """ x_data_iterator, y_data_iterator, iterator_for_len_calc = file_buffered_tee(tokenized_dialog_lines, 3) _logger.info('Iterating through lines to get number of elements in the dataset') n_dialogs = sum(1 for _ in iterator_for_len_calc) x_data_iterator = islice(x_data_iterator, 0, None, 2) y_data_iterator = islice(y_data_iterator, 1, None, 2) n_dialogs /= 2 y_data_iterator, y_data_iterator_for_context = file_buffered_tee(y_data_iterator) x_data_iterator = _get_x_data_iterator_with_context(x_data_iterator, y_data_iterator_for_context) _logger.info('Iterating through lines to get input matrix') x_ids = transform_contexts_to_token_ids( x_data_iterator, token_to_index, INPUT_SEQUENCE_LENGTH, INPUT_CONTEXT_SIZE, max_contexts_num=n_dialogs) _logger.info('Iterating through lines to get output matrix') y_ids = transform_lines_to_token_ids( y_data_iterator, token_to_index, OUTPUT_SEQUENCE_LENGTH, n_dialogs, add_start_end=True) return x_ids, y_ids, n_dialogs
def _train_model(tokenized_lines, voc_size, vec_size, window_size, skip_gram): _logger.info( 'Word2Vec model will be trained now. It can take long, so relax and have fun.' ) params_str = get_w2v_params_str(voc_size, vec_size, window_size, skip_gram) _logger.info('Parameters for training: %s' % params_str) model = Word2Vec(window=window_size, size=vec_size, max_vocab_size=voc_size, min_count=MIN_WORD_FREQ, workers=_WORKERS_NUM, sg=skip_gram) tokenized_lines_for_voc, tokenized_lines_for_train = file_buffered_tee( tokenized_lines) model.build_vocab(tokenized_lines_for_voc) model.train(tokenized_lines_for_train) # forget the original vectors and only keep the normalized ones = saves lots of memory # https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec.init_sims model.init_sims(replace=True) return model
def get_dialog_lines_and_conditions(dialog_lines, text_field_name, condition_field_name): """ Splits one dialog_lines generator into two generators - one for conditions and one for dialog lines """ conditions_iter, dialog_lines_iter = file_buffered_tee( imap(lambda line: [line[condition_field_name], line[text_field_name]], dialog_lines)) conditions_iter = imap(itemgetter(0), conditions_iter) dialog_lines_iter = imap(itemgetter(1), dialog_lines_iter) return dialog_lines_iter, conditions_iter
def get_dialog_lines_and_conditions(dialog_lines, text_field_name, condition_field_name): """ Splits one dialog_lines generator into two generators - one for conditions and one for dialog lines """ conditions_iter, dialog_lines_iter = file_buffered_tee( [[line[condition_field_name], line[text_field_name]] for line in dialog_lines]) conditions_iter = list(map(itemgetter(0), conditions_iter)) dialog_lines_iter = list(map(itemgetter(1), dialog_lines_iter)) return dialog_lines_iter, conditions_iter
def _train_model(tokenized_lines, voc_size, vec_size, window_size, skip_gram): _logger.info('Word2Vec model will be trained now. It can take long, so relax and have fun.') params_str = get_w2v_params_str(voc_size, vec_size, window_size, skip_gram) _logger.info('Parameters for training: %s' % params_str) model = Word2Vec( window=window_size, size=vec_size, max_vocab_size=voc_size, min_count=MIN_WORD_FREQ, workers=_WORKERS_NUM, sg=skip_gram) tokenized_lines_for_voc, tokenized_lines_for_train = file_buffered_tee(tokenized_lines) model.build_vocab(tokenized_lines_for_voc) model.train(tokenized_lines_for_train) # forget the original vectors and only keep the normalized ones = saves lots of memory # https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec.init_sims model.init_sims(replace=True) return model