Esempio n. 1
0
                                   'w')
            max_length_f.write(u'{0}\n'.format(total_length))
            max_length_f.close()

        elif 'rescore' in config or 'predict_next' in config:
            # use trained model for rescoring
            print('For rescoring: sentence per sentence')
            data = lm_data.charSentenceDataRescore(config, eval_config, TRAIN,
                                                   VALID, TEST)
            all_data, vocab_size, _ = data.get_data()
        elif 'debug2' in config:
            raise NotImplementedError(
                "Generating a debug2 file with a character-level \
				model is not implemented.")
        else:
            data = lm_data.charData(config, eval_config, TRAIN, VALID, TEST)
            all_data, vocab_size, _ = data.get_data()

    # word-level training, on sentence level (sentences are padded until maximum sentence length)
    elif 'per_sentence' in config:

        if 'char_ngram' in config:
            raise NotImplementedError(
                "Models with character n-gram input are only "
                "implemented on discourse level.")
        elif 'word_char_concat' in config:
            raise NotImplementedError(
                "Models with concatenated word and character embeddings "
                "as input are only implemented at discourse level.")

        # do not read all data at once (for large datasets/small memory)
Esempio n. 2
0
		config: dictionary containing configuration options (for training and validation)
		eval_config: dictionary containing configuration options (for testing)
		(TRAIN, VALID, TEST): tuple of booleans indicating whether we should train, validate and/or test
	Returns:
		config: dictionary containing configuration options (for training and validation)
		eval_config: dictionary containing configuration options (for testing)
		data: data object
		train_data: training data mapped to indices (can be single list or tuple of lists depending on the type of model)
		valid_data: validation data mapped to indices
		test_data: test data mapped to indices
		(TRAIN, VALID, TEST): tuple of booleans indicating whether we should train, validate and/or test
	'''

    # character-level training, in batches (cross sentence boundaries)
    if 'char' in config:
        data = lm_data.charData(config, eval_config)
        all_data, vocab_size, _ = data.get_data()

    # word-level training, on sentence level (sentences are padded until maximum sentence length)
    elif 'per_sentence' in config:

        if 'rescore' in config:
            max_length = int(
                open('{0}max_length'.format(
                    config['trained_model'])).readlines()[0].strip())
            # set num_steps = total length of each (padded) sentence
            config['num_steps'] = max_length

            data = lm_data.wordSentenceDataRescore(config, eval_config)
            all_data, vocab_size, _ = data.get_data()