def __init__(self, text, batch_size, num_unrollings=1, vocabulary=None):
     self._text = text
     self._text_size = len(text)
     self._batch_size = batch_size
     self.vocabulary = vocabulary
     self._vocabulary_size = len(self.vocabulary)
     self.characters_positions_in_vocabulary = get_positions_in_vocabulary(self.vocabulary)
     self._num_unrollings = num_unrollings
     segment = self._text_size // batch_size
     self._cursor = [offset * segment for offset in range(batch_size)]
     self._last_batch = self._start_batch()
Exemple #2
0
 def __init__(self, text, batch_size, num_unrollings=1, vocabulary=None):
     self._text = text
     self._pairs = self.make_pairs(self._text, None)
     self._number_of_pairs = len(self._pairs)
     self._text_size = len(text)
     self._batch_size = batch_size
     self.vocabulary = vocabulary
     self._vocabulary_size = len(self.vocabulary)
     self.character_positions_in_vocabulary = get_positions_in_vocabulary(self.vocabulary)
     self._ids = self._create_id_array(self._pairs, self.character_positions_in_vocabulary)
     self._num_unrollings = num_unrollings
     segment = self._number_of_pairs // batch_size
     self._cursor = [offset * segment for offset in range(batch_size)]
     self._last_batch = self._start_batch()
Exemple #3
0
    def __init__(self, text, batch_size, num_unrollings=1, vocabulary=None):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self.vocabularies = vocabulary
        self._vocabulary_sizes = [len(voc) for voc in self.vocabularies]
        self.character_positions_in_vocabulary = [
            get_positions_in_vocabulary(voc) for voc in self.vocabularies
        ]

        self._pairs = self.make_pairs(
            self._text, {'punctuation_marks': self.vocabularies[1]})
        self._number_of_pairs = len(self._pairs)

        self._num_unrollings = num_unrollings
        segment = self._number_of_pairs // batch_size
        self._cursor = [offset * segment for offset in range(batch_size)]
        self._last_batch = self._start_batch()
Exemple #4
0
 def __init__(self, text, batch_size, num_unrollings=1, vocabulary=None):
     tmp_output = process_input_text(text)
     [
         self._text, self._eod_flags, self._speaker_flags,
         self._bot_answer_flags, self._number_of_speakers
     ] = tmp_output
     # print('self._speaker_flags:', self._speaker_flags[:5000])
     # print('self._eod_flags:', self._eod_flags[:5000])
     # print('self._bot_answer_flags:', self._bot_answer_flags[:5000])
     # print('self._text:', self._text[:5000])
     self._text_size = len(self._text)
     self._batch_size = batch_size
     self._vocabulary = vocabulary
     self._vocabulary_size = len(self._vocabulary)
     self._character_positions_in_vocabulary = get_positions_in_vocabulary(
         self._vocabulary)
     self._num_unrollings = num_unrollings
     segment = self._text_size // batch_size
     self._cursor = [offset * segment for offset in range(batch_size)]
     self._last_inputs, _ = self._start_batch()
     print('self._number_of_speakers:', self._number_of_speakers)
    def __init__(self, text, batch_size, num_unrollings=1, vocabulary=None):

        # tmp_output = process_input_text_reg(text)
        tmp_output = process_input_text(text)
        [self._text, self._speaker_flags, self._bot_speaks_flags] = tmp_output
        # print('self._speaker_flags:', self._speaker_flags[:5000])
        # print('self._bot_speaks_flags:', self._bot_speaks_flags[:5000])
        # print('self._text:', self._text[:5000])
        # print('(__init__)len(self._text):', len(self._text))
        # print('len(self._bot_speaks_flags):', len(self._bot_speaks_flags))
        # print('sum(self._bot_speaks_flags):', sum(self._bot_speaks_flags))
        self._text_size = len(self._text)
        self._batch_size = batch_size
        self._vocabulary = vocabulary
        self._vocabulary_size = len(self._vocabulary)
        self._number_of_speakers = 2
        self._character_positions_in_vocabulary = get_positions_in_vocabulary(
            self._vocabulary)
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [offset * segment for offset in range(batch_size)]
        self._counter = 0  # to swap flags when all train dataset is processed
        self._last_inputs, _ = self._start_batch()
f.close()

# different
offset = 10000
valid_size = 1000
valid_text = text[offset:offset + valid_size]
train_text = text[offset + valid_size:]
train_size = len(train_text)

# In[5]:

vocabulary = create_vocabulary(text)
vocabulary_size = len(vocabulary)

env = Environment(Lstm, LstmBatchGenerator)
cpiv = get_positions_in_vocabulary(vocabulary)

evaluation = dict(
    save_path='residuals_no_authors_no_sampling/parameter_tuning/just_lstm_go',
    result_types=['perplexity', 'loss', 'bpc', 'accuracy'],
    datasets={
        'train': None,
        'default_1': [valid_text, 'default_1']
    },
    batch_gen_class=LstmBatchGenerator,
    batch_kwargs={'vocabulary': vocabulary},
    batch_size=1,
    additional_feed_dict=[{
        'placeholder': 'dropout',
        'value': 1.
    }])
Exemple #7
0
    with open('datasets/all_scipop_word_voc.txt', 'w') as f:
        for w_idx, w in enumerate(word_voc):
            f.write(w)
            if w_idx < len(word_voc) - 1:
                f.write('\t')

    with open('datasets/all_scipop_punc_voc.txt', 'w') as f:
        for p_idx, p in enumerate(punc_voc):
            f.write(p)
            if p_idx < len(punc_voc) - 1:
                f.write('\t')

# print('word_voc:', word_voc)
print('punc_voc:', punc_voc)
word_cpiv = get_positions_in_vocabulary(word_voc)
punc_cpiv = get_positions_in_vocabulary(punc_voc)
# env = Environment(Lstm, BatchGenerator, vocabulary=vocabulary)
env = Environment(Lstm, BatchGenerator, vocabulary=[word_voc, punc_voc])

add_feed = [{'placeholder': 'dropout', 'value': 0.8}]
valid_add_feed = [{'placeholder': 'dropout', 'value': 1.}]

print('reached build')
env.build(batch_size=1,
          embeddings_in_batch=False,
          num_layers=2,
          num_nodes=[1300, 1300],
          num_output_layers=2,
          num_output_nodes=[2048],
          vocabulary_size=vocabulary_sizes[0],