Esempio n. 1
0
def pretrained_word_emb(vocab, emb_dim):
    word2emb = vocab['word'].load_word2emb()
    word_emb = Embedding(len(vocab['word']), emb_dim)
    W = word_emb.get_weights()[0]
    for i, word in enumerate(word2emb.keys()):
        W[i] = word2emb[word]
    word_emb.set_weights([W])
    return word_emb
Esempio n. 2
0
def pretrained_word_emb(vocab, emb_dim):
    word2emb = vocab['word'].load_word2emb()
    word_emb = Embedding(len(vocab['word']), emb_dim)
    W = word_emb.get_weights()[0]
    for i, word in enumerate(word2emb.keys()):
        W[i] = word2emb[word]
    word_emb.set_weights([W])
    return word_emb
Esempio n. 3
0
    def run_model(self):
        vocab_size = len(self.word2id)
        input_target = Input((1, ))
        input_context = Input((1, ))

        embedding = Embedding(vocab_size,
                              self.vector_dim,
                              input_length=1,
                              name='embedding')

        target = embedding(input_target)
        target = Reshape((self.vector_dim, 1))(target)

        context = embedding(input_context)
        context = Reshape((self.vector_dim, 1))(context)

        dot_product = dot([target, context], axes=1, normalize=False)
        dot_product = Reshape((1, ))(dot_product)
        #output = Dense(1, activation='sigmoid')(dot_product)

        model = Model(input=[input_target, input_context], output=dot_product)
        model.compile(loss='binary_crossentropy', optimizer='rmsprop')

        target_arr = np.zeros((1, ))
        context_arr = np.zeros((1, ))
        label_arr = np.zeros((1, ))
        word_target, word_context, labels = zip(*self.train_data)
        for cnt in range(self.epochs):
            idx = np.random.randint(0, len(labels) - 1)
            target_arr[0, ] = word_target[idx]
            context_arr[0, ] = word_context[idx]
            label_arr[0, ] = labels[idx]
            loss = model.train_on_batch([target_arr, context_arr], label_arr)
            if cnt % 100 == 0:
                print("Iteration {}, loss={}".format(cnt, loss))

        weights = embedding.get_weights()[0]
        words_embeddings = {w: weights[idx] for w, idx in self.word2id.items()}
        return words_embeddings
Esempio n. 4
0
loss_plot = TensorBoard(log_dir=train_name + '_logs',
                        write_graph=False,
                        embeddings_freq=10)
earlystopping = EarlyStopping(monitor='loss',
                              min_delta=0.0001,
                              patience=1,
                              verbose=1,
                              mode='auto')

steps = no_train_pairs / batch_size  # How many times per epoch we will ask the batch generator to yield a batch

# Let's start training!
start = time.time()
history = keras_model.fit_generator(
    batch_generator(wordpairs, vocabulary, vocab_size, negative, batch_size),
    callbacks=[sim_cb, loss_plot, earlystopping],
    steps_per_epoch=steps,
    epochs=10,
    workers=cores,
    verbose=1)

end = time.time()
print('Training took:', int(end - start), 'seconds', file=sys.stderr)

# Saving the resulting vectors:
filename = train_name + '.vec.gz'
save_word2vec_format(filename, vocabulary,
                     word_embedding_layer.get_weights()[0])

backend.clear_session()
Esempio n. 5
0
                                 concat_axis=-1)
# The dot products are outputted
model = Model(input=[word_index, context, negative_samples],
              output=[word_context_product, negative_context_product])
# binary crossentropy is applied on the output
model.compile(optimizer='rmsprop', loss='binary_crossentropy')
print model.summary()

# model.fit_generator(V_gen.pretraining_batch_generator(sentences, vocabulary, reverse_vocabulary), samples_per_epoch=G.train_words, nb_epoch=1)
model.fit_generator(V_gen.pretraining_batch_generator(sentences, vocabulary,
                                                      reverse_vocabulary),
                    samples_per_epoch=10,
                    nb_epoch=1)
# Save the trained embedding
S.save_embeddings("embedding.txt",
                  shared_embedding_layer.get_weights()[0], vocabulary)

# input_context = np.random.randint(10, size=(1, context_size))
# input_word = np.random.randint(10, size=(1,))
# input_negative = np.random.randint(10, size=(1, G.negative))

# print "word, context, negative samples"
# print input_word.shape, input_word
# print input_context.shape, input_context
# print input_negative.shape, input_negative

# output_dot_product, output_negative_product = model.predict([input_word, input_context, input_negative])
# print "word cbow dot product"
# print output_dot_product.shape, output_dot_product
# print "cbow negative dot product"
# print output_negative_product.shape, output_negative_product
Esempio n. 6
0
                 + str(args.regularize)

    # create a secondary validation model to run our similarity checks during training
    similarity = dot([word_embedding, context_embedding], axes=1, normalize=True)
    validation_model = Model(inputs=[word_index, context_index], outputs=[similarity])
    sim_cb = helpers.SimilarityCallback(validation_model=validation_model)

    loss_plot = TensorBoard(log_dir=train_name + '_logs', write_graph=False)
    earlystopping = EarlyStopping(monitor='loss', min_delta=0.0001, patience=1, verbose=1,
                                  mode='auto')

    # How many times per epoch we will ask the batch generator to yield a batch?
    steps = no_train_pairs / batch_size

    # Let's start training!
    start = time.time()
    history = keras_model.fit_generator(
        helpers.batch_generator(wordpairs, vocab_dict, vocab_size, negative, batch_size,
                                args.use_neighbors, neighbors_count),
        callbacks=[sim_cb, loss_plot, earlystopping], steps_per_epoch=steps, epochs=args.epochs,
        workers=cores, verbose=2)

    end = time.time()
    print('Training took:', int(end - start), 'seconds', file=sys.stderr)

    # Saving the resulting vectors:
    filename = train_name + '_' + run_name + '.vec.gz'
    helpers.save_word2vec_format(filename, vocab_dict, word_embedding_layer.get_weights()[0])

    backend.clear_session()
Esempio n. 7
0
negative_words_embedding = shared_embedding_layer(negative_samples)
# Now the context words are averaged to get the CBOW vector
cbow = Lambda(lambda x: K.mean(x, axis=1), output_shape=(G.embedding_dimension,))(context_embeddings)
# The context is multiplied (dot product) with current word and negative sampled words
word_context_product = merge([word_embedding, cbow], mode='dot')
negative_context_product = merge([negative_words_embedding, cbow], mode='dot', concat_axis=-1)
# The dot products are outputted
model = Model(input=[word_index, context, negative_samples], output=[word_context_product, negative_context_product])
# binary crossentropy is applied on the output
model.compile(optimizer='rmsprop', loss='binary_crossentropy')
print(model.summary())

# model.fit_generator(V_gen.pretraining_batch_generator(sentences, vocabulary, reverse_vocabulary), samples_per_epoch=G.train_words, nb_epoch=1)
model.fit_generator(V_gen.pretraining_batch_generator(sentences, vocabulary, reverse_vocabulary), samples_per_epoch=10000, nb_epoch=10)
# Save the trained embedding
S.save_embeddings("word2vec_50.txt", shared_embedding_layer.get_weights()[0], vocabulary)

# input_context = np.random.randint(10, size=(1, context_size))
# input_word = np.random.randint(10, size=(1,))
# input_negative = np.random.randint(10, size=(1, G.negative))

# print "word, context, negative samples"
# print input_word.shape, input_word
# print input_context.shape, input_context
# print input_negative.shape, input_negative

# output_dot_product, output_negative_product = model.predict([input_word, input_context, input_negative])
# print "word cbow dot product"
# print output_dot_product.shape, output_dot_product
# print "cbow negative dot product"
# print output_negative_product.shape, output_negative_product
Esempio n. 8
0
class LSTMEncDec:
    __LOSS_FUNCS__ = ('mean_squared_error', 'categorical_crossentropy')
    __DECODER_BUILDS__ = None

    def __init__(self,
                 word_vec,
                 word_to_index,
                 index_to_word,
                 weight_file=None,
                 enc_layer_output=(32, ),
                 dec_layer_output=(32, ),
                 learning_rate=0.001,
                 sequence_len=200,
                 output_len=2000,
                 directory='.',
                 out_type=0,
                 decoder_type=0):
        """
        :param out_type:
            0: word vector output/similarity inference
            1: for softmax word distribution output.
        :param decoder_type:
            0: non-readout LSTM decoder
            1: recurrentshop's readout decoder
            2: seq2seq decoder.
        """
        self.__DECODER_BUILDS__ = (self.__build_repeat_decoder__,
                                   self.__build_readout_decoder__,
                                   self.__build_seq2seq_decoder__)
        self.word_to_index = word_to_index
        self.index_to_word = index_to_word
        self.sequence_len = sequence_len
        self.output_len = output_len
        self.directory = directory
        self.enc_layer_output = enc_layer_output
        self.dec_layer_output = dec_layer_output
        self.decoder_type = decoder_type
        self.out_type = out_type
        try:
            loss = self.__LOSS_FUNCS__[out_type]
        except IndexError:
            raise ValueError('Invalid output type %s.' % self.out_type)
        self.encoder = Sequential(name='Encoder')
        self.decoder = Sequential(name='Decoder')
        self.embed = None
        self.batch_size = 0

        input_layer, output_layer = self.config_model(word_vec)

        self.model = Model(inputs=[input_layer], outputs=[output_layer])
        if weight_file is not None:
            self.model.load_weights(weight_file)
        self.compile(learning_rate, loss)

    def config_model(self, word_vec):
        """
        Creates the encoder-decoder structure and returns the symbolic input and output
        """
        train_embed = True

        # Configure input layer
        input_layer = Input(shape=(self.sequence_len, ), name='Input')

        # Embedding layer should be initialized with a word-vector array and
        # not be trained as the output relies on the same array
        self.embed = Embedding(input_dim=np.size(word_vec, 0),
                               output_dim=np.size(word_vec, 1),
                               weights=[word_vec],
                               trainable=train_embed,
                               mask_zero=True,
                               name='Embed')

        # Configure encoder network with the given output sizes.
        # Embedding for encoder only since decoder receives the question vector.
        self.encoder.add(self.embed)
        for el in self.enc_layer_output[:-1]:
            self.encoder.add(
                LSTM(el, return_sequences=True, consume_less='mem'))
        self.encoder.add(LSTM(self.enc_layer_output[-1])
                         )  # Final LSTM layer only outputs the last vector
        # Encoder outputs the question vector as a tensor with each time-step output being the final question vector
        question_vec = self.encoder(input_layer)

        # Configure decoder network with the given output sizes.
        # Layer connecting to encoder output
        try:
            self.__DECODER_BUILDS__[self.out_type]()
        except IndexError:
            raise ValueError('Invalid decoder type %s.' % self.decoder_type)

        if self.out_type == 0:
            # Final layer outputting a sequence of word vectors
            self.decoder.add(
                TimeDistributed(
                    Dense(np.size(word_vec, 1), activation='linear')))
        else:
            # Final layer outputting a sequence of word distribution vectors
            self.decoder.add(
                TimeDistributed(
                    Dense(len(self.index_to_word), activation='softmax')))
        output_layer = self.decoder(question_vec)

        return input_layer, output_layer

    def __build_repeat_decoder__(self):
        # Repeat the final vector for answer input
        self.decoder.add(
            RepeatVector(self.sequence_len,
                         input_shape=(self.enc_layer_output[-1], )))
        self.decoder.add(
            LSTM(self.dec_layer_output[0],
                 input_shape=(self.sequence_len, self.enc_layer_output[-1]),
                 name='ConnectorLSTM',
                 return_sequences=True,
                 consume_less='mem'))
        for dl in self.dec_layer_output[1:]:
            self.decoder.add(
                LSTM(dl, return_sequences=True, consume_less='mem'))

    def __build_readout_decoder__(self):
        self.decoder.add(
            RepeatVector(
                self.sequence_len,
                input_shape=(self.enc_layer_output[-1],
                             )))  # Repeat the final vector for answer input
        # Using recurrentshop's container with readout
        container = RecurrentContainer(readout=True,
                                       return_sequences=True,
                                       output_length=self.sequence_len)
        if len(self.dec_layer_output) > 1:
            container.add(
                LSTMCell(output_dim=self.dec_layer_output[0],
                         input_dim=self.enc_layer_output[-1]))
            for dl in self.dec_layer_output[1:-1]:
                container.add(LSTMCell(output_dim=dl))
            container.add(LSTMCell(output_dim=self.enc_layer_output[-1]))
        else:
            container.add(
                LSTMCell(input_dim=self.enc_layer_output[-1],
                         output_dim=self.enc_layer_output[-1]))

        if self.enc_layer_output[-1] != self.dec_layer_output[-1]:
            print(
                'WARNING: Overriding final decoder output to %s for readout compatibility'
                % self.enc_layer_output[-1])
        self.decoder.add(container)

    def __build_seq2seq_decoder__(self):
        # Using recurrentshop's decoder container
        container = RecurrentContainer(
            return_sequences=True,
            readout='add',
            output_length=self.sequence_len,
            input_shape=(self.enc_layer_output[-1], ),
            decode=True)
        if len(self.dec_layer_output) > 1:
            container.add(
                LSTMCell(output_dim=self.dec_layer_output[0],
                         input_dim=self.enc_layer_output[-1]))
            for dl in self.dec_layer_output[1:-1]:
                container.add(LSTMCell(output_dim=dl))
            container.add(LSTMCell(output_dim=self.enc_layer_output[-1]))
        else:
            container.add(
                LSTMCell(input_dim=self.enc_layer_output[-1],
                         output_dim=self.enc_layer_output[-1]))

        if self.enc_layer_output[-1] != self.dec_layer_output[-1]:
            print(
                'WARNING: Overriding final decoder output to %s for readout compatibility'
                % self.enc_layer_output[-1])
        self.decoder.add(container)

    def compile(self, learning_rate, loss):
        if self.out_type == 0:
            metrics = ['mean_absolute_error']
        else:
            metrics = []
        self.model.compile(optimizer=RMSprop(lr=learning_rate),
                           loss=loss,
                           metrics=metrics,
                           sample_weight_mode='temporal')

    def train(self,
              Xtrain,
              ytrain,
              nb_epoch,
              Xval=None,
              yval=None,
              train_mask=None,
              val_mask=None,
              batch_size=10,
              queries=None):
        """
        Uses a generator to decompress labels from integers to hot-coded vectors batch-by-batch to save memory.
        See utils.generate_batch().
        """
        self.batch_size = batch_size
        callback = EncDecCallback(self, queries, True)
        logger = CSVLogger(self.directory + '/epochs.csv')
        nb_class = len(self.index_to_word)
        total_len = np.size(ytrain, 0)

        if self.out_type == 0:
            generator = utils.generate_vector_batch
        else:
            generator = utils.generate_batch

        if Xval is None or yval is None:
            self.model.fit_generator(generator(Xtrain, ytrain,
                                               self.embed.get_weights()[0],
                                               train_mask, nb_class, total_len,
                                               batch_size),
                                     steps_per_epoch=total_len / batch_size,
                                     workers=1,
                                     epochs=nb_epoch,
                                     callbacks=[callback, logger],
                                     verbose=1,
                                     max_q_size=1)
        else:
            self.model.fit_generator(
                generator(Xtrain, ytrain,
                          self.embed.get_weights()[0], train_mask, nb_class,
                          total_len, batch_size),
                steps_per_epoch=total_len / batch_size,
                epochs=nb_epoch,
                callbacks=[callback, logger],
                verbose=1,
                max_q_size=1,
                workers=1,
                validation_steps=Xval.shape[0] / self.batch_size,
                validation_data=generator(Xval, yval,
                                          self.embed.get_weights()[0],
                                          val_mask, nb_class, Xval.shape[0],
                                          batch_size))

    def generate_response(self, query):
        """
        Pre-processes a raw query string and return a response string
        """
        tokens = nltk.word_tokenize(query.lower())[:self.sequence_len]
        indices = [
            self.word_to_index[w]
            if w in self.word_to_index else self.word_to_index[UNKNOWN_TOKEN]
            for w in tokens
        ]
        indices.extend([0] * (self.sequence_len - len(indices)))
        indices = np.asarray(indices, dtype=np.int32).reshape(
            (1, self.sequence_len))
        output = self.model.predict(indices, batch_size=1, verbose=0)
        vectors = self.embed.get_weights()[0]
        response = []

        if self.out_type == 0:
            for word_vec in output[0]:
                word = self.index_to_word[utils.nearest_vector_index(
                    vectors, word_vec)]
                if word == MASK_TOKEN:
                    continue
                elif word == SENTENCE_END_TOKEN:
                    break
                response.append(word)
        else:
            out_idx = np.argmax(output, axis=2)
            # noinspection PyTypeChecker
            for idx in out_idx[0]:
                word = self.index_to_word[idx]
                if word == MASK_TOKEN:
                    continue
                elif word == SENTENCE_END_TOKEN:
                    response.append(word)
                    break
                response.append(word)

        return ' '.join(response)

    def generate_candidates(self, query, top=3):
        """
        Generates a list of top candidates for each word position given a raw string query.
        Only applies for softmax model.
        """
        tokens = nltk.word_tokenize(query.lower())[:self.sequence_len]
        indices = [
            self.word_to_index[w]
            if w in self.word_to_index else self.word_to_index[UNKNOWN_TOKEN]
            for w in tokens
        ]
        indices.extend([0] * (self.sequence_len - len(indices)))
        indices = np.asarray(indices, dtype=np.int32).reshape(
            (1, self.sequence_len))
        output = self.model.predict(indices, batch_size=1, verbose=0)
        vectors = self.embed.get_weights()[0]
        response, candidates = [], []

        if self.out_type == 0:
            for word_vec in output[0]:
                word = self.index_to_word[utils.nearest_vector_index(
                    vectors, word_vec)]
                if word == MASK_TOKEN:
                    continue
                elif word == lstm.tokens.SENTENCE_END_TOKEN:
                    break
                response.append(word)
        else:
            out_idx = utils.k_largest_idx(output, top)
            # noinspection PyTypeChecker
            for ca in out_idx[0]:
                word = self.index_to_word[ca[0]]
                if word == MASK_TOKEN:
                    continue
                elif word == SENTENCE_END_TOKEN:
                    response.append(word)
                    break
                response.append(word)
                candidates.append([self.index_to_word[c] for c in ca])

        return ' '.join(response), candidates

    def log(self, string='', out=True):
        f = open(self.directory + '/log.txt', mode='at')

        if out:
            print(string)
        print(string, file=f)
        f.close()
def compile_model(inputs, repeat):
    (vocab_size1, max_sent1, sent_maxlen1, query_maxlen1) = inputs[0]
    (vocab_size2, max_sent2, sent_maxlen2, query_maxlen2) = inputs[1]
    mvocab_size = vocab_size1
    if (mvocab_size < vocab_size2):
        mvocab_size = vocab_size2
    story_input1 = Input((max_sent1, sent_maxlen1))
    story_input2 = Input((max_sent2, sent_maxlen2))
    query_input1 = Input((sent_maxlen1, ))
    query_input2 = Input((sent_maxlen2, ))

    # H = Dense(EMBED_HIDDEN_SIZE)

    embedBlayer1 = Embedding(vocab_size1,
                             EMBED_HIDDEN_SIZE,
                             input_length=sent_maxlen1,
                             init=INIT_WEIGHT)
    embedBlayer2 = Embedding(vocab_size2,
                             EMBED_HIDDEN_SIZE,
                             input_length=sent_maxlen2,
                             init=INIT_WEIGHT)

    embeddingBs = embedBlayer1(query_input1), embedBlayer2(query_input2)
    #u = Lambda(lambda x: K.sum(x, axis=2), output_shape=lambda s: (s[0], s[1]))(embeddingB)
    u = [
        Lambda(lambda x: K.sum(x, axis=1),
               output_shape=(EMBED_HIDDEN_SIZE, ))(embeddingB)
        for embeddingB in embeddingBs
    ]
    embedlayer1 = None
    embedlayer2 = None

    for hop in range(HOPS):
        embeddingAs = None
        if hop == 0:
            embeddingAs = [
                embedBlayer1(story_input1),
                embedBlayer2(story_input2)
            ]
        else:
            embeddingAs = [
                embedlayer1(story_input1),
                embedlayer2(story_input2)
            ]
        #ms = Lambda(lambda x: K.sum(x, axis=3), output_shape=lambda s: (s[0], s[1], s[2]))(embeddingA)
        mss = [
            Lambda(lambda x: K.sum(x, axis=2),
                   output_shape=(max_sent1,
                                 EMBED_HIDDEN_SIZE))(embeddingAs[0]),
            Lambda(lambda x: K.sum(x, axis=2),
                   output_shape=(max_sent2, EMBED_HIDDEN_SIZE))(embeddingAs[1])
        ]

        dotproducts = [
            merge([mss[0], u[-2]],
                  mode=row_wise_dot,
                  output_shape=(max_sent1, )),
            merge([mss[1], u[-1]],
                  mode=row_wise_dot,
                  output_shape=(max_sent2, ))
        ]
        #dotproduct = merge([ms, u], mode=row_wise_cos, output_shape=(max_sent,))
        probs = [
            Activation('softmax')(dotproduct) for dotproduct in dotproducts
        ]

        embedlayer1 = Embedding(vocab_size1,
                                EMBED_HIDDEN_SIZE,
                                input_length=sent_maxlen1,
                                init=INIT_WEIGHT)
        embedlayer2 = Embedding(vocab_size2,
                                EMBED_HIDDEN_SIZE,
                                input_length=sent_maxlen2,
                                init=INIT_WEIGHT)
        embeddingCs = [embedlayer1(story_input1), embedlayer2(story_input2)]
        cs = [
            Lambda(lambda x: K.sum(x, axis=2),
                   output_shape=(max_sent1,
                                 EMBED_HIDDEN_SIZE))(embeddingCs[0]),
            Lambda(lambda x: K.sum(x, axis=2),
                   output_shape=(max_sent2, EMBED_HIDDEN_SIZE))(embeddingCs[1])
        ]
        c_temps = [
            Lambda(lambda x: tf.transpose(x, [0, 2, 1]),
                   output_shape=(EMBED_HIDDEN_SIZE, max_sent1))(cs[0]),
            Lambda(lambda x: tf.transpose(x, [0, 2, 1]),
                   output_shape=(EMBED_HIDDEN_SIZE, max_sent2))(cs[1])
        ]

        os = [
            merge([c_temp, prob],
                  mode=row_wise_dot,
                  output_shape=(EMBED_HIDDEN_SIZE, ))
            for c_temp, prob in zip(c_temps, probs)
        ]
        newus = [
            merge([u[-2], os[0]],
                  mode='sum',
                  output_shape=(EMBED_HIDDEN_SIZE, )),
            merge([u[-1], os[1]],
                  mode='sum',
                  output_shape=(EMBED_HIDDEN_SIZE, ))
        ]
        # u.append(H(newu))
        u.append(newus[0])
        u.append(newus[1])

    # Applying w matrix
    #dl = Dense(vocab_size, input_dim=(EMBED_HIDDEN_SIZE,))(u[-1])

    # Using last C as W per adjacent weight tying
    func1 = lambda x: tf.matmul(
        x, tf.transpose(embedlayer1.get_weights()[0], [1, 0]))
    func2 = lambda x: tf.matmul(
        x, tf.transpose(embedlayer2.get_weights()[0], [1, 0]))
    dls = [Lambda(func1)(newus[0]), Lambda(func2)(newus[1])]

    preds = [
        Dense(vocab_size1, activation='softmax')(dls[0]),
        Dense(vocab_size2, activation='softmax')(dls[1])
    ]

    model = Model(
        input=[story_input1, query_input1, story_input2, query_input2],
        output=dls)

    # opt = Adam(lr=0.001,
    # beta_1=0.9,
    # beta_2=0.999,
    # epsilon=1e-08,
    # decay=0.0)

    opt = SGD(lr=0.0, momentum=0.0, decay=0.0, nesterov=False)

    model.compile(optimizer=opt,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model, [LearningRateScheduler(step_decay)]
Esempio n. 10
0
class HN():
    def __init__(self,
                 src_list,
                 save_dir,
                 data_directory,
                 data_name,
                 glove,
                 word_coverage,
                 non_test,
                 initial_learning_rate,
                 learning_rate_decay,
                 optimizer_kwargs,
                 adjust_learning_rate,
                 clip_batch_size,
                 set_min_batch_size,
                 char_rnn,
                 REMAINDER,
                 CHAR,
                 MAX_WORD_LENGTH,
                 BUCKET_DIVISION,
                 WordEmb_dropout,
                 WordRnn_dropout,
                 SentenceRnn_dropout,
                 rnn,
                 CHARACTER_RNN_DIMENSION,
                 conv_unit_size,
                 mse,
                 TRAIN,
                 pretrain,
                 mode,
                 mem_test=False,
                 test=False,
                 bucket_coverage=None,
                 max_n_epoch=55,
                 cnn_window_size=7,
                 memory=False,
                 optimizer_name='rmsprop',
                 std_batch_size=64,
                 embedding_regularizer_coefficient=None,
                 recurrent_activation='sigmoid',
                 conv_dense_activation='tanh',
                 entire_char_size=50,
                 kernel_regularizer_coefficient=None,
                 sec_period=30,
                 memory_fraction=1,
                 stack=[1, 1],
                 RNN_DIMENSION=50,
                 trainable_word_emb=True,
                 year=None,
                 flag_embedding_layer_mask_zero=True,
                 rnn_implementation=2,
                 tanh2_dropout=0,
                 patience=3,
                 complete_pretrain=False):
        save_filename = 'model.h5'
        if os.path.isfile(os.path.join(save_dir, save_filename)):
            print('===== will load weights ======')
        else:  # create new path
            save_dir = find_new_dir(save_dir)
        # copy source codes
        save_src(save_dir, src_list)
        # open log file
        log_path = os.path.join(save_dir, 'log.txt')
        self.log, sys.stdout = open(log_path, 'a', 1), open(log_path, 'a', 1)
        self.log.write('\n\n\n\n\n')
        # adjust config
        if test:
            data_name = 'small'
            max_n_epoch = 3
            BUCKET_DIVISION = 2
            conv_unit_size = 32
        if set_min_batch_size and K.backend() == 'theano':
            adjust_learning_rate = False
            import theano
            theano.config.optimizer = 'fast_run'
        if pretrain:
            print('pre-train is impossible now due to fast-mem_test')
            sys.exit()
        # parameters
        self.log_path, self.sec_period, self.std_batch_size = log_path, sec_period, std_batch_size
        self.adjust_learning_rate = adjust_learning_rate
        self.best_validation_save_path = os.path.join(save_dir,
                                                      'best-accuracy.txt')
        self.best_save_path = os.path.join(save_dir, 'best.h5')
        self.model_save_path = os.path.join(save_dir, save_filename)
        self.mem_test_flag = mem_test
        self.learning_rate_decay = learning_rate_decay
        self.custom_objects = {'AttentionLayer': AttentionLayer}
        self.word_embedding_dim = 200
        self.rnn_implementation = rnn_implementation
        self.trainable_word_emb, self.flag_embedding_layer_mask_zero = trainable_word_emb, flag_embedding_layer_mask_zero
        self.CHAR = CHAR
        self.REMAINDER = REMAINDER
        self.rnn, self.conv_dense_activation = rnn, conv_dense_activation
        self.recurrent_activation = recurrent_activation
        self.initial_learning_rate = initial_learning_rate
        self.TRAIN = TRAIN
        self.stack = stack
        self.pretrain, self.complete_pretrain = pretrain, complete_pretrain
        self.patience, self.mse, self.max_n_epoch = patience, mse, max_n_epoch
        self.MULTI_RNN_DIMENSION = RNN_DIMENSION  # int(RNN_DIMENSION / math.sqrt(N_LAYERS))
        self.conv_unit_size = conv_unit_size
        self.embedding_regularizer_coefficient = embedding_regularizer_coefficient
        self.kernel_regularizer_coefficient = kernel_regularizer_coefficient
        self.char_rnn_flag, self.CHARACTER_RNN_DIMENSION = char_rnn, CHARACTER_RNN_DIMENSION
        self.WordEmb_dropout, self.WordRnn_dropout, self.SentenceRnn_dropout, self.tanh2_dropout = WordEmb_dropout, WordRnn_dropout, SentenceRnn_dropout, tanh2_dropout
        self.optimizer_name, self.optimizer_kwargs = optimizer_name, optimizer_kwargs
        self.save_dir = save_dir
        self.mode = mode
        # prepare dataset
        self.path = Path(directory_path=data_directory,
                         name=data_name,
                         glove=glove,
                         year=year)
        MEMORY = self.memory_control(memory_fraction)
        if memory:
            MEMORY = memory
        self.log.write('MEMORY: {}\n'.format(MEMORY))
        self.dataset = Dataset(log_path,
                               self.path,
                               set_min_batch_size=set_min_batch_size,
                               bucket_coverage=bucket_coverage,
                               clip_batch_size=clip_batch_size,
                               non_test_flag=non_test,
                               word_coverage=word_coverage,
                               save_dir=save_dir,
                               MAX_WORD_LENGTH=MAX_WORD_LENGTH,
                               MAX_N_CHARACTER=entire_char_size)
        self.dataset.bucketize2(BUCKET_DIVISION, MEMORY)
        self.dataset.import_word_embedding(EMBEDDING_DIM=200)
        self.log.write(
            'vocabulary_size={}, embedding_matrix_size, EMBEDDING_DIM = {}, {}\n'
            .format(len(self.dataset.word_count),
                    len(self.dataset.word_embedding_matrix),
                    self.word_embedding_dim))
        self.dataset.std_batch_size = std_batch_size
        self.max_word_length = self.dataset.MAX_WORD_LENGTH
        # print configurations
        self.log.write('test={}   mem_test={}   memory_fraction = {}\n'.format(
            test, mem_test, memory_fraction))
        self.log.write('data_name = {} {}\n'.format(data_name, year))
        self.log.write(
            'set_min_batch_size = {}   bucket_coverage = {}\n'.format(
                set_min_batch_size, bucket_coverage))
        self.log.write('TRAIN = {}   mse = {}\n'.format(TRAIN, mse))
        self.log.write('mode = {}'.format(self.mode))
        self.log.write(
            'glove = {},   non_test = {}   word_coverage = {}\n'.format(
                glove, non_test, word_coverage))
        self.log.write(
            'entire_char_size={},  MAX_WORD_LENGTH = {},  BUCKET_DIVISION = {}\n'
            .format(entire_char_size, MAX_WORD_LENGTH, BUCKET_DIVISION))
        self.log.write('save_dir = {}\n'.format(save_dir))
        self.log.write('log_path = {}\n'.format(log_path))
        self.log.write(
            'optimizer_kwargs = {}   initial_learning_rate = {}\n'.format(
                optimizer_kwargs, initial_learning_rate))
        self.log.write(
            'learning_rate_decay = {}\n'.format(learning_rate_decay))
        self.log.write('backend: {}\n'.format(K.backend()))
        self.log.write('REMAINDER = {}\n'.format(REMAINDER))
        self.log.write('CHAR = {},  char_rnn = {}\n'.format(CHAR, char_rnn))
        self.log.write('embedding_regularizer_coefficient = {}\n'.format(
            embedding_regularizer_coefficient))
        self.log.write('kernel_regularizer_coefficient = {}\n'.format(
            kernel_regularizer_coefficient))
        self.log.write('rnn = {},  stack = {}  conv_unit_size={}\n'.format(
            rnn, stack, conv_unit_size))
        self.log.write('optimizer_name = {}\n'.format(optimizer_name))
        self.log.write('cnn_window_size = {}\n'.format(cnn_window_size))
        self.log.write(
            'CHARACTER_RNN_DIMENSION = {}\n'.format(CHARACTER_RNN_DIMENSION))
        self.log.write('std_batch_size = {}\n'.format(std_batch_size))
        self.log.write(
            'WordEmb_dropout, WordRnn_dropout, SentenceRnn_dropout = {}, {}, {}\n'
            .format(WordEmb_dropout, WordRnn_dropout, SentenceRnn_dropout))
        self.log.write('pretrain = {}   trainable_word_emb = {}\n'.format(
            pretrain, trainable_word_emb))
        self.log.write(
            'adjust_learning_rate = {}\n'.format(adjust_learning_rate))
        self.log.write('tanh2_dropout = {}\n'.format(tanh2_dropout))
        self.log.write('patience ={}\n'.format(patience))
        self.log.write('clip_batch_size = {}\n'.format(clip_batch_size))
        self.log.write(
            'conv_dense_activation = {}    recurrent_activation = {}\n'.format(
                conv_dense_activation, recurrent_activation))
        self.make_layers()

    def run(self):
        if self.mem_test_flag is True:
            self.mem_test()
            self.run_without_mem_test(compiled=True)
        else:
            self.run_without_mem_test()
        print('=== finished ===')

    def mem_test(self):
        self.dataset.mem_test, self.mem_test_flag = True, True
        print('=== memory test ====')
        self.run_without_mem_test()
        self.dataset.mem_test, self.mem_test_flag = False, False

    def make_layers(self):
        self.word_embedding_layer = Embedding(
            len(self.dataset.word_embedding_matrix),
            self.word_embedding_dim,
            embeddings_initializer=Constant(
                self.dataset.word_embedding_matrix),
            trainable=self.trainable_word_emb,
            mask_zero=self.flag_embedding_layer_mask_zero,
            name='word_emb',
            embeddings_regularizer=l2(self.embedding_regularizer_coefficient))
        self.word_rnn = self.stack_rnn(self.stack[0])
        self.tanh1 = self.make_dense(DIMENSION_ATTENTION, activation='tanh')
        self.att1 = AttentionLayer(name='att1')

        self.rnn2 = self.stack_rnn(self.stack[1])
        self.tanh2 = self.make_dense(DIMENSION_ATTENTION, activation='tanh')
        self.att2 = AttentionLayer(name='att2')
        self.logit = self.make_dense(self.dataset.n_classes,
                                     activation='softmax',
                                     name='logit')
        if self.optimizer_name == 'rmsprop':
            self.optimizer = optimizers.RMSprop(lr=self.initial_learning_rate,
                                                **self.optimizer_kwargs)
        elif self.optimizer_name == 'sgd':
            self.optimizer = optimizers.SGD(lr=self.initial_learning_rate,
                                            momentum=0.9,
                                            **self.optimizer_kwargs)
        elif self.optimizer_name == 'adam':
            self.optimizer = optimizers.Adam(lr=self.initial_learning_rate,
                                             **self.optimizer_kwargs)
        else:
            self.log.write('unknown optimizer name')
            sys.exit(1)
        if self.CHAR:
            self.dataset.make_character_embedding_index()
            self.character_embedding_layer = Embedding(
                len(self.dataset.char_embedding_index),
                len(self.dataset.char_embedding_index) - 2,
                # weights=[self.dataset.char_embedding_matrix],
                embeddings_initializer=Constant(
                    self.dataset.char_embedding_matrix),
                mask_zero=self.char_rnn_flag,
                trainable=True,
                name='ch_emb',
                embeddings_regularizer=l2(
                    self.embedding_regularizer_coefficient))
            if not self.char_rnn_flag:  # character cnn
                self.conv1 = self.make_conv(self.conv_unit_size, 5)
                self.conv2 = self.make_conv(self.conv_unit_size, 2)
            else:  # character rnn
                self.char_rnn = self.make_rnn(self.CHARACTER_RNN_DIMENSION,
                                              False)
            # temp
            self.word_linear = self.make_dense(self.word_embedding_dim,
                                               activation='linear')
            self.char_linear = self.make_dense(self.word_embedding_dim,
                                               activation='linear')
            self.max_tanh = self.make_dense(self.word_embedding_dim,
                                            activation='tanh')
            # layers for merging words and characters
            self.conv_dense = self.make_dense(
                self.word_embedding_dim, activation=self.conv_dense_activation)
            self.max_relu = self.make_dense(self.word_embedding_dim,
                                            activation='relu')

    def memory_control(self, memory_fraction):
        memory = {
            'citron': 11169,
            'apple': 8108,
            'cacao': 4041,
            'lime': 3300,
            'tangerine': 4036
        }  # 'lime':3050
        # memory['citron'] = memory['cacao']
        # memory['apple'] = memory['cacao']
        # memory['tangerine'] = memory['cacao']
        # memory['lime'] = memory['cacao']
        memory['durian'] = memory['citron']  # 11169
        memory['lemon'] = memory['apple']
        host = socket.gethostname()
        if self.CHAR:
            MEMORY = int(10 * memory[host] * memory_fraction * 0.8)
            if False and memory[host] > 11000 and memory_fraction > 0.85:
                MEMORY = int(0.8 * MEMORY)
        else:
            MEMORY = int(10 * memory[host] * memory_fraction * 0.8)
        if K.backend() == 'tensorflow' and memory_fraction < 0.85:
            import tensorflow as tf
            from keras.backend.tensorflow_backend import set_session
            tf_config = tf.ConfigProto()
            tf_config.gpu_options.per_process_gpu_memory_fraction = memory_fraction
            set_session(tf.Session(config=tf_config))
        return MEMORY

    def make_conv(self, unit_size, kernel_size):
        return Conv1D(unit_size,
                      kernel_size,
                      activation='relu',
                      kernel_regularizer=l2(
                          self.kernel_regularizer_coefficient))

    def stack_rnn(self, stack):
        rnn = []
        for _ in range(stack):
            rnn += [self.make_bi_rnn()]
        return rnn

    def make_bi_rnn(self):
        return Bidirectional(self.make_rnn(self.MULTI_RNN_DIMENSION, True))

    def make_rnn(self, dimension, return_sequences=False):
        if self.rnn == 'gru':
            rnn = GRU
        elif self.rnn == 'lstm':
            rnn = LSTM
        return rnn(dimension,
                   return_sequences=return_sequences,
                   implementation=self.rnn_implementation,
                   recurrent_activation=self.recurrent_activation,
                   kernel_regularizer=l2(self.kernel_regularizer_coefficient),
                   recurrent_regularizer=l2(
                       self.kernel_regularizer_coefficient))

    def make_dense(self, dim, activation, name=None):
        return Dense(dim,
                     activation=activation,
                     name=name,
                     kernel_regularizer=l2(
                         self.kernel_regularizer_coefficient))

    def char_to_word_model(
            self, max_word_length):  # char_i_input: (batch_size, word-length)
        char_i_input = Input(shape=(max_word_length, ),
                             dtype='int32',
                             name='word_ch_input')
        embedded_characters = self.character_embedding_layer(char_i_input)
        if not self.char_rnn_flag:
            conv_tensor = self.conv1(embedded_characters)
            conv_tensor = MaxPooling1D(3)(conv_tensor)
            conv_tensor = self.conv2(conv_tensor)
            conv_tensor = MaxPooling1D(2)(conv_tensor)
            # conv_out_shape = K.int_shape(conv_tensor)
            # output = Flatten()(conv_tensor)
            # flatt_out_shape = (conv_out_shape[0], conv_out_shape[1]*conv_out_shape[2])
            # output = Lambda(lambda x: Flatten()(x), output_shape=flatt_out_shape)(conv_tensor)
            output = MyFlat()(conv_tensor)
        else:
            output = self.char_rnn(embedded_characters)
        print('flatten: ', output)
        model = Model(char_i_input, output)
        print('model.output_shape=', model.output_shape)
        return model

    def embedded_word_to_sentence(self, max_sentence_length, embed_dim):
        embedded_word = Input(shape=(max_sentence_length, embed_dim),
                              dtype='float32',
                              name='emb_word_in')
        masked = Masking()(embedded_word)
        masked = Dropout(self.WordEmb_dropout)(masked)
        for i in range(self.stack[0]):
            masked = self.word_rnn[i](masked)
        wordRnn = Dropout(self.WordRnn_dropout)(masked)
        word_tanh = self.tanh1(wordRnn)
        # word_tanh = Dropout(self.WordRnn_dropout)(word_tanh)
        attention = self.att1(word_tanh)
        sentenceEmb = Multiply()([wordRnn, attention])
        sentenceEmb = Lambda(lambda x: K.sum(x, axis=1),
                             output_shape=lambda x: (x[0], x[2]))(sentenceEmb)
        modelSentence = Model(embedded_word, sentenceEmb)
        #         print('word_to_sentence model summary')
        #         print(modelSentence.summary())
        # modelSentAttention = Model(embedded_word, attention)
        return modelSentence

    def embed_word_document(self, wordsInputs):
        embedded = TimeDistributed(self.word_embedding_layer)(wordsInputs)
        return Masking()(embedded)

    def embed_char_document(self, char_input):
        # char_input: (batch_size, n_sentences, sentence_length, word_length)
        if False:
            model = self.char_to_word_model(
                self.max_word_length
            )  # char_i_input: (batch_size, word-length)
            return TimeDistributed(TimeDistributed(model))(char_input)
        else:
            embedded = TimeDistributed(
                TimeDistributed(self.character_embedding_layer))(char_input)
            conv_tensor = TimeDistributed(TimeDistributed(
                self.conv1))(embedded)
            conv_tensor = TimeDistributed(TimeDistributed(
                MaxPooling1D(3)))(conv_tensor)
            conv_tensor = TimeDistributed(TimeDistributed(
                self.conv2))(conv_tensor)
            conv_tensor = TimeDistributed(TimeDistributed(
                MaxPooling1D(2)))(conv_tensor)
            output = TimeDistributed(TimeDistributed(Flatten()))(conv_tensor)
            output = self.conv_dense(output)
            return output

    def embedded_word_to_document(self,
                                  max_sentence_length,
                                  embed_dim,
                                  embedded,
                                  sentence_remainder=None,
                                  word_remainder=None):
        # input:(batch_size, sentence_size, sentence_length, embed_dim)
        # assume input is masked
        # embedded = append_word_remainder(word_remainder, words)
        # sentence level
        if self.REMAINDER:
            # expanded_word_remainder = K.expand_dims(word_remainder, axis=-1)
            embedded = Concatenate()([word_remainder, embedded])
            embed_dim += 1
        modelSentence = self.embedded_word_to_sentence(max_sentence_length,
                                                       embed_dim)
        sentenceEmbbeding = TimeDistributed(modelSentence)(embedded)
        # sentenceAttention = TimeDistributed(modelSentAttention)(embedded)
        # document level
        sentenceEmbbeding = Masking()(sentenceEmbbeding)
        if self.REMAINDER:
            # expanded_sentence_remainder = K.expand_dims(sentence_remainder, axis=-1)
            sentenceEmbbeding = Concatenate()(
                [sentence_remainder, sentenceEmbbeding])
        for i in range(self.stack[1]):
            sentenceEmbbeding = self.rnn2[i](sentenceEmbbeding)
        # sentenceEmbbeding = Dropout(self.SentenceRnn_dropout)(sentenceEmbbeding)
        sentence_tanh = self.tanh2(sentenceEmbbeding)
        sentence_tanh = Dropout(self.tanh2_dropout)(sentence_tanh)
        attentionSent = self.att2(sentence_tanh)
        documentEmb = Multiply()([sentenceEmbbeding, attentionSent])
        documentEmb = Lambda(lambda x: K.sum(x, axis=1),
                             output_shape=lambda x: (x[0], x[2]),
                             name="sum_att2")(documentEmb)
        documentOut = self.logit(documentEmb)
        return documentOut

    def append_word_remainder(self, word_remainder, words):
        if self.REMAINDER:
            words = concatenate([word_remainder, words])
        return words

    def remainder_input_tensor(self, max_n_sentences, max_length):
        if self.REMAINDER:
            # sentence and word remainder
            return Input(shape=(max_n_sentences, 1),
                         dtype='float32'), Input(shape=(max_n_sentences,
                                                        max_length, 1),
                                                 dtype='float32')
        else:
            return None, None

    def word_model(self, batch_size, max_n_sentences, max_length):
        # embed input
        sentence_remainder, word_remainder = self.remainder_input_tensor(
            max_n_sentences, max_length)
        documentInputs = Input(shape=(max_n_sentences, max_length),
                               dtype='int32',
                               name='word_input')
        embedded = self.embed_word_document(documentInputs)  # masked
        output = self.embedded_word_to_document(max_length,
                                                self.word_embedding_dim,
                                                embedded, sentence_remainder,
                                                word_remainder)
        # model creation
        if not self.REMAINDER:
            model = Model([documentInputs], output)
        else:
            model = Model([documentInputs, sentence_remainder, word_remainder],
                          output)
        # modelAttentionEv = Model(inputs=[documentInputs], outputs=[output,  sentenceAttention, attentionSent])
        self.compile_model(model)
        # self.compile_model(modelAttentionEv)
        return model

    def combined_model(self, batch_size, max_n_sentences, max_sentence_length,
                       max_word_length):
        sentence_remainder, word_remainder = self.remainder_input_tensor(
            max_n_sentences, max_sentence_length)
        exists = Input(shape=(max_n_sentences, max_sentence_length),
                       dtype='float32',
                       name='exists')
        known = Input(shape=(max_n_sentences, max_sentence_length),
                      dtype='float32',
                      name='known')
        # embed word
        word_input = Input(shape=(max_n_sentences, max_sentence_length),
                           dtype='int32',
                           name='doc-wd_in')
        embedded_word = self.embed_word_document(word_input)  # masked
        # embed char
        char_input = Input(shape=(max_n_sentences, max_sentence_length,
                                  max_word_length),
                           dtype='int32',
                           name='doc_ch_in')
        embedded_char = self.embed_char_document(char_input)
        # combine (masked during merging)
        if False:
            # concat_word = Concatenate()([embedded_char, embedded_word])
            # concat_word = self.conv_dense(concat_word)
            embedded_word = self.max_relu(embedded_word)
            embedded_char = self.max_relu(embedded_char)
            concat_word = Maximum()([embedded_word, embedded_char])
            # concat_word = self.max_tanh(concat_word)
        elif self.mode == 'max':
            # embedded_word = self.word_linear(embedded_word)
            # embedded_char = self.char_linear(embedded_char)
            concat_word = Maximum()([embedded_word, embedded_char])
            # concat_word = self.max_tanh(concat_word)
        elif self.mode == 'switch':
            if True:
                concat_word = Switch(self.word_embedding_dim, 4)(
                    [exists, known, embedded_word, embedded_char])
            else:
                concat_word = embedded_char

        def expand(x):
            y = K.expand_dims(exists)
            return K.repeat_elements(y, self.word_embedding_dim, axis=3)

        def calc_output_shape(in_shape):
            return in_shape + (self.word_embedding_dim, )

        expanded_exists = Lambda(expand,
                                 output_shape=calc_output_shape)(exists)
        concat_word = Multiply()([concat_word, expanded_exists])
        # to document
        output = self.embedded_word_to_document(max_sentence_length,
                                                self.word_embedding_dim,
                                                concat_word,
                                                sentence_remainder,
                                                word_remainder)
        if not self.REMAINDER:
            model = Model([exists, known, word_input, char_input], output)
        else:
            model = Model([
                exists, known, word_input, char_input, sentence_remainder,
                word_remainder
            ], output)
        self.compile_model(model)
        return model

    def compile_model(self, model):
        metrics = ['accuracy']
        if self.mse:
            metrics += [argmax_mse]
        model.compile(loss='categorical_crossentropy',
                      optimizer=self.optimizer,
                      metrics=metrics)

    def make_models(self):
        self.models = []
        if self.CHAR:
            compile_models = self.bucket_combined_models
        else:
            compile_models = self.bucket_word_models
        compile_models()
        self.log.write(str(self.models[0].summary()) + '\n')

        self.make_mini_batch_fn()

        self.my_keras = My_keras(
            self.mini_batch_fn,
            self.log_path,
            self.dataset,
            self.CHAR,
            self.REMAINDER,
            self.sec_period,
            self.dataset.n_classes,
            self.std_batch_size,
            adjust_learning_rate=self.adjust_learning_rate,
            mse=self.mse)
        return self.models

    def make_mini_batch_fn(self):
        def mini_batch_fn(is_train, bucket_i, xs, ys):
            if is_train:
                return self.models[bucket_i].train_on_batch(xs, ys)
            else:
                return self.models[bucket_i].test_on_batch(xs, ys)

        self.mini_batch_fn = mini_batch_fn

    def bucket_word_models(self):
        for i in range(len(self.dataset.bucket_bounds)):
            batch_size = self.dataset.bucket_batch_size[i]
            max_n_sentences = self.dataset.bucket_bounds[i][0]
            max_sentence_length = self.dataset.bucket_bounds[i][1]
            self.models += [
                self.word_model(batch_size, max_n_sentences,
                                max_sentence_length)
            ]

    def bucket_combined_models(self):
        for i in range(len(self.dataset.bucket_bounds)):
            batch_size = self.dataset.bucket_batch_size[i]
            max_n_sentences = self.dataset.bucket_bounds[i][0]
            max_sentence_length = self.dataset.bucket_bounds[i][1]
            self.models += [
                self.combined_model(batch_size, max_n_sentences,
                                    max_sentence_length,
                                    self.dataset.MAX_WORD_LENGTH)
            ]

    def set_learning_rate(self, models, learning_rate):
        K.set_value(self.optimizer.lr, learning_rate)

    def run_epochs(self,
                   models,
                   save_path,
                   max_n_epoch,
                   pretrain=False,
                   decay=None,
                   partial_train=False,
                   best_validation_accuracy=0):
        self.my_keras.models = models
        best_val_acc, n_patient = best_validation_accuracy, 0
        learning_rate = self.initial_learning_rate
        if not decay:
            decay = self.learning_rate_decay
        for i in range(max_n_epoch):
            self.log.write('learning_rate = {}\n'.format(learning_rate))
            if self.TRAIN:
                self.log.write('==== train ==== (epoch {})\n'.format(i + 1))
                train_acc, train_loss, train_rmse = self.my_keras.train_model(
                    self.dataset.train,
                    learning_rate,
                    partial_train=partial_train)
            self.log.write('========== validation ==========\n')
            val_acc, val_loss, val_rmse = self.my_keras.test_model(
                self.dataset.validation, partial_train=partial_train)
            self.log.write('========== test ==========\n')
            test_acc, test_loss, test_rmse = self.my_keras.test_model(
                self.dataset.test, partial_train=partial_train)
            self.log.write('{}   '.format(i + 1))
            if self.TRAIN:
                self.log.write('train loss = {:.5f}  train rmse = {}'.format(
                    train_loss, train_rmse))
            self.log.write('   val_loss = {:.5f}  val_rmse={}'.format(
                val_loss, val_rmse))
            self.log.write('   test_loss = {:.5f}  test_rmse={}\n'.format(
                test_loss, test_rmse))
            if self.TRAIN:
                self.log.write('train_acc = {:.5f}  '.format(train_acc))
            self.log.write('val_acc = {:.5f}  test_acc = {:.5f}\n'.format(
                val_acc, test_acc))
            with open('acc.txt', 'a') as f:
                f.write(
                    'val_loss = {:.5f},  val_acc = {:.5f},  test_loss = {:.5f},  test accuracy = {:.5f}\n'
                    .format(val_loss, val_acc, test_loss, test_acc))
            self.print_emb_matrix()
            # save the current
            if not self.mem_test_flag:
                models[0].save_weights(save_path)
            # save the best
            prev_best_val_acc = best_val_acc
            if not self.mem_test_flag and val_acc >= best_val_acc + 0.0005:
                best_val_acc = val_acc
                models[0].save_weights(self.best_save_path)
                self.save_best_validation(best_val_acc)
                self.log.write('====== saved ======\n')
            # increace patience if the improvement is not enough
            if val_acc >= prev_best_val_acc + 0.0005:
                n_patient = 0
            else:
                n_patient += 1
                print('n_patient = {}'.format(n_patient))
                if n_patient >= self.patience:
                    break
            # the break condition for pretrain
            if pretrain and val_acc - best_val_acc < 0.01:
                break  # terminate when pretrain and small difference

            learning_rate /= decay
            self.set_learning_rate(models, learning_rate)

            if self.mem_test_flag:
                break

    def load(self, models, save_path):
        if os.path.exists(save_path):
            with CustomObjectScope(self.custom_objects):
                models[0].load_weights(save_path)
                with open(self.best_validation_save_path, 'r') as f:
                    best_val_acc = float(f.read(
                    ))  # error if there is no float value in the file
            self.log.write(
                '====== {} loaded (best validation accuracy = {}) =====\n'.
                format(save_path, best_val_acc))
            return best_val_acc
        else:
            self.log.write(
                '======== failed loading .... NEW   {} =========\n'.format(
                    save_path))
            self.print_emb_matrix('before save')
            models[0].save_weights(
                save_path)  # save initial restore point for mem_test
            self.print_emb_matrix('after save')
            self.save_best_validation(0)
            return 0

    def run_without_mem_test(self, compiled=False):
        if not self.CHAR:  # only word
            self.log.write('========== word-model ==========\n')
            if not compiled:
                self.make_models()
            best_validation_accuracy = self.load(self.models,
                                                 self.model_save_path)
            self.run_epochs(self.models,
                            self.model_save_path,
                            self.max_n_epoch,
                            best_validation_accuracy=best_validation_accuracy)
        else:
            self.log.write('=== combined model ======\n')
            concat_save_path = os.path.join(self.save_dir, 'concat.h5')
            if not self.mem_test and self.pretrain and not os.path.exists(
                    concat_save_path):
                # pre-train
                self.word_embedding_layer.trainable = False
                pretrain_models = self.make_models()
                pretrain_save_path = os.path.join(self.save_dir, 'pretrain.h5')
                if not self.complete_pretrain or not os.path.exists(
                        pretrain_save_path):
                    self.log.write('=== pretrain ====\n')
                    best_validation_accuracy = self.load(
                        pretrain_models, self.best_save_path)
                    self.run_epochs(
                        pretrain_models,
                        pretrain_save_path,
                        max_n_epoch=2,
                        decay=1,
                        best_validation_accuracy=best_validation_accuracy)
                else:
                    self.log.write('=== finish pre-train ====\n')
                # convert to full-model
                self.load(pretrain_models, self.best_save_path)
                self.word_embedding_layer.trainable = True
                concat_models = self.make_models()
                concat_models[0].save_weights(concat_save_path)
                self.clear()
            if not compiled:
                self.make_models()
            best_validation_accuracy = self.load(self.models,
                                                 self.model_save_path)
            self.run_epochs(self.models,
                            self.model_save_path,
                            self.max_n_epoch,
                            best_validation_accuracy=best_validation_accuracy)

    def save_best_validation(self, accuracy):
        with open(self.best_validation_save_path, 'w') as f:
            f.write(str(accuracy))

    def clear(self):
        print('cleared')
        if os.environ['KERAS_BACKEND'] == 'tensorflow':
            K.clear_session()
            self.make_layers()

    def print_emb_matrix(self, in_str=None):
        with open(os.path.join(self.save_dir, 'weights.txt'), 'a') as f:
            f.write('=======================\n')
            if in_str:
                f.write(in_str + '\n')
            f.write('word_rnn = {}\n'.format(self.word_rnn[0].get_weights()))
            f.write('word = {}\n'.format(
                self.word_embedding_layer.get_weights()))
            if self.CHAR:
                f.write('char = {}\n'.format(
                    self.character_embedding_layer.get_weights()))
                f.write('conv1 = {}\n'.format(self.conv1.get_weights()))
                f.write('conv2 = {}\n'.format(self.conv2.get_weights()))
Esempio n. 11
0
class LSTMLangModel:
    def __init__(self,
                 word_vec,
                 word_to_index,
                 index_to_word,
                 weight_file=None,
                 learning_rate=0.001,
                 sequence_len=2000,
                 directory='./models/LM_debug',
                 dropout=0.0,
                 outputs=(32, )):
        self.word_vec = word_vec
        self.word_to_index = word_to_index
        self.index_to_word = index_to_word
        self.sequence_len = sequence_len
        self.directory = directory
        self.outputs = outputs
        self.dropout = dropout

        self.model = Sequential()
        self.embed = Embedding(input_dim=np.size(word_vec, 0),
                               output_dim=np.size(word_vec, 1),
                               weights=[word_vec],
                               trainable=True,
                               mask_zero=True,
                               name='Embed',
                               input_shape=(self.sequence_len, ))
        self.model.add(self.embed)
        for lo in outputs:
            self.model.add(
                LSTM(lo,
                     implementation=1,
                     return_sequences=True,
                     dropout=self.dropout))
        self.model.add(
            TimeDistributed(
                Dense(len(self.index_to_word), activation='softmax')))

        if weight_file is not None:
            self.model.load_weights(weight_file)

        self.model.compile(RMSprop(lr=learning_rate),
                           'categorical_crossentropy',
                           sample_weight_mode='temporal',
                           metrics=[])

    def train(self,
              Xtrain,
              ytrain,
              nb_epoch,
              Xval=None,
              yval=None,
              train_mask=None,
              val_mask=None,
              batch_size=10):
        """
            Uses a generator to decompress labels from integers to hot-coded vectors batch-by-batch to save memory.
            See utils.commons.generate_batch().
        """
        callback = LangModelCallback(self)
        logger = CSVLogger(self.directory + '/epochs.csv')
        nb_class = len(self.index_to_word)
        total_len = np.size(ytrain, 0)

        generator = commons.generate_batch

        if Xval is None or yval is None:
            self.model.fit_generator(generator(Xtrain, ytrain,
                                               self.embed.get_weights()[0],
                                               train_mask, nb_class, total_len,
                                               batch_size),
                                     steps_per_epoch=total_len / batch_size,
                                     nb_worker=1,
                                     epochs=nb_epoch,
                                     callbacks=[callback, logger],
                                     verbose=1,
                                     max_q_size=1)
        else:
            self.model.fit_generator(
                generator(Xtrain, ytrain,
                          self.embed.get_weights()[0], train_mask, nb_class,
                          total_len, batch_size),
                steps_per_epoch=total_len / batch_size,
                epochs=nb_epoch,
                callbacks=[callback, logger],
                verbose=1,
                max_q_size=1,
                workers=1,
                validation_steps=Xval.shape[0] / batch_size,
                validation_data=generator(Xval, yval,
                                          self.embed.get_weights()[0],
                                          val_mask, nb_class, Xval.shape[0],
                                          batch_size))

    def predict(self, query_tokens, top=None):
        if top is None:
            top = np.size(self.word_vec, 0)

        out_pos = len(query_tokens) - 1
        indices = [
            self.word_to_index[w] if w in self.word_to_index else
            self.word_to_index[tokens.UNKNOWN_TOKEN] for w in query_tokens
        ]
        indices.extend([0] * (self.sequence_len - len(indices)))
        indices = np.asarray(indices, dtype=np.int32).reshape(
            (1, self.sequence_len))

        output = self.model.predict(indices, batch_size=1, verbose=0)
        dist = np.asarray(output[0][out_pos])

        index_rank = np.flip(np.argsort(dist)[-top:], 0)
        result = []
        for idx in index_rank:
            result.append((self.index_to_word[idx], dist[idx]))

        return result

    def log(self, string='', out=True):
        f = open(self.directory + '/log.txt', mode='at')

        if out:
            print(string)
        print(string, file=f)
        f.close()

    def save(self):
        f1 = self.directory + '/weights.hdf5'
        f2 = self.directory + '/config.pkl'
        f3 = self.directory + '/dictionary.npz'

        self.model.save_weights(f1)
        config = {
            'seq_len': self.sequence_len,
            'word_vec_dim': np.shape(self.word_vec),
            'outputs': self.outputs,
            'dropout': self.dropout
        }
        pickle.dump(config, open(f2, 'wb'), pickle.HIGHEST_PROTOCOL)
        np.savez(f3,
                 wit=self.word_to_index,
                 itw=self.index_to_word,
                 wv=self.word_vec)
        logging.info('\nSaved model to %s' % self.directory)

    @staticmethod
    def load(directory):
        f1 = directory + '/weights.hdf5'
        f2 = directory + '/config.pkl'
        f3 = directory + '/dictionary.npz'

        logging.info('Loading model from %s...' % directory)
        try:
            config = pickle.load(open(f2, 'rb'))

            npz_file = np.load(f3)
            word_to_index, index_to_word, word_vec = npz_file["wit"].reshape(
                1)[0], npz_file["itw"], npz_file["wv"].reshape(
                    config['word_vec_dim'])

            logging.info('Done.')
            return LSTMLangModel(word_vec,
                                 word_to_index,
                                 index_to_word,
                                 weight_file=f1,
                                 sequence_len=config.get('seq_len', 2000),
                                 directory=directory,
                                 outputs=config.get('outputs', (32, )),
                                 dropout=config.get('dropout', 0.0))
        except FileNotFoundError:
            print('One or more model files cannot be found. Terminating...')
            sys.exit()
Esempio n. 12
0
target = Reshape((vector_dim, 1))(target)
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)

dot_product = merge([target, context], mode='dot', dot_axes=1)
dot_product = Reshape((1, ))(dot_product)
output = Dense(1, activation='softmax')(dot_product)  #sigmoid

#output = Dense(len(set(labels)), activation='sigmoid')(dot_product)

model = Model(input=[input_target, input_context], output=output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

#model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['acc'])

print model.summary()

epochs = int(sys.argv[2])

model.fit_generator(generator(word_target, word_context, labels, 100),
                    steps_per_epoch=100,
                    epochs=epochs)

save_embeddings("embedding.txt",
                embedding.get_weights()[0], ValueIdentifierDict)

model = load_embedding("embedding.txt", 100)
#tsne_plot(model)

exit(0)
Esempio n. 13
0
def collect_data():
    
    filename = "C:\\Users\\rkrit\\Documents\\CS 584_Data Mining\\source_code.zip"
    vocabulary,code = read_data(filename)
    code = np.squeeze(np.asarray(code))
    print(len(vocabulary))
    #print(vocabulary[:7])
    vocabulary_size = 83123
    sum = 0
    code3=code
    #data, count, dictionary, reverse_dictionary =
    vocab_size =83123
    window_size = 100
    data, count, dictionary, reverse_dictionary,code_data = build_dataset(vocabulary,vocabulary_size,code3)
    print(code_data)
    couples, labels = skipgrams(data, vocab_size, window_size=window_size,shuffle=False)
    vocab_size = 83123
    print(data) 
    vocab_size =83123
    window_size = 100
    
    print(data[:7])
    import numpy as np
    window_size = 5
    vector_dim = 300
    epochs = 2000
    
    valid_size = 16     # Random set of words to evaluate similarity on.
    valid_window = 100  # Only pick dev samples in the head of the distribution.
    valid_examples = np.random.choice(valid_window, valid_size, replace=False)
    
    sampling_table = sequence.make_sampling_table(16)
    couples, labels = skipgrams(data, vocab_size, window_size=window_size)
    word_target, word_context = zip(*couples)
    word_target = np.array(word_target, dtype="int32")
    word_context = np.array(word_context, dtype="int32")
    print(couples[:10], labels[:10])
    
    # create some input variables
    input_target = Input((1,))
    input_context = Input((1,))
    
    embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
    
    target = embedding(input_target)
    target = Reshape((vector_dim, 1))(target)
    context = embedding(input_context)
    context = Reshape((vector_dim, 1))(context)
    print(len(embedding.get_weights()[0]))
    sum1=0
    final =[]
    
    param =embedding.get_weights()[0]
    indices=[]
    
        
    prev=0
    for i in vocabulary:
       
       indices.append([prev,len(i)+prev])
       prev=len(i)
    sum=0
    for ind in indices:
        final.append(param[ind[0]:ind[1]])
        sum=sum+len(param[ind[0]:ind[1]])
        
        
    print(len(indices))
    print(sum)
    print(sum1)
    print(len(final))
    print(len(param))
    print(len(vocabulary[124]))
    print(len(final[124]))
    
    for code1,code2 in code_data:
        
        print("The cosine similarity for  ",code1,"  is  ", K.eval(cos_distance(np.array(param[code2]),np.array(param[code2+100:code2+200]))))   
    # setup a cosine similarity operation which will be output in a secondary model
   
    model = Sequential()
    target = keras.layers.Input(shape=(300,))
    model.add(target)
    context = keras.layers.Input(shape=(300,))
    model.add(Dense(units=50, activation='sigmoid')(context))
    print(couples[:10], labels[:10])
    print(K.eval(cos_distance(x1, x2)))
    model = Sequential()
    model.add(Dense(units=50,activation="relu",input_shape=(300,)))
    model.add(Dense(units=50,activation="relu",input_shape=(300,)))
    model.add(Dense(units=10,activation="softmax"))
    model.compile(optimizer=SGD(0.001),loss="binary_crossentropy",metrics=["accuracy"],optimizer='rmsprop')
   
    #
    
    # setup a cosine similarity operation which will be output in a secondary model
    similarity = merge([target, context], mode='cos', dot_axes=0)
    
    # now perform the dot product operation to get a similarity measure
    dot_product = merge([target, context], mode='dot', dot_axes=1)
    dot_product = Reshape((1,))(dot_product)
    # add the sigmoid output layer
    output = Dense(1, activation='sigmoid')(dot_product)
    # create the primary training model
    model = Model(input=[input_target, input_context], output=output)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop')
    
    # create a secondary validation model to run our similarity checks during training
    validation_model = Model(input=[input_target, input_context], output=similarity)
    #del vocabulary  # Hint to reduce memory.
    return data, count, dictionary, reverse_dictionary
Esempio n. 14
0
model = Model([word_input, doc_input], dense)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
model.fit([X_train_data_arr, X_train_doc_idx_arr],
          label_train,
          validation_data=([X_test_data_arr,
                            X_test_doc_idx_arr], [label_test]),
          epochs=1,
          batch_size=128)
model.evaluate([X_train_data_arr, X_train_doc_idx_arr],
               label_train,
               batch_size=128,
               verbose=1,
               sample_weight=None)
em = embedding_layer_word.get_weights()[0]

em_norm = LA.norm(em, axis=1)
#norm = np.sqrt(np.reduce_sum(np.square(em), 1, keep_dims=True))
em_n = em / em_norm.reshape((20000, 1))
similarity = np.matmul(em_n, np.transpose(em_n))
i = 559
i = 358
top_k = 10
nearest = (-similarity[i, :]).argsort()[1:top_k + 1]
log = 'Nearest to %s:' % index_word[i]
for k in range(top_k):
    close_word = index_word[nearest[k]]
    log = '%s %s,' % (log, close_word)

print(log)