Beispiel #1
0
def build_ep_inference_model(model_settings):
    # architecture
    input = Input(shape=(model_settings['max_len'], model_settings['emb_dim']), name='input')
    model = Bidirectional(LSTM(units=100, return_sequences=True), name='bilstm1')(input)  # biLSTM
    model = Bidirectional(LSTM(units=100, return_sequences=True), name='bilstm2')(model)  # 2nd biLSTM
    model = TimeDistributed(Dense(model_settings['n_tags'], activation=None), name='td')(model)  # a dense layer
    crf = CRF(model_settings['n_tags'], name='crf')  # CRF layer
    out = crf(model)  # output
    model = Model(input, out)
    model.compile(optimizer=Nadam(lr=0.01, clipnorm=1), loss=losses.crf_loss, metrics=[metrics.crf_accuracy]) 
    model.summary()
    return model
Beispiel #2
0
def build_qt_inference_model(model_settings):
    # architecture
    _input = Input(shape=(model_settings['max_len'], model_settings['emb_dim']), name='input')
    model = Bidirectional(LSTM(units=100, return_sequences=True, dropout=0.5,
                               recurrent_dropout=0.5), name='bilstm1')(_input)  # biLSTM
    model = Bidirectional(LSTM(units=100, return_sequences=False, dropout=0.5,
                               recurrent_dropout=0.5), name='bilstm2')(model)  # 2nd biLSTM
    _output = Dense(model_settings['n_tags'], activation='softmax', name='output')(model)  # a dense layer
    model = Model(_input, _output)
    model.compile(optimizer=Nadam(clipnorm=1), loss='categorical_crossentropy', metrics=['accuracy']) 
    model.summary()
    return model
Beispiel #3
0
    def draw(self):
        with open(self.conf.train_dict, "rb") as fp:
            vocabulary = pickle.load(fp)
        fp.close()

        # Model Configuration
        input_character = Input(shape=(None, ), name="character")
        feature_character = Embedding(len(vocabulary.keys()) + 1, self.embedding_dim, mask_zero=True)(input_character)
        feature_character = Dropout(0.1)(feature_character)
        feature_character = Bidirectional(LSTM(self.bi_rnn_units // 2, return_sequences=True, recurrent_dropout=0.1))(feature_character)

        input_construction = Input(shape=(None, 4), name="cxn")

        model = concatenate([feature_character, input_construction])
        model = Bidirectional(LSTM(self.bi_rnn_units // 2, return_sequences=True, recurrent_dropout=0.6))(model)
        output = TimeDistributed(Dense(5, activation="softmax"))(model)

        model = Model(inputs=[input_character, input_construction], outputs=output)
        plot_model(model, self.conf.model_image.format("multi_input_and_output_model.png"), show_shapes=True)

        model.compile("rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])
        model.summary()

        return model
	def train_model(self,
					X,
					y,
					labels,
					word_index,
					MAX_SEQUENCE_LENGTH,
					model_save_directory='./models/'):
		"""
			Train deep learning model
		"""
		
		embedding_matrix, nb_words = get_embedding('glove',word_index)
		
		input1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
		embedding = Embedding(input_dim=len(embedding_matrix), 
								output_dim=self.embedding_dim, 
								weights=[embedding_matrix], 
								input_length=MAX_SEQUENCE_LENGTH, 
								trainable=False)(input1)
#         embedding = Dropout(self.drop_rate_embedding)(embedding)
		model = Bidirectional(LSTM(units=self.num_lstm_units,
								  return_sequences=True,
								  recurrent_dropout=self.drop_rate_lstm))(embedding)
		
		model = TimeDistributed(Dense(units=self.num_lstm_units,
									 activation=self.activation_function))(model)
		crf = CRF(units=len(labels))
		output1 = crf(model)
		
		model = Model(input1,output1)        
		model.compile(optimizer='rmsprop',\
					  loss=crf.loss_function,\
					 metrics=[crf.accuracy])
				
		print(model.summary())

		early_stopping = EarlyStopping(monitor='val_loss', patience=3)
		STAMP = 'lstm_%f_%.2f' % (self.num_lstm_units, self.drop_rate_lstm)
		checkpoint_dir = model_save_directory + 'checkpoints/' + str(int(time())) + '/'

		if not os.path.exists(checkpoint_dir):
			os.makedirs(checkpoint_dir)

		with open(bst_model_path+".json", "w") as json_file: json_file.write(model.to_json())
def define_model():
    wordseq = Input(shape=(max_sent_length, ))
    charSeq = Input(shape=(max_sent_length, max_wrd_len))
    wm = wordmodel(max_sent_length)(wordseq)
    cm = TimeDistributed(charmodel(max_wrd_len))(charSeq)
    cm = Reshape((max_sent_length, -1))(cm)
    combined_input = concatenate([wm, cm])
    model = Bidirectional(LSTM(units=100,
                               recurrent_dropout=0.1))(combined_input)
    out = Dense(10, activation="softmax")(model)  # softmax output layer

    model = Model([wordseq, charSeq], out)

    #load existing weight if exist
    if os.path.isfile(outFileName + "-best.hdf5"):
        model.load_weights(outFileName + "-best.hdf5")
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    print(model.summary())
    #plot_model(model, show_shapes=True, to_file=outFileName+'-plot.png')
    return model
Beispiel #6
0
    def train(self, epochs, embedding=None):
        # Embedded Words
        txt_input = Input(shape=(None, ), name='txt_input')
        txt_embed = Embedding(input_dim=self.num_words,
                              output_dim=MAX_LEN,
                              input_length=None,
                              name='txt_embedding',
                              trainable=False,
                              weights=([embedding]))(txt_input)
        txt_drpot = Dropout(0.1, name='txt_dropout')(txt_embed)

        # Embedded Part of Speech
        pos_input = Input(shape=(None, ), name='pos_input')
        pos_embed = Embedding(input_dim=self.num_pos,
                              output_dim=MAX_LEN,
                              input_length=None,
                              name='pos_embedding')(pos_input)
        pos_drpot = Dropout(0.1, name='pos_dropout')(pos_embed)

        # Embedded Characters
        char_in = Input(shape=(
            None,
            MAX_LEN_CHAR,
        ), name="char_input")
        emb_char = TimeDistributed(
            Embedding(input_dim=self.num_chars,
                      output_dim=MAX_LEN_CHAR,
                      input_length=None))(char_in)
        char_enc = TimeDistributed(
            LSTM(units=20, return_sequences=False,
                 recurrent_dropout=0.5))(emb_char)

        # Concatenate inputs
        x = concatenate([txt_drpot, pos_drpot, char_enc], axis=2)
        x = SpatialDropout1D(0.3)(x)

        # Deep Layers
        model = Bidirectional(
            LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(x)
        model = Bidirectional(
            LSTM(units=100, return_sequences=True,
                 recurrent_dropout=0.1))(model)

        # Output
        out = TimeDistributed(Dense(self.num_entities,
                                    activation="softmax"))(model)
        model = Model(inputs=[txt_input, pos_input, char_in], outputs=[out])

        model.compile(optimizer="rmsprop",
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])

        plot_model(model, to_file=self.save_path + 'model_structure.png')
        print(model.summary())

        history = model.fit(
            [self.X_train, self.train_pos, self.train_characters],
            np.array(self.Y_train),
            batch_size=32,
            epochs=epochs,
            validation_data=([
                self.X_validation, self.valid_pos, self.valid_characters
            ], np.array(self.Y_validation)),
            verbose=1)

        model.save(self.save_path + 'model_ner')

        test_eval = model.evaluate(
            [self.X_test, self.test_pos, self.test_characters],
            np.array(self.Y_test))

        print('Test loss:', test_eval[0])
        print('Test accuracy:', test_eval[1])

        return model, history
def bilstm(X_train, X_test, Y_train, Y_test, wordembeddings):
    np.random.seed(1234)
    tf.random.set_seed(1234)
    random.seed(1234)

    max_length_sentence = X_train.str.split().str.len().max()
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',
                          lower=True)
    tokenizer.fit_on_texts(X_train)
    word_index = tokenizer.word_index
    EMBEDDING_DIM = 300
    vocabulary_size = len(word_index) + 1
    print('Found %s unique tokens.' % len(word_index))

    sequences_train = tokenizer.texts_to_sequences(X_train)
    sequences_valid = tokenizer.texts_to_sequences(X_test)
    X_train = pad_sequences(sequences_train, maxlen=max_length_sentence)
    X_val = pad_sequences(sequences_valid, maxlen=X_train.shape[1])
    y_train = np.asarray(Y_train)
    y_val = np.asarray(Y_test)
    #print(word_index)
    '''
    print('Shape of data tensor:', X_train.shape)
    print('Shape of data tensor:', X_val.shape)
    print('Shape of data tensor:', y_train.shape)
    print('Shape of data tensor:', y_val.shape)
    
    print(X_train)
    print("*"*100)
    print(X_val)
    print("*"*100)
    print(y_train)
    print("*"*100)
    print(y_val)
    '''

    embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
    for word, i in word_index.items():
        if (word in wordembeddings.keys()):
            embedding_vector = wordembeddings[word]
            if len(embedding_vector) == 0:  #if array is empty
                embedding_vector = wordembeddings[word.title()]
                if len(embedding_vector) == 0:
                    embedding_vector = wordembeddings[word.upper()]
                    if len(embedding_vector) == 0:
                        embedding_vector = np.array([
                            round(np.random.rand(), 8) for i in range(0, 300)
                        ])

        else:
            #print("WORD NOT IN DICT",word)
            embedding_vector = np.array(
                [round(np.random.rand(), 8) for i in range(0, 300)])

        if len(embedding_vector) != 0:
            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(vocabulary_size,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                trainable=False)  #Try with True

    inputs = Input(shape=(X_train.shape[1], ))
    model = (Embedding(vocabulary_size,
                       EMBEDDING_DIM,
                       input_length=max_length_sentence,
                       weights=[embedding_matrix]))(inputs)

    model = Bidirectional(GRU(64))(
        model)  # !!!!!!! CHANGE THIS FOR OTHER MODELS
    model = (Dense(900, activation='relu'))(model)
    model = (Dense(400, activation='relu'))(model)
    model = (Dense(250, activation='relu'))(model)
    model = (Dense(204, activation='softmax'))(model)
    model = Model(inputs=inputs, outputs=model)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.summary()

    callbacks = [EarlyStopping(monitor='val_loss')]
    hist_adam = model.fit(
        X_train,
        y_train,
        batch_size=1000,
        epochs=200,
        verbose=1,
        validation_data=(X_val, y_val),
        callbacks=callbacks
    )  #!!!!!!!!!!!!!!!!!!!!!!!CHANGE BATCH SIZE TO 1000 #change epochs to 200

    model.save(config.bigru_prepocessed_dataset1_chai
               )  # !!!!!!! CHANGE THIS FOR OTHER MODELS

    y_pred = model.predict(X_val)
    print(y_pred)

    y_val_class = pd.DataFrame(y_val).idxmax(axis=1)
    print(y_val_class)

    y_val_class_argmax = np.argmax(y_val, axis=1)
    y_pred_class_argmax = np.argmax(y_pred, axis=1)

    y_pred_class = pd.DataFrame(y_pred).idxmax(axis=1)
    print(y_pred_class)

    print(classification_report(y_val_class, y_pred_class))

    plt.suptitle('Optimizer : Adam', fontsize=10)
    plt.ylabel('Loss', fontsize=16)
    plt.xlabel('Epoch', fontsize=14)
    plt.plot(hist_adam.history['loss'], color='b', label='Training Loss')
    plt.plot(hist_adam.history['val_loss'], color='r', label='Validation Loss')
    plt.legend(loc='upper right')

    plt.savefig(
        '/home/ubuntu/asset_classification/results/bigru_model_dataset1_preprocessed_chai.png'
    )  # !!!!!!! CHANGE THIS FOR OTHER MODELS

    tf.keras.utils.plot_model(
        model, to_file=config.bigru_architecture,
        show_shapes=True)  # !!!!!!! CHANGE THIS FOR OTHER MODELS

    return (y_pred, y_val_class, y_pred_class, y_val_class_argmax,
            y_pred_class_argmax)
#              recurrent_dropout=0.25)(inputs)

# attention layer
# model = attention_3d_block(model)

# Output FC layer
model = TimeDistributed(Dense(nb_classes, activation="softmax"))(model)

model = Model(inputs=inputs, outputs=model)
# model = multi_gpu_model(model, gpus=2)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              sample_weight_mode="temporal",
              metrics=['accuracy'])
model.summary()

# train on videos with sample weighting
# model.fit(x=X_train_m,
#           y=Y_train_,
#           validation_data=(X_vali_m, Y_vali_, M_vali[:, :, 0]),
#           epochs=nb_epoch,
#           batch_size=batch_size,
#           verbose=1,
#           # sample_weight=M_train[:, :, 0],
#           sample_weight=sample_weights,
#           callbacks=[lr_reducer, early_stopper, tensor_board, checkpointer])


model.fit_generator(train_generator(X_train, Y_train),
                    verbose=1,
Beispiel #9
0
    def main(self, glove):
        # get word embeddings
        utils = wordUtils.Utils()

        if glove:
            # use glove
            self.words_list, self.embedding_matrix = utils.load_glove()
            unword_n = len(self.words_list)

        else:
            self.words_list, self.embedding_matrix = utils.load_word2vec()
            unword_n = len(self.words_list)

        # get the training corpus
        cr = corpusreader.CorpusReader(self.textfile, self.annotfile)
        corpus = cr.trainseqs
        print(len(corpus))
        train = []
        print("Processing training data", datetime.now())
        for doc in corpus:
            tmp_dic = {}

            tmp_dic['tokens'] = doc['tokens']

            # convert SOBIE tags to numbers
            tags = doc['bio']
            tags = [self.lablist[i] for i in tags]
            tmp_dic['bion'] = tags
            train.append(tmp_dic)


        n_emb = 0
        n_unk = 0

        # get the number of the embedding
        for idx in range(len(train)):
            words = train[idx]['tokens']
            words_id = []
            for i in words:
                # get the number of the embedding
                try:
                    # the index of the word in the embedding matrix
                    index = self.words_list.index(i)
                    n_emb = n_emb + 1
                except ValueError:
                    # use the embedding full of zeros to identify an unknown word
                    n_unk = n_unk + 1
                    index = unword_n

                # the index of the word in the embedding matrix
                words_id.append(index)

            train[idx]['tokens'] = words_id


        # get all sizes from the sequences with training data
        train_l_d = {}
        train_l_labels = {}
        for seq in train:
            # corpus
            l = len(seq['tokens'])
            if l not in train_l_d: train_l_d[l] = []
            train_l_d[l].append(seq['tokens'])

            # labels
            l1 = len(seq['bion'])
            if l1 not in train_l_labels: train_l_labels[l1] = []
            train_l_labels[l1].append(seq['bion'])

        sizes = list(train_l_d.keys())
        for i in sizes:
            if len(train_l_d[i]) != len(train_l_labels[i]):
                print("merda")

            for m in range(len(train_l_d[i])):
                if len(train_l_d[i][m]) != len(train_l_labels[i][m]):
                    print("XXX")

        input = Input(shape=(None,))
        el = Embedding(len(self.words_list) + 1, 200, weights=[self.embedding_matrix], trainable=False)(input)
        model = Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout = 0.1))(el)  # variational biLSTM
        model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
        crf = CRF(self.lab_len)  # CRF layer
        out = crf(model)  # output

        model = Model(input, out)
        model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
        model.summary()

        f_best = -1
        f_index = -1
        # OK, start actually training
        for epoch in range(self.epochsN):
            print("Epoch", epoch, "start at", datetime.now())
            # Train in batches of different sizes - randomize the order of sizes
            # Except for the first few epochs
            if epoch > 2:
                random.shuffle(sizes)
            for size in sizes:
                batch = train_l_d[size]
                labs = train_l_labels[size]

                tx = np.array([seq for seq in batch])
                y = [seq for seq in labs]

                ty = [to_categorical(i, num_classes=self.lab_len) for i in y]

                # This trains in mini-batches
                model.fit(tx, np.array(ty), verbose=0, epochs=1)
            print("Trained at", datetime.now())

            # save all epochs
            save_load_utils.save_all_weights(model, 'words-results/epoch_%s.h5' % epoch)
            # test the results
            test_data = 'corpus_char/tmVarCorpus/treated/test_data.txt'
            test_labels = 'corpus_char/tmVarCorpus/treated/test_labels.tsv'
            self.test_model(test_data, test_labels, model, glove)
            f = self.eval()

            if f > f_best:
                f_best = f
                f_index = epoch


        # Pick the best model, and save it with a useful name
        print("Choosing the best epoch")
        shutil.copyfile("words-results/epoch_%s.h5" % f_index, "words_glove_%s.h5" % f_index)