Beispiel #1
0
 def test_unitnorm_constraint(self):
     lookup = Sequential()
     lookup.add(Embedding(3, 2, weights=[self.W1], W_constraint=unitnorm()))
     lookup.add(Flatten())
     lookup.add(Dense(2, 1))
     lookup.add(Activation('sigmoid'))
     lookup.compile(loss='binary_crossentropy', optimizer='sgd', class_mode='binary')
     lookup.train(self.X1, np.array([[1], [0]], dtype='int32'))
     norm = np.linalg.norm(lookup.params[0].get_value(), axis=1)
     self.assertTrue(np.allclose(norm, np.ones_like(norm).astype('float32')))
 def test_unitnorm_constraint(self):
     lookup = Sequential()
     lookup.add(Embedding(3, 2, weights=[self.W1], W_constraint=unitnorm()))
     lookup.add(Flatten())
     lookup.add(Dense(2, 1))
     lookup.add(Activation('sigmoid'))
     lookup.compile(loss='binary_crossentropy',
                    optimizer='sgd',
                    class_mode='binary')
     lookup.train(self.X1, np.array([[1], [0]], dtype='int32'))
     norm = np.linalg.norm(lookup.params[0].get_value(), axis=1)
     self.assertTrue(np.allclose(norm,
                                 np.ones_like(norm).astype('float32')))
Beispiel #3
0
ada = Adagrad()

model.compile(loss='mse', optimizer=ada)



model.fit(X[:10000], Y[:10000], batch_size=120, nb_epoch=5)


model.fit(X[:400000], Y[:400000], batch_size=3000, nb_epoch=5)

for it, (seq, label) in enumerate(zip(seq_data, Y_trans)):
    if it % 10 == 0:
        print 'Iteration: {}'.format(it)
    model.train(np.array([seq]), [label])



io.save('./yelp-datafile-1-30.h5', {'funny' : np.array(funny_votes), 
              'useful' : np.array(useful_votes), 
              'stars' : np.array(review_stars), 
              'sequenced_data' : seq_data, 
              'padded_data' : X,
              'meta' : 'Yelp data over the partitions 1 thru 29. sequenced_data is an embedding from the Keras Tokenizer'})





Beispiel #4
0
# let's train the model using SGD + momentum (how original).
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd)

dataset = geometric_3d_dataset.Geometric3DDataset(patch_size=patch_size,
                                                  task=geometric_3d_dataset.Geometric3DDataset.CLASSIFICATION_TASK,
                                                  centered=True)

for e in range(nb_epoch):

    train_iterator = dataset.iterator(batch_size=batch_size,
                                      num_batches=nb_test_batches)

    for b in range(nb_train_batches):
        X_batch, Y_batch = train_iterator.next()
        loss = model.train(X_batch, Y_batch)
        print 'loss: ' + str(loss)

    test_iterator = dataset.iterator(batch_size=batch_size,
                                     num_batches=nb_train_batches)

    for b in range(nb_test_batches):
        X_batch, Y_batch = test_iterator.next()
        error = model.test(X_batch, Y_batch)
        print 'error: ' + str(error)





    for e in range(nb_epoch):
        print('-'*40)
        print('Epoch', e)
        print('-'*40)

        progbar = generic_utils.Progbar(tokenizer.document_count)
        samples_seen = 0
        losses = []

        for i, seq in enumerate(tokenizer.texts_to_sequences_generator(text_generator())):
            # get skipgram couples for one text in the dataset
            couples, labels = sequence.skipgrams(seq, max_features, window_size=4, negative_samples=1., sampling_table=sampling_table)
            if couples:
                # one gradient update per sentence (one sentence = a few 1000s of word couples)
                X = np.array(couples, dtype="int32")
                loss = model.train(X, labels)
                losses.append(loss)
                if len(losses) % 100 == 0:
                    progbar.update(i, values=[("loss", np.mean(losses))])
                    losses = []
                samples_seen += len(labels)
        print('Samples seen:', samples_seen)
    print("Training completed!")

    if save:
        print("Saving model...")
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        six.moves.cPickle.dump(model, open(os.path.join(save_dir, model_save_fname), "wb"))

    for e in range(nb_epoch):
        print('-'*40)
        print('Epoch', e)
        print('-'*40)

        progbar = generic_utils.Progbar(tokenizer.document_count)
        samples_seen = 0
        losses = []
        
        for i, seq in enumerate(tokenizer.texts_to_sequences_generator(text_generator())):
            # get skipgram couples for one text in the dataset
            couples, labels = sequence.skipgrams(seq, max_features, window_size=4, negative_samples=1., sampling_table=sampling_table)
            if couples:
                # one gradient update per sentence (one sentence = a few 1000s of word couples)
                X = np.array(couples, dtype="int32")
                loss = model.train(X, labels)
                losses.append(loss)
                if len(losses) % 100 == 0:
                    progbar.update(i, values=[("loss", np.mean(losses))])
                    losses = []
                samples_seen += len(labels)
        print('Samples seen:', samples_seen)
    print("Training completed!")

    if save:
        print("Saving model...")
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        six.moves.cPickle.dump(model, open(os.path.join(save_dir, model_save_fname), "wb"))

class MelanomaModel:

    def __init__(self, nb_train_batches, batch_size, is3D):

        self.model = None
        self.train_data_set=None
        self.test_data_set=None
        self.valid_data_set=None
        self.is3D = is3D;

        # compute the number of mini-batches for training, validation and testing
        self.nb_train_batches=nb_train_batches
        self.batch_size=batch_size

    def create_model(self):
        self.model = Sequential()
        self.model.add(Convolution3D(16, stack_size=1, nb_row=11, nb_col=11, nb_depth=6, border_mode='valid'))
        self.model.add(Activation('relu'))
        self.model.add(MaxPooling3D(poolsize=(3, 3, 1)))
        self.model.add(Convolution3D(32, stack_size=16, nb_row=5, nb_col=5, nb_depth=1, border_mode='valid' ))
        self.model.add(Activation('relu'))
        self.model.add(MaxPooling3D(poolsize=(3, 3, 1)))
        self.model.add(Convolution3D(64, stack_size=32, nb_row=3, nb_col=3, nb_depth=1, border_mode='valid' ))
        self.model.add(MaxPooling3D(poolsize=(3, 3, 1)))
        self.model.add(Flatten3D())
        self.model.add(Dense(4096, 1024, init='normal'))
        self.model.add(Activation('relu'))
        self.model.add(Dropout(0.5))
        self.model.add(Dense(1024, 512, init='normal'))
        self.model.add(Activation('relu'))
        self.model.add(Dense(512, 2, init='normal'))

        # let's train the self.model using SGD + momentum(how original).
        sgd = RMSprop(rho=0.9, epsilon=1e-3, lr=0.001)
        self.model.compile(loss='mean_squared_error', optimizer=sgd)

    def load_model(self, model_file_path):
        model_file = open(model_file_path)
        self.model = cPickle.load(model_file)

    def load_melanoma_dataset(self, data_dir, training_perc):
        # Preparing melanoma dataset
        # data directory folder path
        file_names = ([data_dir + filename for filename in os.listdir(data_dir) if ".h5" in filename])
        random.shuffle(file_names)

        train_file_names = file_names[0:int(training_perc*len(file_names))]
        test_file_names = file_names[int(training_perc*len(file_names)):]

        if self.is3D:
            self.train_data_set = MelanomaDataset3D(data_dir, examples=train_file_names)
            self.test_data_set = MelanomaDataset3D(data_dir, examples=test_file_names)
        else:
            self.train_data_set = MelanomaDataset2D(data_dir, examples=train_file_names)
            self.test_data_set = MelanomaDataset2D(data_dir, examples=test_file_names)

    # storing and printing average error over all the mini-batches in an epoch
    def train_model(self, nb_epoch, model_starting_id, model_snapshot_freq, stats_snapshot_freq):
        losses = []
        errors = []

        last_error = float("inf")

        for e in range(nb_epoch):

            print " Performing Epoch no : " + str(e)+".......",

            train_iterator = self.train_data_set.iterator(batch_size=self.batch_size,
                                                          num_batches=self.nb_train_batches,
                                                          mode='even_shuffled_sequential')

            for b in range(self.nb_train_batches):
                X_batch, Y_batch = train_iterator.next()
                loss = self.model.train(X_batch, Y_batch)
                sys.stdout.write("Loss: %f%%   \r" % (loss))
                sys.stdout.flush()
                losses.append(loss)

            test_iterator = self.test_data_set.iterator(batch_size=self.batch_size,
                                                        mode='sequential')

            errors1 = []
            while test_iterator.has_next():
                X_batch, Y_batch, bacth_files = test_iterator.next()
                error = self.model.test(X_batch, Y_batch)
                errors1.append(error)

            mean_error = np.mean(errors1)
            errors.append(mean_error)
            print "error:   "+ str(mean_error)

            if mean_error < last_error:
                last_error = mean_error
                pickle.dump(self.model, open("best_model_"+str(e)+".pkl","wc"))

            if(e % stats_snapshot_freq == 0 and e > 0):
                pickle.dump(losses, open("loss.pkl", "wc"))
                pickle.dump(errors, open("error.pkl", "wc"))

            if(e % model_snapshot_freq == 0 and e > 0):
                pickle.dump(self.model, open("trained_model.pkl","wc"))
                model_starting_id += 1
Beispiel #8
0
lenx = 0
for line in documents:
    tmp_len = len(line)
    if tmp_len > lenx:
        lenx = tmp_len

print(lenx)

model = gensim.models.Word2Vec(documents,
                               size=200,
                               window=10,
                               min_count=1,
                               workers=10)
#model = gensim.models.Word2Vec(documents)

model.train(documents, total_examples=len(documents), epochs=10)

doc_df = pd.DataFrame(preprocess(politics_data))
print(doc_df)

doc_df = doc_df.fillna(0.0)
doc_df
type(doc_df)

doc_nparr = doc_df.values
type(doc_nparr)
doc_nparr


def vectorize_str(str_arr):
    tmp_arr = []
Beispiel #9
0
#Word2Vec
import gensim
from nltk import word_tokenize
model = gensim.models.Word2Vec.load("word2vec.model")
Bigger_list = []
for i in corpus:
    tokenized_text = word_tokenize(i)
    #print(tokenized_text)
    Bigger_list.append(tokenized_text)
# build vocabulary and train model
model = gensim.models.Word2Vec(Bigger_list,
                               size=4,
                               window=3,
                               min_count=5,
                               workers=10)
model.train(Bigger_list, total_examples=len(Bigger_list), epochs=20)
print(model)
model.save("word2vec.model")
model.save("model.bin")
vocab = list(model.wv.vocab)
l = model.wv.vectors
df = pd.DataFrame(index=range(3896), columns=range(224))
df[:] = 0
for j in range(0, 3896):
    w = 0
    text = corpus[j]
    text = text.split()
    for k in range(0, len(text)):
        if text[k] in vocab and w <= 223:
            df.iloc[j, w:w + 4] = df.iloc[j, w:w + 4] + l[vocab.index(text[k])]
        w = w + 4
Beispiel #10
0





print ("Method = Linear SVM with doc2vec features")
np.random.seed(0)
class LabeledLineSentence(object):
  def __init__(self, data ): self.data = data
  def __iter__(self):
    for uid, line in enumerate( self.data ): yield TaggedDocument( line.split(" ") , ["S_%s" % uid] )
model = Doc2Vec( alpha=0.025 , min_alpha=0.025 )
sentences = LabeledLineSentence( train_texts + test_texts )
model.build_vocab( sentences )
model.train( sentences )
for w in model.vocab.keys():
  try: model[w] = embeddings[w] 
  except : continue
for epoch in range(10):
    model.train(sentences)
    model.alpha -= 0.002
    model.min_alpha = model.alpha
train_rep = np.array( [ model.docvecs[i] for i in range( train_matrix.shape[0] ) ] )
test_rep = np.array( [ model.docvecs[i + train_matrix.shape[0]] for i in range( test_matrix.shape[0] ) ] )
model = LinearSVC( random_state=0 )
model.fit( train_rep , train_labels )
results = model.predict( test_rep )
print ("Accuracy = " + repr( sklearn.metrics.accuracy_score( test_labels , results )  ))
print (sklearn.metrics.classification_report( test_labels , results ))
Beispiel #11
0
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU

model = Sequential()

model.add(
    LSTM(input_shape=(emb, ),
         input_dim=emb,
         output_dim=hidden,
         return_sequences=True))
model.add(TimeDistributedDense(output_dim=4))
model.add(Activation('softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam')
for seq, label in zip(sequences, y):
    model.train(np.array([seq]), [label])
Beispiel #12
0
    horizontal_flip=False)
# compute quantities required for featurewise normalization
# (std, mean, and principal components if ZCA whitening is applied)
datagen.fit(x_train)
#create model
model = Sequential()  
model.add(Conv2D(32,(5,5),strides=(1,1),input_shape=x_train.shape[1:],padding='valid',activation='relu',kernel_initializer='uniform'))
model.add(MaxPooling2D(pool_size=(2,2)))  
model.add(Conv2D(64,(5,5),strides=(1,1),padding='valid',activation='relu',kernel_initializer='uniform'))  
model.add(MaxPooling2D(pool_size=(2,2)))  
model.add(Flatten())  
model.add(Dense(100,activation='relu'))  
model.add(Dense(num_classes,activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
# fits the model on batches with real-time data augmentation:
model.fit_generator(datagen.flow(x_train, y_train, batch_size=128), steps_per_epoch=len(x_train), epochs=epochs)
# here's a more "manual" example
for e in range(epochs):
    print ('Epoch:',e)
    batches = 0
    for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=32):
        loss = model.train(x_batch, y_batch)
        batches += 1
        if batches >= len(x_train) / 32:
            # we need to break the loop by hand because
            # the generator loops indefinitely
            break


Beispiel #13
0
model.add(Flatten(nb_filter * 14 * 14 * 14))
model.add(Dense(nb_filter * 14 * 14 * 14, nb_classes, init='normal'))
model.add(Activation('softmax'))

# let's train the model using SGD + momentum (how original).
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd)

dataset = geometric_3d_dataset.Geometric3DDataset(
    patch_size=patch_size,
    task=geometric_3d_dataset.Geometric3DDataset.CLASSIFICATION_TASK,
    centered=True)

for e in range(nb_epoch):

    train_iterator = dataset.iterator(batch_size=batch_size,
                                      num_batches=nb_test_batches)

    for b in range(nb_train_batches):
        X_batch, Y_batch = train_iterator.next()
        loss = model.train(X_batch, Y_batch)
        print 'loss: ' + str(loss)

    test_iterator = dataset.iterator(batch_size=batch_size,
                                     num_batches=nb_train_batches)

    for b in range(nb_test_batches):
        X_batch, Y_batch = test_iterator.next()
        error = model.test(X_batch, Y_batch)
        print 'error: ' + str(error)
from keras.models import Sequential
from keras.layers import Dense

# Instantiate a Sequential model
model = Sequential()

# Build the input and hidden layer
model.add(Dense(4, input_shape=(2,), activation="tanh"))
# Add output layer, use sigmoid
model.add(Dense(1,activation="sigmoid"))

# Compile model
model.compile(optimizer='sgd', loss='binary_crossentropy')
# Train model
model.train(coordinates, labels, epochs=20)
# Predict with trained model
preds = model.predict(coordinates)
################################################################################
Exploring dollar bills

# Import seaborn
import seaborn as sns

# Use pairplot and set the hue to be our class
sns.pairplot(banknotes, hue='class') 

# Show the plot
plt.show()

# Describe the data
Beispiel #15
0
def train_data(ds_idx):
    time.sleep(5000 * random())
    # the data, shuffled and split between tran and test sets
    (X_train, y_train), (X_test, y_test) = cifar100.load_data(test_split=0.15)

    train_idx = np.where((y_train >= ds_idx * 10) & (y_train < (1 + ds_idx) * 10))[0]
    test_idx = np.where((y_test >= ds_idx * 10) & (y_test < (1 + ds_idx) * 10))[0]

    X_train = np.array([X_train[i] for i in train_idx])
    y_train  = np.array([y_train[i] for i in train_idx])
    X_test = np.array([X_test[i] for i in test_idx])
    y_test = np.array([y_test[i] for i in test_idx])

    print X_train.shape[0], 'train samples'
    print X_test.shape[0], 'test samples'

    y_train -= ds_idx * 10
    y_test -= ds_idx * 10

    # convert class vectors to binary class matrices
    Y_train = np_utils.to_categorical(y_train, nb_classes)
    Y_test = np_utils.to_categorical(y_test, nb_classes)

    model = Sequential()

    model.add(Convolution2D(32, 3, 3, 3, border_mode='full'))
    model.add(Activation('relu'))
    model.add(Dropout(0.8))
    model.add(Convolution2D(32, 32, 3, 3))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(poolsize=(2, 2)))
    model.add(Dropout(0.75))

    model.add(Convolution2D(64, 32, 3, 3, border_mode='full'))
    model.add(Activation('relu'))
    model.add(Dropout(0.7))
    model.add(Convolution2D(64, 64, 3, 3))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(poolsize=(2, 2)))
    model.add(Dropout(0.6))

    model.add(Flatten(64*8*8))
    model.add(Dense(64*8*8, 512, init='normal'))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))

    model.add(Dense(512, nb_classes, init='normal'))
    model.add(Activation('softmax'))

    # let's train the model using SGD + momentum (how original).
    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy', optimizer=sgd)

    if not data_augmentation:
        print "Not using data augmentation or normalization"

        X_train = X_train.astype("float32")
        X_test = X_test.astype("float32")
        X_train /= 255
        X_test /= 255
        model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=10)
        score = model.evaluate(X_test, Y_test, batch_size=batch_size)
        print 'Test score:', score

    else:
        print "Using real time data augmentation"

        # this will do preprocessing and realtime data augmentation
        datagen = ImageDataGenerator(
            featurewise_center=True, # set input mean to 0 over the dataset
            samplewise_center=False, # set each sample mean to 0
            featurewise_std_normalization=True, # divide inputs by std of the dataset
            samplewise_std_normalization=False, # divide each input by its std
            zca_whitening=False, # apply ZCA whitening
            rotation_range=20, # randomly rotate images in the range (degrees, 0 to 180)
            width_shift_range=0.3, # randomly shift images horizontally (fraction of total width)
            height_shift_range=0.3, # randomly shift images vertically (fraction of total height)
            horizontal_flip=True, # randomly flip images
            vertical_flip=False) # randomly flip images

        # compute quantities required for featurewise normalization
        # (std, mean, and principal components if ZCA whitening is applied)
        datagen.fit(X_train)
        best_score = 0.0
        best_epoch = 0

        for e in range(nb_epoch):
            print '-'*40
            print 'Epoch', e
            print '-'*40
            print "Training..."
            # batch train with realtime data augmentation
            progbar = generic_utils.Progbar(X_train.shape[0])
            for X_batch, Y_batch in datagen.flow(X_train, Y_train):
                loss = model.train(X_batch, Y_batch)
                progbar.add(X_batch.shape[0], values=[("train loss", loss)])

            print "Testing..."
            # test time!
            progbar = generic_utils.Progbar(X_test.shape[0])
            pred = model.predict_classes(X_test, batch_size=batch_size)
            score = np_utils.accuracy(pred, Y_test)
            best_epoch, best_score = (best_epoch, best_score) if best_score >= score else (e, score)
            print 'Score: ', score
            print 'Best: ', best_score, ' at epoch: ', best_epoch
            #for X_batch, Y_batch in datagen.flow(X_test, Y_test):
                #score = model.test(X_batch, Y_batch)
                #progbar.add(X_batch.shape[0], values=[("test loss", score)])
        all_time_best.append((best_epoch, best_score))
"""# Training Word2Vec Model"""

from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import word2vec
import numpy

num_features = 300  # Word vector dimensionality
min_word_count = 2 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 10        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words
gdrive =  "/content/gdrive/My Drive/For_study/Final/Code/Model"

model = Word2Vec(readData(SEMEVAL_FOUR), size=100, window=5, min_count=1, workers=4)
# common_texts
model.train(readData(SEMEVAL_FIVE_TRAIN), total_examples=1, epochs=150)
model.save(GDRIVE+"word2vec.model")

# sentences = readData(SEMEVAL_FOUR)
sentences = glove
print("Training model....", len(sentences))
model = word2vec.Word2Vec(sentences, workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

# To make the model memory efficient
model.init_sims(replace=True)
model.train(sentences, total_examples=1, epochs=150)