Ejemplo n.º 1
0
 def __init__(self,
              datafile="chinese_news.csv",
              checkpoint_dir='./training_checkpoints'):
     self.model = None
     self.checkpoint_dir = checkpoint_dir
     self.data = Data(datafile)
     self.dataset = None
Ejemplo n.º 2
0
def job(args, train_csv, test_csv, embeddings, cache):
    """ Reads data, makes preprocessing, trains model and records results.
        Gets args as argument and passes values of it's fields to functions."""

    data = Data(train_csv, test_csv, cache)

    # read and preprocess data
    to_cache = not args.no_cache
    data.read_embedding(embeddings, args.unk_std, args.max_vectors, to_cache)
    data.preprocess(args.tokenizer, args.var_length)
    data.embedding_lookup()

    # split train dataset
    data_iter = data.split(args.kfold, args.split_ratio, args.stratified, args.test, args.seed)

    # iterate through folds
    loss_function = nn.BCEWithLogitsLoss()
    for fold, d in enumerate(data_iter):
        print(f'\n__________ fold {fold} __________')
        # get dataloaders
        if len(d) == 2:
            train, val = d
            test = data.test
        else:
            train, val, test = d
        dataloaders = iterate(train, val, test, args.batch_size) # train, val and test dataloader

        # choose model, optimizer, lr scheduler
        model = choose_model(args.model, data.text, args.n_layers, args.hidden_dim, args.dropout)
        optimizer = choose_optimizer(filter(lambda p: p.requires_grad, model.parameters()), args)
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lrstep, gamma=0.1)
        learn = Learner(model, dataloaders, loss_function, optimizer, scheduler, args)
        learn.fit(args.epoch, args.n_eval, args.f1_tresh, args.early_stop, args.warmup_epoch, args.clip)

        # load best model
        learn.model, info = learn.recorder.load()
        # save val predictions
        y_pred, y_true, ids = learn.predict_probs()
        val_ids = [data.qid.vocab.itos[i] for i in ids]
        pred_to_csv(val_ids, y_pred, y_true)
        # choose best threshold for val predictions
        best_th, max_f1 = choose_thresh(y_pred, y_true, [0.1, 0.5, 0.01], message=True)
        learn.recorder.append_info({'best_th': best_th, 'max_f1': max_f1})


        # predict test labels
        test_label, test_prob,_, test_ids, tresh = learn.predict_labels(is_test=True, thresh=args.f1_tresh)
        if args.test:
            test_loss, test_f1, _, _, _ = learn.evaluate(learn.test_dl, args.f1_tresh)
            learn.recorder.append_info({'test_loss': test_loss, 'test_f1': test_f1}, message='Test set results: ')

        # save test predictions to submission.csv
        test_ids = [data.qid.vocab.itos[i] for i in test_ids]
        submit(test_ids, test_label, test_prob)
        record_path = learn.recorder.record(fold)  # directory path with all records
        print('\n')
    return record_path
Ejemplo n.º 3
0
def interpret(image, label, model, filename):
    input = tf.Variable(Data._normalize(image))
    with tf.GradientTape() as tape:
        prediction = model(tf.expand_dims(input, axis=0), training=False)
        scce = tf.keras.losses.SparseCategoricalCrossentropy()
        loss = scce(label, prediction)
    gradients = tape.gradient(loss, input)

    plt.style.use('grayscale')
    fig, axes = plt.subplots(nrows=1, ncols=2)
    x = axes[0].imshow(input.numpy())
    y = axes[1].imshow(
        np.squeeze(gradients / np.max(gradients)) * input.numpy())
    plt.savefig(filename)
Ejemplo n.º 4
0
from keras.layers import Input, Reshape, Activation
from keras.layers import Dropout, BatchNormalization, GaussianNoise
from keras.constraints import maxnorm
from keras.optimizers import RMSprop, SGD
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence

# TODO

dataset = Data(path="data/",
               stem=False,
               simply=True,
               stop_word=False,
               delete_class=['0', '000'],
               codif='bagofwords',
               max_features=None)
X_train, y_train, X_test, y_test = dataset.train_test_split(0.8)

x, y = dataset.get_non_coded()

# create the model
embedding_vecor_length = 2
model = Sequential()
model.add(
    Embedding(n_features,
              embedding_vecor_length,
              input_length=max_review_length))
model.add(LSTM(2))
Ejemplo n.º 5
0
    from song_tkinter import UserPlaylistEntry, NewPlaylistOutput
    from preprocess import Data
    from post_cluster import Graph_Save

    print('Running main.py. Tkinter interface will appear', end=' ')
    print('when everything finishes loading.\n', end='\r')

    print('Parsing args...', end='\r')
    arg_parser = ArgumentParser()
    arg_parser.add_argument('--graphs-file-name', type=str)
    args = arg_parser.parse_args()
    print('Done parsing args!\n', end='\r')

    # Preprocessed data
    print('Restoring preprocessed data...', end='\r')
    data_obj = Data()
    print('Done restoring preprocessed data!\n', end='\r')

    # Spotify
    print('Initializing Spotipy client...', end='\r')
    credentials_manager = spotipy.oauth2.SpotifyClientCredentials(
        'daf1fbca87e94c9db377c98570e32ece', '1a674398d1bb44859ccaa4488df1aaa9')
    sp = spotipy.Spotify(client_credentials_manager=credentials_manager)
    print('Done initializing Spotipy client!\n', end='\r')

    # Restore centroid_to_graph
    print('Restoring Graphs... This will take a while (3 - 10 min).', end='\r')
    graphs_file = open(args.graphs_file_name, 'rb')
    centroid_to_graph_save = pickle.load(file=graphs_file)
    centroid_to_graph = dict()
    for centroid in centroid_to_graph_save:
Ejemplo n.º 6
0
"""

from toy_simpleNN import ToySimpleNN
from preprocess import Data

hyperparam = {
    'activation': 'tanh',
    'regularization': 'l2',
    'batch_size': 20,
    'learning_rate': 1e-4,
    'valid_rate': 0.1,
    'optimizer': 'SGD',
    'train_step': 1000
}

data_set = Data()
data_set.load_OUTCAR("C:/Users/Seungwoo Hwang/Desktop/toy-simpleNN/OUTCAR")
#data_set = Data("OUTCAR")

x_train, y_train, x_test, y_test = data_set.train_test_split()

model = ToySimpleNN(act=hyperparam['activation'],
                    train_step=hyperparam['train_step'])
model.set_optimizer(optimizer=hyperparam['optimizer'])
model.load_data(data_set)

model.train()

model.test()
Ejemplo n.º 7
0
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate
from keras.layers import LSTM
# from keras.utils import plot_model
from keras.callbacks import ModelCheckpoint
from preprocess import Data
import pickle

data = Data()

story_maxlen = data.story_maxlen
query_maxlen = data.query_maxlen

vocab_size = data.vocab_size

inputs_train = data.inputs_train
queries_train = data.queries_train
answers_train = data.answers_train

inputs_test = data.inputs_test
queries_test = data.queries_test
answers_test = data.answers_test

# placeholders
input_sequence = Input((story_maxlen, ), name='story_inputs')
question = Input((query_maxlen, ), name='question_inputs')

# encoders
# embed the input sequence into a sequence of vectors
input_encoder_m = Sequential(name='story_m_embed_dropout')
Ejemplo n.º 8
0
import time

from preprocess import Data
from model import get_model
from trainer import train
from config import Config
from generator import Generator

if __name__ == '__main__':
    data = Data()
    data.process()
    model = get_model()
    if Config.LOAD_MODEL:
        model.load_weights(Config.CHECKPOINT)
    Gen = Generator(Config.all_columns_wi, data)
    train(model,
          data,
          epochs=Config.EPOCHS,
          batch_size=Config.BATCH_SIZE,
          generator=Gen.input_generator)
    model.save('model' + str(time.time()) + '.h5')
    print("FINISHED TRAINING")
Ejemplo n.º 9
0
from preprocess import Data
import re
from keras.models import load_model
import numpy as np


def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]


data = Data()

story = tokenize('Sandra went to the beach. John went to the hallway')
question = tokenize('Where is Sandra ?')
# this is just an dummy answer
# which won't be used in our inference process
_ = 'bathroom'

story_vec, question_vec, _ = data.vectorize_stories([(story, question, _)])

print('-')
print('loading model...')
model = load_model('model_25_1.00.h5')
print('-')
print('predicting...')
preds = model.predict([story_vec, question_vec])
pred_idx = np.argmax(preds[0])
Ejemplo n.º 10
0
# decoder part
decoder_input_h = Input(shape=(LATENT_DIM, ), name='decoder_input_h')
decoder_input_c = Input(shape=(LATENT_DIM, ), name='decoder_input_c')
decoder_states_inputs = [decoder_input_h, decoder_input_c]
decoder_inputs = model.get_layer('decoder_inputs').input
decoder_lstm = model.get_layer('decoder_lstm')
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states_outputs = [state_h, state_c]
decoder_dense = model.get_layer('decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                      [decoder_outputs] + decoder_states_outputs)

NUM_SAMPLES = 10000
data = Data(NUM_SAMPLES)
DECODER_NUM_TOKENS = data.DECODER_NUN_TOKENS
target_token_index = data.target_token_index
reverse_target_token_index = data.reverse_target_token_index
DECODER_MAXLEN = data.DECODER_MAXLEN


def decode_sequence(input_seq):
    states_values = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1, DECODER_NUM_TOKENS))
    target_seq[0, 0, target_token_index['\t']] = 1

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
Ejemplo n.º 11
0
class Model():
    def __init__(self,
                 datafile="chinese_news.csv",
                 checkpoint_dir='./training_checkpoints'):
        self.model = None
        self.checkpoint_dir = checkpoint_dir
        self.data = Data(datafile)
        self.dataset = None

    def build(self, batch_size):
        self.batch_size = batch_size
        if tf.test.is_gpu_available():
            rnn = tf.keras.layers.CuDNNLSTM
        else:
            import functools
            rnn = functools.partial(tf.keras.layers.LSTM,
                                    recurrent_activation='relu')

        self.model = tf.keras.Sequential([
            # Embedding
            tf.keras.layers.Embedding(CHAR_SIZE,
                                      EMBEDDING_DIM,
                                      batch_input_shape=[batch_size, None]),
            # Bidirectional LSTM
            tf.keras.layers.Bidirectional(
                rnn(RNN_UNITS,
                    return_sequences=True,
                    recurrent_initializer='glorot_uniform',
                    stateful=True)),
            # Dropout
            tf.keras.layers.Dropout(DROPOUT_RATE),

            # Dense
            tf.keras.layers.Dense(CHAR_SIZE, activation='softmax')
        ])

    def input_fn(self, file_path):
        def parse(example_proto):
            features = {"idx_lst": tf.VarLenFeature(tf.int64)}
            parsed_features = tf.parse_single_example(example_proto, features)
            idx_lst = tf.sparse.to_dense(parsed_features["idx_lst"])
            idx_lst = tf.cast(idx_lst, tf.int32)
            return idx_lst[:-1], idx_lst[1:]

        dataset = (tf.data.TFRecordDataset(file_path).map(parse))
        if SHUFFLE:
            dataset = dataset.shuffle(buffer_size=BUFFER_SIZE)
        dataset = dataset.repeat()
        return dataset.padded_batch(BATCH_SIZE,
                                    padded_shapes=([MAX_LEN_SEN - 1],
                                                   [MAX_LEN_SEN - 1]))

    def get_dataset(self):
        if self.dataset is None:
            char2idx = self.data.get_char2idx
            sentences = self.data.get_sentence()
            with tf.python_io.TFRecordWriter(TRAIN_RECORD_FILE) as writer:
                for sen in sentences:
                    idx_lst = [char2idx(c) for c in sen]
                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'idx_lst':
                            tf.train.Feature(int64_list=tf.train.Int64List(
                                value=idx_lst))
                        }))
                    writer.write(example.SerializeToString())

            self.dataset = self.input_fn(TRAIN_RECORD_FILE)

        return self.dataset

    def train(self, epochs=3):
        def loss(labels, logits):
            return tf.keras.losses.sparse_categorical_crossentropy(
                labels, logits)

        self.model.compile(optimizer=tf.train.AdamOptimizer(), loss=loss)

        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=os.path.join(self.checkpoint_dir, "ckpt_{epoch}"),
            save_weights_only=True)

        dataset = self.get_dataset()

        history = self.model.fit(dataset,
                                 epochs=epochs,
                                 steps_per_epoch=MAX_LEN_SEN,
                                 callbacks=[checkpoint_callback])
        return history

    def load(self):
        try:
            tf.train.latest_checkpoint(self.checkpoint_dir)
            self.build(batch_size=1)

            self.model.load_weights(
                tf.train.latest_checkpoint(self.checkpoint_dir))

            self.model.build(tf.TensorShape([1, None]))

        except:
            self.build(batch_size=BATCH_SIZE)

    def predict(self, start_string, generate_size=1000):
        char2idx = self.data.get_char2idx
        idx2char = self.data.get_idx2char

        # Converting our start string to numbers (vectorizing)
        input_eval = [char2idx(s) for s in start_string]
        input_eval = tf.expand_dims(input_eval, 0)
        text_generated = []

        # Low temperatures results in more predictable text.
        # Higher temperatures results in more surprising text.
        # Experiment to find the best setting.
        temperature = 1.0

        # Here batch size == 1
        self.model.reset_states()

        for i in range(generate_size):

            if np.random.rand() < PUNCTUATION_RATE:
                if np.random.rand() < 0.5:
                    text_generated.append(',')
                else:
                    text_generated.append('。')

            predictions = self.model(input_eval)
            # remove the batch dimension
            predictions = tf.squeeze(predictions, 0)

            # using a multinomial distribution to predict the word returned by the model
            predictions = predictions / temperature
            predicted_id = tf.multinomial(predictions,
                                          num_samples=1)[-1, 0].numpy()

            # We pass the predicted word as the next input to the model
            # along with the previous hidden state
            input_eval = tf.expand_dims([predicted_id], 0)
            nchar = idx2char(predicted_id)
            if nchar == UNK:
                print(' ')
            text_generated.append(nchar)
        return (start_string + ''.join(text_generated))
Ejemplo n.º 12
0
if __name__ == '__main__':
    """
    Given input playlist of size n, adventure, and existing graphs,
    recommend n songs
    """
    print('Parsing args...', end='\r')
    arg_parser = ArgumentParser()
    arg_parser.add_argument('--playlist-link', type=str)
    arg_parser.add_argument('--adventure', type=int)
    arg_parser.add_argument('--graphs-file-name', type=str)
    args = arg_parser.parse_args()
    print('Done parsing args!\n', end='\r')

    # Preprocessed data
    print('Restoring preprocessed data...', end='\r')
    data = Data()
    print('Done restoring preprocessed data!\n', end='\r')

    # Spotify
    print('Initializing Spotipy client...', end='\r')
    credentials_manager = spotipy.oauth2.SpotifyClientCredentials(
        'daf1fbca87e94c9db377c98570e32ece', '1a674398d1bb44859ccaa4488df1aaa9')
    sp = spotipy.Spotify(client_credentials_manager=credentials_manager)
    print('Done initializing Spotipy client!\n', end='\r')

    # Restore centroid_to_graph
    print('Restoring Graphs...', end='\r')
    graphs_file = open(args.graphs_file_name, 'rb')
    centroid_to_graph_save = pickle.load(file=graphs_file)
    centroid_to_graph = dict()
    for centroid in centroid_to_graph_save:
Ejemplo n.º 13
0
    with tf.GradientTape() as tape:
        prediction = model(tf.expand_dims(input, axis=0), training=False)
        scce = tf.keras.losses.SparseCategoricalCrossentropy()
        loss = scce(label, prediction)
    gradients = tape.gradient(loss, input)

    plt.style.use('grayscale')
    fig, axes = plt.subplots(nrows=1, ncols=2)
    x = axes[0].imshow(input.numpy())
    y = axes[1].imshow(
        np.squeeze(gradients / np.max(gradients)) * input.numpy())
    plt.savefig(filename)


if __name__ == "__main__":
    data = Data()
    data.normalize()
    data.split_data()

    model = Model()
    model(tf.keras.Input(shape=(hp.img_size, hp.img_size, 3)))

    model.summary()

    model.compile(optimizer=model.optimizer,
                  loss=model.loss_fn,
                  metrics=["sparse_categorical_accuracy"])

    print("TRAINING")
    train(model=model,
          path_to_weights=os.path.join(os.path.dirname(os.getcwd()),
Ejemplo n.º 14
0
from __future__ import annotations
import random
from collections import deque
import pickle
from argparse import ArgumentParser
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import spotipy
from Point import Point
from preprocess import Data
from spotify_client import Spotify_Client
from k_means import KMeansAlgo
from typing import Any, List


DATA = Data()
CLIENT_CREDENTIALS_MANAGER = spotipy.oauth2.SpotifyClientCredentials(
    'daf1fbca87e94c9db377c98570e32ece', '1a674398d1bb44859ccaa4488df1aaa9')
SPOTIPY = spotipy.Spotify(client_credentials_manager=CLIENT_CREDENTIALS_MANAGER)


class Graph:
    """
    Represents an individual graph of vertices (songs).
    Vertices are connected based on an epsilon value, which represents distance.

    Instance Attributes:
        - points: list of Point objects
        - epsilon: float representing a distance
        - id_point_mapping: a dictionary a str ID to a Point object
        - song_ids: list of ids
Ejemplo n.º 15
0
	if (os.path.exists('config.js')):
		os.remove('config.js')
	
	Metrics.saveConfig('config.js', text)
	Metrics.copyDirectory(mydir, latest)


if __name__ == '__main__':

	verbose = True

	"""
	Pre-process
	"""

	base = Data('mammography-consolidated.csv', verbose=verbose,normalize=True)
	training, validation, testing = base.split()
	training_bkp = training.copy()
	validation_bkp = validation.copy()
	testing_bkp = testing.copy()

	"""
	Setup experiment config
	"""

	#samplinh
	# sampling_options = [Oversampling.SmoteRegular]
	sampling_options = [Oversampling.DontUse, Oversampling.Repeat, Oversampling.SmoteRegular, Undersampling.ClusterCentroids, Undersampling.SMOTEENN]

	# learning_rule = stochastic gradient descent ('sgd'), 'momentum', 'nesterov', 'adadelta', 'adagrad', 'rmsprop'
	learning_rule_options = ['momentum']
Ejemplo n.º 16
0
    # for each word in vocabulary, cluster contexts.
    # every centroid then represents a sense; multiple senses per word
    def get_senses(self):
        f = open('data\centroids', 'w')
        for word in self.dataset.vocab:
            X = np.array(self.dataset.contexts[word])
            if len(X)>= 1:
                #vmf_soft = VonMisesFisherMixture(n_clusters=self.K, posterior_type='soft').fit(X)
                kmeans = MiniBatchKMeans(n_clusters=self.K, init='k-means++').fit(X)
                self.senses[word] = kmeans.cluster_centers_
                self.temp_vocab.append(word)
                f.write(str(word))
                for C in kmeans.cluster_centers_:
                    f.write("\n$")
                    for x in C: 
                        f.write(" "+str(x))
                f.write("\n")
        f.write('#')
        f.close()

  
if __name__ == "__main__":

    model = MPVSM()
    # options: picard.txt, nahodi.txt, 1984.txt
    model.dataset = Data("data/hrv/picard.txt", N=10, truncate=20000) # must truncate because memory error :<
    print "10 most common words:", ', '.join(model.dataset.vocab[:10])
    model.get_senses()

    # Trenutni rezultat: vrati one rijeci s kojima je najcesce bila u recenici (npr djeca -> napustena)