def __init__(self, datafile="chinese_news.csv", checkpoint_dir='./training_checkpoints'): self.model = None self.checkpoint_dir = checkpoint_dir self.data = Data(datafile) self.dataset = None
def job(args, train_csv, test_csv, embeddings, cache): """ Reads data, makes preprocessing, trains model and records results. Gets args as argument and passes values of it's fields to functions.""" data = Data(train_csv, test_csv, cache) # read and preprocess data to_cache = not args.no_cache data.read_embedding(embeddings, args.unk_std, args.max_vectors, to_cache) data.preprocess(args.tokenizer, args.var_length) data.embedding_lookup() # split train dataset data_iter = data.split(args.kfold, args.split_ratio, args.stratified, args.test, args.seed) # iterate through folds loss_function = nn.BCEWithLogitsLoss() for fold, d in enumerate(data_iter): print(f'\n__________ fold {fold} __________') # get dataloaders if len(d) == 2: train, val = d test = data.test else: train, val, test = d dataloaders = iterate(train, val, test, args.batch_size) # train, val and test dataloader # choose model, optimizer, lr scheduler model = choose_model(args.model, data.text, args.n_layers, args.hidden_dim, args.dropout) optimizer = choose_optimizer(filter(lambda p: p.requires_grad, model.parameters()), args) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lrstep, gamma=0.1) learn = Learner(model, dataloaders, loss_function, optimizer, scheduler, args) learn.fit(args.epoch, args.n_eval, args.f1_tresh, args.early_stop, args.warmup_epoch, args.clip) # load best model learn.model, info = learn.recorder.load() # save val predictions y_pred, y_true, ids = learn.predict_probs() val_ids = [data.qid.vocab.itos[i] for i in ids] pred_to_csv(val_ids, y_pred, y_true) # choose best threshold for val predictions best_th, max_f1 = choose_thresh(y_pred, y_true, [0.1, 0.5, 0.01], message=True) learn.recorder.append_info({'best_th': best_th, 'max_f1': max_f1}) # predict test labels test_label, test_prob,_, test_ids, tresh = learn.predict_labels(is_test=True, thresh=args.f1_tresh) if args.test: test_loss, test_f1, _, _, _ = learn.evaluate(learn.test_dl, args.f1_tresh) learn.recorder.append_info({'test_loss': test_loss, 'test_f1': test_f1}, message='Test set results: ') # save test predictions to submission.csv test_ids = [data.qid.vocab.itos[i] for i in test_ids] submit(test_ids, test_label, test_prob) record_path = learn.recorder.record(fold) # directory path with all records print('\n') return record_path
def interpret(image, label, model, filename): input = tf.Variable(Data._normalize(image)) with tf.GradientTape() as tape: prediction = model(tf.expand_dims(input, axis=0), training=False) scce = tf.keras.losses.SparseCategoricalCrossentropy() loss = scce(label, prediction) gradients = tape.gradient(loss, input) plt.style.use('grayscale') fig, axes = plt.subplots(nrows=1, ncols=2) x = axes[0].imshow(input.numpy()) y = axes[1].imshow( np.squeeze(gradients / np.max(gradients)) * input.numpy()) plt.savefig(filename)
from keras.layers import Input, Reshape, Activation from keras.layers import Dropout, BatchNormalization, GaussianNoise from keras.constraints import maxnorm from keras.optimizers import RMSprop, SGD from keras.layers import Dense, Dropout from keras.layers import Embedding from keras.layers import LSTM from keras.models import Sequential from keras.preprocessing import sequence # TODO dataset = Data(path="data/", stem=False, simply=True, stop_word=False, delete_class=['0', '000'], codif='bagofwords', max_features=None) X_train, y_train, X_test, y_test = dataset.train_test_split(0.8) x, y = dataset.get_non_coded() # create the model embedding_vecor_length = 2 model = Sequential() model.add( Embedding(n_features, embedding_vecor_length, input_length=max_review_length)) model.add(LSTM(2))
from song_tkinter import UserPlaylistEntry, NewPlaylistOutput from preprocess import Data from post_cluster import Graph_Save print('Running main.py. Tkinter interface will appear', end=' ') print('when everything finishes loading.\n', end='\r') print('Parsing args...', end='\r') arg_parser = ArgumentParser() arg_parser.add_argument('--graphs-file-name', type=str) args = arg_parser.parse_args() print('Done parsing args!\n', end='\r') # Preprocessed data print('Restoring preprocessed data...', end='\r') data_obj = Data() print('Done restoring preprocessed data!\n', end='\r') # Spotify print('Initializing Spotipy client...', end='\r') credentials_manager = spotipy.oauth2.SpotifyClientCredentials( 'daf1fbca87e94c9db377c98570e32ece', '1a674398d1bb44859ccaa4488df1aaa9') sp = spotipy.Spotify(client_credentials_manager=credentials_manager) print('Done initializing Spotipy client!\n', end='\r') # Restore centroid_to_graph print('Restoring Graphs... This will take a while (3 - 10 min).', end='\r') graphs_file = open(args.graphs_file_name, 'rb') centroid_to_graph_save = pickle.load(file=graphs_file) centroid_to_graph = dict() for centroid in centroid_to_graph_save:
""" from toy_simpleNN import ToySimpleNN from preprocess import Data hyperparam = { 'activation': 'tanh', 'regularization': 'l2', 'batch_size': 20, 'learning_rate': 1e-4, 'valid_rate': 0.1, 'optimizer': 'SGD', 'train_step': 1000 } data_set = Data() data_set.load_OUTCAR("C:/Users/Seungwoo Hwang/Desktop/toy-simpleNN/OUTCAR") #data_set = Data("OUTCAR") x_train, y_train, x_test, y_test = data_set.train_test_split() model = ToySimpleNN(act=hyperparam['activation'], train_step=hyperparam['train_step']) model.set_optimizer(optimizer=hyperparam['optimizer']) model.load_data(data_set) model.train() model.test()
from keras.models import Sequential, Model from keras.layers.embeddings import Embedding from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate from keras.layers import LSTM # from keras.utils import plot_model from keras.callbacks import ModelCheckpoint from preprocess import Data import pickle data = Data() story_maxlen = data.story_maxlen query_maxlen = data.query_maxlen vocab_size = data.vocab_size inputs_train = data.inputs_train queries_train = data.queries_train answers_train = data.answers_train inputs_test = data.inputs_test queries_test = data.queries_test answers_test = data.answers_test # placeholders input_sequence = Input((story_maxlen, ), name='story_inputs') question = Input((query_maxlen, ), name='question_inputs') # encoders # embed the input sequence into a sequence of vectors input_encoder_m = Sequential(name='story_m_embed_dropout')
import time from preprocess import Data from model import get_model from trainer import train from config import Config from generator import Generator if __name__ == '__main__': data = Data() data.process() model = get_model() if Config.LOAD_MODEL: model.load_weights(Config.CHECKPOINT) Gen = Generator(Config.all_columns_wi, data) train(model, data, epochs=Config.EPOCHS, batch_size=Config.BATCH_SIZE, generator=Gen.input_generator) model.save('model' + str(time.time()) + '.h5') print("FINISHED TRAINING")
from preprocess import Data import re from keras.models import load_model import numpy as np def tokenize(sent): '''Return the tokens of a sentence including punctuation. >>> tokenize('Bob dropped the apple. Where is the apple?') ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?'] ''' return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()] data = Data() story = tokenize('Sandra went to the beach. John went to the hallway') question = tokenize('Where is Sandra ?') # this is just an dummy answer # which won't be used in our inference process _ = 'bathroom' story_vec, question_vec, _ = data.vectorize_stories([(story, question, _)]) print('-') print('loading model...') model = load_model('model_25_1.00.h5') print('-') print('predicting...') preds = model.predict([story_vec, question_vec]) pred_idx = np.argmax(preds[0])
# decoder part decoder_input_h = Input(shape=(LATENT_DIM, ), name='decoder_input_h') decoder_input_c = Input(shape=(LATENT_DIM, ), name='decoder_input_c') decoder_states_inputs = [decoder_input_h, decoder_input_c] decoder_inputs = model.get_layer('decoder_inputs').input decoder_lstm = model.get_layer('decoder_lstm') decoder_outputs, state_h, state_c = decoder_lstm( decoder_inputs, initial_state=decoder_states_inputs) decoder_states_outputs = [state_h, state_c] decoder_dense = model.get_layer('decoder_dense') decoder_outputs = decoder_dense(decoder_outputs) decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states_outputs) NUM_SAMPLES = 10000 data = Data(NUM_SAMPLES) DECODER_NUM_TOKENS = data.DECODER_NUN_TOKENS target_token_index = data.target_token_index reverse_target_token_index = data.reverse_target_token_index DECODER_MAXLEN = data.DECODER_MAXLEN def decode_sequence(input_seq): states_values = encoder_model.predict(input_seq) target_seq = np.zeros((1, 1, DECODER_NUM_TOKENS)) target_seq[0, 0, target_token_index['\t']] = 1 stop_condition = False decoded_sentence = '' while not stop_condition:
class Model(): def __init__(self, datafile="chinese_news.csv", checkpoint_dir='./training_checkpoints'): self.model = None self.checkpoint_dir = checkpoint_dir self.data = Data(datafile) self.dataset = None def build(self, batch_size): self.batch_size = batch_size if tf.test.is_gpu_available(): rnn = tf.keras.layers.CuDNNLSTM else: import functools rnn = functools.partial(tf.keras.layers.LSTM, recurrent_activation='relu') self.model = tf.keras.Sequential([ # Embedding tf.keras.layers.Embedding(CHAR_SIZE, EMBEDDING_DIM, batch_input_shape=[batch_size, None]), # Bidirectional LSTM tf.keras.layers.Bidirectional( rnn(RNN_UNITS, return_sequences=True, recurrent_initializer='glorot_uniform', stateful=True)), # Dropout tf.keras.layers.Dropout(DROPOUT_RATE), # Dense tf.keras.layers.Dense(CHAR_SIZE, activation='softmax') ]) def input_fn(self, file_path): def parse(example_proto): features = {"idx_lst": tf.VarLenFeature(tf.int64)} parsed_features = tf.parse_single_example(example_proto, features) idx_lst = tf.sparse.to_dense(parsed_features["idx_lst"]) idx_lst = tf.cast(idx_lst, tf.int32) return idx_lst[:-1], idx_lst[1:] dataset = (tf.data.TFRecordDataset(file_path).map(parse)) if SHUFFLE: dataset = dataset.shuffle(buffer_size=BUFFER_SIZE) dataset = dataset.repeat() return dataset.padded_batch(BATCH_SIZE, padded_shapes=([MAX_LEN_SEN - 1], [MAX_LEN_SEN - 1])) def get_dataset(self): if self.dataset is None: char2idx = self.data.get_char2idx sentences = self.data.get_sentence() with tf.python_io.TFRecordWriter(TRAIN_RECORD_FILE) as writer: for sen in sentences: idx_lst = [char2idx(c) for c in sen] example = tf.train.Example(features=tf.train.Features( feature={ 'idx_lst': tf.train.Feature(int64_list=tf.train.Int64List( value=idx_lst)) })) writer.write(example.SerializeToString()) self.dataset = self.input_fn(TRAIN_RECORD_FILE) return self.dataset def train(self, epochs=3): def loss(labels, logits): return tf.keras.losses.sparse_categorical_crossentropy( labels, logits) self.model.compile(optimizer=tf.train.AdamOptimizer(), loss=loss) checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=os.path.join(self.checkpoint_dir, "ckpt_{epoch}"), save_weights_only=True) dataset = self.get_dataset() history = self.model.fit(dataset, epochs=epochs, steps_per_epoch=MAX_LEN_SEN, callbacks=[checkpoint_callback]) return history def load(self): try: tf.train.latest_checkpoint(self.checkpoint_dir) self.build(batch_size=1) self.model.load_weights( tf.train.latest_checkpoint(self.checkpoint_dir)) self.model.build(tf.TensorShape([1, None])) except: self.build(batch_size=BATCH_SIZE) def predict(self, start_string, generate_size=1000): char2idx = self.data.get_char2idx idx2char = self.data.get_idx2char # Converting our start string to numbers (vectorizing) input_eval = [char2idx(s) for s in start_string] input_eval = tf.expand_dims(input_eval, 0) text_generated = [] # Low temperatures results in more predictable text. # Higher temperatures results in more surprising text. # Experiment to find the best setting. temperature = 1.0 # Here batch size == 1 self.model.reset_states() for i in range(generate_size): if np.random.rand() < PUNCTUATION_RATE: if np.random.rand() < 0.5: text_generated.append(',') else: text_generated.append('。') predictions = self.model(input_eval) # remove the batch dimension predictions = tf.squeeze(predictions, 0) # using a multinomial distribution to predict the word returned by the model predictions = predictions / temperature predicted_id = tf.multinomial(predictions, num_samples=1)[-1, 0].numpy() # We pass the predicted word as the next input to the model # along with the previous hidden state input_eval = tf.expand_dims([predicted_id], 0) nchar = idx2char(predicted_id) if nchar == UNK: print(' ') text_generated.append(nchar) return (start_string + ''.join(text_generated))
if __name__ == '__main__': """ Given input playlist of size n, adventure, and existing graphs, recommend n songs """ print('Parsing args...', end='\r') arg_parser = ArgumentParser() arg_parser.add_argument('--playlist-link', type=str) arg_parser.add_argument('--adventure', type=int) arg_parser.add_argument('--graphs-file-name', type=str) args = arg_parser.parse_args() print('Done parsing args!\n', end='\r') # Preprocessed data print('Restoring preprocessed data...', end='\r') data = Data() print('Done restoring preprocessed data!\n', end='\r') # Spotify print('Initializing Spotipy client...', end='\r') credentials_manager = spotipy.oauth2.SpotifyClientCredentials( 'daf1fbca87e94c9db377c98570e32ece', '1a674398d1bb44859ccaa4488df1aaa9') sp = spotipy.Spotify(client_credentials_manager=credentials_manager) print('Done initializing Spotipy client!\n', end='\r') # Restore centroid_to_graph print('Restoring Graphs...', end='\r') graphs_file = open(args.graphs_file_name, 'rb') centroid_to_graph_save = pickle.load(file=graphs_file) centroid_to_graph = dict() for centroid in centroid_to_graph_save:
with tf.GradientTape() as tape: prediction = model(tf.expand_dims(input, axis=0), training=False) scce = tf.keras.losses.SparseCategoricalCrossentropy() loss = scce(label, prediction) gradients = tape.gradient(loss, input) plt.style.use('grayscale') fig, axes = plt.subplots(nrows=1, ncols=2) x = axes[0].imshow(input.numpy()) y = axes[1].imshow( np.squeeze(gradients / np.max(gradients)) * input.numpy()) plt.savefig(filename) if __name__ == "__main__": data = Data() data.normalize() data.split_data() model = Model() model(tf.keras.Input(shape=(hp.img_size, hp.img_size, 3))) model.summary() model.compile(optimizer=model.optimizer, loss=model.loss_fn, metrics=["sparse_categorical_accuracy"]) print("TRAINING") train(model=model, path_to_weights=os.path.join(os.path.dirname(os.getcwd()),
from __future__ import annotations import random from collections import deque import pickle from argparse import ArgumentParser import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import spotipy from Point import Point from preprocess import Data from spotify_client import Spotify_Client from k_means import KMeansAlgo from typing import Any, List DATA = Data() CLIENT_CREDENTIALS_MANAGER = spotipy.oauth2.SpotifyClientCredentials( 'daf1fbca87e94c9db377c98570e32ece', '1a674398d1bb44859ccaa4488df1aaa9') SPOTIPY = spotipy.Spotify(client_credentials_manager=CLIENT_CREDENTIALS_MANAGER) class Graph: """ Represents an individual graph of vertices (songs). Vertices are connected based on an epsilon value, which represents distance. Instance Attributes: - points: list of Point objects - epsilon: float representing a distance - id_point_mapping: a dictionary a str ID to a Point object - song_ids: list of ids
if (os.path.exists('config.js')): os.remove('config.js') Metrics.saveConfig('config.js', text) Metrics.copyDirectory(mydir, latest) if __name__ == '__main__': verbose = True """ Pre-process """ base = Data('mammography-consolidated.csv', verbose=verbose,normalize=True) training, validation, testing = base.split() training_bkp = training.copy() validation_bkp = validation.copy() testing_bkp = testing.copy() """ Setup experiment config """ #samplinh # sampling_options = [Oversampling.SmoteRegular] sampling_options = [Oversampling.DontUse, Oversampling.Repeat, Oversampling.SmoteRegular, Undersampling.ClusterCentroids, Undersampling.SMOTEENN] # learning_rule = stochastic gradient descent ('sgd'), 'momentum', 'nesterov', 'adadelta', 'adagrad', 'rmsprop' learning_rule_options = ['momentum']
# for each word in vocabulary, cluster contexts. # every centroid then represents a sense; multiple senses per word def get_senses(self): f = open('data\centroids', 'w') for word in self.dataset.vocab: X = np.array(self.dataset.contexts[word]) if len(X)>= 1: #vmf_soft = VonMisesFisherMixture(n_clusters=self.K, posterior_type='soft').fit(X) kmeans = MiniBatchKMeans(n_clusters=self.K, init='k-means++').fit(X) self.senses[word] = kmeans.cluster_centers_ self.temp_vocab.append(word) f.write(str(word)) for C in kmeans.cluster_centers_: f.write("\n$") for x in C: f.write(" "+str(x)) f.write("\n") f.write('#') f.close() if __name__ == "__main__": model = MPVSM() # options: picard.txt, nahodi.txt, 1984.txt model.dataset = Data("data/hrv/picard.txt", N=10, truncate=20000) # must truncate because memory error :< print "10 most common words:", ', '.join(model.dataset.vocab[:10]) model.get_senses() # Trenutni rezultat: vrati one rijeci s kojima je najcesce bila u recenici (npr djeca -> napustena)