def rp_to_estimate_noise(data_params, n_examples, n_runs, delta_matrix): train, val, test = get_data(data_params, n_examples, n_runs, delta_matrix) X_train, y_train, y_train_tildes = train X_val, y_val = val X_test, y_test = test noise_matrices = [] for y_train_tilde in y_train_tildes: lnl = cleanlab.classification.LearningWithNoisyLabels( clf=LogisticRegression(solver='lbfgs', multi_class='multinomial', class_weight='balanced')) lnl.fit(X_train, y_train_tilde) y_pred = lnl.predict(X_test) y_train_pred = lnl.predict(X_train) lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', class_weight='balanced') lr.fit(X_train, y_train_tilde) y_pred = lr.predict(X_test) y_pred_proba = lr.predict_proba(X_train) noise_matrix = estimate_noise_matrix(X_train, y_train_tilde, y_train_pred, y_pred_proba) noise_matrices.append(noise_matrix) avg_noise_matrix = np.mean(noise_matrices, axis=0) return avg_noise_matrix
def run(self, sess, network, world, root, iterations=1600, tau=1): if root.parent is not None: raise ValueError("Root's parent must be None.") for i in range(iterations): node, _, rew, raw_obs = find_leaf(root, self.tree_policy, world=deepcopy(world)) # Expectimax-ish #reward = node.simulation(world, rew) ob = get_data(np.array(raw_obs)[-2:])[node.id:node.id + 1] P, v = sess.run( [network.softmax_policy, network.pred_Q], feed_dict={ network.obs: np.array([[y.A for y in x] for x in ob]), network.training_flag: False }) node.Ps = P[0] self.backup(node, v[0][0]) pi = [0] * len(root.actions) for action in np.arange(len(root.actions)): pi[action] = root.children[str(action)].N**(1 / tau) pi = np.array(pi) return pi / float(np.sum(pi))
def load_model(): global MODEL dataset, ids_from_chars, chars_from_ids = train.get_data( 'discord_data.txt') model = train.create_model(ids_from_chars) train.restore( model, 20, os.path.join('./training_checkpoints_discord_2', "ckpt_{epoch}.ckpt")) MODEL = train.OneStep(model, chars_from_ids, ids_from_chars)
def classified_data(): total_data = get_data(is_test=True) # ['ASIN', 'FILENAME', 'IMAGE_URL', 'TITLE', 'AUTHOR', 'CATEGORY_ID', 'CATEGORY'] classified_data_list = [] class_names = [] for i in range(30): classified_data_list.append(total_data[total_data.CATEGORY_ID == i]) class_names.append( total_data[total_data.CATEGORY_ID == i].CATEGORY.values[0]) return classified_data_list, class_names
def main(): """ DEPRACATED (only one class) Load and validate a model. Generate and save precision and recall scores. :return: """ train_labels = train.get_data() modelOfInterest = "17-7-25/" model = train.standard_load_model(modelOfInterest) # load test data batch_size = 30 valid_l = 28000 valid_h = 31000 dm = 512 predictions = 3 test_generator = train.ImageSequence( train_labels=train_labels[valid_l:valid_h], batch_size=batch_size, dm=dm, start=valid_l, predictions=predictions) # initialize confusion matrix membership = 0 # column of predicted and actual results to examine a0 = np.zeros((2, 2)) # build confusion matrix num_batches = test_generator.__len__() for batch in range(num_batches): x_test, y_act = test_generator.__getitem__(batch) y_act = np.round(y_act) y_pred = np.round(model.predict(x_test), 0) print(str(batch) + " out of " + str(num_batches)) for i in range(y_pred.shape[0]): # this produces backwards results for model ants/ and model showers/ prediction = int( y_pred[i] [membership]) # real nice ... look at a diff column when actual = int(y_act[i][membership]) # building the confusion matrix a0[prediction][actual] += 1 # calculate performance metrics class0 = a0[0][1] + a0[1][1] recall = a0[1][1] / class0 exclaim = a0[1][0] + a0[1][1] precision = a0[1][1] / exclaim # save the results notes = [ "file: train.csv", "range: " + str(valid_l) + ":" + str(valid_h), "batch size: " + str(batch_size) ] precision = ["precision: ", str(precision)] recall = ["recall: ", str(recall)] write_performance_single(modelOfInterest, a0, precision, recall, notes)
def main(): lrs = [.01, .1, 1] beta1s = [.9] beta2s = [.999] epsilons = [.1] # lrs = [.01, .1] # beta1s = [.8] # beta2s = [.999] # epsilons = [.1] train_labels = train.get_data() search_parameters(lrs, beta1s, beta2s, epsilons, train_labels=train_labels)
def train(epochs=1, batch_size=128, path=''): # Import the MNIST dataset using Keras, will only # use the 60,000 training examples. (X_train, _), _ = get_data(True) # Creating GAN generator = make_generator() discriminator = make_discriminator() adversial_net = make_adversial_network(generator, discriminator) visualize_generator(0, generator, path=path) for epoch in range(epochs): print(f'Epoch {epoch+1}') discr_loss = 0 gen_loss = 0 for _ in tqdm(range(batch_size)): noise = generate_latent_noise(batch_size) generated_images = generator.predict(noise) real_images = X_train[np.random.choice(X_train.shape[0], batch_size, replace=False)] discrimination_data = np.concatenate( [real_images, generated_images]) # Labels for generated and real data, uses soft label trick discrimination_labels = 0.1 * np.ones(2 * batch_size) discrimination_labels[:batch_size] = 0.9 # To train, we alternate between training just the discriminator # and just the generator. discriminator.trainable = True discr_loss += discriminator.train_on_batch(discrimination_data, discrimination_labels) # Trick to 'freeze' discriminator weights in adversial_net. Only # the generator weights will be changed, which are shared with # the generator. discriminator.trainable = False # N.B, changing the labels because now we want to 'fool' the # discriminator. gen_loss += adversial_net.train_on_batch(noise, np.ones(batch_size)) print(f'Discriminator Loss: {discr_loss/batch_size}') print(f'Generator Loss: {gen_loss/batch_size}') visualize_generator(epoch + 1, generator, path=path)
def get(self, crypto): result = [] _, _, norm, actual_data = get_data(filename=crypto + ".csv", num_days_to_predict=1) actual_data *= norm print(actual_data.shape) values = [] i = 0 for data in actual_data: values.append({ "x": str(-actual_data.shape[0] + i), "y": float(data[0]) }) i += 1 return values[-100:]
def learn_with_noisy_labels(data_params, n_examples, n_runs, delta_matrix): train, val, test = get_data(data_params, n_examples, n_runs, delta_matrix) X_train, y_train, y_train_tildes = train X_val, y_val = val X_test, y_test = test rp_scores = [] baseline_noisy_scores = [] baseline_clean_scores = [] for y_train_tilde in y_train_tildes: lnl = cleanlab.classification.LearningWithNoisyLabels( clf=LogisticRegression(solver='lbfgs', multi_class='multinomial', class_weight='balanced')) lnl.fit(X_train, y_train_tilde) y_pred = lnl.predict(X_test) rp_scores.append(f1_score(y_test, y_pred, average='weighted') * 1.0) lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', class_weight='balanced') lr.fit(X_train, y_train_tilde) y_pred = lr.predict(X_test) baseline_noisy_scores.append( f1_score(y_test, y_pred, average='weighted') * 1.0) lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', class_weight='balanced') lr.fit(X_train, y_train) y_pred = lr.predict(X_test) baseline_clean_scores.append( f1_score(y_test, y_pred, average='weighted') * 1.0) scores = [rp_scores, baseline_clean_scores, baseline_noisy_scores] name = ["rp", "baseline_clean", "baseline_noisy"] res = [] for sc in scores: res.append((sum(sc) / len(sc), np.std(np.array(sc)))) return name, res
def main(): start_time = time() data = get_data() X_train = np.asarray(data['X_train']) X_dev = np.asarray(data['X_dev']) X_test = data['X_test'] X_train_feats = np.asarray(data['X_train_feats']) X_dev_feats = np.asarray(data['X_dev_feats']) X_test_feats = data['X_test_feats'] y_train = np.asarray(data['y_train']) y_dev = np.asarray(data['y_dev']) y_test = data['y_test'] word_index = data['word_index'] tag_index = data['tag_index'] feature_sizes = data['feature_sizes'] tag_size = len(tag_index) vocab_size = len(word_index) embedding_matrix = get_embedding_matrix(word_index) print('\nLoading candidate predictions for training data...') print('\nTraining reranking models...') model = LSTMReranking(vocab_size, feature_sizes, tag_size, embedding_matrix) model.summary() sgd = SGD(lr=0.01, momentum=0.7, clipnorm=5) model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['acc']) print('\nTrain...') checkpointer = ModelCheckpoint(MODEL_FILE, monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=False, mode='auto') early_stopping = EarlyStopping(monitor='val_acc', min_delta=0, patience=3, verbose=1, mode='auto') model.fit_generator(data_generator(X_train, X_train_feats, y_train, tag_size, batch_size=batch_size), samples_per_epoch=len(X_train)//batch_size*batch_size, nb_epoch=nb_epoch, verbose=1, callbacks=[checkpointer, early_stopping], validation_data=data_generator(X_test, X_test_feats, y_test, tag_size, batch_size=batch_size), nb_val_samples=len(X_test)//batch_size*batch_size ) model.load_weights(MODEL_FILE) print('\nTesting...') _, acc = model.evaluate_generator(data_generator(X_test, X_test_feats, y_test, tag_size), val_samples=len(X_test)) print('Test accuracy: {}.'.format(acc)) seconds = time() - start_time minutes = seconds / 60 print('[Finished in {} seconds ({} minutes)]'.format(str(round(seconds, 1)), str(round(minutes, 1))))
from datetime import datetime import os import tensorflow as tf import numpy as np from config import Config from dataset import MidiDataset from model import get_model from train import get_data if __name__ == "__main__": filepath = "/home/guy/melody_completions/runs/run_20200613_215938/model.33-0.14.h5" x_train, y_train, x_test, y_test = get_data( base_folder=Config().BASE_FOLDER) model = get_model() model.load_weights(filepath) y_pred = model.predict(x_train[:1])
def main(): # Pre-trained model VALID_ARCH_CHOICES = ("vgg16", "vgg13", "densenet121") # Parse command line arguments ap = argparse.ArgumentParser() ap.add_argument("data_dir", help="Directory containing the dataset (default: data)", default="data", nargs="?") ap.add_argument( "--arch", help="Model architecture from 'torchvision.models' (default: vgg16)", choices=VALID_ARCH_CHOICES, default=VALID_ARCH_CHOICES[0]) # ap.add_argument("--hidden-units", # help="Number of units the hidden layer should consist of (default: 512)", # default=512, # type=int) ap.add_argument( "--cpu", help="Use CPU (else GPU) for training (default if not set: GPU)", action="store_true") args = ap.parse_args() device = "cpu" if args.cpu else "cuda" args.device = device args.noise = 0.25 args.clip = 1.0 args.batch_size = 64 args.hidden_units = 256 args.delta = 1e-4 # Build model: chose loss function, optimizer, processor support # # Done later to reset the model # model = hybrid_model(arch=args.arch, hidden_units=args.hidden_units) criterion = nn.NLLLoss() device = "cpu" if args.cpu else "cuda" # ===== TUNING =========================================================== # Hyperparameters to test lr_range = [1e-4] ##### <== choice (enumeration) batch_size_range = [ 32, 16, 8, 2 ] #, 32, 128, 8, 4, 1] ##### <== choice (enumeration) epochs = 30 ##### <== choice (1 value=max) # Number of iteration for each parameter iter = 1 ##### <== choice (single value) # DP or not DP, that is the question args.disable_dp = False ##### <== choice (boolean) # ======================================================================== # File to export results dp_or_not = "noDP_" if args.disable_dp else "DP_" file = "experiment_stats/accuracy_data_" + dp_or_not file += str(datetime.datetime.today()).replace(' ', '_') + ".csv" steps = len(lr_range) * len(batch_size_range) * iter step = 0 # Write column titles with open(file, 'w') as f: f.write( 'learning_rate, batch_size, n_epochs, accuracy, n_times_for_avg\n') # Experiment loops for lr in lr_range: args.learning_rate = lr for bs in batch_size_range: args.batch_size = bs # Load the dataset into a dataloader ### default test batch size ### trainloader, testloader, mapping = get_data( data_folder=args.data_dir, batch_size=bs) args.sample_size = len(trainloader.dataset) #for epochs in epochs_range: accuracy_sum = [] for _ in range(iter): # Reset the model model, optimizer = hybrid_model(arch=args.arch, hidden_units=args.hidden_units, args=args) step += 1 _, acc = train( model=model, trainloader=trainloader, testloader=testloader, epochs=epochs, print_every=None, criterion=criterion, optimizer=optimizer, device=device, arch=args.arch, model_dir='', serialize=False, detail=False, args=args, ) acc = np.multiply(acc, 100) accuracy_sum.append(acc) print(f' {step}/{steps}\tlr={lr}, bs={bs},') for n_epoch, accur in enumerate(acc, start=1): line = f'{lr}, {bs}, {n_epoch}, {accur:.2f}, 1\n' with open(file, 'a') as f: f.write(line) print(f'\t. ×{n_epoch} epoch{"s" if n_epoch > 1 else " "}' f' => accuracy = {accur:.2f}%') # Sum up for identical settings, repeted `iter` times if iter > 1: acc_avg = np.average(accuracy_sum, axis=0) for n_epoch, accur in enumerate(acc_avg, start=1): line = f'{lr}, {bs}, {n_epoch}, {accur:.2f}, {iter}\n' with open(file, 'a') as f: f.write(line) print( f'\t\t>>> Average on {iter} iterations >>>\tlr={lr}, bs={bs},' f' ×{n_epoch} epoch{"s" if n_epoch > 1 else " "}' f' => accuracy = {accur:.2f}%')
def eval_score(results_file_base, results_file_ext, weights_filename, saliency=False): ''' Evaluate a trained model on the score dataset Arguments - results_file_base: str Base results file name. Usually includes run_id but leaves out file extension - results_file_ext: str Results file extension, exluduing the period (e.g. 'results') - weights_filename: str Filename of saved Tensorflow weights - saliency: bool, default = False Whether to compute and plot the saliency map ''' # read results of best run results_file = results_file_base + '.' + results_file_ext results_file_dtypes = results_file + '_dtypes' # dtypes_series = pd.read_csv('dtype_series', header=None) # dtypes_series = dtypes_series.set_index(0).squeeze() # dtypes_dict = dtypes_series.to_dict() df = pd.read_csv(results_file, header=0, float_precision='high', sep='\t') # dtype=dtypes_dict series = df.iloc[0] params = series.to_dict() # Get data datasets = ['train', 'test', 'score'] metrics = ['loss', 'acc', 'auroc', 'auroc_sk'] X, Y = train.get_data(params, datasets) # unpack params rand_seed = params['rand_seed'] kernel_reg_const = params['kernel_reg_const'] num_features = params['num_features'] q = params['q'] node_array = params['node_array'].split(',') for i in range(len(node_array)): node_array[i] = int(node_array[i].strip('[] ')) node_array = np.array(node_array) # rebuild model model = models.DNN(num_features, node_array, kernel_reg_const, rand_seed) # recreate results dict loss, acc, auroc, auroc_sk, y_prob = {}, {}, {}, {}, {} for res in [loss, acc, auroc, auroc_sk, y_prob]: for dataset in datasets: res[dataset] = [] results = { 'best_index': 0, 'loss': loss, 'acc': acc, 'auroc': auroc, 'auroc_sk': auroc_sk, 'y_prob': y_prob } # restore graph with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, weights_filename) # evaluate model on all datasets, including score train.evaluate_model(X, Y, model, q, results, datasets, sess) for dataset in datasets: y_prob = sess.run(model.y_prob, feed_dict={model.x: X[dataset]}) results['y_prob'][dataset] = y_prob # plot ROC curve and save results train.plot_ROC(X, Y, results, datasets, results_file_base) train.save_results(X, params, results, metrics, datasets, results_file_base) # compute and plot saliency map if saliency: saliency_vecs = train.saliency(X, Y, model, sess) train.plot_saliency(saliency_vecs, num_features, results_file_base)
def run_possibilities(dataset_path, logs_path, possibilities): x_train_labeled, x_train_unlabeled, y_train_labeled, x_val, y_val = get_data( dataset_path=dataset_path, normalization=NORMALIZATION, unlabeled_percentage=UNLABELED_PERCENTAGE, seed=SEED) _, evaluation_mapping, _ = timit.get_phone_mapping() n_classes = get_number_of_classes() for consistency_loss, schedule, sigma, consistency_scale, stabilization_scale, xi in possibilities: hparams = { 'consistency_loss': consistency_loss, 'schedule': schedule, 'sigma': sigma, 'consistency_scale': consistency_scale, 'stabilization_scale': stabilization_scale, 'xi': xi } for k, v in hparams.items(): print(f'{k}={v}, ', end='') print() config = Config(version='mono_directional', n_hidden_layers=N_HIDDEN_LAYERS, n_units=N_UNITS, n_epochs=N_EPOCHS, batch_size=BATCH_SIZE, unlabeled_percentage=UNLABELED_PERCENTAGE, optimizer=OPTIMIZER, consistency_loss=consistency_loss, consistency_scale=consistency_scale, stabilization_scale=stabilization_scale, xi=xi, sigma=sigma, schedule=schedule, schedule_length=SCHEDULE_LENGTH, normalization=NORMALIZATION, seed=SEED) logs_path_ = logs_path / str(config) if logs_path_.is_dir( ): # skip what already done (e.g. in case of crashes) print('already done, skipping...') continue logs_path_.mkdir(parents=True) logs_path_ = str(logs_path_) model = DualStudent(n_classes=n_classes, n_hidden_layers=config.n_hidden_layers, n_units=config.n_units, consistency_loss=config.consistency_loss, consistency_scale=config.consistency_scale, stabilization_scale=config.stabilization_scale, xi=config.xi, padding_value=PADDING_VALUE, sigma=config.sigma, schedule=config.schedule, schedule_length=config.schedule_length, version=config.version) model.compile(optimizer=get_optimizer(config.optimizer)) model.train(x_labeled=x_train_labeled, x_unlabeled=x_train_unlabeled, y_labeled=y_train_labeled, n_epochs=config.n_epochs, batch_size=config.batch_size, seed=config.seed) results = model.test(x=x_val, y=y_val, batch_size=config.batch_size, evaluation_mapping=evaluation_mapping) with tf.summary.create_file_writer(logs_path_).as_default(): hp.hparams(hparams) for k, v in results.items(): tf.summary.scalar(k, v, step=N_EPOCHS)
import os import urllib.request import numpy as np print("reading yaml") config = yaml.safe_load(open("./config.yml", "r")) print("config loaded") print("Let's start") last_date = datetime.now().date() print("loading model") model = pickle.load(open(config["model"], 'rb')) with open('model.pkl', 'rb') as fin: modelCovid = pickle.load(fin) print("getting data") df = train.get_data() print("init Ok") result = None mapping_reg_dep = pd.read_csv("mapping_region_dep.csv", dtype={ "region": str, "dep": str }) def get_risks(request): """HTTP Cloud Function. Args: request (flask.Request): The request object. <http://flask.pocoo.org/docs/1.0/api/#flask.Request>
from box import Box from train import get_data training_data = get_data() box = Box(training_data) box_env = box.read_box() # box environment represented by nested lists box.show_box(box_env)
import tensorflow as tf from train import get_data from keras.models import load_model import numpy as np if __name__ == "__main__": model = load_model('my_model.h5') train_X, train_y, val_X, val_y, test_X, test_y = get_data() loss, accuracy = model.evaluate(test_X, test_y) print(loss, accuracy) #for value, prediction in zip(test_y, model.predict(test_X)): # print(value, prediction, np.mean(np.square(value - prediction)))
from transformers import BertTokenizer import argparse import os import torch if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--ckpt') parser.add_argument('--batch_size', type=int, default=16) parser.add_argument('--branch', type=int, default=0) parser.add_argument('--task', type=int) parser.add_argument('--train_tasks', nargs='+', type=int) args = parser.parse_args() tokenizer = BertTokenizer.from_pretrained('bert-large-cased') test_data = get_data(args.task, 'test') test_loader = DataLoader(test_data, args.batch_size, collate_fn=lambda x: collate(tokenizer, x), pin_memory=True) model = BertMultiTask([get_n_classes(t) for t in args.train_tasks]) model.load_state_dict(torch.load(args.ckpt)) model = model.cuda().eval() all_correct = correct = total = 0 tp = torch.zeros(get_n_classes(args.task)) fp = torch.zeros(get_n_classes(args.task)) fn = torch.zeros(get_n_classes(args.task)) for inputs, labels in tqdm(test_loader):
from train import get_data from train import train from train import validate_model from nltk.tokenize import word_tokenize import re import string import io import csv with open('dataset.csv') as f: reader = csv.reader(f, delimiter=',') #Skip first row next(reader) with open('processed.csv', 'w', encoding='UTF-8') as new_file: for row in reader: for messages in row[1:-3]: # Remove all non alphabetic letters encoded_string = messages.encode("ascii", "ignore") messages = encoded_string.decode() # Call word tokenize for each row of messages text = preprocess(word_tokenize(messages)) for classification in row[:1]: classify = classification # Write each classification to new proceesed file new_file.write(classify + "," + text + "\n") new_file.close() f.close() X_train, X_test, y_train, y_test = get_data() model = train(X_train, y_train) validate_model(model, X_test, y_test)