def get_meta_features(): ignored = ['svc'] train_data, test_data, features, target = load_data('data.csv', small=True, part=20) train_data = train_data.reset_index(drop=True) classifiers = get_classifiers(len(features), ignored=ignored) total = time() # get meta features for clf_name in sorted(classifiers): start_time = time() print('processing ' + clf_name) train_loss, valid_loss = process_clf(classifiers[clf_name], clf_name, train_data, test_data, features, target) passed = (time() - start_time) / 60 print('total (train,valid) Log Loss = (%.5f,%.5f). took %.2f minutes' % (train_loss, valid_loss, passed)) # average neural nets' outputs test_data['meta_net'] = np.zeros(len(test_data)) train_data['meta_net'] = np.zeros(len(train_data)) for n in range(TOTAL_NETS): col = 'meta_' + 'net' + str(n).zfill(2) test_data['meta_net'] += test_data[col] train_data['meta_net'] += train_data[col] test_data.drop(col, axis=1, inplace=True) train_data.drop(col, axis=1, inplace=True) test_data['meta_net'] /= TOTAL_NETS train_data['meta_net'] /= TOTAL_NETS # write to file train_data.to_csv('train_meta.csv', index=False) test_data.to_csv('test_meta.csv', index=False) print('Generating meta features took %.2f minutes' % ((time() - total) / 60))
# coding: utf-8 import helper # Вспомогательные функции для сохранения и загрузки модели и параметров import numpy as np import random import time import tensorflow as tf data_dir = './data/headers_full.txt' text = helper.load_data(data_dir) # Файл модели load_dir = './models/word_emb' tokens = { ".": "||PERIOD||", ",": "||COMMA||", '"': "||QUOT_MARK||", ";": "||SEMICOL||", "!": "||EXCL_MARK||", "?": "||QUEST_MARK||", "(": "||L_PARENTH||", ")": "||R_PARENTH||", "--": "||DASH||", "\n": "||RETURN||" } for key, token in tokens.items(): text = text.replace(key, ' {} '.format(token.lower())) lines = text.split(' ||period|| ')
__author__ = 'harri' import costs import helper import layers import theano #Load data data = helper.load_data(path=None, return_shared=True) train_X, train_y = data["train"] val_X, val_y = data["validation"] #Some useful variables. batch_size, input_dim = train_X.get_value(borrow=True).shape hidden_dim = 100 output_dim = 10 learning_rate = 0.1 mini_batch_size = 100 n_epochs = 100 #Build network hidden_layer = layers.DenseLayer(nonlinearity=theano.tensor.nnet.relu, input_dim=input_dim, output_dim=hidden_dim, name="hidden0") output_layer = layers.DenseLayer(nonlinearity=theano.tensor.nnet.softmax, input_dim=hidden_dim, output_dim = output_dim, name="softmax_layer") net = layers.NeuralNetwork(layers=[hidden_layer,output_layer])
# 因为将整个英语语言内容翻译成法语需要大量训练时间,所以我们提供了一小部分的英语语料库。 # # In[4]: """ DON'T MODIFY ANYTHING IN THIS CELL """ from os.path import isdir import helper import problem_unittests as tests source_path = 'data/small_vocab_en' target_path = 'data/small_vocab_fr' source_text = helper.load_data(source_path) target_text = helper.load_data(target_path) if not isdir('checkpoints'): get_ipython().system('mkdir checkpoints') # ## 探索数据 # # 研究 view_sentence_range,查看并熟悉该数据的不同部分。 # # In[3]: view_sentence_range = (0, 10)
def load_buggy(project): dir = path.join('data', project, 'buggy_changes.json') return load_data(dir)
def load_annotated(project): dir = path.join('data', project, 'commits_changes.json') return load_data(dir)
def load_data(): orgs,dummycolumns=helper.load_data(nrows=100) events,founders,degrees,orgs=add_features(orgs,nrows=100) return(orgs,dummycolumns,events,founders,degrees)
# -*- coding: utf-8 -*- """ Created on Tue Dec 17 21:38:42 2019 @author: tanma """ import helper codes = helper.load_data('cipher.txt') plaintext = helper.load_data('plaintext.txt') from keras.preprocessing.text import Tokenizer def tokenize(x): """ :param x: List of sentences/strings to be tokenized :return: Tuple of (tokenized x data, tokenizer used to tokenize x) """ x_tk = Tokenizer(char_level=True) x_tk.fit_on_texts(x) return x_tk.texts_to_sequences(x), x_tk # Tokenize Example output text_sentences = [ 'The quick brown fox jumps over the lazy dog .', 'By Jove , my quick study of lexicography won a prize .',
if not os.path.exists(cov_dir): os.makedirs(cov_dir) logging.basicConfig( format='[%(asctime)s] - %(message)s', datefmt='%Y/%m/%d %H:%M:%S', level=logging.INFO, handlers=[ logging.FileHandler( os.path.join(cov_dir, 'output.log')), logging.StreamHandler() ]) ## Load benign images from mnist, cifar, or svhn x_train, y_train, x_test, y_test = load_data(dataset_name) ## Load keras pretrained model for the specific dataset model_path = "{}{}/{}.h5".format(MODEL_DIR, dataset_name, model_name) model = load_model(model_path) model.summary() x_adv_path = "{}x_test.npy".format(adv_dir) x_adv = np.load(x_adv_path) l = [0, 8] xlabel = [] cov_nc1 = [] cov_nc2 = []
num_trials = 10 model_name = 'cnn-deep' sigmas = [ 0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.0, 3, 4, 5 ] # save path results_path = utils.make_directory('../../results', 'initialization_sweep') params_path = utils.make_directory(results_path, 'model_params') save_path = utils.make_directory(results_path, 'conv_filters') #------------------------------------------------------------------------------------------------ # load dataset data_path = '../../data/synthetic_dataset.h5' data = helper.load_data(data_path) x_train, y_train, x_valid, y_valid, x_test, y_test = data file_path = os.path.join(results_path, 'performance_initializations.tsv') with open(file_path, 'w') as f: f.write('%s\t%s\t%s\n' % ('model', 'ave roc', 'ave pr')) for activation in activations: for sigma in sigmas: trial_roc_mean = [] trial_roc_std = [] trial_pr_mean = [] trial_pr_std = [] for trial in range(num_trials): keras.backend.clear_session()
def main(): start = datetime.now() # get the data train_data = helpers.load_data(numpy_path, 'train_set.npy') valid_data = helpers.load_data(numpy_path, 'valid_set.npy') test_data = helpers.load_data(numpy_path, 'test_set.npy') # filter the data test_data_labels = np.array([item[0] for item in test_data[:, 2]]) test_data_countries = np.array([item[0] for item in test_data[:, 0]]) test_data_month = test_data[:, 5] # convert the data train_dataset, train_shape = convert_dataset(train_data, batchsize=batchsize, shuffle=1000, shape=True) valid_dataset = convert_dataset(valid_data, batchsize=1000, shuffle=100) test_dataset = convert_dataset(test_data, batchsize=1000) # build the model model = build_model(train_shape[1], train_shape[2]) # Print Model # modelprovider.printModel(model, dir=os.path.join( # logdir, expname), name=expname+".png") # compiling the model lossfn = loss.crps_cost_function opt = Adam(lr=learning_rate, amsgrad=True) model.compile(loss=lossfn, optimizer=opt) # Load model if exits checkpoint_dir = os.path.join(logdir, expname, 'checkpoints/') # begin with training 10 times print('[INFO] Starting training') predictions = [] for i in range(1, 11): print('Round number: ' + str(i)) model = build_model(train_shape[1], train_shape[2]) # compile new model with new inital weights model.compile(loss=lossfn, optimizer=opt) # checkpoint callbacks # all checkpoints cp_callback_versuch = tf.keras.callbacks.ModelCheckpoint( os.path.join(checkpoint_dir, 'round-' + str(i) + '/') + "checkpoint_{epoch}", monitor='val_loss', save_weights_only=True, mode='min', verbose=0) # best checkpoint cp_callback = tf.keras.callbacks.ModelCheckpoint( os.path.join(checkpoint_dir, 'round-' + str(i) + '/checkpoint'), monitor='val_loss', save_weights_only=True, mode='min', save_best_only=True, verbose=0) # train the model if train_model: model.fit( train_dataset, epochs=epochs, initial_epoch=initial_epochs, batch_size=batchsize, verbose=1, validation_data=valid_dataset, validation_batch_size=1000, callbacks=[cp_callback, cp_callback_versuch], ) # load the best checkpoint of round i model.load_weights( os.path.join(checkpoint_dir, 'round-' + str(i) + '/checkpoint')).expect_partial() predictions.append( model.predict(test_dataset, batch_size=1000, verbose=0)) # convert to numpy array predictions = np.array(predictions) # Make sure std is positive predictions[:, :, 1] = np.abs(predictions[:, :, 1]) mean_predictions = np.mean(predictions, 0) # calculate the score for each record in test set test_crps = crps.norm_data(test_data_labels, mean_predictions) # print the results with filters helpers.printIntCountries(test_data_labels, test_data_countries, mean_predictions) helpers.printHist(helpers.datasetPIT(mean_predictions, test_data_labels)) np.save(os.path.join(logdir, expname, 'prediction'), predictions) print(datetime.now() - start)
from tensorflow.python.client import device_lib print(device_lib.list_local_devices()) # ## Dataset # We begin by investigating the dataset that will be used to train and evaluate your pipeline. The most common datasets used for machine translation are from [WMT](http://www.statmt.org/). However, that will take a long time to train a neural network on. We'll be using a dataset we created for this project that contains a small vocabulary. You'll be able to train your model in a reasonable time with this dataset. # ### Load Data # The data is located in `data/small_vocab_en` and `data/small_vocab_fr`. The `small_vocab_en` file contains English sentences with their French translations in the `small_vocab_fr` file. Load the English and French data from these files from running the cell below. # In[4]: # Load English data english_sentences = helper.load_data('data/small_vocab_en') # Load French data french_sentences = helper.load_data('data/small_vocab_fr') print('Dataset Loaded') # ### Files # Each line in `small_vocab_en` contains an English sentence with the respective translation in each line of `small_vocab_fr`. View the first two lines from each file. # In[5]: for sample_i in range(2): print('small_vocab_en Line {}: {}'.format(sample_i + 1, english_sentences[sample_i])) print('small_vocab_fr Line {}: {}'.format(sample_i + 1, french_sentences[sample_i]))
# Plot per-episode if __name__ == "__main__" and True: # data_sources = helper.gen_find('./data/', '1000eps_RandomWalk-17-states_RandomBinomial-n-7-k-3_RandomAction_*.csv') data_dir = "../data/" run_dir = "1000eps_RandomWalk-17-states_IntToVector-n-17" data_sources = helper.gen_find(os.path.join(data_dir, run_dir), "*.csv") lm_values = [1 - (1/2)**i for i in range(0, 11)] lm_values = [0] # REMOVE for data_path in data_sources: print("Running experiment on file:", data_path) # Get list of observed feature vectors for this dataset data = helper.load_data(data_path) #REMOVE###################### #data = data + data + data ############################# fvec_lst = helper.get_run_features(data) fmapping = helper.get_run_feature_mapping(data) num_features = len(data[0][1]) # Iterate over the various lambda values for the experiment for lm in lm_values: print("Using lambda=", lm) gamma_val = 1 algo_params = \ {
# Shift data in principal component coordinates by 1 shifted_coords = np.roll(pc_coordinates, -dt, axis=0) # substracting data from its shifted version gives us velocity in each time step velocity = np.diag(cdist(pc_coordinates, shifted_coords)) # plot velocity against time plot_data(np.arange(0, velocity.shape[0] - 1, 1), velocity[:-1], name="3", labels=["Arc length", "velocity"]) # the velocity data is periodic and repeats in roughly every 2000 time steps one_period = velocity[:2000] # plot a single period plot_data(np.arange(0, one_period.shape[0], 1), one_period, name="3_2", labels=["Arc length", "velocity"]) plt.locator_params(axis='x', nbins=4) # arclengths = [np.sum(one_period[:i]) for i in range(one_period.shape[0])] # plot_data(np.arange(0, one_period.shape[0],1), arclengths, name="3_3", labels=["Time", "Arc length"]) if __name__ == "__main__": mi_timesteps = load_data("MI_timesteps.txt") # remove header and burn-in period of first 1000 time steps mi_timesteps = mi_timesteps[1001:] with InteractiveMode(): part1(mi_timesteps) part2(mi_timesteps) part3(dt=1)
__author__ = 'harri' __project__ = 'dds' import theano import lasagne import helper import numpy as np import cPickle as cp import matplotlib.pyplot as plt #Load/prepare the data data = helper.load_data() train_X, train_y = data["train"] train_y = np.reshape(train_y, (-1,1)) N,d = train_X.shape train_X = theano.shared(lasagne.utils.floatX(train_X), "train_X") train_y = theano.shared(lasagne.utils.floatX(train_y), "train_y") val_X, val_y = data["validation"] val_y = np.reshape(val_y, (-1,1)) val_X = theano.shared(lasagne.utils.floatX(val_X), "val_X") val_y = theano.shared(lasagne.utils.floatX(val_y), "val_y") def get_errors(penalty=0): #Build network. input_layer = lasagne.layers.InputLayer((None,d), name="input_layer") output_layer = lasagne.layers.DenseLayer(incoming=input_layer,num_units=1, name="output_layer", nonlinearity=None) #Build cost and symbolic variables.
from BLSTM_CRF2 import BLSTM_CRF2 from helper import load_data, add_features, prepare_sentence, prepare_tags, prepare_features import pickle import torch from sklearn.metrics import classification_report inf = open("out/word_to_ix", "rb") word_to_ix = pickle.load(inf) inf.close() with open("out/tag_to_ix", "rb") as inf: tag_to_ix = pickle.load(inf) #load val set val_path = "data/twitter_ner/validation.txt" examples = load_data(val_path) features = add_features(examples) features_dim = len(features[0][0]) print("feature dim: ", features_dim) EMBEDDING_DIM = 16 HIDDEN_DIM = 16 model = BLSTM_CRF2(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, features_dim) model.load_state_dict(torch.load("out/lstm_crf_2/epoch20.hdf5")) # predict on val set true_tags = [] pred_tags = [] with torch.no_grad():
mode = args.mode classif_mode = args.classifier limit_load = False if args.limit == 1: limit_load = True limit_size = args.limit_size tok_store = "token_stash2.p" pp = pprint.PrettyPrinter(indent=4) proc_arts = None print os.path.isfile(tok_store) if os.path.isfile(tok_store) == False: #Load then preprocess articles = helper.load_data(DATA_PATH, limit=limit_load, limit_num=limit_size) clean_arts = helper.trim_and_token(articles) proc_arts = helper.lang_proc(clean_arts) store = open(tok_store, "wb") pickle.dump(proc_arts, store) store.close() else: store = open(tok_store, "rb") proc_arts = pickle.load(store) store.close() if mode == 1: helper.run_bag_of_words(proc_arts, classif=classif_mode) elif mode == 2:
def __init__(self, source_data_path): self.data = load_data(source_data_path)