def get_meta_features():
    ignored = ['svc']
    train_data, test_data, features, target = load_data('data.csv', small=True, part=20)
    train_data = train_data.reset_index(drop=True)
    classifiers = get_classifiers(len(features), ignored=ignored)
    total = time()

    # get meta features
    for clf_name in sorted(classifiers):
        start_time = time()
        print('processing ' + clf_name)
        train_loss, valid_loss = process_clf(classifiers[clf_name], clf_name, train_data, test_data, features, target)
        passed = (time() - start_time) / 60
        print('total (train,valid) Log Loss = (%.5f,%.5f). took %.2f minutes' % (train_loss, valid_loss, passed))

    # average neural nets' outputs
    test_data['meta_net'] = np.zeros(len(test_data))
    train_data['meta_net'] = np.zeros(len(train_data))
    for n in range(TOTAL_NETS):
        col = 'meta_' + 'net' + str(n).zfill(2)
        test_data['meta_net'] += test_data[col]
        train_data['meta_net'] += train_data[col]
        test_data.drop(col, axis=1, inplace=True)
        train_data.drop(col, axis=1, inplace=True)
    test_data['meta_net'] /= TOTAL_NETS
    train_data['meta_net'] /= TOTAL_NETS

    # write to file
    train_data.to_csv('train_meta.csv', index=False)
    test_data.to_csv('test_meta.csv', index=False)
    print('Generating meta features took %.2f minutes' % ((time() - total) / 60))
Beispiel #2
0
# coding: utf-8

import helper # Вспомогательные функции для сохранения и загрузки модели и параметров
import numpy as np
import random
import time
import tensorflow as tf

data_dir = './data/headers_full.txt'
text = helper.load_data(data_dir)

# Файл модели
load_dir = './models/word_emb'

tokens = {
        ".": "||PERIOD||",
        ",": "||COMMA||",
        '"': "||QUOT_MARK||",
        ";": "||SEMICOL||",
        "!": "||EXCL_MARK||",
        "?": "||QUEST_MARK||",
        "(": "||L_PARENTH||",
        ")": "||R_PARENTH||",
        "--": "||DASH||",
        "\n": "||RETURN||"
    }

for key, token in tokens.items():
    text = text.replace(key, ' {} '.format(token.lower()))

lines = text.split(' ||period||  ')
__author__ = 'harri'

import costs
import helper
import layers
import theano

#Load data
data = helper.load_data(path=None, return_shared=True)
train_X, train_y = data["train"]
val_X, val_y = data["validation"]


#Some useful variables.
batch_size, input_dim = train_X.get_value(borrow=True).shape
hidden_dim = 100
output_dim = 10
learning_rate = 0.1
mini_batch_size = 100
n_epochs = 100

#Build network

hidden_layer = layers.DenseLayer(nonlinearity=theano.tensor.nnet.relu, input_dim=input_dim,
                                 output_dim=hidden_dim, name="hidden0")

output_layer = layers.DenseLayer(nonlinearity=theano.tensor.nnet.softmax, input_dim=hidden_dim,
                                 output_dim = output_dim, name="softmax_layer")

net = layers.NeuralNetwork(layers=[hidden_layer,output_layer])
# 因为将整个英语语言内容翻译成法语需要大量训练时间,所以我们提供了一小部分的英语语料库。
# 

# In[4]:


"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
from os.path import isdir
import helper
import problem_unittests as tests

source_path = 'data/small_vocab_en'
target_path = 'data/small_vocab_fr'
source_text = helper.load_data(source_path)
target_text = helper.load_data(target_path)

if not isdir('checkpoints'):
    get_ipython().system('mkdir checkpoints')


# ## 探索数据
# 
# 研究 view_sentence_range,查看并熟悉该数据的不同部分。
# 

# In[3]:


view_sentence_range = (0, 10)
Beispiel #5
0
def load_buggy(project):
    dir = path.join('data', project, 'buggy_changes.json')
    return load_data(dir)
Beispiel #6
0
def load_annotated(project):
    dir = path.join('data', project, 'commits_changes.json')
    return load_data(dir)
def load_data():
    orgs,dummycolumns=helper.load_data(nrows=100)
    events,founders,degrees,orgs=add_features(orgs,nrows=100)
    return(orgs,dummycolumns,events,founders,degrees)
Beispiel #8
0
# coding: utf-8

import helper  # Вспомогательные функции для сохранения и загрузки модели и параметров
import numpy as np
import random
import time
import tensorflow as tf

data_dir = './data/headers_full.txt'
text = helper.load_data(data_dir)

# Файл модели
load_dir = './models/word_emb'

tokens = {
    ".": "||PERIOD||",
    ",": "||COMMA||",
    '"': "||QUOT_MARK||",
    ";": "||SEMICOL||",
    "!": "||EXCL_MARK||",
    "?": "||QUEST_MARK||",
    "(": "||L_PARENTH||",
    ")": "||R_PARENTH||",
    "--": "||DASH||",
    "\n": "||RETURN||"
}

for key, token in tokens.items():
    text = text.replace(key, ' {} '.format(token.lower()))

lines = text.split(' ||period||  ')
Beispiel #9
0
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 17 21:38:42 2019

@author: tanma
"""


import helper

codes = helper.load_data('cipher.txt')
plaintext = helper.load_data('plaintext.txt')

from keras.preprocessing.text import Tokenizer


def tokenize(x):
    """
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """

    x_tk = Tokenizer(char_level=True)
    x_tk.fit_on_texts(x)

    return x_tk.texts_to_sequences(x), x_tk

# Tokenize Example output
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
Beispiel #10
0
    
    if not os.path.exists(cov_dir):
            os.makedirs(cov_dir)

    logging.basicConfig(
        format='[%(asctime)s] - %(message)s',
        datefmt='%Y/%m/%d %H:%M:%S',
        level=logging.INFO,
        handlers=[
            logging.FileHandler(
                os.path.join(cov_dir, 'output.log')),
            logging.StreamHandler()
        ])

    ## Load benign images from mnist, cifar, or svhn
    x_train, y_train, x_test, y_test = load_data(dataset_name)

    ## Load keras pretrained model for the specific dataset
    model_path = "{}{}/{}.h5".format(MODEL_DIR,
                                    dataset_name, model_name)
    model = load_model(model_path)
    model.summary()

    x_adv_path = "{}x_test.npy".format(adv_dir)
    x_adv = np.load(x_adv_path)

    l = [0, 8]

    xlabel = []
    cov_nc1 = []
    cov_nc2 = []
Beispiel #11
0
num_trials = 10
model_name = 'cnn-deep'
sigmas = [
    0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.0, 3, 4, 5
]

# save path
results_path = utils.make_directory('../../results', 'initialization_sweep')
params_path = utils.make_directory(results_path, 'model_params')
save_path = utils.make_directory(results_path, 'conv_filters')

#------------------------------------------------------------------------------------------------

# load dataset
data_path = '../../data/synthetic_dataset.h5'
data = helper.load_data(data_path)
x_train, y_train, x_valid, y_valid, x_test, y_test = data

file_path = os.path.join(results_path, 'performance_initializations.tsv')
with open(file_path, 'w') as f:
    f.write('%s\t%s\t%s\n' % ('model', 'ave roc', 'ave pr'))

    for activation in activations:
        for sigma in sigmas:
            trial_roc_mean = []
            trial_roc_std = []
            trial_pr_mean = []
            trial_pr_std = []
            for trial in range(num_trials):
                keras.backend.clear_session()
Beispiel #12
0
def main():
    start = datetime.now()

    # get the data
    train_data = helpers.load_data(numpy_path, 'train_set.npy')
    valid_data = helpers.load_data(numpy_path, 'valid_set.npy')
    test_data = helpers.load_data(numpy_path, 'test_set.npy')

    # filter the data
    test_data_labels = np.array([item[0] for item in test_data[:, 2]])
    test_data_countries = np.array([item[0] for item in test_data[:, 0]])
    test_data_month = test_data[:, 5]

    # convert the data
    train_dataset, train_shape = convert_dataset(train_data,
                                                 batchsize=batchsize,
                                                 shuffle=1000,
                                                 shape=True)
    valid_dataset = convert_dataset(valid_data, batchsize=1000, shuffle=100)
    test_dataset = convert_dataset(test_data, batchsize=1000)

    # build the model
    model = build_model(train_shape[1], train_shape[2])

    # Print Model
    # modelprovider.printModel(model, dir=os.path.join(
    #     logdir, expname), name=expname+".png")

    # compiling the model
    lossfn = loss.crps_cost_function
    opt = Adam(lr=learning_rate, amsgrad=True)
    model.compile(loss=lossfn, optimizer=opt)

    # Load model if exits
    checkpoint_dir = os.path.join(logdir, expname, 'checkpoints/')

    # begin with training 10 times
    print('[INFO] Starting training')
    predictions = []
    for i in range(1, 11):
        print('Round number: ' + str(i))
        model = build_model(train_shape[1], train_shape[2])

        # compile new model with new inital weights
        model.compile(loss=lossfn, optimizer=opt)

        # checkpoint callbacks
        # all checkpoints
        cp_callback_versuch = tf.keras.callbacks.ModelCheckpoint(
            os.path.join(checkpoint_dir, 'round-' + str(i) + '/') +
            "checkpoint_{epoch}",
            monitor='val_loss',
            save_weights_only=True,
            mode='min',
            verbose=0)
        # best checkpoint
        cp_callback = tf.keras.callbacks.ModelCheckpoint(
            os.path.join(checkpoint_dir, 'round-' + str(i) + '/checkpoint'),
            monitor='val_loss',
            save_weights_only=True,
            mode='min',
            save_best_only=True,
            verbose=0)

        # train the model
        if train_model:
            model.fit(
                train_dataset,
                epochs=epochs,
                initial_epoch=initial_epochs,
                batch_size=batchsize,
                verbose=1,
                validation_data=valid_dataset,
                validation_batch_size=1000,
                callbacks=[cp_callback, cp_callback_versuch],
            )
        # load the best checkpoint of round i
        model.load_weights(
            os.path.join(checkpoint_dir,
                         'round-' + str(i) + '/checkpoint')).expect_partial()

        predictions.append(
            model.predict(test_dataset, batch_size=1000, verbose=0))

    # convert to numpy array
    predictions = np.array(predictions)
    # Make sure std is positive
    predictions[:, :, 1] = np.abs(predictions[:, :, 1])
    mean_predictions = np.mean(predictions, 0)
    # calculate the score for each record in test set
    test_crps = crps.norm_data(test_data_labels, mean_predictions)

    # print the results with filters
    helpers.printIntCountries(test_data_labels, test_data_countries,
                              mean_predictions)
    helpers.printHist(helpers.datasetPIT(mean_predictions, test_data_labels))

    np.save(os.path.join(logdir, expname, 'prediction'), predictions)
    print(datetime.now() - start)
Beispiel #13
0

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())


# ## Dataset
# We begin by investigating the dataset that will be used to train and evaluate your pipeline.  The most common datasets used for machine translation are from [WMT](http://www.statmt.org/).  However, that will take a long time to train a neural network on.  We'll be using a dataset we created for this project that contains a small vocabulary.  You'll be able to train your model in a reasonable time with this dataset.
# ### Load Data
# The data is located in `data/small_vocab_en` and `data/small_vocab_fr`. The `small_vocab_en` file contains English sentences with their French translations in the `small_vocab_fr` file. Load the English and French data from these files from running the cell below.

# In[4]:


# Load English data
english_sentences = helper.load_data('data/small_vocab_en')
# Load French data
french_sentences = helper.load_data('data/small_vocab_fr')

print('Dataset Loaded')


# ### Files
# Each line in `small_vocab_en` contains an English sentence with the respective translation in each line of `small_vocab_fr`.  View the first two lines from each file.

# In[5]:


for sample_i in range(2):
    print('small_vocab_en Line {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, french_sentences[sample_i]))
Beispiel #14
0
# Plot per-episode
if __name__ == "__main__" and True:
	# data_sources = helper.gen_find('./data/', '1000eps_RandomWalk-17-states_RandomBinomial-n-7-k-3_RandomAction_*.csv')
	
	data_dir = "../data/"
	run_dir = "1000eps_RandomWalk-17-states_IntToVector-n-17"
	data_sources = helper.gen_find(os.path.join(data_dir, run_dir), "*.csv")

	lm_values = [1 - (1/2)**i for i in range(0, 11)]
	lm_values = [0] # REMOVE

	for data_path in data_sources:
		print("Running experiment on file:", data_path)

		# Get list of observed feature vectors for this dataset
		data = helper.load_data(data_path)
		
		#REMOVE######################
		#data = data + data + data
		#############################
		fvec_lst = helper.get_run_features(data)
		fmapping = helper.get_run_feature_mapping(data)
		num_features = len(data[0][1])

		# Iterate over the various lambda values for the experiment
		for lm in lm_values:
			print("Using lambda=", lm)

			gamma_val = 1
			algo_params = \
			{
Beispiel #15
0
    # Shift data in principal component coordinates by 1
    shifted_coords = np.roll(pc_coordinates, -dt, axis=0)
    # substracting data from its shifted version gives us velocity in each time step
    velocity = np.diag(cdist(pc_coordinates, shifted_coords))
    # plot velocity against time
    plot_data(np.arange(0, velocity.shape[0] - 1, 1),
              velocity[:-1],
              name="3",
              labels=["Arc length", "velocity"])
    # the velocity data is periodic and repeats in roughly every 2000 time steps
    one_period = velocity[:2000]
    # plot a single period
    plot_data(np.arange(0, one_period.shape[0], 1),
              one_period,
              name="3_2",
              labels=["Arc length", "velocity"])
    plt.locator_params(axis='x', nbins=4)

    # arclengths = [np.sum(one_period[:i]) for i in range(one_period.shape[0])]
    # plot_data(np.arange(0, one_period.shape[0],1), arclengths, name="3_3", labels=["Time", "Arc length"])


if __name__ == "__main__":
    mi_timesteps = load_data("MI_timesteps.txt")
    # remove header and burn-in period of first 1000 time steps
    mi_timesteps = mi_timesteps[1001:]
    with InteractiveMode():
        part1(mi_timesteps)
        part2(mi_timesteps)
        part3(dt=1)
__author__ = 'harri'
__project__ = 'dds'

import theano
import lasagne
import helper
import numpy as np
import cPickle as cp
import matplotlib.pyplot as plt

#Load/prepare the data
data = helper.load_data()
train_X, train_y = data["train"]
train_y = np.reshape(train_y, (-1,1))
N,d = train_X.shape
train_X = theano.shared(lasagne.utils.floatX(train_X), "train_X")
train_y = theano.shared(lasagne.utils.floatX(train_y), "train_y")
val_X, val_y = data["validation"]
val_y = np.reshape(val_y, (-1,1))
val_X = theano.shared(lasagne.utils.floatX(val_X), "val_X")
val_y = theano.shared(lasagne.utils.floatX(val_y), "val_y")


def get_errors(penalty=0):


    #Build network.
    input_layer = lasagne.layers.InputLayer((None,d), name="input_layer")
    output_layer = lasagne.layers.DenseLayer(incoming=input_layer,num_units=1, name="output_layer", nonlinearity=None)

    #Build cost and symbolic variables.
Beispiel #17
0
from BLSTM_CRF2 import BLSTM_CRF2
from helper import load_data, add_features, prepare_sentence, prepare_tags, prepare_features
import pickle
import torch
from sklearn.metrics import classification_report

inf = open("out/word_to_ix", "rb")
word_to_ix = pickle.load(inf)
inf.close()

with open("out/tag_to_ix", "rb") as inf:
    tag_to_ix = pickle.load(inf)

#load val set
val_path = "data/twitter_ner/validation.txt"
examples = load_data(val_path)
features = add_features(examples)

features_dim = len(features[0][0])
print("feature dim: ", features_dim)
EMBEDDING_DIM = 16
HIDDEN_DIM = 16

model = BLSTM_CRF2(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,
                   features_dim)
model.load_state_dict(torch.load("out/lstm_crf_2/epoch20.hdf5"))

# predict on val set
true_tags = []
pred_tags = []
with torch.no_grad():
Beispiel #18
0
mode = args.mode
classif_mode = args.classifier
limit_load = False
if args.limit == 1:
	limit_load = True
limit_size = args.limit_size

tok_store = "token_stash2.p"

pp = pprint.PrettyPrinter(indent=4)

proc_arts = None
print os.path.isfile(tok_store)
if os.path.isfile(tok_store) == False:
	#Load then preprocess
	articles = helper.load_data(DATA_PATH, limit=limit_load, limit_num=limit_size)
	clean_arts = helper.trim_and_token(articles)

	proc_arts = helper.lang_proc(clean_arts)
	store = open(tok_store, "wb")
	pickle.dump(proc_arts, store)
	store.close()
else:
	store = open(tok_store, "rb")
	proc_arts = pickle.load(store)
	store.close()


if mode == 1:
	helper.run_bag_of_words(proc_arts, classif=classif_mode)
elif mode == 2:
Beispiel #19
0
 def __init__(self, source_data_path):
     self.data = load_data(source_data_path)