Ejemplo n.º 1
0
def run():
    """
    Runs the full data gathering, processing and moddeling pipeline
    """

    # Get raw data
    raw_data = data.get_data()

    # Calculate new cases
    new_cases = data.new_cases(raw_data)

    # Get smoothed data
    original, smoothed = data.smooth_cases(new_cases)

    # Plot orginal vs smoothed
    plots.original_smoothed(original, smoothed)

    # Get posteriors from optimized Bayesian model
    posterior = model.bayesian_model(smoothed,
                                     p=0.95,
                                     save_posteriors_df=True,
                                     optimize=True)

    posterior = pd.read_csv("data/most_likely_rt.csv",
                            parse_dates=["date"],
                            index_col=["date"])

    # Plot Rt estimate
    plots.plot_rt(posterior, annotate=True)
Ejemplo n.º 2
0
def main(args):
    if args.verbose:
        sprint(0, '[ args ]')
        for k in sorted(args.__dict__.keys()):
            sprint(2, "({}): {}".format(k, args.__dict__[k]))
    # fin = os.path.join(args.sdir, args.data + '.pkl')
    # if not args.save and os.path.exists(fin):
    #     sprint(1, '> loading %s' % fin)
    #     with open(fin, 'r') as f:
    #         masks = pkl.load(f)
    #     train, C = get_data(args.data, 'train')
    # else:
    #     train, C = get_data(args.data, 'train')
    #     jdicts = [build(args, t, i) for i,t in enumerate(train)]
    #     masks = map(fmask, [build_mask(jdicts, c) for c in C])
    #     masks = [np.swapaxes(np.swapaxes(m, 0, 2), 1, 2) for m in masks]
    #     # masks = [np.swapaxes(np.swapaxes(build_mask(jdicts, c), 1, 3), 2, 3) for c in C]
    #     # # masks = map(fmask, [build_mask(jdicts, c) for c in C])
    #     # masks = map(fmask, masks)
    #     if args.save:
    #         if not os.path.exists(args.sdir):
    #             sprint(2, '! creating directory %s' % args.sdir)
    #             os.mkdir(args.sdir)
    #         fout = os.path.join(args.sdir, args.data + '.pkl')
    #         sprint(1, '+ writing to %s' % fout)
    #         with open(fout, 'w') as f:
    #             pkl.dump(masks, f)
    train, C = get_data(args.data, 'train')
    X, y = train[0]['X'] / train[0]['X'].max(), train[0]['y']
    masks = map(sum,
                [X[filter(lambda i: y[i] == c, range(len(y)))] for c in C])
    sprint(1, '[ calculating stats')
    stats = zip(*all_stats(train, masks).reshape(-1, 2))
    # del train
    return map(lambda x: x.reshape(-1, 28, 28), masks), stats
Ejemplo n.º 3
0
def test_new_cases():
    data = get_data()
    cases = new_cases(data)

    assert isinstance(cases, pd.DataFrame)
    assert len(cases.columns) == 1
    assert cases.dtypes["cases"] == int
    assert cases.index.nunique() == len(cases)
def main(
    log_dir,
    model_type,
    vocab_size,
    emb_size,
    batch_size,
    epochs,
    maxlen,
    min_acc,
    num_samples,
    oov_token,
):
    _create_dir_if_not_exist(log_dir)

    (str_X_train, y_train), (str_X_val, y_val), (str_X_test,
                                                 y_test) = get_data()
    test_samples = np.random.choice(str_X_test, num_samples)

    preprocessor = Preprocessor(maxlen=maxlen,
                                vocab_size=vocab_size,
                                oov_token=oov_token)
    preprocessor.fit_on_texts(str_X_train + str_X_val + str_X_test)

    X_train = preprocessor.transform(str_X_train)
    X_val = preprocessor.transform(str_X_val)
    X_test = preprocessor.transform(str_X_test)

    # Define model
    model = get_model(model_type, maxlen, vocab_size, emb_size)
    model.compile(loss="binary_crossentropy",
                  optimizer="adam",
                  metrics=["accuracy"])

    model.fit(
        X_train,
        y_train,
        validation_data=(X_val, y_val),
        batch_size=batch_size,
        epochs=epochs,
    )

    score = model.evaluate(X_test, y_test, verbose=0)
    print("Test accuracy:", score[1])
    assert score[
        1] > min_acc, f"score doesnt meet the minimum threshold {min_acc}"

    test_requests = _generate_test_requests(model, preprocessor, test_samples)
    _save_json(join(log_dir, "test_requests.json"), test_requests)
    preprocessor.save(join(log_dir, "preprocessor.pkl"))

    # HACK
    # For some reason savedModel format is not working with `lstm` model
    if model_type == "lstm":
        model.save(join(log_dir, "model.h5"))
    else:
        model.save(join(log_dir, "saved_model/model"))
Ejemplo n.º 5
0
def main(args, data='train', dir='plot'):
    if args.verbose:
        sprint(0, '[ args ]')
        for k in sorted(args.__dict__.keys()):
            sprint(2, "({}): {}".format(k, args.__dict__[k]))
    train, C = get_data(args.data, 'train')
    jdicts = [build(args, t, i) for i, t in enumerate(train)]
    masks = {c: build_mask(jdicts, c) for c in C}
    plot(args, jdicts, masks)
    return jdicts, masks, train
Ejemplo n.º 6
0
def test_smooth_cases():
    data = get_data()
    cases = new_cases(data)
    processed = smooth_cases(cases)

    assert isinstance(processed, tuple)
    assert len(processed) == 2

    original = processed[0]
    smoothed = processed[1]

    assert isinstance(original, pd.DataFrame)
    assert isinstance(smoothed, pd.DataFrame)
    assert original.shape == smoothed.shape
    assert len(cases) != len(smoothed)
def main(args):
    if not os.path.exists(args.params_dir):
        os.makedirs(args.params_dir)
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = args.log_level
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    if args.dataset == 'synthetic':
        global N_VOCAB
        train_data, val_data, test_data, N_VOCAB = get_data(args)
        Ss, Ts, Xs, Ys = train_data['Is'], train_data['Ts'], train_data['Xs'], train_data['Ys']
        dSs, dTs, dXs, dYs = val_data['Is'], val_data['Ts'], val_data['Xs'], val_data['Ys']
        tSs, tTs, tXs, tYs = test_data['Is'], test_data['Ts'], test_data['Xs'], test_data['Ys']
    else:
        Ss, Ts, Is, As, Ys, Yis = make_train_dataset()
        dSs, dTs, dIs, dAs, dYs, dYis = make_dev_dataset()
        tSs, tTs, tIs, tAs, tYs, tYis = make_dev_dataset()
        Xs = AtoI(As)
        dXs = AtoI(dAs)
        tXs = AtoI(tAs)

    print('*' * 80)
    print('Train samples: {} | Validation samples: {} | Test samples: {}'.format(len(Ss), len(dSs), len(tSs)))
    print('Vocab size: {}'. format(N_VOCAB))
    print('*' * 80)

    model = build_2d_model(args)
    param_fpath = '../params/params_unet' + str(args.run) + '.h5'
    os.system('rm ' + param_fpath)
    checkpointer = tfk.callbacks.ModelCheckpoint(param_fpath, monitor='val_acc', verbose=0,
                                                 save_weights_only=True, save_best_only=True, mode='max', period=1)
    model.fit(x=[Ts, Xs], y=[Ys], validation_data=([dTs, dXs], [dYs]), batch_size=args.batch_size, epochs=args.epochs, callbacks=[checkpointer])

    # Loads the weights
    model.load_weights(param_fpath)

    # Re-evaluate the model
    print('*' * 80)
    print('Validation set performance')
    model.evaluate([dTs, dXs], [dYs], batch_size=args.batch_size)
    print('*' * 80)
    print('Test set performance')
    print('-' * 80)
    model.evaluate(x=[tTs, tXs], y=[tYs], batch_size=args.batch_size)
    print('*' * 80)
Ejemplo n.º 8
0
	parser.add_argument('-p', '--prepick', nargs='*')
	args = parser.parse_args()
	
	if args.sets:
		import src.sets as s
		if 'teams' in args.sets:
			s.make_team_sets()
		if 'player' in args.sets:
			s.make_player_sets()
		if 'picks' in args.sets:
			s.make_picks_sets()

	if args.data:
		import src.data as d
		if 'get' in args.data:
			d.get_data()
		if 'all' in args.data:
			d.get_all_data()
		if 'picks' in args.data:
			d.make_pick_data()
		if 'influ' in args.data:
			d.plot_influential()

	if args.freq:
		import src.freq as f
		if 'phrase' in args.freq:
			f.find_freq_phrases()
		if 'timeline' in args.freq:
			f.plot_timelines()
		if 'top' in args.freq:
			f.plot_freq_bar()
Ejemplo n.º 9
0
def train():
    data, word2index, index2word = get_data()
    data = torch.from_numpy(data)
    data_loader = torch.utils.data.DataLoader(data,
                                              batch_size=Config['batch_size'],
                                              shuffle=True)

    model = PoetryModel(len(word2index), Config['embedding_dim'],
                        Config['hidden_dim'])
    if os.path.exists(Config['model_path']):
        model.load_state_dict(torch.load(Config['model_path']))
    model.to(Config['device'])

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=Config['learning_rate'])
    criterion = nn.CrossEntropyLoss()

    print('training...')
    for epoch in range(Config['epochs']):
        log = []  # 日志类数据
        for index, data in enumerate(data_loader):
            # 转置成shape ( sequence_length, batch_size )  sequence_length 为max_len
            # contiguous 为了改变底层的存储,contiguous重新按顺序存储,否则执行view会报错
            data = data.long().transpose(1,
                                         0).contiguous().to(Config['device'])

            x, y = data[:-1, :], data[1:, :]
            y_, _ = model(x)

            optimizer.zero_grad()
            loss = criterion(y_, y.view(-1))
            loss.backward()
            optimizer.step()
            if (index + 1) % Config['print_every'] == 0:
                print('epoch %d, iter %d, loss %.4f' %
                      (epoch, index + 1, loss.item()))
                temp = {}  # 用来保存过程数据
                temp['epoch'] = epoch
                temp['iter'] = index + 1
                temp['loss'] = loss.item()

                # 分别以不同的字开头续写诗
                print('普通诗词')
                temp['普通诗词'] = []
                for w in list('山上一把火'):
                    poetry = generate(model, w, word2index, index2word)
                    print(poetry)
                    temp['普通诗词'].append(poetry)

                # 生成藏头诗
                print('藏头诗')
                poerty_acrostic = generate_acrostic(model, '我想回校', word2index,
                                                    index2word)
                print(poerty_acrostic)
                temp['藏头诗'] = poerty_acrostic

                log.append(temp)

        with open(os.path.join(Config['log_json_path'],
                               'epoch_%d.json' % epoch),
                  'w',
                  encoding='utf-8',
                  errors='ignore') as f:
            # 将log转字符串再转字典再保存
            json.dump(json.loads(json.dumps(log)), f, ensure_ascii=False)

    torch.save(model.state_dict(), Config['model_path'])
Ejemplo n.º 10
0
def one_class(c=0):
    train, classes = get_data('mnist', 'train')
    X, y = train[0]['X'], train[0]['y']
    shape = train[0]['shape']
    I = filter(lambda i: y[i] == c, range(len(y)))
    return one_mask(X[I])
Ejemplo n.º 11
0
def main():
    df = data.get_data()
    chart = chart_all(df)
    chart.save(CFG['OutputFile'], embed_options={'actions': False})
Ejemplo n.º 12
0
def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    adj = adj.coalesce()
    sp_adj = sp.coo_matrix((adj.values(), adj.indices()),
                           shape=list(adj.size()))

    rowsum = np.array(sp_adj.sum(axis=1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    norm_sp_adj = sp_adj.dot(d_mat_inv_sqrt).transpose().dot(
        d_mat_inv_sqrt).tocoo()

    entries = norm_sp_adj.data
    row = norm_sp_adj.row
    col = norm_sp_adj.col
    indices = torch.LongTensor([row, col])
    entries = torch.FloatTensor(entries)
    return sparse.FloatTensor(indices, entries, adj.size())


if __name__ == '__main__':
    from src.data import Corpus, get_data, get_vocabulary, get_labels
    vocab = get_vocabulary('data/20ng-vocabulary.txt')
    labels = get_labels('data/20ng-labels.txt')
    corpus = get_data('data/test.txt', labels)
    clean_text(corpus, vocab)
    adj = build_adj_matrix(corpus, vocab)
    print(adj)
Ejemplo n.º 13
0
import os, sys

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
import random

from src.data import get_data
from src.models import get_model
from src.train.trainer import train
from src.train.test import predict
from src.explainer import explain

#### Model Definition

data = get_data(load_existing=True)
(X_train, X_test, y_train, y_test, feature_names, label_encoder) = data

input_dim, output_dim = X_train.shape[1], y_train.shape[1]
print("input_dim, output_dim:", input_dim, output_dim)
model = get_model(input_dim, output_dim)

model = train(model, data, load_existing=True)

# predict(model, label_encoder, data)

explain(model, data, n_samples=1, submodular_pick=True)
Ejemplo n.º 14
0
    model.load_state_dict(torch.load(save_path))
    return train_losses, val_losses


if __name__ == '__main__':
    from src.data import Corpus, get_data, get_vocabulary, get_labels
    from src.preprocessing import clean_text, build_adj_matrix
    from src.models.gcn import GCN

    seed = 0
    val_split = 0.1

    vocab = get_vocabulary('data/20ng-vocabulary.txt')
    labels = get_labels('data/20ng-labels.txt')
    corpus = get_data('data/train-20news.txt', labels)
    test_corpus = get_data('data/test-20news.txt', labels)

    # Mask out unknown words
    clean_text(corpus, vocab)
    clean_text(test_corpus, vocab)

    # Split validation set
    corpus.shuffle(seed)
    len_train = int(len(corpus) * (1 - val_split))
    train_corpus = Corpus(corpus[:len_train])
    val_corpus = Corpus(corpus[len_train:])

    num_documents = len(train_corpus) + len(val_corpus) + len(test_corpus)
    train_adj_matrix = build_adj_matrix(train_corpus,
                                        vocab,
Ejemplo n.º 15
0
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from src.model import MobileNet
from src.data import get_data
from src.data import get_image_value

dim = (224,224)
model_type = 'Mobilenet'
bw = False

train_paths, train_labels = get_data('train')
train_images = np.array([get_image_value(i, dim, bw, model_type) for i in train_paths])
train_dict = dict(images = train_images, labels = train_labels)


valid_paths, valid_labels = get_data('valid')
valid_images = np.array([get_image_value(i, dim, bw, model_type) for i in valid_paths])
valid_dict = dict(images = valid_images, labels = valid_labels)

test_paths, test_labels = get_data('test')
test_images = np.array([get_image_value(i, dim, bw, model_type) for i in test_paths])
test_dict = dict(images = test_images, labels = test_labels)

data_dir = './data/images/syn/'
img_height = 224
img_width = 224
Ejemplo n.º 16
0
def test_get_data():
    data = get_data()

    assert isinstance(data, pd.DataFrame)
from src.train import train
from src.data import get_train_val_test, get_data
import sys

if __name__ == '__main__':
    batch_size = 64
    print(sys.argv)
    model_name = sys.argv[1]
    dataset = sys.argv[2]
    epoch = int(sys.argv[3])
    if dataset == 'fairface' or dataset == 'utkface':
        train_set, val_set, test_set = get_train_val_test(
            batch_size, 'rec/{}'.format(dataset))
    else:
        train_path = sys.argv[4]
        val_path = sys.argv[5]
        train_set = get_data(batch_size, train_path, True)
        val_set = test_set = get_data(batch_size, val_path)
    train(model_name,
          dataset,
          train_set,
          val_set,
          test_set,
          batch_size,
          epoch=epoch)