def run(): """ Runs the full data gathering, processing and moddeling pipeline """ # Get raw data raw_data = data.get_data() # Calculate new cases new_cases = data.new_cases(raw_data) # Get smoothed data original, smoothed = data.smooth_cases(new_cases) # Plot orginal vs smoothed plots.original_smoothed(original, smoothed) # Get posteriors from optimized Bayesian model posterior = model.bayesian_model(smoothed, p=0.95, save_posteriors_df=True, optimize=True) posterior = pd.read_csv("data/most_likely_rt.csv", parse_dates=["date"], index_col=["date"]) # Plot Rt estimate plots.plot_rt(posterior, annotate=True)
def main(args): if args.verbose: sprint(0, '[ args ]') for k in sorted(args.__dict__.keys()): sprint(2, "({}): {}".format(k, args.__dict__[k])) # fin = os.path.join(args.sdir, args.data + '.pkl') # if not args.save and os.path.exists(fin): # sprint(1, '> loading %s' % fin) # with open(fin, 'r') as f: # masks = pkl.load(f) # train, C = get_data(args.data, 'train') # else: # train, C = get_data(args.data, 'train') # jdicts = [build(args, t, i) for i,t in enumerate(train)] # masks = map(fmask, [build_mask(jdicts, c) for c in C]) # masks = [np.swapaxes(np.swapaxes(m, 0, 2), 1, 2) for m in masks] # # masks = [np.swapaxes(np.swapaxes(build_mask(jdicts, c), 1, 3), 2, 3) for c in C] # # # masks = map(fmask, [build_mask(jdicts, c) for c in C]) # # masks = map(fmask, masks) # if args.save: # if not os.path.exists(args.sdir): # sprint(2, '! creating directory %s' % args.sdir) # os.mkdir(args.sdir) # fout = os.path.join(args.sdir, args.data + '.pkl') # sprint(1, '+ writing to %s' % fout) # with open(fout, 'w') as f: # pkl.dump(masks, f) train, C = get_data(args.data, 'train') X, y = train[0]['X'] / train[0]['X'].max(), train[0]['y'] masks = map(sum, [X[filter(lambda i: y[i] == c, range(len(y)))] for c in C]) sprint(1, '[ calculating stats') stats = zip(*all_stats(train, masks).reshape(-1, 2)) # del train return map(lambda x: x.reshape(-1, 28, 28), masks), stats
def test_new_cases(): data = get_data() cases = new_cases(data) assert isinstance(cases, pd.DataFrame) assert len(cases.columns) == 1 assert cases.dtypes["cases"] == int assert cases.index.nunique() == len(cases)
def main( log_dir, model_type, vocab_size, emb_size, batch_size, epochs, maxlen, min_acc, num_samples, oov_token, ): _create_dir_if_not_exist(log_dir) (str_X_train, y_train), (str_X_val, y_val), (str_X_test, y_test) = get_data() test_samples = np.random.choice(str_X_test, num_samples) preprocessor = Preprocessor(maxlen=maxlen, vocab_size=vocab_size, oov_token=oov_token) preprocessor.fit_on_texts(str_X_train + str_X_val + str_X_test) X_train = preprocessor.transform(str_X_train) X_val = preprocessor.transform(str_X_val) X_test = preprocessor.transform(str_X_test) # Define model model = get_model(model_type, maxlen, vocab_size, emb_size) model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) model.fit( X_train, y_train, validation_data=(X_val, y_val), batch_size=batch_size, epochs=epochs, ) score = model.evaluate(X_test, y_test, verbose=0) print("Test accuracy:", score[1]) assert score[ 1] > min_acc, f"score doesnt meet the minimum threshold {min_acc}" test_requests = _generate_test_requests(model, preprocessor, test_samples) _save_json(join(log_dir, "test_requests.json"), test_requests) preprocessor.save(join(log_dir, "preprocessor.pkl")) # HACK # For some reason savedModel format is not working with `lstm` model if model_type == "lstm": model.save(join(log_dir, "model.h5")) else: model.save(join(log_dir, "saved_model/model"))
def main(args, data='train', dir='plot'): if args.verbose: sprint(0, '[ args ]') for k in sorted(args.__dict__.keys()): sprint(2, "({}): {}".format(k, args.__dict__[k])) train, C = get_data(args.data, 'train') jdicts = [build(args, t, i) for i, t in enumerate(train)] masks = {c: build_mask(jdicts, c) for c in C} plot(args, jdicts, masks) return jdicts, masks, train
def test_smooth_cases(): data = get_data() cases = new_cases(data) processed = smooth_cases(cases) assert isinstance(processed, tuple) assert len(processed) == 2 original = processed[0] smoothed = processed[1] assert isinstance(original, pd.DataFrame) assert isinstance(smoothed, pd.DataFrame) assert original.shape == smoothed.shape assert len(cases) != len(smoothed)
def main(args): if not os.path.exists(args.params_dir): os.makedirs(args.params_dir) os.environ['TF_CPP_MIN_LOG_LEVEL'] = args.log_level os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu if args.dataset == 'synthetic': global N_VOCAB train_data, val_data, test_data, N_VOCAB = get_data(args) Ss, Ts, Xs, Ys = train_data['Is'], train_data['Ts'], train_data['Xs'], train_data['Ys'] dSs, dTs, dXs, dYs = val_data['Is'], val_data['Ts'], val_data['Xs'], val_data['Ys'] tSs, tTs, tXs, tYs = test_data['Is'], test_data['Ts'], test_data['Xs'], test_data['Ys'] else: Ss, Ts, Is, As, Ys, Yis = make_train_dataset() dSs, dTs, dIs, dAs, dYs, dYis = make_dev_dataset() tSs, tTs, tIs, tAs, tYs, tYis = make_dev_dataset() Xs = AtoI(As) dXs = AtoI(dAs) tXs = AtoI(tAs) print('*' * 80) print('Train samples: {} | Validation samples: {} | Test samples: {}'.format(len(Ss), len(dSs), len(tSs))) print('Vocab size: {}'. format(N_VOCAB)) print('*' * 80) model = build_2d_model(args) param_fpath = '../params/params_unet' + str(args.run) + '.h5' os.system('rm ' + param_fpath) checkpointer = tfk.callbacks.ModelCheckpoint(param_fpath, monitor='val_acc', verbose=0, save_weights_only=True, save_best_only=True, mode='max', period=1) model.fit(x=[Ts, Xs], y=[Ys], validation_data=([dTs, dXs], [dYs]), batch_size=args.batch_size, epochs=args.epochs, callbacks=[checkpointer]) # Loads the weights model.load_weights(param_fpath) # Re-evaluate the model print('*' * 80) print('Validation set performance') model.evaluate([dTs, dXs], [dYs], batch_size=args.batch_size) print('*' * 80) print('Test set performance') print('-' * 80) model.evaluate(x=[tTs, tXs], y=[tYs], batch_size=args.batch_size) print('*' * 80)
parser.add_argument('-p', '--prepick', nargs='*') args = parser.parse_args() if args.sets: import src.sets as s if 'teams' in args.sets: s.make_team_sets() if 'player' in args.sets: s.make_player_sets() if 'picks' in args.sets: s.make_picks_sets() if args.data: import src.data as d if 'get' in args.data: d.get_data() if 'all' in args.data: d.get_all_data() if 'picks' in args.data: d.make_pick_data() if 'influ' in args.data: d.plot_influential() if args.freq: import src.freq as f if 'phrase' in args.freq: f.find_freq_phrases() if 'timeline' in args.freq: f.plot_timelines() if 'top' in args.freq: f.plot_freq_bar()
def train(): data, word2index, index2word = get_data() data = torch.from_numpy(data) data_loader = torch.utils.data.DataLoader(data, batch_size=Config['batch_size'], shuffle=True) model = PoetryModel(len(word2index), Config['embedding_dim'], Config['hidden_dim']) if os.path.exists(Config['model_path']): model.load_state_dict(torch.load(Config['model_path'])) model.to(Config['device']) optimizer = torch.optim.Adam(model.parameters(), lr=Config['learning_rate']) criterion = nn.CrossEntropyLoss() print('training...') for epoch in range(Config['epochs']): log = [] # 日志类数据 for index, data in enumerate(data_loader): # 转置成shape ( sequence_length, batch_size ) sequence_length 为max_len # contiguous 为了改变底层的存储,contiguous重新按顺序存储,否则执行view会报错 data = data.long().transpose(1, 0).contiguous().to(Config['device']) x, y = data[:-1, :], data[1:, :] y_, _ = model(x) optimizer.zero_grad() loss = criterion(y_, y.view(-1)) loss.backward() optimizer.step() if (index + 1) % Config['print_every'] == 0: print('epoch %d, iter %d, loss %.4f' % (epoch, index + 1, loss.item())) temp = {} # 用来保存过程数据 temp['epoch'] = epoch temp['iter'] = index + 1 temp['loss'] = loss.item() # 分别以不同的字开头续写诗 print('普通诗词') temp['普通诗词'] = [] for w in list('山上一把火'): poetry = generate(model, w, word2index, index2word) print(poetry) temp['普通诗词'].append(poetry) # 生成藏头诗 print('藏头诗') poerty_acrostic = generate_acrostic(model, '我想回校', word2index, index2word) print(poerty_acrostic) temp['藏头诗'] = poerty_acrostic log.append(temp) with open(os.path.join(Config['log_json_path'], 'epoch_%d.json' % epoch), 'w', encoding='utf-8', errors='ignore') as f: # 将log转字符串再转字典再保存 json.dump(json.loads(json.dumps(log)), f, ensure_ascii=False) torch.save(model.state_dict(), Config['model_path'])
def one_class(c=0): train, classes = get_data('mnist', 'train') X, y = train[0]['X'], train[0]['y'] shape = train[0]['shape'] I = filter(lambda i: y[i] == c, range(len(y))) return one_mask(X[I])
def main(): df = data.get_data() chart = chart_all(df) chart.save(CFG['OutputFile'], embed_options={'actions': False})
def normalize_adj(adj): """Symmetrically normalize adjacency matrix.""" adj = adj.coalesce() sp_adj = sp.coo_matrix((adj.values(), adj.indices()), shape=list(adj.size())) rowsum = np.array(sp_adj.sum(axis=1)) d_inv_sqrt = np.power(rowsum, -0.5).flatten() d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. d_mat_inv_sqrt = sp.diags(d_inv_sqrt) norm_sp_adj = sp_adj.dot(d_mat_inv_sqrt).transpose().dot( d_mat_inv_sqrt).tocoo() entries = norm_sp_adj.data row = norm_sp_adj.row col = norm_sp_adj.col indices = torch.LongTensor([row, col]) entries = torch.FloatTensor(entries) return sparse.FloatTensor(indices, entries, adj.size()) if __name__ == '__main__': from src.data import Corpus, get_data, get_vocabulary, get_labels vocab = get_vocabulary('data/20ng-vocabulary.txt') labels = get_labels('data/20ng-labels.txt') corpus = get_data('data/test.txt', labels) clean_text(corpus, vocab) adj = build_adj_matrix(corpus, vocab) print(adj)
import os, sys import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn import metrics import random from src.data import get_data from src.models import get_model from src.train.trainer import train from src.train.test import predict from src.explainer import explain #### Model Definition data = get_data(load_existing=True) (X_train, X_test, y_train, y_test, feature_names, label_encoder) = data input_dim, output_dim = X_train.shape[1], y_train.shape[1] print("input_dim, output_dim:", input_dim, output_dim) model = get_model(input_dim, output_dim) model = train(model, data, load_existing=True) # predict(model, label_encoder, data) explain(model, data, n_samples=1, submodular_pick=True)
model.load_state_dict(torch.load(save_path)) return train_losses, val_losses if __name__ == '__main__': from src.data import Corpus, get_data, get_vocabulary, get_labels from src.preprocessing import clean_text, build_adj_matrix from src.models.gcn import GCN seed = 0 val_split = 0.1 vocab = get_vocabulary('data/20ng-vocabulary.txt') labels = get_labels('data/20ng-labels.txt') corpus = get_data('data/train-20news.txt', labels) test_corpus = get_data('data/test-20news.txt', labels) # Mask out unknown words clean_text(corpus, vocab) clean_text(test_corpus, vocab) # Split validation set corpus.shuffle(seed) len_train = int(len(corpus) * (1 - val_split)) train_corpus = Corpus(corpus[:len_train]) val_corpus = Corpus(corpus[len_train:]) num_documents = len(train_corpus) + len(val_corpus) + len(test_corpus) train_adj_matrix = build_adj_matrix(train_corpus, vocab,
import os import numpy as np import tensorflow as tf from tensorflow.keras.preprocessing import image from tensorflow.keras.applications.mobilenet_v2 import preprocess_input from tensorflow.keras.preprocessing.image import ImageDataGenerator from src.model import MobileNet from src.data import get_data from src.data import get_image_value dim = (224,224) model_type = 'Mobilenet' bw = False train_paths, train_labels = get_data('train') train_images = np.array([get_image_value(i, dim, bw, model_type) for i in train_paths]) train_dict = dict(images = train_images, labels = train_labels) valid_paths, valid_labels = get_data('valid') valid_images = np.array([get_image_value(i, dim, bw, model_type) for i in valid_paths]) valid_dict = dict(images = valid_images, labels = valid_labels) test_paths, test_labels = get_data('test') test_images = np.array([get_image_value(i, dim, bw, model_type) for i in test_paths]) test_dict = dict(images = test_images, labels = test_labels) data_dir = './data/images/syn/' img_height = 224 img_width = 224
def test_get_data(): data = get_data() assert isinstance(data, pd.DataFrame)
from src.train import train from src.data import get_train_val_test, get_data import sys if __name__ == '__main__': batch_size = 64 print(sys.argv) model_name = sys.argv[1] dataset = sys.argv[2] epoch = int(sys.argv[3]) if dataset == 'fairface' or dataset == 'utkface': train_set, val_set, test_set = get_train_val_test( batch_size, 'rec/{}'.format(dataset)) else: train_path = sys.argv[4] val_path = sys.argv[5] train_set = get_data(batch_size, train_path, True) val_set = test_set = get_data(batch_size, val_path) train(model_name, dataset, train_set, val_set, test_set, batch_size, epoch=epoch)