def benchmark(model, board, test=True): if isinstance(board, list): data = None for t in board: board_id = t[-1] if test: d = load(*t)[1] else: d = load(*t)[0] if data is None: data = d else: data = pd.concat([data, d]) data['board'] = board_id else: board_id = board[-1] if test: data = load(*board)[1] else: data = load(*board)[0] data['board'] = board_id scores, _ = model.score(data[sensor_features], data[env_features], data['board'], data[Y_features]) result = {} for i, gas in enumerate(["NO2", "O3"]): for score, value in scores.items(): result["%s %s" % (gas, score)] = value[i] return result
def benchmark(model, test, train=False, dump_preds=None): idx = 0 if train else 1 if isinstance(test, list): data = pd.concat([load(*t)[idx] for t in test]) else: data = load(*test)[idx] if hasattr(model, 'features') and model.features is not None: features = model.features else: features = X_features scores, preds = model.score(data[features], data[Y_features]) if dump_preds is not None: dump_preds = Path(dump_preds) if not dump_preds.parent.exists(): os.makedirs(dump_preds.parent, exist_ok=True) with open(dump_preds, 'w') as fp: df = data.copy() df['preds-no2'] = preds[:, 0] df['preds-o3'] = preds[:, 1] fp.write(df.to_csv()) result = {} for i, gas in enumerate(["NO2", "O3"]): for score, value in scores.items(): result["%s %s" % (gas, score)] = value[i] return result
def analyze_tree(config, tree): test_data0 = load(2, 'donovan', 18)[0] test_data1 = load(3, 'elcajon', 18)[0] variances = [] for i, gas in enumerate(Y_features): result = tree.models[0].estimators_[i].apply(test_data0[X_features].as_matrix().astype(np.float32)) tree_var0 = analyze_tree_result(result, test_data0[gas]) tree_var1 = analyze_tree_result(result, test_data1[gas]) variances.append([tree_var0, tree_var1]) return np.stack(variances)
def benchmark(model, test, train=False): idx = 0 if train else 1 if isinstance(test, list): data = pd.concat([load(*t)[idx] for t in test]) else: data = load(*test)[idx] if hasattr(model, 'features') and model.features is not None: features = model.features else: features = X_features score = model.score(data[features], data[Y_features]) return np.stack(score)
def level2(out_dir, X_features): out_path = out_dir / 'level2' / 'models' boards = {} for round in DATA: for location in DATA[round]: for board_id in DATA[round][location]: if board_id not in boards: boards[board_id] = set() boards[board_id].add((round, location)) for board_id in tqdm.tqdm(boards): print("Training board:", board_id) print(boards[board_id]) if len(boards[board_id]) != 3: continue for test_config in boards[board_id]: train_config = boards[board_id] - {test_config} data = pd.concat([load(*(t[0], t[1], board_id))[0] for t in train_config]) # test_data = load(*(test_config[0], test_config[1], board_id)) with fs.open(str(out_dir / 'level2' / 'models' / ('board%u_%s.pkl' % (board_id, '-'.join(map(str, list(train_config))))) ), 'wb') as fp: joblib.dump( ( (board_id, train_config), Model(X_features).fit(data[X_features], data[Y_features]) ), fp )
def main(): """ Train a CO neural network model and save as a pickle file. Note: must add this line to metasense-transfer/metasense/data.py on line 9 data['epa-co'] = data['co'] """ X_features = ['no2', 'o3', 'co', 'temperature', 'absolute-humidity', \ 'pressure'] Y_features = ['epa-co'] nn_model = NeuralNetwork(X_features, nn.Relu(len(X_features[:]), 200) >> \ nn.Relu(200) >> nn.Relu(200) >> nn.Relu(200) >> nn.Linear(1)) round_num = 3 location = 'elcajon' board = 18 train, test = load(round_num, location, board) nn_model.fit(train[X_features], train[Y_features]) joblib.dump((board, nn_model), PICKLE_PATH) print(f'Saved pickle of model to {PICKLE_PATH}') """ print predictions predictions = nn_model.predict(test[X_features]) print('----Predictions on test set----') print(predictions) print('----EPA data for test set----') print(test[Y_features]) """ """ print testing error
def train(out_dir, dim, seed, load_model=None): (out_dir / 'models').mkdir(exist_ok=True, parents=True) boards = {} for round in DATA: for location in DATA[round]: for board_id in DATA[round][location]: if board_id not in boards: boards[board_id] = set() boards[board_id].add((round, location)) if load_model is None: sensor_models = { # board_id: nn.Relu(100) >> nn.Relu(100) >> nn.Linear(dim) for board_id in boards board_id: nn.Linear(3, dim) for board_id in boards } calibration_model = nn.Relu(dim + 3, 50) >> nn.Relu(50) >> nn.Linear(2) split_model = SplitModel(sensor_models, calibration_model, log_dir=out_dir, lr=args.lr, batch_size=args.batch_size) else: split_model = joblib.load(load_model) data = {} print("Filtering round: %s" % args.round) print("Filtering location: %s" % args.location) for board_id in boards: board_train = [] for round, location in boards[board_id]: if (args.round, args.location, args.board) == (round, location, board_id): print("Removing: ", round, location, board_id) continue board_train.append(load(*(round, location, board_id))[0]) if len(board_train) > 0: print("Loaded board[%u]: %u" % (board_id, len(board_train))) board_train = pd.concat(board_train) board_train['board'] = board_id if board_id not in data: data[board_id] = [] data[board_id].append(board_train) data = [pd.concat(ds) for ds in data.values()] max_size = max([d.shape[0] for d in data]) for i in range(len(data)): d = data[i] if d.shape[0] < max_size: data[i] = d.append(d.sample(max_size - d.shape[0], replace=True)) data = pd.concat(data) split_model.fit(data[sensor_features], data[env_features], data['board'], data[Y_features], dump_every=(out_dir / 'models' / 'model_latest.pkl', 1000), n_iters=args.num_iters) joblib.dump(split_model, out_dir / 'models' / 'model.pkl')
def level1(out_dir, X_features): (out_dir / 'level1' / 'models').mkdir(exist_ok=True, parents=True) for round in DATA: for location in DATA[round]: for board_id in DATA[round][location]: print("Training: Round %u - %s - Board %u" % (round, location, board_id)) train, _ = load(round, location, board_id) joblib.dump( ((round, location, board_id), Model(X_features).fit( train[X_features], train[Y_features])), out_dir / 'level1' / 'models' / ('round%u_%s_board%u.pkl' % (round, location, board_id)))
def level1(out_dir, X_features): out_path = out_dir / 'level1' / 'models' for round in DATA: for location in DATA[round]: for board_id in DATA[round][location]: train, _ = load(round, location, board_id) print("Training:", round, location, board_id) with fs.open(str(out_path / ('round%u_%s_board%u.pkl' % (round, location, board_id))), 'wb') as fp: joblib.dump( ( (round, location, board_id), Model(X_features).fit(train[X_features], train[Y_features]) ), fp )
def level3(out_dir, X_features, seed): (out_dir / 'level3' / 'models').mkdir(exist_ok=True, parents=True) boards = {} for round in DATA: for location in DATA[round]: for board_id in DATA[round][location]: if board_id not in boards: boards[board_id] = set() boards[board_id].add((round, location)) for board_id in tqdm.tqdm(boards): data = [load(*(t[0], t[1], board_id)) for t in boards[board_id]] train_data = pd.concat([t[0] for t in data]) joblib.dump((board_id, Model(X_features).fit(train_data[X_features], train_data[Y_features])), out_dir / 'level3' / 'models' / ('board%u.pkl' % board_id))
def benchmark(model, board, test=True): if isinstance(board, list): data = None for t in board: board_id = t[-1] if test: d = load(*t)[1] else: d = load(*t)[0] if data is None: data = d else: data = pd.concat([data, d]) data['board'] = board_id else: board_id = board[-1] if test: data = load(*board)[1] else: data = load(*board)[0] data['board'] = board_id score = model.score(data[sensor_features], data[env_features], data['board'], data[Y_features]) return np.stack(score)
def level3(out_dir, X_features, seed): out_path = out_dir / 'level3' / 'models' boards = {} for round in DATA: for location in DATA[round]: for board_id in DATA[round][location]: if board_id not in boards: boards[board_id] = set() boards[board_id].add((round, location)) for board_id in tqdm.tqdm(boards): data = [load(*(t[0], t[1], board_id)) for t in boards[board_id]] train_data = pd.concat([t[0] for t in data]) with fs.open(str(out_dir / 'level3' / 'models' / ('board%u.pkl' % board_id)), 'wb') as fp: joblib.dump( ( board_id, Model(X_features).fit(train_data[X_features], train_data[Y_features]) ), fp )
import joblib from pathlib import Path from metasense import BOARD_CONFIGURATION as DATA from metasense.data import load from metasense.models import SubuForest X_features = ['no2', 'o3', 'co', 'temperature', 'humidity', 'pressure'] Y_features = ['epa-no2', 'epa-o3'] out_dir = Path('results') / 'subu' (out_dir / 'level1' / 'models').mkdir(exist_ok=True, parents=True) (out_dir / 'level2' / 'models').mkdir(exist_ok=True, parents=True) for round in DATA: if round not in MODELS: MODELS[round] = {} for location in DATA[round]: if location not in MODELS[round]: MODELS[round][location] = {} for board_id in DATA[round][location]: print("Training: Round %u - %s - Board %u" % (round, location, board_id)) data = load(round, location, board_id) joblib.dump( ( (round, location, board_id), SubuForest().fit(data[X_features], data[Y_features]) ), out_dir / 'level1' / 'models' / ('round%u_%s_board%u.pkl' % (round, location, board_id)) )
def fit_nn(X_data, Y_data, net, batch_size=20, n_iters=100000): N = X_data.shape[0] for i in tqdm.trange(n_iters): idx = random.sample(range(N), batch_size) _, l = sess.run([train_op, loss], {X: X_data[idx], Y: Y_data[idx]}) if i % 1000 == 0: print(l) if __name__ == "__main__": args = parse_args() model_path = Path('results') / args.name / 'models' / 'model_latest.pkl' dataset1 = load(args.round1, args.location1, args.board1) dataset2 = load(args.round2, args.location2, args.board2) train = dataset1[0].join(dataset2[0], lsuffix='-left').dropna() test = dataset1[1].join(dataset2[1], lsuffix='-left').dropna() model = joblib.load(model_path) fixer_model = pickle.loads(model.architecture)[0][args.board2] X = T.placeholder(T.floatx(), [None, 3]) Y = T.placeholder(T.floatx(), [None, 3]) Y_ = fixer_model(X) loss = T.mean((Y - Y_)**2) train_op = T.core.train.AdamOptimizer(1e-4).minimize( loss, var_list=fixer_model.get_parameters()) X_data_train = train[[s + '-left' for s in sensor_features]].as_matrix()
def train(out_dir, dim, seed, load_model=None): out_path = out_dir / 'models' if not (out_path).exists(): out_path.mkdir() boards = {} for round in DATA: for location in DATA[round]: for board_id in DATA[round][location]: if board_id not in boards: boards[board_id] = set() boards[board_id].add((round, location)) if load_model is None: sensor_models = { board_id: nn.Relu(100) >> nn.Relu(100) >> nn.Linear(dim) for board_id in boards # board_id: nn.Linear(3, dim) for board_id in boards } calibration_model = nn.Relu(dim + 3, args.hidden_size) >> nn.Relu( args.hidden_size) >> nn.Linear(2) split_model = SplitModel(sensor_models, calibration_model, log_dir=out_dir, lr=args.lr, batch_size=args.batch_size) else: split_model = joblib.load(load_model) data = {} print("Filtering: %s" % ignore) for board_id in boards: board_train = [] for round, location in boards[board_id]: if (round, location, board_id) in ignore: print("Removing: ", round, location, board_id) continue board_train.append(load(*(round, location, board_id))[0]) if len(board_train) > 0: print("Loaded board[%u]: %u" % (board_id, len(board_train))) board_train = pd.concat(board_train) board_train['board'] = board_id if board_id not in data: data[board_id] = [] data[board_id].append(board_train) data = [pd.concat(ds) for ds in data.values()] max_size = max([d.shape[0] for d in data]) for i in range(len(data)): d = data[i] if d.shape[0] < max_size: data[i] = d.append(d.sample(max_size - d.shape[0], replace=True)) data = pd.concat(data) def cb(model): with open(str(out_path / 'model_latest.pkl'), 'wb') as fp: joblib.dump(split_model, fp) print("Total data size:", data.shape) split_model.fit(data[sensor_features], data[env_features], data['board'], data[Y_features], dump_every=(out_dir / 'models' / 'model_latest.pkl', 1000), n_iters=args.num_iters, cb=cb) with open(str(out_path / 'model.pkl'), 'wb') as fp: joblib.dump(split_model, fp)
for location in LOCATION_PLOTS: LOCATION_PLOTS[location][1].set_title("%s - Temperature" % location) for location in LOCATION_NO2: LOCATION_PLOTS[location][1].set_title("%s - NO2" % location) for location in LOCATION_O3: LOCATION_PLOTS[location][1].set_title("%s - O3" % location) for location in HUMIDITY_PLOTS: HUMIDITY_PLOTS[location][1].set_title("%s - Humidity" % location) total_data = None for round in BOARDS: ROUND_PLOTS[round][1].set_title("Round %u - Temperature" % round) ROUND_NO2[round][1].set_title("Round %u - NO2" % round) ROUND_O3[round][1].set_title("Round %u - O3" % round) for location in BOARDS[round]: data = pd.concat(load(round, location, BOARDS[round][location])) data['Location'] = location if total_data is None: total_data = data else: total_data = pd.concat([total_data, data]) temperature = data['temperature'] * 9 / 5 + 32 humidity = data['absolute-humidity'] no2 = data[data["epa-no2"] < data["epa-no2"].quantile(0.99)]["epa-no2"] o3 = data[data["epa-o3"] < data["epa-o3"].quantile(0.99)]["epa-o3"] sns.distplot(temperature, ax=ROUND_PLOTS[round][1], label=location, axlabel='Temperature (F)', kde_kws=dict(bw='silverman'), norm_hist=False)
def level4(out_dir, seed): model_dir = out_dir / 'level1' / 'models' (out_dir / 'level4').mkdir(exist_ok=True) boards = {} for round in DATA: for location in DATA[round]: for board_id in DATA[round][location]: if board_id not in boards: boards[board_id] = set() boards[board_id].add((round, location)) differences = pd.DataFrame(columns=[ 'Model', 'Test', 'NO2 MAE', 'O3 MAE', 'NO2 CvMAE', 'O3 CvMAE' ]) train_results = pd.DataFrame(columns=[ 'Model', 'Test', 'NO2 MAE', 'O3 MAE', 'NO2 CvMAE', 'O3 CvMAE' ]) test_results = pd.DataFrame(columns=[ 'Model', 'Test', 'NO2 MAE', 'O3 MAE', 'NO2 CvMAE', 'O3 CvMAE' ]) models = {} for model_file in tqdm.tqdm(list(model_dir.glob('*'))): board_id, model = joblib.load(model_file) if board_id[2] not in models: models[board_id[2]] = [] models[board_id[2]].append(model) for board_id in tqdm.tqdm(models): for t in boards[board_id]: train_data, test_data = load(*(t[0], t[1], board_id)) train_result = np.mean([ np.stack( model.score(train_data[X_features], train_data[Y_features])) for model in models[board_id] ], axis=0) test_result = np.mean([ np.stack( model.score(test_data[X_features], test_data[Y_features])) for model in models[board_id] ], axis=0) difference = train_result - test_result train_results = train_results.append( { 'Model': board_id, 'Test': (t[0], t[1]), 'NO2 MAE': train_result[0, 0], 'O3 MAE': train_result[0, 1], 'NO2 CvMAE': train_result[1, 0], 'O3 CvMAE': train_result[1, 1], }, ignore_index=True) test_results = test_results.append( { 'Model': board_id, 'Test': (t[0], t[1]), 'NO2 MAE': test_result[0, 0], 'O3 MAE': test_result[0, 1], 'NO2 CvMAE': test_result[1, 0], 'O3 CvMAE': test_result[1, 1], }, ignore_index=True) differences = differences.append( { 'Model': board_id, 'Test': (t[0], t[1]), 'NO2 MAE': difference[0, 0], 'O3 MAE': difference[0, 1], 'NO2 CvMAE': difference[1, 0], 'O3 CvMAE': difference[1, 1], }, ignore_index=True) with open(str(out_dir / 'level4' / 'train.csv'), 'w') as fp: fp.write(train_results.to_csv()) with open(str(out_dir / 'level4' / 'train.tex'), 'w') as fp: fp.write(train_results.to_latex()) with open(str(out_dir / 'level4' / 'test.csv'), 'w') as fp: fp.write(test_results.to_csv()) with open(str(out_dir / 'level4' / 'test.tex'), 'w') as fp: fp.write(test_results.to_latex()) with open(str(out_dir / 'level4' / 'difference.csv'), 'w') as fp: fp.write(differences.to_csv()) with open(str(out_dir / 'level4' / 'difference.tex'), 'w') as fp: fp.write(differences.to_latex())
item_map = lambda x: (int(x[0]), x[1], int(x[2])) for model in tqdm.tqdm(models): # model_triples = set((int(item[0]),item[1], int(item[2])) for name in model.basename().split('-') for item in name.split("_")) model_triples = set( item_map(name.split("_")) for name in model.basename().split('-')) model_path = model / 'models' / 'model.pkl' with fs.open(model_path, 'rb') as fp: model = joblib.load(fp) for model_triple in model_triples: print(model_triple) trainable_triples = [ triple for triple in (all_triples - model_triples) if model_triple[0] == triple[0] and model_triple[1] == triple[1] ] dataset1 = load(*model_triple) dataset2 = [load(*tt) for tt in trainable_triples] train = dataset1[0].join(pd.concat([d[0] for d in dataset2]), lsuffix='-left').dropna() test = dataset1[1].join(pd.concat([d[1] for d in dataset2]), lsuffix='-left').dropna() fixer_model = pickle.loads( model.architecture)[0][trainable_triples[0][2]] np.random.seed(args.seed) T.core.set_random_seed(args.seed) random.seed(args.seed) graph = T.core.Graph() with graph.as_default(): X = T.placeholder(T.floatx(), [None, 3])
def level3(out_dir, experiment_dir, seed): model_dir = experiment_dir / 'level3' / 'models' (out_dir / 'level3').mkdir_p() boards = {} for round in DATA: for location in DATA[round]: for board_id in DATA[round][location]: if board_id not in boards: boards[board_id] = set() boards[board_id].add((round, location)) differences = pd.DataFrame(columns=[ 'Model', 'Test', 'NO2 MAE', 'O3 MAE', 'NO2 CvMAE', 'O3 CvMAE' ]) train_results = pd.DataFrame(columns=[ 'Model', 'Test', 'NO2 MAE', 'O3 MAE', 'NO2 CvMAE', 'O3 CvMAE' ]) test_results = pd.DataFrame(columns=[ 'Model', 'Test', 'NO2 MAE', 'O3 MAE', 'NO2 CvMAE', 'O3 CvMAE' ]) for model_file in tqdm.tqdm(list(fs.glob(str(model_dir / '*')))): with fs.open(model_file, 'rb') as fp: board_id, model = joblib.load(fp) for t in boards[board_id]: train_data, test_data = load(*(t[0], t[1], board_id)) # train_data = pd.concat([t[0] for t in data]) # test_data = pd.concat([t[1] for t in data]) train_result = benchmark( model, (t[0], t[1], board_id), train=True, dump_preds=DATA_DIR / 'level3/round{round}/{location}/board{board}_train.csv'.format( round=t[0], location=t[1], board=board_id)) test_result = benchmark( model, (t[0], t[1], board_id), train=False, dump_preds=DATA_DIR / 'level3/round{round}/{location}/board{board}_test.csv'.format( round=t[0], location=t[1], board=board_id)) difference = {} for k, v in train_result.items(): difference[k] = v - test_result[k] train_results = train_results.append( { 'Model': board_id, 'Test': (t[0], t[1]), **train_result, }, ignore_index=True) test_results = test_results.append( { 'Model': board_id, 'Test': (t[0], t[1]), **test_result, }, ignore_index=True) differences = differences.append( { 'Model': board_id, 'Test': (t[0], t[1]), **difference }, ignore_index=True) with open(str(out_dir / 'level3' / 'train.csv'), 'w') as fp: fp.write(train_results.to_csv()) with open(str(out_dir / 'level3' / 'train.tex'), 'w') as fp: fp.write(train_results.to_latex()) with open(str(out_dir / 'level3' / 'test.csv'), 'w') as fp: fp.write(test_results.to_csv()) with open(str(out_dir / 'level3' / 'test.tex'), 'w') as fp: fp.write(test_results.to_latex()) with open(str(out_dir / 'level3' / 'difference.csv'), 'w') as fp: fp.write(differences.to_csv()) with open(str(out_dir / 'level3' / 'difference.tex'), 'w') as fp: fp.write(differences.to_latex())