Example #1
0
def benchmark(model, board, test=True):
    if isinstance(board, list):
        data = None
        for t in board:
            board_id = t[-1]
            if test:
                d = load(*t)[1]
            else:
                d = load(*t)[0]
            if data is None:
                data = d
            else:
                data = pd.concat([data, d])
        data['board'] = board_id
    else:
        board_id = board[-1]
        if test:
            data = load(*board)[1]
        else:
            data = load(*board)[0]
        data['board'] = board_id
    scores, _ = model.score(data[sensor_features], data[env_features],
                            data['board'], data[Y_features])
    result = {}
    for i, gas in enumerate(["NO2", "O3"]):
        for score, value in scores.items():
            result["%s %s" % (gas, score)] = value[i]
    return result
def benchmark(model, test, train=False, dump_preds=None):
    idx = 0 if train else 1
    if isinstance(test, list):
        data = pd.concat([load(*t)[idx] for t in test])
    else:
        data = load(*test)[idx]
    if hasattr(model, 'features') and model.features is not None:
        features = model.features
    else:
        features = X_features
    scores, preds = model.score(data[features], data[Y_features])
    if dump_preds is not None:
        dump_preds = Path(dump_preds)
        if not dump_preds.parent.exists():
            os.makedirs(dump_preds.parent, exist_ok=True)
        with open(dump_preds, 'w') as fp:
            df = data.copy()
            df['preds-no2'] = preds[:, 0]
            df['preds-o3'] = preds[:, 1]
            fp.write(df.to_csv())
    result = {}
    for i, gas in enumerate(["NO2", "O3"]):
        for score, value in scores.items():
            result["%s %s" % (gas, score)] = value[i]
    return result
Example #3
0
def analyze_tree(config, tree):
    test_data0 = load(2, 'donovan', 18)[0]
    test_data1 = load(3, 'elcajon', 18)[0]
    variances = []
    for i, gas in enumerate(Y_features):
        result = tree.models[0].estimators_[i].apply(test_data0[X_features].as_matrix().astype(np.float32))
        tree_var0 = analyze_tree_result(result, test_data0[gas])
        tree_var1 = analyze_tree_result(result, test_data1[gas])
        variances.append([tree_var0, tree_var1])
    return np.stack(variances)
Example #4
0
def benchmark(model, test, train=False):
    idx = 0 if train else 1
    if isinstance(test, list):
        data = pd.concat([load(*t)[idx] for t in test])
    else:
        data = load(*test)[idx]
    if hasattr(model, 'features') and model.features is not None:
        features = model.features
    else:
        features = X_features
    score = model.score(data[features], data[Y_features])
    return np.stack(score)
Example #5
0
def level2(out_dir, X_features):
    out_path = out_dir / 'level2' / 'models'
    boards = {}
    for round in DATA:
        for location in DATA[round]:
            for board_id in DATA[round][location]:
                if board_id not in boards:
                    boards[board_id] = set()
                boards[board_id].add((round, location))
    for board_id in tqdm.tqdm(boards):
        print("Training board:", board_id)
        print(boards[board_id])
        if len(boards[board_id]) != 3:
            continue
        for test_config in boards[board_id]:
            train_config = boards[board_id] - {test_config}
            data = pd.concat([load(*(t[0], t[1], board_id))[0] for t in train_config])
            # test_data = load(*(test_config[0], test_config[1], board_id))
            with fs.open(str(out_dir / 'level2' / 'models' / ('board%u_%s.pkl' % (board_id, '-'.join(map(str, list(train_config)))))
), 'wb') as fp:
                joblib.dump(
                    (
                        (board_id, train_config),
                        Model(X_features).fit(data[X_features], data[Y_features])
                    ),
                    fp
                )
def main():
    """
  Train a CO neural network model and save as a pickle file.

  Note: must add this line to metasense-transfer/metasense/data.py on line 9
  data['epa-co'] = data['co']
  """

    X_features = ['no2', 'o3', 'co', 'temperature', 'absolute-humidity', \
          'pressure']
    Y_features = ['epa-co']

    nn_model = NeuralNetwork(X_features, nn.Relu(len(X_features[:]), 200) >> \
        nn.Relu(200) >> nn.Relu(200) >> nn.Relu(200) >> nn.Linear(1))

    round_num = 3
    location = 'elcajon'
    board = 18

    train, test = load(round_num, location, board)
    nn_model.fit(train[X_features], train[Y_features])

    joblib.dump((board, nn_model), PICKLE_PATH)
    print(f'Saved pickle of model to {PICKLE_PATH}')
    """ print predictions
  predictions = nn_model.predict(test[X_features])
  print('----Predictions on test set----')
  print(predictions)
  print('----EPA data for test set----')
  print(test[Y_features])
  """
    """ print testing error
Example #7
0
def train(out_dir, dim, seed, load_model=None):
    (out_dir / 'models').mkdir(exist_ok=True, parents=True)
    boards = {}
    for round in DATA:
        for location in DATA[round]:
            for board_id in DATA[round][location]:
                if board_id not in boards:
                    boards[board_id] = set()
                boards[board_id].add((round, location))
    if load_model is None:
        sensor_models = {
            # board_id: nn.Relu(100) >> nn.Relu(100) >> nn.Linear(dim) for board_id in boards
            board_id: nn.Linear(3, dim)
            for board_id in boards
        }
        calibration_model = nn.Relu(dim + 3, 50) >> nn.Relu(50) >> nn.Linear(2)
        split_model = SplitModel(sensor_models,
                                 calibration_model,
                                 log_dir=out_dir,
                                 lr=args.lr,
                                 batch_size=args.batch_size)
    else:
        split_model = joblib.load(load_model)
    data = {}
    print("Filtering round: %s" % args.round)
    print("Filtering location: %s" % args.location)
    for board_id in boards:
        board_train = []
        for round, location in boards[board_id]:
            if (args.round, args.location, args.board) == (round, location,
                                                           board_id):
                print("Removing: ", round, location, board_id)
                continue
            board_train.append(load(*(round, location, board_id))[0])
        if len(board_train) > 0:
            print("Loaded board[%u]: %u" % (board_id, len(board_train)))
            board_train = pd.concat(board_train)
            board_train['board'] = board_id
            if board_id not in data:
                data[board_id] = []
            data[board_id].append(board_train)
    data = [pd.concat(ds) for ds in data.values()]
    max_size = max([d.shape[0] for d in data])
    for i in range(len(data)):
        d = data[i]
        if d.shape[0] < max_size:
            data[i] = d.append(d.sample(max_size - d.shape[0], replace=True))
    data = pd.concat(data)
    split_model.fit(data[sensor_features],
                    data[env_features],
                    data['board'],
                    data[Y_features],
                    dump_every=(out_dir / 'models' / 'model_latest.pkl', 1000),
                    n_iters=args.num_iters)
    joblib.dump(split_model, out_dir / 'models' / 'model.pkl')
def level1(out_dir, X_features):
    (out_dir / 'level1' / 'models').mkdir(exist_ok=True, parents=True)
    for round in DATA:
        for location in DATA[round]:
            for board_id in DATA[round][location]:
                print("Training: Round %u - %s - Board %u" %
                      (round, location, board_id))
                train, _ = load(round, location, board_id)
                joblib.dump(
                    ((round, location, board_id), Model(X_features).fit(
                        train[X_features], train[Y_features])),
                    out_dir / 'level1' / 'models' /
                    ('round%u_%s_board%u.pkl' % (round, location, board_id)))
Example #9
0
def level1(out_dir, X_features):
    out_path = out_dir / 'level1' / 'models'
    for round in DATA:
        for location in DATA[round]:
            for board_id in DATA[round][location]:
                train, _ = load(round, location, board_id)
                print("Training:", round, location, board_id)
                with fs.open(str(out_path / ('round%u_%s_board%u.pkl' % (round, location, board_id))), 'wb') as fp:
                    joblib.dump(
                        (
                            (round, location, board_id),
                            Model(X_features).fit(train[X_features], train[Y_features])
                        ), fp
                    )
Example #10
0
def level3(out_dir, X_features, seed):
    (out_dir / 'level3' / 'models').mkdir(exist_ok=True, parents=True)
    boards = {}
    for round in DATA:
        for location in DATA[round]:
            for board_id in DATA[round][location]:
                if board_id not in boards:
                    boards[board_id] = set()
                boards[board_id].add((round, location))
    for board_id in tqdm.tqdm(boards):
        data = [load(*(t[0], t[1], board_id)) for t in boards[board_id]]
        train_data = pd.concat([t[0] for t in data])
        joblib.dump((board_id, Model(X_features).fit(train_data[X_features],
                                                     train_data[Y_features])),
                    out_dir / 'level3' / 'models' / ('board%u.pkl' % board_id))
Example #11
0
def benchmark(model, board, test=True):
    if isinstance(board, list):
        data = None
        for t in board:
            board_id = t[-1]
            if test:
                d = load(*t)[1]
            else:
                d = load(*t)[0]
            if data is None:
                data = d
            else:
                data = pd.concat([data, d])
        data['board'] = board_id
    else:
        board_id = board[-1]
        if test:
            data = load(*board)[1]
        else:
            data = load(*board)[0]
        data['board'] = board_id
    score = model.score(data[sensor_features], data[env_features],
                        data['board'], data[Y_features])
    return np.stack(score)
Example #12
0
def level3(out_dir, X_features, seed):
    out_path = out_dir / 'level3' / 'models'
    boards = {}
    for round in DATA:
        for location in DATA[round]:
            for board_id in DATA[round][location]:
                if board_id not in boards:
                    boards[board_id] = set()
                boards[board_id].add((round, location))
    for board_id in tqdm.tqdm(boards):
        data = [load(*(t[0], t[1], board_id)) for t in boards[board_id]]
        train_data = pd.concat([t[0] for t in data])
        with fs.open(str(out_dir / 'level3' / 'models' / ('board%u.pkl' % board_id)), 'wb') as fp:
            joblib.dump(
                (
                    board_id,
                    Model(X_features).fit(train_data[X_features], train_data[Y_features])
                ), fp
            )
Example #13
0
import joblib
from pathlib import Path
from metasense import BOARD_CONFIGURATION as DATA
from metasense.data import load
from metasense.models import SubuForest

X_features = ['no2', 'o3', 'co', 'temperature', 'humidity', 'pressure']
Y_features = ['epa-no2', 'epa-o3']


out_dir = Path('results') / 'subu'
(out_dir / 'level1' / 'models').mkdir(exist_ok=True, parents=True)
(out_dir / 'level2' / 'models').mkdir(exist_ok=True, parents=True)

for round in DATA:
    if round not in MODELS:
        MODELS[round] = {}
    for location in DATA[round]:
        if location not in MODELS[round]:
            MODELS[round][location] = {}
        for board_id in DATA[round][location]:
            print("Training: Round %u - %s - Board %u" % (round, location, board_id))
            data = load(round, location, board_id)
            joblib.dump(
                (
                    (round, location, board_id),
                    SubuForest().fit(data[X_features], data[Y_features])
                ), out_dir / 'level1' / 'models' / ('round%u_%s_board%u.pkl' % (round, location, board_id))
            )
def fit_nn(X_data, Y_data, net, batch_size=20, n_iters=100000):

    N = X_data.shape[0]

    for i in tqdm.trange(n_iters):
        idx = random.sample(range(N), batch_size)
        _, l = sess.run([train_op, loss], {X: X_data[idx], Y: Y_data[idx]})
        if i % 1000 == 0:
            print(l)


if __name__ == "__main__":
    args = parse_args()
    model_path = Path('results') / args.name / 'models' / 'model_latest.pkl'
    dataset1 = load(args.round1, args.location1, args.board1)
    dataset2 = load(args.round2, args.location2, args.board2)
    train = dataset1[0].join(dataset2[0], lsuffix='-left').dropna()
    test = dataset1[1].join(dataset2[1], lsuffix='-left').dropna()
    model = joblib.load(model_path)
    fixer_model = pickle.loads(model.architecture)[0][args.board2]

    X = T.placeholder(T.floatx(), [None, 3])
    Y = T.placeholder(T.floatx(), [None, 3])

    Y_ = fixer_model(X)
    loss = T.mean((Y - Y_)**2)
    train_op = T.core.train.AdamOptimizer(1e-4).minimize(
        loss, var_list=fixer_model.get_parameters())

    X_data_train = train[[s + '-left' for s in sensor_features]].as_matrix()
Example #15
0
def train(out_dir, dim, seed, load_model=None):
    out_path = out_dir / 'models'
    if not (out_path).exists():
        out_path.mkdir()
    boards = {}
    for round in DATA:
        for location in DATA[round]:
            for board_id in DATA[round][location]:
                if board_id not in boards:
                    boards[board_id] = set()
                boards[board_id].add((round, location))
    if load_model is None:
        sensor_models = {
            board_id: nn.Relu(100) >> nn.Relu(100) >> nn.Linear(dim)
            for board_id in boards
            # board_id: nn.Linear(3, dim) for board_id in boards
        }
        calibration_model = nn.Relu(dim + 3, args.hidden_size) >> nn.Relu(
            args.hidden_size) >> nn.Linear(2)
        split_model = SplitModel(sensor_models,
                                 calibration_model,
                                 log_dir=out_dir,
                                 lr=args.lr,
                                 batch_size=args.batch_size)
    else:
        split_model = joblib.load(load_model)
    data = {}
    print("Filtering: %s" % ignore)
    for board_id in boards:
        board_train = []
        for round, location in boards[board_id]:
            if (round, location, board_id) in ignore:
                print("Removing: ", round, location, board_id)
                continue
            board_train.append(load(*(round, location, board_id))[0])
        if len(board_train) > 0:
            print("Loaded board[%u]: %u" % (board_id, len(board_train)))
            board_train = pd.concat(board_train)
            board_train['board'] = board_id
            if board_id not in data:
                data[board_id] = []
            data[board_id].append(board_train)
    data = [pd.concat(ds) for ds in data.values()]
    max_size = max([d.shape[0] for d in data])
    for i in range(len(data)):
        d = data[i]
        if d.shape[0] < max_size:
            data[i] = d.append(d.sample(max_size - d.shape[0], replace=True))
    data = pd.concat(data)

    def cb(model):
        with open(str(out_path / 'model_latest.pkl'), 'wb') as fp:
            joblib.dump(split_model, fp)

    print("Total data size:", data.shape)
    split_model.fit(data[sensor_features],
                    data[env_features],
                    data['board'],
                    data[Y_features],
                    dump_every=(out_dir / 'models' / 'model_latest.pkl', 1000),
                    n_iters=args.num_iters,
                    cb=cb)
    with open(str(out_path / 'model.pkl'), 'wb') as fp:
        joblib.dump(split_model, fp)
Example #16
0
for location in LOCATION_PLOTS:
    LOCATION_PLOTS[location][1].set_title("%s - Temperature" % location)
for location in LOCATION_NO2:
    LOCATION_PLOTS[location][1].set_title("%s - NO2" % location)
for location in LOCATION_O3:
    LOCATION_PLOTS[location][1].set_title("%s - O3" % location)
for location in HUMIDITY_PLOTS:
    HUMIDITY_PLOTS[location][1].set_title("%s - Humidity" % location)
total_data = None
for round in BOARDS:
    ROUND_PLOTS[round][1].set_title("Round %u - Temperature" % round)
    ROUND_NO2[round][1].set_title("Round %u - NO2" % round)
    ROUND_O3[round][1].set_title("Round %u - O3" % round)
    for location in BOARDS[round]:
        data = pd.concat(load(round, location, BOARDS[round][location]))
        data['Location'] = location
        if total_data is None:
            total_data = data
        else:
            total_data = pd.concat([total_data, data])
        temperature = data['temperature'] * 9 / 5 + 32
        humidity = data['absolute-humidity']
        no2 = data[data["epa-no2"] < data["epa-no2"].quantile(0.99)]["epa-no2"]
        o3 = data[data["epa-o3"] < data["epa-o3"].quantile(0.99)]["epa-o3"]
        sns.distplot(temperature,
                     ax=ROUND_PLOTS[round][1],
                     label=location,
                     axlabel='Temperature (F)',
                     kde_kws=dict(bw='silverman'),
                     norm_hist=False)
Example #17
0
def level4(out_dir, seed):
    model_dir = out_dir / 'level1' / 'models'
    (out_dir / 'level4').mkdir(exist_ok=True)

    boards = {}
    for round in DATA:
        for location in DATA[round]:
            for board_id in DATA[round][location]:
                if board_id not in boards:
                    boards[board_id] = set()
                boards[board_id].add((round, location))

    differences = pd.DataFrame(columns=[
        'Model', 'Test', 'NO2 MAE', 'O3 MAE', 'NO2 CvMAE', 'O3 CvMAE'
    ])
    train_results = pd.DataFrame(columns=[
        'Model', 'Test', 'NO2 MAE', 'O3 MAE', 'NO2 CvMAE', 'O3 CvMAE'
    ])
    test_results = pd.DataFrame(columns=[
        'Model', 'Test', 'NO2 MAE', 'O3 MAE', 'NO2 CvMAE', 'O3 CvMAE'
    ])
    models = {}
    for model_file in tqdm.tqdm(list(model_dir.glob('*'))):
        board_id, model = joblib.load(model_file)
        if board_id[2] not in models:
            models[board_id[2]] = []
        models[board_id[2]].append(model)
    for board_id in tqdm.tqdm(models):
        for t in boards[board_id]:
            train_data, test_data = load(*(t[0], t[1], board_id))
            train_result = np.mean([
                np.stack(
                    model.score(train_data[X_features],
                                train_data[Y_features]))
                for model in models[board_id]
            ],
                                   axis=0)
            test_result = np.mean([
                np.stack(
                    model.score(test_data[X_features], test_data[Y_features]))
                for model in models[board_id]
            ],
                                  axis=0)
            difference = train_result - test_result
            train_results = train_results.append(
                {
                    'Model': board_id,
                    'Test': (t[0], t[1]),
                    'NO2 MAE': train_result[0, 0],
                    'O3 MAE': train_result[0, 1],
                    'NO2 CvMAE': train_result[1, 0],
                    'O3 CvMAE': train_result[1, 1],
                },
                ignore_index=True)
            test_results = test_results.append(
                {
                    'Model': board_id,
                    'Test': (t[0], t[1]),
                    'NO2 MAE': test_result[0, 0],
                    'O3 MAE': test_result[0, 1],
                    'NO2 CvMAE': test_result[1, 0],
                    'O3 CvMAE': test_result[1, 1],
                },
                ignore_index=True)
            differences = differences.append(
                {
                    'Model': board_id,
                    'Test': (t[0], t[1]),
                    'NO2 MAE': difference[0, 0],
                    'O3 MAE': difference[0, 1],
                    'NO2 CvMAE': difference[1, 0],
                    'O3 CvMAE': difference[1, 1],
                },
                ignore_index=True)
    with open(str(out_dir / 'level4' / 'train.csv'), 'w') as fp:
        fp.write(train_results.to_csv())
    with open(str(out_dir / 'level4' / 'train.tex'), 'w') as fp:
        fp.write(train_results.to_latex())
    with open(str(out_dir / 'level4' / 'test.csv'), 'w') as fp:
        fp.write(test_results.to_csv())
    with open(str(out_dir / 'level4' / 'test.tex'), 'w') as fp:
        fp.write(test_results.to_latex())
    with open(str(out_dir / 'level4' / 'difference.csv'), 'w') as fp:
        fp.write(differences.to_csv())
    with open(str(out_dir / 'level4' / 'difference.tex'), 'w') as fp:
        fp.write(differences.to_latex())
    item_map = lambda x: (int(x[0]), x[1], int(x[2]))
    for model in tqdm.tqdm(models):
        # model_triples = set((int(item[0]),item[1], int(item[2])) for name in model.basename().split('-') for item in name.split("_"))
        model_triples = set(
            item_map(name.split("_")) for name in model.basename().split('-'))
        model_path = model / 'models' / 'model.pkl'
        with fs.open(model_path, 'rb') as fp:
            model = joblib.load(fp)
        for model_triple in model_triples:
            print(model_triple)
            trainable_triples = [
                triple for triple in (all_triples - model_triples) if
                model_triple[0] == triple[0] and model_triple[1] == triple[1]
            ]
            dataset1 = load(*model_triple)
            dataset2 = [load(*tt) for tt in trainable_triples]
            train = dataset1[0].join(pd.concat([d[0] for d in dataset2]),
                                     lsuffix='-left').dropna()
            test = dataset1[1].join(pd.concat([d[1] for d in dataset2]),
                                    lsuffix='-left').dropna()
            fixer_model = pickle.loads(
                model.architecture)[0][trainable_triples[0][2]]

            np.random.seed(args.seed)
            T.core.set_random_seed(args.seed)
            random.seed(args.seed)

            graph = T.core.Graph()
            with graph.as_default():
                X = T.placeholder(T.floatx(), [None, 3])
def level3(out_dir, experiment_dir, seed):
    model_dir = experiment_dir / 'level3' / 'models'
    (out_dir / 'level3').mkdir_p()

    boards = {}
    for round in DATA:
        for location in DATA[round]:
            for board_id in DATA[round][location]:
                if board_id not in boards:
                    boards[board_id] = set()
                boards[board_id].add((round, location))

    differences = pd.DataFrame(columns=[
        'Model', 'Test', 'NO2 MAE', 'O3 MAE', 'NO2 CvMAE', 'O3 CvMAE'
    ])
    train_results = pd.DataFrame(columns=[
        'Model', 'Test', 'NO2 MAE', 'O3 MAE', 'NO2 CvMAE', 'O3 CvMAE'
    ])
    test_results = pd.DataFrame(columns=[
        'Model', 'Test', 'NO2 MAE', 'O3 MAE', 'NO2 CvMAE', 'O3 CvMAE'
    ])
    for model_file in tqdm.tqdm(list(fs.glob(str(model_dir / '*')))):
        with fs.open(model_file, 'rb') as fp:
            board_id, model = joblib.load(fp)
        for t in boards[board_id]:
            train_data, test_data = load(*(t[0], t[1], board_id))
            # train_data = pd.concat([t[0] for t in data])
            # test_data = pd.concat([t[1] for t in data])
            train_result = benchmark(
                model, (t[0], t[1], board_id),
                train=True,
                dump_preds=DATA_DIR /
                'level3/round{round}/{location}/board{board}_train.csv'.format(
                    round=t[0], location=t[1], board=board_id))
            test_result = benchmark(
                model, (t[0], t[1], board_id),
                train=False,
                dump_preds=DATA_DIR /
                'level3/round{round}/{location}/board{board}_test.csv'.format(
                    round=t[0], location=t[1], board=board_id))
            difference = {}
            for k, v in train_result.items():
                difference[k] = v - test_result[k]
            train_results = train_results.append(
                {
                    'Model': board_id,
                    'Test': (t[0], t[1]),
                    **train_result,
                },
                ignore_index=True)
            test_results = test_results.append(
                {
                    'Model': board_id,
                    'Test': (t[0], t[1]),
                    **test_result,
                },
                ignore_index=True)
            differences = differences.append(
                {
                    'Model': board_id,
                    'Test': (t[0], t[1]),
                    **difference
                },
                ignore_index=True)
    with open(str(out_dir / 'level3' / 'train.csv'), 'w') as fp:
        fp.write(train_results.to_csv())
    with open(str(out_dir / 'level3' / 'train.tex'), 'w') as fp:
        fp.write(train_results.to_latex())
    with open(str(out_dir / 'level3' / 'test.csv'), 'w') as fp:
        fp.write(test_results.to_csv())
    with open(str(out_dir / 'level3' / 'test.tex'), 'w') as fp:
        fp.write(test_results.to_latex())
    with open(str(out_dir / 'level3' / 'difference.csv'), 'w') as fp:
        fp.write(differences.to_csv())
    with open(str(out_dir / 'level3' / 'difference.tex'), 'w') as fp:
        fp.write(differences.to_latex())