Esempio n. 1
0
def main(tickers_path):
    tickers = {}

    for file_name in get_next_file(tickers_path):
        calc_volatility = TickerVolatility(file_path=file_name)
        ticker, volatility = calc_volatility.run()
        tickers[ticker] = volatility

    print_report(tickers)
def main(tickers_path):
    threads = []
    tickers = {}
    lock = Lock()

    for fname in get_next_file(tickers_path):
        threads.append(
            TickerVolatility(file_path=fname, tickers=tickers, lock=lock))

    [thread.start() for thread in threads]
    [thread.join() for thread in threads]

    print_report(tickers)
Esempio n. 3
0
def main(tickers_path):
    tickers = {}
    collector = Queue(maxsize=2)

    processes = [
        TickerVolatility(file_path=fname, tickers_queue=collector)
        for fname in get_next_file(tickers_path)
    ]

    [process.start() for process in processes]

    while True:
        try:
            ticker, volatility = collector.get(timeout=1)
            tickers[ticker] = volatility
        except Empty:
            if not any(process.is_alive() for process in processes):
                break

    [process.join() for process in processes]

    print_report(tickers)
Esempio n. 4
0
def train_loop(loader, model, epochs = 3, start_epoch = 0, params = None, device = None, loss_func = torch.nn.CrossEntropyLoss, n_tops = [1, 5]):
	L_RATE, DECAY_RATE, DECAY_EPOCHS, WEIGHT_DECAY, SAVE_MODEL, SAVE_MODEL_N, SAVE_MODEL_DIR, MODEL, N_LAYERS = params
	optimizer = optim.Adam(model.parameters(), lr = L_RATE, weight_decay = WEIGHT_DECAY)
	if SAVE_MODEL:
		if MODEL == 'Darknet':
			path = '{}{}'.format(MODEL, N_LAYERS)
		else:
			path = MODEL
		if not os.path.exists('{}/{}'.format(SAVE_MODEL_DIR, path)):
			os.makedirs('{}/{}'.format(SAVE_MODEL_DIR, path))
	losses, accuracies = {'train': [], 'validate': []}, {'train': [], 'validate': []}

	for epoch in range(start_epoch, epochs + start_epoch):
		t = time()
		if (epoch + 1) % DECAY_EPOCHS == 0:
			L_RATE *= (1 - DECAY_RATE)
			optimizer = optim.Adam(model.parameters(), lr=L_RATE, weight_decay=WEIGHT_DECAY)

		# print epoch number
		print_report(part = 'start', epoch = epoch)
		# train loop
		train_epoch(loader['train'], model, optimizer, device, loss_func)
		# print metrics
		val_acc, val_loss = get_accuracy(loader['val'], model, device, dtype, loss_func, n_tops)
		train_acc, train_loss = get_accuracy(loader['train'], model, device, dtype, loss_func, n_tops)
		metrics = train_loss, val_loss, train_acc, val_acc, n_tops
		print_report(part='accuracy', metrics = metrics)
		# collect metrics
		losses['train'].append(train_loss)
		losses['validate'].append(val_loss)
		accuracies['train'].append(train_acc)
		accuracies['validate'].append(val_acc)

		# save models
		if SAVE_MODEL:
			save_checkpoint(model = model, cfg = cfg, epoch = epoch, loss = round(val_loss, 3))
		
		# print time
		print_report(part='end', t = int(time() - t))
Esempio n. 5
0
X_val, y_val = load_validation_data()

# build prediction pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(
        loss='hinge',
        penalty='l2',
        alpha=1e-3,
        random_state=42,
        max_iter=5,
        tol=None,
    ))
])
text_clf.fit(X_train, y_train)

print('=== training error ===')
predictions = text_clf.predict(X_train)
print_report(predictions, y_train)

print('=== validation error ===')
predictions = text_clf.predict(X_val)
print_report(predictions, y_val)

print('=== classification report ===')
print(metrics.classification_report(y_val, predictions))

print('=== confusion matrix ===')
print(metrics.confusion_matrix(y_val, predictions))
Esempio n. 6
0
def train_loop(cfg_path, gpu_n='0'):
    # get configs
    with open(cfg_path, 'r') as stream:
        config = yaml.safe_load(stream)
    device = torch.device('cuda:{}'.format(gpu_n) if config['GPU']
                          and torch.cuda.is_available else 'cpu')
    dtype = torch.float32  # TODO: find out how it affects speed and accuracy
    MODEL = config['MODEL']
    LOAD_MODEL = config['LOAD_MODEL']
    LOAD_MODEL_FILE = config['LOAD_MODEL_FILE']
    SAVE_MODEL = config['SAVE_MODEL']
    SAVE_MODEL_N = config['SAVE_MODEL_N']
    SAVE_MODEL_DIR = config['SAVE_MODEL_DIR']
    DATASET_DIR = config['DATASET_DIR']
    L_RATE = config['LEARNING_RATE']
    DECAY_RATE = config['DECAY_RATE']
    DECAY_EPOCHS = config['DECAY_EPOCHS']
    WEIGHT_DECAY = config['WEIGHT_DECAY']
    EPOCHS = config['EPOCHS']
    BATCH_SIZE = config['BATCH_SIZE']
    NUM_WORKERS = config['NUM_WORKERS']
    PIN_MEMORY = config['PIN_MEMORY']
    CSV_TRAIN = config['CSV_TRAIN']
    CSV_VAL = config['CSV_VAL']

    # set up model
    if MODEL == 'Darknet':
        model = YoloV1(grid_size=7, num_boxes=2, num_classes=20).to(DEVICE)
    elif MODEL == 'VGG':
        pass  # add here VGG backbone
    if LOAD_MODEL:
        # TODO: load backbone
        # cfg_cp, start_epoch = load_checkpoint(LOAD_MODEL_FILE, model)
        val = input(
            'Do you want to use config from checkpoint? Answer "yes" or "no": '
        )
        # if 'val' == 'yes':
        #     L_RATE = cfg_cp['LEARNING_RATE']
        #     DECAY_RATE = cfg_cp['DECAY_RATE']
        #     DECAY_EPOCHS = cfg_cp['DECAY_EPOCHS']
        #     WEIGHT_DECAY = cfg_cp['WEIGHT_DECAY']
        #     BALANCED = cfg_cp['BALANCED_DATASET']
        #     BATCH_SIZE = cfg_cp['BATCH_SIZE']
        #     NUM_WORKERS = cfg_cp['NUM_WORKERS']
        #     PIN_MEMORY = cfg_cp['PIN_MEMORY']
        #     MIN_IMAGES = cfg_cp['MIN_IMAGES']
        #     LOSS = cfg_cp['LOSS']
    else:
        model = init_weights(model)
        start_epoch = 0

    optimizer = optim.Adam(model.parameters(),
                           lr=L_RATE,
                           weight_decay=WEIGHT_DECAY)
    loss_fn = YoloLoss()
    loader_params = BATCH_SIZE, NUM_WORKERS, PIN_MEMORY, DATASET_DIR, CSV_TRAIN, CSV_VAL
    loader = get_dataloader(loader_params)

    # create folder to save models
    if SAVE_MODEL:
        if not os.path.exists('{}/{}'.format(SAVE_MODEL_DIR, MODEL)):
            os.makedirs('{}/{}'.format(SAVE_MODEL_DIR, MODEL))
    losses, accuracies = {
        'train': [],
        'validate': []
    }, {
        'train': [],
        'validate': []
    }

    for epoch in range(start_epoch, EPOCHS + start_epoch):
        t = time()
        if (epoch + 1) % DECAY_EPOCHS == 0:
            L_RATE *= (1 - DECAY_RATE)
            optimizer = optim.Adam(model.parameters(),
                                   lr=L_RATE,
                                   weight_decay=WEIGHT_DECAY)

        # print epoch number
        print_report(part='start', epoch=epoch)
        # train loop
        train_epoch(loader['train'], model, optimizer, device, loss_fn)

        # print metrics
        pred_bb, target_bb = get_bboxes(loader['train'],
                                        model,
                                        iou_threshold=0.5,
                                        threshold=0.4)
        train_map = mean_average_precision(pred_bb,
                                           target_bb,
                                           iou_threshold=0.5,
                                           box_format='midpoint')

        v_pred_bb, v_target_bb = get_bboxes(loader['val'],
                                            model,
                                            iou_threshold=0.5,
                                            threshold=0.4)
        val_map = mean_average_precision(v_pred_bb,
                                         v_target_bb,
                                         iou_threshold=0.5,
                                         box_format='midpoint')

        metrics = -1, -1, train_map, val_map
        print_report(part='accuracy', metrics=metrics)
        # collect metrics
        # losses['train'].append(train_loss)
        # losses['validate'].append(val_loss)
        # accuracies['train'].append(train_acc)
        # accuracies['validate'].append(val_acc)

        # save models
        # if SAVE_MODEL:
        #     save_checkpoint(model=model, cfg=cfg, epoch=epoch, loss=round(val_loss, 3))

        # print time
        print_report(part='end', t=int(time() - t))
Esempio n. 7
0
def train_loop(cfg_path, gpu_n='0', stat_path='stat'):
    # get configs
    with open(cfg_path, 'r') as stream:
        config = yaml.safe_load(stream)
    print()
    print(config)
    print()
    device = torch.device('cuda:{}'.format(gpu_n) if config['GPU']
                          and torch.cuda.is_available else 'cpu')
    dtype = torch.float32  # TODO: find out how it affects speed and accuracy
    MODEL = config['MODEL']
    LOAD_MODEL = config['LOAD_MODEL']
    LOAD_MODEL_FILE = config['LOAD_MODEL_FILE']
    SAVE_MODEL = config['SAVE_MODEL']
    SAVE_MODEL_N = config['SAVE_MODEL_N']
    SAVE_MODEL_DIR = config['SAVE_MODEL_DIR']
    DATASET_DIR = config['DATASET_DIR']

    EPOCHS = config['EPOCHS']
    BATCH_SIZE = config['BATCH_SIZE']
    NUM_WORKERS = config['NUM_WORKERS']
    PIN_MEMORY = config['PIN_MEMORY']
    CSV_TRAIN = config['CSV_TRAIN']
    CSV_VAL = config['CSV_VAL']

    OPTIMIZER = config['OPTIMIZER']

    # create stat file
    nid = create_stat(stat_path, config)

    # set up model
    S, B, C = 7, 2, 20  # TODO: add it to config

    if LOAD_MODEL:
        # TODO: load backbone
        model, cfg_save, epoch = load_checkpoint(
            LOAD_MODEL_FILE,
            device=device,
            S=S,
            B=B,
            C=C,
            cfg=config if MODEL == 'VGG16' else None)
        # TODO: init weight
    else:
        if MODEL == 'Darknet':
            model = YoloV1(grid_size=S, num_boxes=B, num_classes=C).to(device)
        elif MODEL == 'VGG':
            pass  # add here VGG backbone
    model = init_weights(model)
    start_epoch = 0

    loss_fn = YoloLoss()
    loader_params = BATCH_SIZE, NUM_WORKERS, PIN_MEMORY, DATASET_DIR, CSV_TRAIN, CSV_VAL
    loader = get_dataloader(loader_params, my_transforms)

    # create folder to save models
    if SAVE_MODEL:
        if not os.path.exists('{}/{}'.format(SAVE_MODEL_DIR, MODEL)):
            os.makedirs('{}/{}'.format(SAVE_MODEL_DIR, MODEL))
    losses, accuracies = {
        'train': [],
        'validate': []
    }, {
        'train': [],
        'validate': []
    }

    optimizer = None
    opt_lr = None
    for epoch in range(start_epoch, EPOCHS + start_epoch):
        t = time()
        optimizer, opt_name, opt_lr = get_optimizer(optimizer, model,
                                                    OPTIMIZER, epoch, opt_lr)
        print_report(part='start', epoch=epoch)
        # train loop
        train_epoch(loader['train'], model, optimizer, device, loss_fn,
                    (opt_name, opt_lr))

        # print metrics
        train_loss, train_maps = get_metrics_NEW(loader=loader['train'],
                                                 model=model,
                                                 iou_threshold=0.5,
                                                 threshold=0.4,
                                                 device=device,
                                                 loss_func=loss_fn,
                                                 S=S,
                                                 B=B,
                                                 C=C)

        val_loss, val_maps = get_metrics_NEW(loader=loader['val'],
                                             model=model,
                                             iou_threshold=0.5,
                                             threshold=0.4,
                                             device=device,
                                             loss_func=loss_fn)

        metrics = train_loss, val_loss, train_maps, val_maps
        print_report(part='accuracy', metrics=metrics)
        # collect metrics
        losses['train'].append(train_loss)
        losses['validate'].append(val_loss)
        accuracies['train'].append(np.mean(train_maps))
        accuracies['validate'].append(np.mean(val_maps))
        # write stats to CSV
        r = [
            nid,
            datetime.now(), epoch, train_loss, val_loss, train_maps, val_maps
        ]
        with open('{}/stat.csv'.format(stat_path), 'a') as f:
            writer = csv.writer(f)
            writer.writerow(r)

        # save models
        # if SAVE_MODEL:
        #     save_checkpoint(model=model, cfg=cfg, epoch=epoch, loss=round(val_loss, 3))

        # print time
        print_report(part='end', t=int(time() - t))
def main(args):
    # load data
    X, y = utils.load_dataset(args.dataset_path)

    # split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1)

    # Training mode
    if args.mode == 'train':
        print(f'Training data shape {X_train.shape}, {y_train.shape}')
        print(f'Test data shape {X_test.shape}, {y_test.shape}')

        print(f'Training class distrubution')
        class_counter = Counter(y_train)
        for k, v in class_counter.items():
            print(f'  Class={k}, Count={v}')

        # preprocess data
        # missing data, date engineer (date column), class imbalance, text data
        X_train_processed = utils.prepare_inputs(X_train, X_train)
        X_test_processed = utils.prepare_inputs(X_train, X_test)
        y_train_processed = utils.prepare_targets(y_train, y_train)
        y_test_processed = utils.prepare_targets(y_train, y_test)

        # try different classifiers, spot-checking which algorithm perform well
        print("Starting spot check")

        # define models
        models, names = utils.get_models()
        results = list()
        # evaluate each model
        for i in range(len(models)):
            # evaluate the model and store results
            scores = utils.evaluate_model(X_train_processed, y_train_processed,
                                          models[i])
            results.append(scores)
            # summarize performance
            print('>%s %.3f (%.3f)' %
                  (names[i], np.mean(scores), np.std(scores)))

        print("End spot check")

        # get the best model
        print("Start model training")
        model = RandomForestClassifier(n_estimators=100)
        # fit the model
        model.fit(X_train_processed, y_train_processed)

        # save model
        print("Saving model")
        pickle.dump(model, open(args.save_model_path, 'wb'))

        # evaluate the model
        y_train_preds = model.predict(X_train_processed)
        y_test_preds = model.predict(X_test_processed)

        # precition, recall, f-score for training for each category
        print("Evaluating training performance")
        utils.print_report(y_train_processed, y_train_preds, class_counter)

        # precition, recall, f-score for testing for each category
        print("Evaluating testing performance")
        utils.print_report(y_test_processed, y_test_preds, class_counter)

        # confusion matrix
        plot_confusion_matrix(model, X_test_processed, y_test_processed)
        plt.show()

        # feature_importances
        utils.plot_feature_importance(X_train_processed.columns, model)

    # Explain mode, how the classifier come to the decicion
    elif args.mode == 'explain':
        data = pd.DataFrame(data=[args.input.split(',')],
                            columns=X_train.columns)

        data['pagesCount'] = data.pagesCount.astype('int64')
        data['wordCount'] = data.wordCount.astype('int64')
        data['fileSize'] = data.fileSize.astype('int64')

        # process test data
        data_processed = utils.prepare_inputs(X_train, data)

        # load model
        model = pickle.load(open(args.save_model_path, 'rb'))

        # Extract and plot single tree
        estimator = model.estimators_[5]

        fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4, 4), dpi=300)
        tree.plot_tree(estimator,
                       feature_names=data_processed.columns,
                       filled=True)
        plt.show()

        # get decision tree for each tree
        n_nodes_ = [t.tree_.node_count for t in model.estimators_]
        children_left_ = [t.tree_.children_left for t in model.estimators_]
        children_right_ = [t.tree_.children_right for t in model.estimators_]
        feature_ = [t.tree_.feature for t in model.estimators_]
        threshold_ = [t.tree_.threshold for t in model.estimators_]

        for i, e in enumerate(model.estimators_):
            print("Tree %d\n" % i)
            sample_id = 0
            utils.explore_tree(model.estimators_[i],
                               n_nodes_[i],
                               children_left_[i],
                               children_right_[i],
                               feature_[i],
                               threshold_[i],
                               data_processed.columns,
                               data_processed,
                               sample_id=sample_id)

            prediction = model.estimators_[i].predict(data_processed)
            prediction = [int(i) for i in prediction]
            print(
                f'Prediction for sample {sample_id}: {utils.decode_targets(y_train, prediction)[sample_id]}'
            )

            print('\n' * 2)

    # Predict mode
    elif args.mode == 'predict':
        data = pd.DataFrame(data=[args.input.split(',')],
                            columns=X_train.columns)

        data['pagesCount'] = data.pagesCount.astype('int64')
        data['wordCount'] = data.wordCount.astype('int64')
        data['fileSize'] = data.fileSize.astype('int64')

        # process test data
        data_processed = utils.prepare_inputs(X_train, data)

        # load model
        model = pickle.load(open(args.save_model_path, 'rb'))

        # predict data
        prediction = model.predict(data_processed)
        print(f'prediction : {utils.decode_targets(y_train, prediction)[0]}')
def classifer_creation(df_final, target_column):
    X = df_final.loc[:, df_final.columns != target_column]
    Y = df_final.loc[:, df_final.columns == target_column]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.25,
                                                        random_state=42)

    y_train_ravel = y_train.values.ravel()
    print(len(X_train.columns))
    svc_best_param, svc_report = svc_param_selection(X_train, y_train_ravel, 5)
    print('SVM finished')
    dt_best_param, dt_report = descision_tree_param_selection(
        X_train, y_train_ravel, 5)
    print('Descision tree finished')
    bt_best_param, bt_report = boosted_tree_param_selection(
        X_train, y_train_ravel, dt_best_param, 5)
    print('Boosted tree finished')
    knn_best_param, knn_report = k_nearest_neighbors_param_selection(
        X_train, y_train_ravel, 5)
    print('KNN finished')
    ANN_best_param, ANN_report = ANN_param_selection(X_train, y_train_ravel, 5)
    print('ANN finished')

    print_report(svc_report, dt_report, bt_report, knn_report, ANN_report)
    params = {
        'SVC': svc_best_param,
        'DecisionTree': dt_best_param,
        'BoostedTrees': bt_best_param,
        'K-NearestNeighbors': knn_best_param,
        'NeuralNetworks': ANN_best_param
    }

    for best_params in params:
        print('{classifer}: {params}'.format(classifer=best_params,
                                             params=params[best_params]))

    svc = svm.SVC(C=svc_best_param['C'],
                  gamma=svc_best_param['gamma'],
                  kernel=svc_best_param['kernel'])

    dt = DecisionTreeClassifier(
        max_depth=dt_best_param['max_depth'],
        min_samples_split=dt_best_param['min_samples_split'],
        min_samples_leaf=dt_best_param['min_samples_leaf'],
        max_features=dt_best_param['max_features'])

    # check for pruning
    max_prune, prune = get_prune(dt, X_train, y_train, X_test, y_test)
    print(
        'Pruning with highest score: {max_prune}, Pruning with acceptable loss in accuracy: {prune}'
        .format(max_prune=max_prune, prune=prune))

    bt = GradientBoostingClassifier(
        max_depth=bt_best_param['max_depth'],
        min_samples_split=bt_best_param['min_samples_split'],
        min_samples_leaf=bt_best_param['min_samples_leaf'],
        max_features=bt_best_param['max_features'],
        learning_rate=bt_best_param['learning_rate'],
        n_estimators=bt_best_param['n_estimators'])

    knn = KNeighborsClassifier(n_neighbors=knn_best_param['n_neighbors'],
                               p=knn_best_param['p'])

    ANN = create_model(len(X_train.columns))

    learning_curves(svc, dt, bt, knn, ANN, ANN_best_param, X, Y.values.ravel(),
                    target_column)

    start_time = timeit.default_timer()
    svc.fit(X_train, y_train_ravel)
    print("{classifer} took {time} to fit on X and y".format(
        classifer='svc', time=(timeit.default_timer() - start_time)))
    start_time = timeit.default_timer()
    dt.fit(X_train, y_train_ravel)
    print("{classifer} took {time} to fit on X and y".format(
        classifer='dt', time=(timeit.default_timer() - start_time)))
    start_time = timeit.default_timer()
    bt.fit(X_train, y_train_ravel)
    print("{classifer} took {time} to fit on X and y".format(
        classifer='bt', time=(timeit.default_timer() - start_time)))
    start_time = timeit.default_timer()
    knn.fit(X_train, y_train_ravel)
    print("{classifer} took {time} to fit on X and y".format(
        classifer='knn', time=(timeit.default_timer() - start_time)))
    start_time = timeit.default_timer()
    ANN.fit(X_train,
            y_train_ravel,
            epochs=ANN_best_param['epochs'],
            batch_size=ANN_best_param['batch_size'],
            verbose=0)
    print("{classifer} took {time} to fit on X and y".format(
        classifer='ann', time=(timeit.default_timer() - start_time)))

    # PRUNE THE MODEL
    try:
        dot_data = tree.export_graphviz(dt,
                                        out_file=None,
                                        feature_names=X_test.columns)
        graph = graphviz.Source(dot_data)
        graph.render("pre-pruning-" + target_column)

        post_pruning(dt.tree_, 0, max_prune)

        dot_data = tree.export_graphviz(dt,
                                        out_file=None,
                                        feature_names=X_test.columns)
        graph = graphviz.Source(dot_data)
        graph.render("post-pruning-" + target_column)
    except:
        print(
            'if you wish to render the pruning graphs please install graphviz')

    return svc, dt, bt, knn, ANN, X_train, y_train, X_test, y_test
            resources = gscatalog.get_resources(workspace=ws)
            for res in resources:
                fullname = ws.name + ":" + res.name
                if args.item == res.name or args.item == fullname:
                    resource_found = res
                    break
            if resource_found is not None:
                break
        # Still not found ? trying on the layergroups
        # TODO: Cannot update layergroups properties
        # if resource_found is None:
        #     lgroups = gscatalog.get_layergroups()
        #     for lg in lgroups:
        #         if lg.name == args.item:
        #             resource_found = lg
        #             break
        # resource not found in the whole GeoServer
        if resource_found is None:
            logger.error("Ressource \"%s\" not found." % args.item)
            sys.exit()
        # Actually process the provided resources
        else:
            logger.debug("Resource \"%s\" found, processing ..." % resource_found.name)
            try:
                layer = gscatalog.get_layer(resource_found.workspace.name + ":" + resource_found.name)
                gn_to_gs_fix(layer, resource_found, args.dry_run, creds, args.disable_ssl_verification)
            except Inconsistency as e:
                errors.append(e)
    print_report(logger, errors)

Esempio n. 11
0
frames_total = 0
for episode in range(num_episodes):
    state = env.reset()
    score = 0
    while True:
        frames_total += 1
        epsilon = calculate_epsilon(frames_total)
        action = qnet_agent.select_action(state, epsilon)
        new_state, reward, done, info = env.step(action)
        score += reward

        memory.push(state, action, new_state, reward, done)
        qnet_agent.optimize()
        # env.render()
        state = new_state
        if done:
            solved_after = episode
            rewards_total.append(score)
            plot_results(rewards_total)
            mean_reward_100 = sum(rewards_total[-100:]) / 100
            if episode % report_interval == 0 and episode > 0:
                print_report(episode, report_interval, rewards_total,
                             mean_reward_100, epsilon, frames_total)
            break

print("Average reward: %.2f" % (sum(rewards_total) / num_episodes))
print("Average reward (last 100 episodes): ",
      (sum(rewards_total[-100:]) / 100))
print("Solved after %i episodes" % solved_after)
env.close()