Example #1
0
def main():
    """
    main processing module
    """
    options = parse_command_line()
    initialize_logging(options.log_name)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    gevent.signal(signal.SIGTERM, _handle_sigterm, halt_event)

    log.info("loading test script from %r" % (options.test_script, ))
    with open(options.test_script, "rt") as input_file:
        test_script = json.load(input_file)

    log.info("loading user identity files from %r" % (
        options.user_identity_dir, 
    ))
    customer_list = list()
    for file_name in os.listdir(options.user_identity_dir):
        if not ('motoboto' in file_name and 'benchmark' in file_name):
            continue
        if options.max_users is not None \
        and len(customer_list) >= options.max_users:
            log.info("breaking at %s users" % (options.max_users, ))
            break

        log.info("loading %r" % (file_name, ))
        user_identity = load_identity_from_file(
            os.path.join(options.user_identity_dir, file_name)
        )
        customer = GreenletCustomer(halt_event, user_identity, test_script)
        customer.link_exception(_unhandled_greenlet_exception)
        customer.start_later(random.uniform(0.0, 15.0))
        customer_list.append(customer)

    log.info("waiting")
    try:
        halt_event.wait(options.test_duration)
    except KeyboardInterrupt:
        log.info("KeyBoardInterrupt")

    if test_script.get("audit-after", False):
        print >>sys.stderr, "run redis_stats_collector, press return when done"
        raw_input("waiting...")

    log.info("setting halt event")
    halt_event.set()
    
    total_error_count = 0
    log.info("joining")
    for customer in customer_list:
        customer.join()
        total_error_count += customer.error_count
    
    log.info("program ends {0} total errors, {1} unhandled exceptions".format(
        total_error_count, _unhandled_exception_count))
    return 0
def main():
    """
    main processing module
    """
    options = parse_command_line()
    initialize_logging(options.log_name)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    gevent.signal(signal.SIGTERM, _handle_sigterm, halt_event)

    log.info("loading test script from %r" % (options.test_script,))
    with open(options.test_script, "rt") as input_file:
        test_script = json.load(input_file)

    log.info("loading user identity files from %r" % (options.user_identity_dir,))
    customer_list = list()
    for file_name in os.listdir(options.user_identity_dir):
        if not ("motoboto" in file_name and "benchmark" in file_name):
            continue
        if options.max_users is not None and len(customer_list) >= options.max_users:
            log.info("breaking at %s users" % (options.max_users,))
            break

        log.info("loading %r" % (file_name,))
        user_identity = load_identity_from_file(os.path.join(options.user_identity_dir, file_name))
        customer = GreenletCustomer(halt_event, user_identity, test_script)
        customer.link_exception(_unhandled_greenlet_exception)
        customer.start_later(random.uniform(0.0, 15.0))
        customer_list.append(customer)

    log.info("waiting")
    try:
        halt_event.wait(options.test_duration)
    except KeyboardInterrupt:
        log.info("KeyBoardInterrupt")

    if test_script.get("audit-after", False):
        print >>sys.stderr, "run redis_stats_collector, press return when done"
        raw_input("waiting...")

    log.info("setting halt event")
    halt_event.set()

    total_error_count = 0
    log.info("joining")
    for customer in customer_list:
        customer.join()
        total_error_count += customer.error_count

    log.info(
        "program ends {0} total errors, {1} unhandled exceptions".format(total_error_count, _unhandled_exception_count)
    )
    return 0
Example #3
0
    all_rows = []
    for ind, game in enumerate(games_list):
        try:
            get_logger().info("match", match=game, date=date)
            Match_fields = df_all[df_all['event_name'] == game]
            row = get_MatchOdds(Match_fields, game, minutes_ahead)
            all_rows.append(row)
        except Exception:
            get_logger().error(traceback.format_exc())
    df_sql = pd.DataFrame(all_rows, columns=columns)
    return df_sql


if __name__ == "__main__":

    initialize_logging("favorites")
    engine = create_engine("mysql://{user}:{pw}@localhost/{db}".format(
        user="******", pw="Betfair", db="betfair"))
    data_loader = Loader()
    result = data_loader.load_data_by_date()
    i = 0
    for data in result:
        i += 1
        try:
            # df = process_day(data, first_IP_else_last_PE.get_MatchOdds_first_IP_else_last_PE)  # 8/11/2017
            # df.to_sql(con = engine, name = "favorite_first_IP_last_PE", if_exists = "append")
            # df = process_day(data, first_IP_else_last_PE.get_MatchOdds_last_PE_else_first_IP)  # 8/11/2017
            # df.to_sql(con = engine, name = "favorite_last_PE_first_IP", if_exists = "append")
            df = process_day(
                data,
                first_IP_else_last_PE.get_MatchOdds_last_PE_else_first_IP,
Example #4
0
def main():

    # a few hard-coded arguments
    percentage_features = 0.1
    top_features = 5

    common.initialize_logging()
    args = parse_command_line()

    logging.info("Reading files in directory \"%s\"..." % args.directory)

    file_names = [f for f in os.listdir(args.directory) if f.endswith(".csv")]

    if args.files:
        logging.info(
            "Only files with the following patterns will be considered: %s" %
            str(args.files))

        selected_file_names = []
        for pattern in args.files:
            selected_file_names.extend(fnmatch.filter(file_names, pattern))

        file_names = selected_file_names

    logging.info("A total of %d files was selected: %s" %
                 (len(file_names), str(file_names)))

    logging.info("Reading files and merging information...")
    all_rankings = dict()
    for f in file_names:

        df = pandas.read_csv(os.path.join(args.directory, f), sep=',')
        all_rankings[f] = df.values

    n_features = len(all_rankings[file_names[0]])
    logging.info("The total number of features is %d" % n_features)

    top_features_percentage = math.ceil(percentage_features * n_features)

    logging.info(
        "Now evaluating features that are in the top %.2f%% (%d) or in the top %d"
        % (percentage_features * 100, top_features_percentage, top_features))

    features_dictionary = {
        f: {
            'top': 0,
            'percentage': 0
        }
        for f in all_rankings[file_names[0]][:, 0]
    }
    for f in all_rankings:

        for j in range(0, max(top_features_percentage, top_features)):

            if j < top_features_percentage:
                features_dictionary[all_rankings[f][j, 0]]['percentage'] += 1
            if j < top_features:
                features_dictionary[all_rankings[f][j, 0]]['top'] += 1

    # and now, some sorting
    list_features_percentage = sorted(
        [[f, features_dictionary[f]['percentage']]
         for f in features_dictionary],
        key=lambda x: x[1],
        reverse=True)
    list_features_top = sorted([[f, features_dictionary[f]['top']]
                                for f in features_dictionary],
                               key=lambda x: x[1],
                               reverse=True)

    logging.info(
        "Features that appear most frequently among the top %.2f%%: %s" %
        (percentage_features * 100, str(list_features_percentage)))
    logging.info("Features that appear most frequently among the top %d: %s" %
                 (top_features, str(list_features_top)))

    return
def main():

    # let's create a folder with a unique name to store results
    folderName = datetime.datetime.now().strftime(
        "%Y-%m-%d-%H-%M") + "-regression"
    if not os.path.exists(folderName): os.makedirs(folderName)

    # initialize logging
    common.initialize_logging(folderName)

    regressorsList = [

        # human-designed regressors
        [
            HumanRegressor("y = a_0 + a_1 * x + a_2 * x**2 + a_3 * x**3",
                           map_variables_to_features={"x": 0}),
            "HumanRegressor"
        ],
        [PolynomialRegressor(2), "PolynomialRegressor2"],
        #[PolynomialRegressor(3), "PolynomialRegressor3"],
        # keras neural network
        #[ANNRegressor(epochs=500, batch_size=32, layers=[16,4]), "KerasRegressor8-4"],
        #[ANNRegressor(epochs=700, batch_size=32, layers=[16,8]), "KerasRegressor16-8"],

        # cross decomposition
        [PLSRegression(), "PLSRegression"],

        # ensemble
        [AdaBoostRegressor(), "AdaBoostRegressor"],
        [BaggingRegressor(), "BaggingRegressor"],
        [BaggingRegressor(n_estimators=100), "BaggingRegressor_100"],
        [BaggingRegressor(n_estimators=300), "BaggingRegressor_300"],
        [ExtraTreesRegressor(), "ExtraTreesRegressor"],
        [GradientBoostingRegressor(), "GradientBoostingRegressor"],
        [RandomForestRegressor(), "RandomForestRegressor"],
        [RandomForestRegressor(n_estimators=100), "RandomForestRegressor_100"],
        [RandomForestRegressor(n_estimators=300), "RandomForestRegressor_300"],

        # isotonic
        #[IsotonicRegression(), "IsotonicRegression"], # apparently wants "X" as a 1d array

        # kernel ridge
        [KernelRidge(), "KernelRidge"],

        # linear
        #[ARDRegression(), "ARDRegression"], # takes too much time to train
        [BayesianRidge(), "BayesianRidge"],
        [ElasticNetCV(), "ElasticNetCV"],
        [LarsCV(), "LarsCV"],
        [LassoCV(), "LassoCV"],
        [LinearRegression(), "LinearRegression"],
        [PassiveAggressiveRegressor(), "PassiveAggressiveRegressor"],

        # neighbors
        [KNeighborsRegressor(), "KNeighborsRegressor"],
        [RadiusNeighborsRegressor(), "RadiusNeighborsRegressor"],

        # neural networks
        #[BernoulliRBM(), "BernoulliRBM"], # has a different interface, no "predict"

        # svm
        [SVR(), "SVR"],
        [LinearSVR(), "LinearSVR"],
        [NuSVR(), "NuSVR"],

        # tree
        [DecisionTreeRegressor(), "DecisionTreeRegressor (max depth 10)"],
        [ExtraTreeRegressor(), "ExtraTreeRegressor"],

        # generalized additive models
        [LinearGAM(n_splines=20), "LinearGAM(n_splines=20)"],

        # gaussian processes
        [
            GaussianProcessRegressor(kernel=DotProduct() + WhiteKernel()),
            "GaussianProcessRegressor"
        ],
    ]

    X = y = X_train = X_test = y_train = y_test = variablesX = variablesY = None
    numberOfSplits = 10  # TODO change number of splits from command line

    if True:
        # this is just a dumb benchmark
        X, y, variablesX, variablesY = common.loadEasyBenchmark()

    if False:
        X, y, variablesX, variablesY = common.loadChristianQuestionnaireRegression(
        )

    if False:
        X, y, variablesX, variablesY = common.loadYongShiDataCalibration2(
            "TIMBER")

    if False:
        X, y, variablesX, variablesY = common.loadLaurentBouvierNewData()

    if False:
        X, y, variablesX, variablesY = common.loadYongShiDataCalibration()

    if False:
        from sklearn.datasets import load_linnerud
        X, y = load_linnerud(return_X_y=True)

    if False:
        X, y, variablesX, variablesY = common.loadYingYingData()

    if False:
        X, y, variablesX, variablesY = common.loadCleaningDataGermanSpecific()
        #X, y, variablesX, variablesY = common.loadCleaningDataGerman()

    if False:
        X, y, variablesX, variablesY = common.loadInsects()

    if False:
        X, y, variablesX, variablesY = common.loadMilkProcessPipesDimensionalAnalysis(
        )
        #X, y, variablesX, variablesY = common.loadMilkProcessPipes()

    if False:  # ecosystem services
        X, y, variablesX, variablesY = common.loadEcosystemServices()

    if False:
        X, y, variablesX, variablesY = common.loadMarcoSoil()

    if False:
        # load dataset
        X, y = common.loadEureqaRegression()
        # randomly split between training and test
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    if False:
        # load dataset
        X_train, X_test, y_train, y_test = common.loadBiscuitExample()
        logging.info("X_train: " + str(X_train.shape))
        logging.info("X_test: " + str(X_test.shape))
        logging.info("y_train: " + str(y_train.shape))
        logging.info("y_test: " + str(y_test.shape))

        # in this particular case, I create the "global" X and y by putting together the two arrays
        X = np.append(X_train, X_test, axis=0)
        y = np.append(y_train, y_test, axis=0)

    if False:
        # load dataset
        X_train, X_test, y_train, y_test = common.loadAromoptiExample()
        logging.info("X_train: " + str(X_train.shape))
        logging.info("X_test: " + str(X_test.shape))
        logging.info("y_train: " + str(y_train.shape))
        logging.info("y_test: " + str(y_test.shape))

        # in this particular case, I create the "global" X and y by putting together the two arrays
        X = np.append(X_train, X_test, axis=0)
        y = np.append(y_train, y_test, axis=0)

    logging.info(
        "Regressing %d output variables, in function of %d input variables..."
        % (y.shape[1], X.shape[1]))

    # if the names of the variables are not specified, let's specify them!
    if variablesY is None:
        variablesY = ["y" + str(i) for i in range(0, len(y[0]))]
    if variablesX is None:
        variablesX = ["X" + str(i) for i in range(0, len(X[0]))]

    performances = dict()

    for variableIndex, variableY in enumerate(variablesY):

        logging.info("** Now evaluating models for variable \"%s\"... **" %
                     variableY)

        # obtain data
        y_ = y[:, variableIndex].ravel()

        # assume here that you will have train/test indexes instead
        # it's also easier for the plots, as we do not face the issue
        # of duplicate values (e.g. same value with two indexes)
        rs = ShuffleSplit(n_splits=numberOfSplits, random_state=42)
        #rs = LeaveOneOut()

        # initialize performance dictionary of arrays
        performances[variableY] = dict()
        for regressor, regressorName in regressorsList:
            performances[variableY][regressorName] = dict()
            performances[variableY][regressorName]["r^2"] = []
            performances[variableY][regressorName]["e.v"] = []
            performances[variableY][regressorName]["mse"] = []
            performances[variableY][regressorName]["mae"] = []
            performances[variableY][regressorName]["predicted"] = []

        # this is used to store all values of each fold, in order; maybe there's a smarter way to do it
        foldPointsInOrder = []

        # and now, for every regressor
        for foldIndex, indexes in enumerate(rs.split(X)):

            train_index, test_index = indexes

            X_train = X[train_index]
            y_train = y_[train_index]
            X_test = X[test_index]
            y_test = y_[test_index]

            # normalize
            logging.info("Normalizing data...")
            scalerX = StandardScaler()
            scalerY = StandardScaler()

            X_train = scalerX.fit_transform(X_train)
            X_test = scalerX.transform(X_test)

            y_train = scalerY.fit_transform(y_train.reshape(-1, 1)).ravel(
            )  # this "reshape/ravel" here is just to avoid warnings, it has no true effect on data
            y_test = scalerY.transform(y_test.reshape(-1, 1)).ravel()

            # now, we store points of the folder in order of how they appear
            foldPointsInOrder.extend(list(scalerY.inverse_transform(y_test)))

            for regressorIndex, regressorData in enumerate(regressorsList):

                regressor = regressorData[0]
                regressorName = regressorData[1]

                logging.info("Fold #%d/%d: training regressor #%d/%d \"%s\"" %
                             (foldIndex + 1, numberOfSplits, regressorIndex +
                              1, len(regressorsList), regressorName))

                try:
                    regressor.fit(X_train, y_train)

                    y_test_predicted = regressor.predict(X_test)
                    r2Test = r2_score(y_test, y_test_predicted)
                    mseTest = mean_squared_error(y_test, y_test_predicted)
                    maeTest = mean_absolute_error(y_test, y_test_predicted)
                    varianceTest = explained_variance_score(
                        y_test, y_test_predicted)

                    logging.info("R^2 score (test): %.4f" % r2Test)
                    logging.info("EV score (test): %.4f" % varianceTest)
                    logging.info("MSE score (test): %.4f" % mseTest)
                    logging.info("MAE score (test): %.4f" % maeTest)

                    # add performance to the list of performances
                    performances[variableY][regressorName]["r^2"].append(
                        r2Test)
                    performances[variableY][regressorName]["e.v"].append(
                        varianceTest)
                    performances[variableY][regressorName]["mse"].append(
                        mseTest)
                    performances[variableY][regressorName]["mae"].append(
                        maeTest)
                    # also record the predictions, to be used later in a global figure
                    performances[variableY][regressorName]["predicted"].extend(
                        list(scalerY.inverse_transform(y_test_predicted)))

                    try:
                        import matplotlib.pyplot as plt

                        # plotting first figure, with points 'x' and 'o'
                        y_predicted = regressor.predict(scalerX.transform(
                            X))  # 'X' was never wholly rescaled before
                        y_train_predicted = regressor.predict(X_train)

                        plt.figure()

                        plt.scatter(train_index,
                                    y_train,
                                    c="gray",
                                    label="training data")
                        plt.scatter(test_index,
                                    y_test,
                                    c="green",
                                    label="test data")

                        plt.plot(np.arange(len(y_predicted)),
                                 y_predicted,
                                 'x',
                                 c="red",
                                 label="regression")
                        plt.xlabel("order of data samples")
                        plt.ylabel("target")
                        plt.title(regressorName + ", R^2=%.4f (test)" % r2Test)
                        plt.legend()

                        logging.info("Saving figure...")
                        plt.savefig(
                            os.path.join(
                                folderName, regressorName + "-" + variableY +
                                "-fold-" + str(foldIndex + 1) + ".pdf"))
                        plt.close()

                        # plotting second figure, with everything close to a middle line
                        plt.figure()

                        plt.plot(y_train,
                                 y_train_predicted,
                                 'r.',
                                 label="training set")  # points
                        plt.plot(y_test,
                                 y_test_predicted,
                                 'go',
                                 label="test set")  # points
                        plt.plot([
                            min(y_train.min(), y_test.min()),
                            max(y_train.max(), y_test.max())
                        ],
                                 [
                                     min(y_train_predicted.min(),
                                         y_test_predicted.min()),
                                     max(y_train_predicted.max(),
                                         y_test_predicted.max())
                                 ], 'k--')  # line

                        plt.xlabel("measured")
                        plt.ylabel("predicted")
                        plt.title(regressorName + " measured vs predicted, " +
                                  variableY)
                        plt.legend(loc='best')

                        plt.savefig(
                            os.path.join(
                                folderName, regressorName + "-" + variableY +
                                "-fold-" + str(foldIndex + 1) + "-b.pdf"))
                        plt.close()

                        # also, save ordered list of features
                        featuresByImportance = relativeFeatureImportance(
                            regressor)

                        # if list exists, write feature importance to disk
                        # TODO horrible hack here, to avoid issues with GAM
                        if len(featuresByImportance
                               ) > 0 and "GAM" not in regressorName:
                            featureImportanceFileName = regressorName + "-" + variableY + "-featureImportance-fold" + str(
                                foldIndex) + ".csv"
                            with open(
                                    os.path.join(folderName,
                                                 featureImportanceFileName),
                                    "w") as fp:
                                fp.write("feature,importance\n")
                                for featureImportance, featureIndex in featuresByImportance:
                                    fp.write(variablesX[int(featureIndex)] +
                                             "," + str(featureImportance) +
                                             "\n")

                    except ImportError:
                        logging.info(
                            "Cannot import matplotlib. Skipping plots...")

                except Exception as e:
                    logging.info("Regressor \"" + regressorName +
                                 "\" failed on variable \"" + variableY +
                                 "\":" + str(e))

    logging.info("Final summary:")
    with open(os.path.join(folderName, "00_summary.txt"), "w") as fp:

        for variableY in variablesY:

            logging.info("For variable \"" + variableY + "\"")
            fp.write("For variable: " + variableY + " = f(" + variablesX[0])
            for i in range(1, len(variablesX)):
                fp.write("," + variablesX[i])
            fp.write(")\n")

            # create a list from the dictionary and sort it
            sortedPerformances = sorted(
                [(performances[variableY][regressorName], regressorName)
                 for regressorName in performances[variableY]],
                key=lambda x: np.mean(x[0]["r^2"]),
                reverse=True)

            for regressorData in sortedPerformances:
                regressorName = regressorData[1]
                regressorScore = regressorData[0]

                r2Mean = np.mean(regressorScore["r^2"])
                r2std = np.std(regressorScore["r^2"])

                varianceMean = np.mean(regressorScore["e.v"])
                varianceStd = np.std(regressorScore["e.v"])

                mseMean = np.mean(regressorScore["mse"])
                mseStd = np.std(regressorScore["mse"])

                maeMean = np.mean(regressorScore["mae"])
                maeStd = np.std(regressorScore["mae"])

                logging.info(
                    "\t- %s, R^2=%.4f (std=%.4f), Explained Variance=%.4f (std=%.4f), MSE=%.4f (std=%.4f), MAE=%.4f (std=%.4f)"
                    % (regressorName, r2Mean, r2std, varianceMean, varianceStd,
                       mseMean, mseStd, maeMean, maeStd))

                fp.write(
                    "\t- %s, R^2=%.4f (std=%.4f), Explained Variance=%.4f (std=%.4f), MSE=%.4f (std=%.4f), MAE=%.4f (std=%.4f)\n"
                    % (regressorName, r2Mean, r2std, varianceMean, varianceStd,
                       mseMean, mseStd, maeMean, maeStd))

                fp.write("\t\t- R^2:" +
                         str(["%.4f" % x
                              for x in regressorScore["r^2"]]) + "\n")
                fp.write("\t\t- E.V.:" +
                         str(["%.4f" % x
                              for x in regressorScore["e.v"]]) + "\n")
                fp.write("\t\t- MSE:" +
                         str(["%.4f" % x
                              for x in regressorScore["mse"]]) + "\n")
                fp.write("\t\t- MAE:" +
                         str(["%.4f" % x
                              for x in regressorScore["mae"]]) + "\n")

                # also, plot a "global" graph
                # issue here, if a regressor fails, you have incongruent matrixes: a check is in order
                # TODO also, the plot looks really bad if some values are negative; turn everything to absolute values?
                if len(foldPointsInOrder) == len(regressorScore["predicted"]):
                    fig = plt.figure()
                    ax = fig.add_subplot(111)

                    #bottom_left_corner = [min(foldPointsInOrder), max(foldPointsInOrder)]
                    #top_right_corner = [min(regressorScore["predicted"]), max(regressorScore["predicted"])]
                    x_bottom_top = [0, max(foldPointsInOrder)]
                    y_bottom_top = [0, max(foldPointsInOrder)]

                    ax.plot(foldPointsInOrder, regressorScore["predicted"],
                            'g.')  # points
                    ax.plot(x_bottom_top, y_bottom_top, 'k--',
                            label="1:1")  # line
                    ax.plot(x_bottom_top,
                            [y_bottom_top[0] * 1.20, y_bottom_top[1] * 1.20],
                            'r--',
                            label="20% error")
                    ax.plot(x_bottom_top,
                            [y_bottom_top[0] * 0.80, y_bottom_top[1] * 0.80],
                            'r--')

                    ax.set_title(regressorName + " measured vs predicted, " +
                                 variableY + " (all test)")
                    ax.set_xlabel("measured")
                    ax.set_ylabel("predicted")
                    ax.legend(loc='best')

                    plt.savefig(
                        os.path.join(
                            folderName,
                            regressorName + "-" + variableY + "-global-b.png"))
                    plt.close(fig)
from strategy_handlers.strategies.DrawChaser import DrawChaser
from strategy_handlers.strategies_manager import strategy_manager
from strategy_handlers.strategies.marketMaker import MarketMaker
from common import initialize_logging
if __name__ == "__main__":
    initialize_logging("draw_better")
    thresh_draw = 2
    sm = strategy_manager(DrawChaser,
                          number_threads=1,
                          thresh_draw=thresh_draw,
                          event_id="28422314")
    sm.manage_strategies()
Example #7
0
def main():
    """
    main processing module
    """
    options = parse_command_line()
    initialize_logging(options.log_name)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    signal.signal(signal.SIGTERM, _create_signal_handler(halt_event))

    log.info("using test script %r" % (options.test_script, ))

    program_dir = os.path.dirname(__file__)
    program_path = os.path.join(program_dir, "customer_process.py")

    customer_process_list = list()
    for file_name in os.listdir(options.user_identity_dir):
        if options.max_users is not None \
        and len(customer_process_list) >= options.max_users:
            log.info("breaking at %s users" % (options.max_users, ))
            break

        log.info("user identity %r" % (file_name, ))
        user_identity_path = os.path.join(options.user_identity_dir, file_name)

        args = [
            sys.executable, program_path, options.test_script,
            user_identity_path
        ]

        environment = {
            "PYTHONPATH"                : os.environ["PYTHONPATH"],
            "NIMBUSIO_LOG_DIR"          : os.environ["NIMBUSIO_LOG_DIR"],
            "NIMBUS_IO_SERVICE_HOST"    : os.environ["NIMBUS_IO_SERVICE_HOST"],
            "NIMBUS_IO_SERVICE_PORT"    : os.environ["NIMBUS_IO_SERVICE_PORT"],
            "NIMBUS_IO_SERVICE_DOMAIN"  : \
                os.environ["NIMBUS_IO_SERVICE_DOMAIN"],
            "NIMBUS_IO_SERVICE_SSL"     : os.environ.get(
                "NIMBUS_IO_SERVICE_SSL", "0"
            )
        }

        process = subprocess.Popen(args, env=environment)
        customer_process_list.append(process)

    log.info("waiting")
    try:
        halt_event.wait(options.test_duration)
    except KeyboardInterrupt:
        log.info("KeyBoardInterrupt")
        halt_event.set()

    log.info("terminating processes")
    for process in customer_process_list:
        process.terminate()

    log.info("waiting for processes")
    for process in customer_process_list:
        process.wait()
        if process.returncode != 0:
            log.error("process returncode %s" % (process.returncode, ))

    log.info("program ends")
    return 0
Example #8
0
from strategy_handlers.strategies_manager import strategy_manager
from strategy_handlers.strategies.marketMaker import MarketMaker
from common import initialize_logging
if __name__ == "__main__":
    initialize_logging("market_maker")
    sm = strategy_manager(MarketMaker)
    sm.manage_strategies()
Example #9
0
 def __init__(self, client):
     initialize_secdb()
     initialize_logging("db_recorder")
     self.betfair_client = client
     self.db_client = DBQuery()
from common import initialize_logging
from strategy_handlers_under_goals.priceChaser import PriceChaser
from strategy_handlers_under_goals.utils import authenticate

initialize_logging("testing_price_chaser")

client = authenticate()

market_id = "1.132089559"
selection_id = 5851482

pc = PriceChaser(client, market_id, selection_id)
# pc.chasePrice(1000, 10)
orders = pc.get_betfair_matches()

print("here")
Example #11
0
from os.path import join

from common import initialize_logging, ROOT_DIR
from predictors.RFRPredictor import RFRPredictor
from strategy_handlers.strategies.UnderGoalsTimer import UnderGoalsTimer
from strategy_handlers.strategies_manager import strategy_manager

if __name__ == "__main__":
    initialize_logging("under_goals_2")

    time_limit = 5
    min_odds = 1.1
    max_odds = 4
    market_under_goals = 2
    min_vol = 0
    stake = 4
    number_parallel_stragies = 10
    market_countries = ["GB", "ES", "DE", "IT", "PT", "FR", "BR", "NL", "BE"]
    market_countries = None
    sm = strategy_manager(UnderGoalsTimer,
                          event_id=None,
                          number_threads=number_parallel_stragies,
                          timer=time_limit,
                          market_under_goals=market_under_goals,
                          stake=stake,
                          min_odds=min_odds,
                          min_vol=min_vol,
                          market_countries=market_countries)
    sm.manage_strategies()
Example #12
0
from os.path import join

from common import initialize_logging, ROOT_DIR
from predictors.RFRPredictor import RFRPredictor
from strategy_handlers.strategies.MLPredictor import MLPredictor
from strategy_handlers.strategies_manager import strategy_manager

if __name__ == "__main__":
    initialize_logging("machine_learning_predictor")

    path_models = join(ROOT_DIR, "predictors\RFPPredictorModels\predictor")
    path_encoder = join(ROOT_DIR, "predictors\RFPPredictorModels\encoder")
    runners = ["1", "x", "2"]
    min_odds = 1.1
    max_odds = 4
    min_pred = 0.5
    scale_with_pred = True
    stake = 4

    predictor = RFRPredictor(path_models,
                             path_encoder,
                             runners,
                             stake=stake,
                             scale_with_pred=scale_with_pred,
                             min_odds=min_odds,
                             max_odds=max_odds,
                             min_pred=min_pred)
    sm = strategy_manager(MLPredictor,
                          number_threads=100,
                          predictor=predictor,
                          max_odds=max_odds,
from common import initialize_logging
from data.hist_trades_export import Recorder
from data.sql_wrapper.connection import initialize_secdb

if __name__ == "__main__":
    initialize_secdb()
    initialize_logging("hist_betfair_data")

    json_trades_recorder = Recorder()
    json_trades_recorder.read_files()



def main():
    """
    main processing module
    """
    options = parse_command_line()
    initialize_logging(options.log_name)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    signal.signal(signal.SIGTERM, _create_signal_handler(halt_event))

    log.info("using test script %r" % (options.test_script, ))

    program_dir = os.path.dirname(__file__)
    program_path = os.path.join(program_dir, "customer_process.py")

    customer_process_list = list()
    for file_name in os.listdir(options.user_identity_dir):
        if options.max_users is not None \
        and len(customer_process_list) >= options.max_users:
            log.info("breaking at %s users" % (options.max_users, ))
            break

        log.info("user identity %r" % (file_name, ))
        user_identity_path = os.path.join(options.user_identity_dir, file_name)

        args = [
            sys.executable,
            program_path,
            options.test_script,
            user_identity_path
        ]

        environment = {
            "PYTHONPATH"                : os.environ["PYTHONPATH"],
            "NIMBUSIO_LOG_DIR"          : os.environ["NIMBUSIO_LOG_DIR"],
            "NIMBUS_IO_SERVICE_HOST"    : os.environ["NIMBUS_IO_SERVICE_HOST"], 
            "NIMBUS_IO_SERVICE_PORT"    : os.environ["NIMBUS_IO_SERVICE_PORT"], 
            "NIMBUS_IO_SERVICE_DOMAIN"  : \
                os.environ["NIMBUS_IO_SERVICE_DOMAIN"], 
            "NIMBUS_IO_SERVICE_SSL"     : os.environ.get(
                "NIMBUS_IO_SERVICE_SSL", "0"
            )
        }        

        process = subprocess.Popen(args, env=environment)
        customer_process_list.append(process)

    log.info("waiting")
    try:
        halt_event.wait(options.test_duration)
    except KeyboardInterrupt:
        log.info("KeyBoardInterrupt")
        halt_event.set()
    
    log.info("terminating processes")
    for process in customer_process_list:
        process.terminate()

    log.info("waiting for processes")
    for process in customer_process_list:
        process.wait()
        if process.returncode != 0:
            log.error("process returncode %s" % (process.returncode, ))
    
    log.info("program ends")
    return 0
Example #15
0
    i = 0
    for root, dirs, files in os.walk(
            "E:\\Betfair Data JSON\\data\\xds\\historic\\BASIC"):
        for name in files:
            if name[1] == '.':
                filepath = os.path.join(root, name)
                newfilepath = os.path.join(outputpath, name + '.decompressed')
                list.append((filepath, newfilepath))
                i += 1

                if i == chunk:
                    yield list
                    list = []
                    i = 0


if __name__ == "__main__":
    initialize_secdb()
    initialize_logging("decompress_betfair_data_2")

    for list_files in file_generator(8):
        jobs = []
        for files in list_files:
            p = multiprocessing.Process(target=decompress_file, args=files)
            jobs.append(p)
            p.start()
        for p in jobs:
            p.join()

    json_trades_recorder.read_json_files()
        self._released = 0
        self._rejected = 0


# Main flow for sender app
if __name__ == "__main__":

    # parsing arguments
    parsed_opts, args = common.parse_opts(True)

    # same message body will be used by all sender instances
    message_body = common.generate_message_body(
        parsed_opts.message_size, "abcdedfgijklmnopqrstuvwxyz0123456789")

    # initializing logging
    common.initialize_logging(parsed_opts.log_level)

    # list of spawned sender processes
    processes = list()

    # Interrupts all running senders
    def interrupt_handler(sig, f):
        global interrupted
        interrupted = True
        for sender in processes:
            sender.interrupt()

    # Capturing SIGINT
    signal.signal(signal.SIGINT, interrupt_handler)
    signal.signal(signal.SIGTERM, interrupt_handler)
Example #17
0
def main() :
	
    # TODO argparse? maybe divide into "fast", "exhaustive", "heuristic"; also add option to specify file from command line (?)
    # hard-coded values here
    n_splits = 10
    final_report_file_name = "00_final_report.txt"
    
    # create uniquely named folder
    folder_name = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M") + "-classification" 
    if not os.path.exists(folder_name) : os.makedirs(folder_name)
    
    # start logging
    common.initialize_logging(folder_name)
    
    # this part can be used by some case studies, storing variable names
    variableY = variablesX = None

    # get data
    logging.info("Loading data...")
    #X, y, variablesX, variablesY = common.loadRallouData() # TODO replace here to load different data
    #X, y, variablesX, variablesY = common.loadCoronaData()
    #X, y, variablesX, variablesY = common.loadXORData()
    #X, y, variablesX, variablesY = common.loadMl4Microbiome()
    X, y, variablesX, variablesY = common.loadMl4MicrobiomeCRC()
    variableY = variablesY[0]

    logging.info("Shape of X: " + str(X.shape))
    logging.info("Shape of y: " + str(y.shape))
    
    # also take note of the classes, they will be useful in the following
    classes, classesCount = np.unique(y, return_counts=True)
	
    # let's output some details about the data, that might be important
    logging.info("Class distribution for the %d classes." % len(classes))
    for i, c in enumerate(classes) :
        logging.info("- Class %d has %.4f of the samples in the dataset." % (c, float(classesCount[i]) / float(y.shape[0])))
	
    # an interesting comparison: what's the performance of a random classifier?
    random_scores = []
    for i in range(0, 100) :
        y_random = np.random.randint( min(classes), high=max(classes)+1, size=y.shape[0] )
        random_scores.append( accuracy_score(y, y_random) )
    logging.info("As a comparison, randomly picking labels 100 times returns an average accuracy of %.4f (+/- %.4f)\n" % (np.mean(random_scores), np.std(random_scores)))

    # check: do the variables' names exist? if not, put some placeholders
    if variableY is None : variableY = "Y"
    if variablesX is None : variablesX = [ "X" + str(i) for i in range(0, X.shape[1]) ]
	
    # this is a utility dictionary, that will be used to create a more concise summary
    performances = dict()

    # perform stratified k-fold cross-validation, but explicitly
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True)
    folds = [ [train_index, test_index] for train_index, test_index in skf.split(X, y) ]
	
    # TODO 	
    # - also call function for feature selection
    # - also keep track of time needed for each classification
    for classifierIndex, classifierOriginal in enumerate(classifier_list) :
		
        classifier = copy.deepcopy( classifierOriginal )
        classifier_string = str(classifier)

        # now, we automatically generate the name of the classifier, using a regular expression
        classifierName = classifier_string.split("(")[0]
        match = regex.search("n_estimators=([0-9]+)", classifier_string)
        if match : classifierName += "_" + match.group(1)

        logging.info("Classifier #%d/%d: %s..." % (classifierIndex+1, len(classifier_list), classifierName))
        
        # initialize local performance
        performances[classifierName] = dict()
        
        # vector that contains (at the moment) two possibilities
        dataPreprocessingOptions = ["raw", "normalized"]
		
        for dataPreprocessing in dataPreprocessingOptions :

            # create list
            performances[classifierName][dataPreprocessing] = []
			
            # this is used to produce a "global" confusion matrix for the classifier
            all_y_test = []
            all_y_pred = []

            # iterate over all splits
            splitIndex = 0
            for train_index, test_index in folds :

                X_train, X_test = X[train_index], X[test_index] 
                y_train, y_test = y[train_index], y[test_index] 
				
                if dataPreprocessing == "normalized" :
                    scaler = StandardScaler()
                    X_train = scaler.fit_transform(X_train)
                    X_test = scaler.transform(X_test)
			
                logging.info("Training classifier %s on split #%d/%d (%s data)..." % (classifierName, splitIndex+1, n_splits, dataPreprocessing))
                try:
                    classifier.fit(X_train, y_train)
					
                    # instead of calling the classifier's "score" method, let's compute accuracy explicitly
                    y_train_pred = classifier.predict(X_train)
                    y_test_pred = classifier.predict(X_test)
					
                    trainScore = accuracy_score(y_train, y_train_pred)
                    testScore = accuracy_score(y_test, y_test_pred)

                    logging.info("Training score: %.4f ; Test score: %.4f", trainScore, testScore)
					
                    # store performance and information
                    performances[classifierName][dataPreprocessing].append( (testScore, trainScore) )
                    all_y_test = np.append(all_y_test, y_test)
                    all_y_pred = np.append(all_y_pred, y_test_pred)
					
                    # get features, ordered by importance 
                    featuresByImportance = get_relative_feature_importance(classifier)
					
                    # write feature importance to disk
                    featureImportanceFileName = classifierName + "-featureImportance-split-" + str(splitIndex) + "." + dataPreprocessing + ".csv"
                    with open( os.path.join(folder_name, featureImportanceFileName), "w") as fp :
                        fp.write("feature,importance\n")
                        for featureImportance, featureIndex in featuresByImportance :
                            fp.write( "\"" + variablesX[int(featureIndex)] + "\"," + str(featureImportance) + "\n")
					
                    # also create and plot confusion matrix for test
                    confusionMatrixFileName = classifierName + "-confusion-matrix-split-" + str(splitIndex) + "-" + dataPreprocessing + ".png"
                    confusionMatrix = confusion_matrix(y_test, y_test_pred)
                    plot_confusion_matrix(confusionMatrix, classes, os.path.join(folder_name, confusionMatrixFileName)) 

                except Exception as e :
                    logging.warning("\tunexpected error: ", e)
				
                splitIndex += 1
	    
            # the classifier might have crashed, so we need a check here
            if len(performances[classifierName][dataPreprocessing]) > 0 :
                testPerformance = [ x[0] for x in performances[classifierName][dataPreprocessing] ]
                logging.info("Average performance (test) of classifier %s on %s data: %.4f (+/- %.4f)" % (classifierName, dataPreprocessing, np.mean(testPerformance), np.std(testPerformance)))
                            
                # plot a last confusion matrix including information for all the splits
                confusionMatrixFileName = classifierName + "-confusion-matrix-" + dataPreprocessing + ".png"
                confusionMatrix = confusion_matrix(all_y_test, all_y_pred)
                plot_confusion_matrix(confusionMatrix, classes, os.path.join(folder_name, confusionMatrixFileName)) 

                # but also save all test predictions, so that other metrics could be computed on top of them
                df = pd.DataFrame()
                df["y_true"] = all_y_test
                df["y_pred"] = all_y_pred
                df.to_csv(os.path.join(folder_name, classifierName + "-test-predictions-" + dataPreprocessing + ".csv"), index=False)

    # now, here we can write a final report
    # first, convert performance dictionary to list
    performances_list = []
    for classifier_name in performances :
        for data_preprocessing in performances[classifier_name] :
            
            if len(performances[classifier_name][data_preprocessing]) > 0 :
                performance = [ x[0] for x in performances[classifier_name][data_preprocessing] ]
                performance_mean = np.mean(performance)
                performance_std = np.std(performance)

                performances_list.append( [classifier_name + " (" + data_preprocessing + ")", performance_mean, performance_std, performance] )

    performances_list = sorted(performances_list, key = lambda x : x[1], reverse=True)

    final_report_file_name = os.path.join(folder_name, final_report_file_name)
    logging.info("Final results (that will also be written to file \"" + final_report_file_name + "\"...")

    with open(final_report_file_name, "w") as fp :

        fp.write("Final accuracy results for variable \"%s\", %d samples, %d classes:\n" % (variableY, len(X), len(classes))) 
        
        for result in performances_list :

            temp_string = "Classifier \"%s\", accuracy: mean=%.4f, stdev=%.4f" % (result[0], result[1], result[2])
            logging.info(temp_string)
            fp.write(temp_string + "\n")

            temp_string = "Folds: %s" % str(result[3])
            logging.info(temp_string)
            fp.write(temp_string + "\n\n")


    #		# this part can be skipped because it's computationally expensive; also skip if there are only two classes
    #		if False :
    #			# multiclass classifiers are treated differently
    #			logging.info("Now training OneVsOneClassifier with " + classifierName + "...")
    #			multiClassClassifier = OneVsOneClassifier( classifierData[0] ) 
    #			multiClassClassifier.fit(trainData, trainLabels)
    #			trainScore = multiClassClassifier.score(trainData, trainLabels)
    #			testScore = multiClassClassifier.score(testData, testLabels)
    #			logging.info("\ttraining score: %.4f ; test score: %.4f", trainScore, testScore)
    #			logging.info(common.classByClassTest(multiClassClassifier, testData, testLabels))
    #
    #			logging.info("Now training OneVsRestClassifier with " + classifierName + "...")
    #			currentClassifier = copy.deepcopy( classifierData[0] )
    #			multiClassClassifier = OneVsRestClassifier( currentClassifier ) 
    #			multiClassClassifier.fit(trainData, trainLabels)
    #			trainScore = multiClassClassifier.score(trainData, trainLabels)
    #			testScore = multiClassClassifier.score(testData, testLabels)
    #			logging.info("\ttraining score: %.4f ; test score: %.4f", trainScore, testScore)
    #			logging.info(common.classByClassTest(multiClassClassifier, testData, testLabels))
    #
    #			logging.info("Now training OutputCodeClassifier with " + classifierName + "...")
    #			multiClassClassifier = OutputCodeClassifier( classifierData[0] ) 
    #			multiClassClassifier.fit(trainData, trainLabels)
    #			trainScore = multiClassClassifier.score(trainData, trainLabels)
    #			testScore = multiClassClassifier.score(testData, testLabels)
    #			logging.info("\ttraining score: %.4f ; test score: %.4f", trainScore, testScore)
    #			logging.info(common.classByClassTest(multiClassClassifier, testData, testLabels))
	
	# TODO save files for each classifier:
	#	- recall?
	#	- accuracy?
	#	- "special" stuff for each classifier, for example the PDF tree for DecisionTree
	
    return