def main(): """ main processing module """ options = parse_command_line() initialize_logging(options.log_name) log = logging.getLogger("main") log.info("program starts") halt_event = Event() gevent.signal(signal.SIGTERM, _handle_sigterm, halt_event) log.info("loading test script from %r" % (options.test_script, )) with open(options.test_script, "rt") as input_file: test_script = json.load(input_file) log.info("loading user identity files from %r" % ( options.user_identity_dir, )) customer_list = list() for file_name in os.listdir(options.user_identity_dir): if not ('motoboto' in file_name and 'benchmark' in file_name): continue if options.max_users is not None \ and len(customer_list) >= options.max_users: log.info("breaking at %s users" % (options.max_users, )) break log.info("loading %r" % (file_name, )) user_identity = load_identity_from_file( os.path.join(options.user_identity_dir, file_name) ) customer = GreenletCustomer(halt_event, user_identity, test_script) customer.link_exception(_unhandled_greenlet_exception) customer.start_later(random.uniform(0.0, 15.0)) customer_list.append(customer) log.info("waiting") try: halt_event.wait(options.test_duration) except KeyboardInterrupt: log.info("KeyBoardInterrupt") if test_script.get("audit-after", False): print >>sys.stderr, "run redis_stats_collector, press return when done" raw_input("waiting...") log.info("setting halt event") halt_event.set() total_error_count = 0 log.info("joining") for customer in customer_list: customer.join() total_error_count += customer.error_count log.info("program ends {0} total errors, {1} unhandled exceptions".format( total_error_count, _unhandled_exception_count)) return 0
def main(): """ main processing module """ options = parse_command_line() initialize_logging(options.log_name) log = logging.getLogger("main") log.info("program starts") halt_event = Event() gevent.signal(signal.SIGTERM, _handle_sigterm, halt_event) log.info("loading test script from %r" % (options.test_script,)) with open(options.test_script, "rt") as input_file: test_script = json.load(input_file) log.info("loading user identity files from %r" % (options.user_identity_dir,)) customer_list = list() for file_name in os.listdir(options.user_identity_dir): if not ("motoboto" in file_name and "benchmark" in file_name): continue if options.max_users is not None and len(customer_list) >= options.max_users: log.info("breaking at %s users" % (options.max_users,)) break log.info("loading %r" % (file_name,)) user_identity = load_identity_from_file(os.path.join(options.user_identity_dir, file_name)) customer = GreenletCustomer(halt_event, user_identity, test_script) customer.link_exception(_unhandled_greenlet_exception) customer.start_later(random.uniform(0.0, 15.0)) customer_list.append(customer) log.info("waiting") try: halt_event.wait(options.test_duration) except KeyboardInterrupt: log.info("KeyBoardInterrupt") if test_script.get("audit-after", False): print >>sys.stderr, "run redis_stats_collector, press return when done" raw_input("waiting...") log.info("setting halt event") halt_event.set() total_error_count = 0 log.info("joining") for customer in customer_list: customer.join() total_error_count += customer.error_count log.info( "program ends {0} total errors, {1} unhandled exceptions".format(total_error_count, _unhandled_exception_count) ) return 0
all_rows = [] for ind, game in enumerate(games_list): try: get_logger().info("match", match=game, date=date) Match_fields = df_all[df_all['event_name'] == game] row = get_MatchOdds(Match_fields, game, minutes_ahead) all_rows.append(row) except Exception: get_logger().error(traceback.format_exc()) df_sql = pd.DataFrame(all_rows, columns=columns) return df_sql if __name__ == "__main__": initialize_logging("favorites") engine = create_engine("mysql://{user}:{pw}@localhost/{db}".format( user="******", pw="Betfair", db="betfair")) data_loader = Loader() result = data_loader.load_data_by_date() i = 0 for data in result: i += 1 try: # df = process_day(data, first_IP_else_last_PE.get_MatchOdds_first_IP_else_last_PE) # 8/11/2017 # df.to_sql(con = engine, name = "favorite_first_IP_last_PE", if_exists = "append") # df = process_day(data, first_IP_else_last_PE.get_MatchOdds_last_PE_else_first_IP) # 8/11/2017 # df.to_sql(con = engine, name = "favorite_last_PE_first_IP", if_exists = "append") df = process_day( data, first_IP_else_last_PE.get_MatchOdds_last_PE_else_first_IP,
def main(): # a few hard-coded arguments percentage_features = 0.1 top_features = 5 common.initialize_logging() args = parse_command_line() logging.info("Reading files in directory \"%s\"..." % args.directory) file_names = [f for f in os.listdir(args.directory) if f.endswith(".csv")] if args.files: logging.info( "Only files with the following patterns will be considered: %s" % str(args.files)) selected_file_names = [] for pattern in args.files: selected_file_names.extend(fnmatch.filter(file_names, pattern)) file_names = selected_file_names logging.info("A total of %d files was selected: %s" % (len(file_names), str(file_names))) logging.info("Reading files and merging information...") all_rankings = dict() for f in file_names: df = pandas.read_csv(os.path.join(args.directory, f), sep=',') all_rankings[f] = df.values n_features = len(all_rankings[file_names[0]]) logging.info("The total number of features is %d" % n_features) top_features_percentage = math.ceil(percentage_features * n_features) logging.info( "Now evaluating features that are in the top %.2f%% (%d) or in the top %d" % (percentage_features * 100, top_features_percentage, top_features)) features_dictionary = { f: { 'top': 0, 'percentage': 0 } for f in all_rankings[file_names[0]][:, 0] } for f in all_rankings: for j in range(0, max(top_features_percentage, top_features)): if j < top_features_percentage: features_dictionary[all_rankings[f][j, 0]]['percentage'] += 1 if j < top_features: features_dictionary[all_rankings[f][j, 0]]['top'] += 1 # and now, some sorting list_features_percentage = sorted( [[f, features_dictionary[f]['percentage']] for f in features_dictionary], key=lambda x: x[1], reverse=True) list_features_top = sorted([[f, features_dictionary[f]['top']] for f in features_dictionary], key=lambda x: x[1], reverse=True) logging.info( "Features that appear most frequently among the top %.2f%%: %s" % (percentage_features * 100, str(list_features_percentage))) logging.info("Features that appear most frequently among the top %d: %s" % (top_features, str(list_features_top))) return
def main(): # let's create a folder with a unique name to store results folderName = datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M") + "-regression" if not os.path.exists(folderName): os.makedirs(folderName) # initialize logging common.initialize_logging(folderName) regressorsList = [ # human-designed regressors [ HumanRegressor("y = a_0 + a_1 * x + a_2 * x**2 + a_3 * x**3", map_variables_to_features={"x": 0}), "HumanRegressor" ], [PolynomialRegressor(2), "PolynomialRegressor2"], #[PolynomialRegressor(3), "PolynomialRegressor3"], # keras neural network #[ANNRegressor(epochs=500, batch_size=32, layers=[16,4]), "KerasRegressor8-4"], #[ANNRegressor(epochs=700, batch_size=32, layers=[16,8]), "KerasRegressor16-8"], # cross decomposition [PLSRegression(), "PLSRegression"], # ensemble [AdaBoostRegressor(), "AdaBoostRegressor"], [BaggingRegressor(), "BaggingRegressor"], [BaggingRegressor(n_estimators=100), "BaggingRegressor_100"], [BaggingRegressor(n_estimators=300), "BaggingRegressor_300"], [ExtraTreesRegressor(), "ExtraTreesRegressor"], [GradientBoostingRegressor(), "GradientBoostingRegressor"], [RandomForestRegressor(), "RandomForestRegressor"], [RandomForestRegressor(n_estimators=100), "RandomForestRegressor_100"], [RandomForestRegressor(n_estimators=300), "RandomForestRegressor_300"], # isotonic #[IsotonicRegression(), "IsotonicRegression"], # apparently wants "X" as a 1d array # kernel ridge [KernelRidge(), "KernelRidge"], # linear #[ARDRegression(), "ARDRegression"], # takes too much time to train [BayesianRidge(), "BayesianRidge"], [ElasticNetCV(), "ElasticNetCV"], [LarsCV(), "LarsCV"], [LassoCV(), "LassoCV"], [LinearRegression(), "LinearRegression"], [PassiveAggressiveRegressor(), "PassiveAggressiveRegressor"], # neighbors [KNeighborsRegressor(), "KNeighborsRegressor"], [RadiusNeighborsRegressor(), "RadiusNeighborsRegressor"], # neural networks #[BernoulliRBM(), "BernoulliRBM"], # has a different interface, no "predict" # svm [SVR(), "SVR"], [LinearSVR(), "LinearSVR"], [NuSVR(), "NuSVR"], # tree [DecisionTreeRegressor(), "DecisionTreeRegressor (max depth 10)"], [ExtraTreeRegressor(), "ExtraTreeRegressor"], # generalized additive models [LinearGAM(n_splines=20), "LinearGAM(n_splines=20)"], # gaussian processes [ GaussianProcessRegressor(kernel=DotProduct() + WhiteKernel()), "GaussianProcessRegressor" ], ] X = y = X_train = X_test = y_train = y_test = variablesX = variablesY = None numberOfSplits = 10 # TODO change number of splits from command line if True: # this is just a dumb benchmark X, y, variablesX, variablesY = common.loadEasyBenchmark() if False: X, y, variablesX, variablesY = common.loadChristianQuestionnaireRegression( ) if False: X, y, variablesX, variablesY = common.loadYongShiDataCalibration2( "TIMBER") if False: X, y, variablesX, variablesY = common.loadLaurentBouvierNewData() if False: X, y, variablesX, variablesY = common.loadYongShiDataCalibration() if False: from sklearn.datasets import load_linnerud X, y = load_linnerud(return_X_y=True) if False: X, y, variablesX, variablesY = common.loadYingYingData() if False: X, y, variablesX, variablesY = common.loadCleaningDataGermanSpecific() #X, y, variablesX, variablesY = common.loadCleaningDataGerman() if False: X, y, variablesX, variablesY = common.loadInsects() if False: X, y, variablesX, variablesY = common.loadMilkProcessPipesDimensionalAnalysis( ) #X, y, variablesX, variablesY = common.loadMilkProcessPipes() if False: # ecosystem services X, y, variablesX, variablesY = common.loadEcosystemServices() if False: X, y, variablesX, variablesY = common.loadMarcoSoil() if False: # load dataset X, y = common.loadEureqaRegression() # randomly split between training and test #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) if False: # load dataset X_train, X_test, y_train, y_test = common.loadBiscuitExample() logging.info("X_train: " + str(X_train.shape)) logging.info("X_test: " + str(X_test.shape)) logging.info("y_train: " + str(y_train.shape)) logging.info("y_test: " + str(y_test.shape)) # in this particular case, I create the "global" X and y by putting together the two arrays X = np.append(X_train, X_test, axis=0) y = np.append(y_train, y_test, axis=0) if False: # load dataset X_train, X_test, y_train, y_test = common.loadAromoptiExample() logging.info("X_train: " + str(X_train.shape)) logging.info("X_test: " + str(X_test.shape)) logging.info("y_train: " + str(y_train.shape)) logging.info("y_test: " + str(y_test.shape)) # in this particular case, I create the "global" X and y by putting together the two arrays X = np.append(X_train, X_test, axis=0) y = np.append(y_train, y_test, axis=0) logging.info( "Regressing %d output variables, in function of %d input variables..." % (y.shape[1], X.shape[1])) # if the names of the variables are not specified, let's specify them! if variablesY is None: variablesY = ["y" + str(i) for i in range(0, len(y[0]))] if variablesX is None: variablesX = ["X" + str(i) for i in range(0, len(X[0]))] performances = dict() for variableIndex, variableY in enumerate(variablesY): logging.info("** Now evaluating models for variable \"%s\"... **" % variableY) # obtain data y_ = y[:, variableIndex].ravel() # assume here that you will have train/test indexes instead # it's also easier for the plots, as we do not face the issue # of duplicate values (e.g. same value with two indexes) rs = ShuffleSplit(n_splits=numberOfSplits, random_state=42) #rs = LeaveOneOut() # initialize performance dictionary of arrays performances[variableY] = dict() for regressor, regressorName in regressorsList: performances[variableY][regressorName] = dict() performances[variableY][regressorName]["r^2"] = [] performances[variableY][regressorName]["e.v"] = [] performances[variableY][regressorName]["mse"] = [] performances[variableY][regressorName]["mae"] = [] performances[variableY][regressorName]["predicted"] = [] # this is used to store all values of each fold, in order; maybe there's a smarter way to do it foldPointsInOrder = [] # and now, for every regressor for foldIndex, indexes in enumerate(rs.split(X)): train_index, test_index = indexes X_train = X[train_index] y_train = y_[train_index] X_test = X[test_index] y_test = y_[test_index] # normalize logging.info("Normalizing data...") scalerX = StandardScaler() scalerY = StandardScaler() X_train = scalerX.fit_transform(X_train) X_test = scalerX.transform(X_test) y_train = scalerY.fit_transform(y_train.reshape(-1, 1)).ravel( ) # this "reshape/ravel" here is just to avoid warnings, it has no true effect on data y_test = scalerY.transform(y_test.reshape(-1, 1)).ravel() # now, we store points of the folder in order of how they appear foldPointsInOrder.extend(list(scalerY.inverse_transform(y_test))) for regressorIndex, regressorData in enumerate(regressorsList): regressor = regressorData[0] regressorName = regressorData[1] logging.info("Fold #%d/%d: training regressor #%d/%d \"%s\"" % (foldIndex + 1, numberOfSplits, regressorIndex + 1, len(regressorsList), regressorName)) try: regressor.fit(X_train, y_train) y_test_predicted = regressor.predict(X_test) r2Test = r2_score(y_test, y_test_predicted) mseTest = mean_squared_error(y_test, y_test_predicted) maeTest = mean_absolute_error(y_test, y_test_predicted) varianceTest = explained_variance_score( y_test, y_test_predicted) logging.info("R^2 score (test): %.4f" % r2Test) logging.info("EV score (test): %.4f" % varianceTest) logging.info("MSE score (test): %.4f" % mseTest) logging.info("MAE score (test): %.4f" % maeTest) # add performance to the list of performances performances[variableY][regressorName]["r^2"].append( r2Test) performances[variableY][regressorName]["e.v"].append( varianceTest) performances[variableY][regressorName]["mse"].append( mseTest) performances[variableY][regressorName]["mae"].append( maeTest) # also record the predictions, to be used later in a global figure performances[variableY][regressorName]["predicted"].extend( list(scalerY.inverse_transform(y_test_predicted))) try: import matplotlib.pyplot as plt # plotting first figure, with points 'x' and 'o' y_predicted = regressor.predict(scalerX.transform( X)) # 'X' was never wholly rescaled before y_train_predicted = regressor.predict(X_train) plt.figure() plt.scatter(train_index, y_train, c="gray", label="training data") plt.scatter(test_index, y_test, c="green", label="test data") plt.plot(np.arange(len(y_predicted)), y_predicted, 'x', c="red", label="regression") plt.xlabel("order of data samples") plt.ylabel("target") plt.title(regressorName + ", R^2=%.4f (test)" % r2Test) plt.legend() logging.info("Saving figure...") plt.savefig( os.path.join( folderName, regressorName + "-" + variableY + "-fold-" + str(foldIndex + 1) + ".pdf")) plt.close() # plotting second figure, with everything close to a middle line plt.figure() plt.plot(y_train, y_train_predicted, 'r.', label="training set") # points plt.plot(y_test, y_test_predicted, 'go', label="test set") # points plt.plot([ min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max()) ], [ min(y_train_predicted.min(), y_test_predicted.min()), max(y_train_predicted.max(), y_test_predicted.max()) ], 'k--') # line plt.xlabel("measured") plt.ylabel("predicted") plt.title(regressorName + " measured vs predicted, " + variableY) plt.legend(loc='best') plt.savefig( os.path.join( folderName, regressorName + "-" + variableY + "-fold-" + str(foldIndex + 1) + "-b.pdf")) plt.close() # also, save ordered list of features featuresByImportance = relativeFeatureImportance( regressor) # if list exists, write feature importance to disk # TODO horrible hack here, to avoid issues with GAM if len(featuresByImportance ) > 0 and "GAM" not in regressorName: featureImportanceFileName = regressorName + "-" + variableY + "-featureImportance-fold" + str( foldIndex) + ".csv" with open( os.path.join(folderName, featureImportanceFileName), "w") as fp: fp.write("feature,importance\n") for featureImportance, featureIndex in featuresByImportance: fp.write(variablesX[int(featureIndex)] + "," + str(featureImportance) + "\n") except ImportError: logging.info( "Cannot import matplotlib. Skipping plots...") except Exception as e: logging.info("Regressor \"" + regressorName + "\" failed on variable \"" + variableY + "\":" + str(e)) logging.info("Final summary:") with open(os.path.join(folderName, "00_summary.txt"), "w") as fp: for variableY in variablesY: logging.info("For variable \"" + variableY + "\"") fp.write("For variable: " + variableY + " = f(" + variablesX[0]) for i in range(1, len(variablesX)): fp.write("," + variablesX[i]) fp.write(")\n") # create a list from the dictionary and sort it sortedPerformances = sorted( [(performances[variableY][regressorName], regressorName) for regressorName in performances[variableY]], key=lambda x: np.mean(x[0]["r^2"]), reverse=True) for regressorData in sortedPerformances: regressorName = regressorData[1] regressorScore = regressorData[0] r2Mean = np.mean(regressorScore["r^2"]) r2std = np.std(regressorScore["r^2"]) varianceMean = np.mean(regressorScore["e.v"]) varianceStd = np.std(regressorScore["e.v"]) mseMean = np.mean(regressorScore["mse"]) mseStd = np.std(regressorScore["mse"]) maeMean = np.mean(regressorScore["mae"]) maeStd = np.std(regressorScore["mae"]) logging.info( "\t- %s, R^2=%.4f (std=%.4f), Explained Variance=%.4f (std=%.4f), MSE=%.4f (std=%.4f), MAE=%.4f (std=%.4f)" % (regressorName, r2Mean, r2std, varianceMean, varianceStd, mseMean, mseStd, maeMean, maeStd)) fp.write( "\t- %s, R^2=%.4f (std=%.4f), Explained Variance=%.4f (std=%.4f), MSE=%.4f (std=%.4f), MAE=%.4f (std=%.4f)\n" % (regressorName, r2Mean, r2std, varianceMean, varianceStd, mseMean, mseStd, maeMean, maeStd)) fp.write("\t\t- R^2:" + str(["%.4f" % x for x in regressorScore["r^2"]]) + "\n") fp.write("\t\t- E.V.:" + str(["%.4f" % x for x in regressorScore["e.v"]]) + "\n") fp.write("\t\t- MSE:" + str(["%.4f" % x for x in regressorScore["mse"]]) + "\n") fp.write("\t\t- MAE:" + str(["%.4f" % x for x in regressorScore["mae"]]) + "\n") # also, plot a "global" graph # issue here, if a regressor fails, you have incongruent matrixes: a check is in order # TODO also, the plot looks really bad if some values are negative; turn everything to absolute values? if len(foldPointsInOrder) == len(regressorScore["predicted"]): fig = plt.figure() ax = fig.add_subplot(111) #bottom_left_corner = [min(foldPointsInOrder), max(foldPointsInOrder)] #top_right_corner = [min(regressorScore["predicted"]), max(regressorScore["predicted"])] x_bottom_top = [0, max(foldPointsInOrder)] y_bottom_top = [0, max(foldPointsInOrder)] ax.plot(foldPointsInOrder, regressorScore["predicted"], 'g.') # points ax.plot(x_bottom_top, y_bottom_top, 'k--', label="1:1") # line ax.plot(x_bottom_top, [y_bottom_top[0] * 1.20, y_bottom_top[1] * 1.20], 'r--', label="20% error") ax.plot(x_bottom_top, [y_bottom_top[0] * 0.80, y_bottom_top[1] * 0.80], 'r--') ax.set_title(regressorName + " measured vs predicted, " + variableY + " (all test)") ax.set_xlabel("measured") ax.set_ylabel("predicted") ax.legend(loc='best') plt.savefig( os.path.join( folderName, regressorName + "-" + variableY + "-global-b.png")) plt.close(fig)
from strategy_handlers.strategies.DrawChaser import DrawChaser from strategy_handlers.strategies_manager import strategy_manager from strategy_handlers.strategies.marketMaker import MarketMaker from common import initialize_logging if __name__ == "__main__": initialize_logging("draw_better") thresh_draw = 2 sm = strategy_manager(DrawChaser, number_threads=1, thresh_draw=thresh_draw, event_id="28422314") sm.manage_strategies()
def main(): """ main processing module """ options = parse_command_line() initialize_logging(options.log_name) log = logging.getLogger("main") log.info("program starts") halt_event = Event() signal.signal(signal.SIGTERM, _create_signal_handler(halt_event)) log.info("using test script %r" % (options.test_script, )) program_dir = os.path.dirname(__file__) program_path = os.path.join(program_dir, "customer_process.py") customer_process_list = list() for file_name in os.listdir(options.user_identity_dir): if options.max_users is not None \ and len(customer_process_list) >= options.max_users: log.info("breaking at %s users" % (options.max_users, )) break log.info("user identity %r" % (file_name, )) user_identity_path = os.path.join(options.user_identity_dir, file_name) args = [ sys.executable, program_path, options.test_script, user_identity_path ] environment = { "PYTHONPATH" : os.environ["PYTHONPATH"], "NIMBUSIO_LOG_DIR" : os.environ["NIMBUSIO_LOG_DIR"], "NIMBUS_IO_SERVICE_HOST" : os.environ["NIMBUS_IO_SERVICE_HOST"], "NIMBUS_IO_SERVICE_PORT" : os.environ["NIMBUS_IO_SERVICE_PORT"], "NIMBUS_IO_SERVICE_DOMAIN" : \ os.environ["NIMBUS_IO_SERVICE_DOMAIN"], "NIMBUS_IO_SERVICE_SSL" : os.environ.get( "NIMBUS_IO_SERVICE_SSL", "0" ) } process = subprocess.Popen(args, env=environment) customer_process_list.append(process) log.info("waiting") try: halt_event.wait(options.test_duration) except KeyboardInterrupt: log.info("KeyBoardInterrupt") halt_event.set() log.info("terminating processes") for process in customer_process_list: process.terminate() log.info("waiting for processes") for process in customer_process_list: process.wait() if process.returncode != 0: log.error("process returncode %s" % (process.returncode, )) log.info("program ends") return 0
from strategy_handlers.strategies_manager import strategy_manager from strategy_handlers.strategies.marketMaker import MarketMaker from common import initialize_logging if __name__ == "__main__": initialize_logging("market_maker") sm = strategy_manager(MarketMaker) sm.manage_strategies()
def __init__(self, client): initialize_secdb() initialize_logging("db_recorder") self.betfair_client = client self.db_client = DBQuery()
from common import initialize_logging from strategy_handlers_under_goals.priceChaser import PriceChaser from strategy_handlers_under_goals.utils import authenticate initialize_logging("testing_price_chaser") client = authenticate() market_id = "1.132089559" selection_id = 5851482 pc = PriceChaser(client, market_id, selection_id) # pc.chasePrice(1000, 10) orders = pc.get_betfair_matches() print("here")
from os.path import join from common import initialize_logging, ROOT_DIR from predictors.RFRPredictor import RFRPredictor from strategy_handlers.strategies.UnderGoalsTimer import UnderGoalsTimer from strategy_handlers.strategies_manager import strategy_manager if __name__ == "__main__": initialize_logging("under_goals_2") time_limit = 5 min_odds = 1.1 max_odds = 4 market_under_goals = 2 min_vol = 0 stake = 4 number_parallel_stragies = 10 market_countries = ["GB", "ES", "DE", "IT", "PT", "FR", "BR", "NL", "BE"] market_countries = None sm = strategy_manager(UnderGoalsTimer, event_id=None, number_threads=number_parallel_stragies, timer=time_limit, market_under_goals=market_under_goals, stake=stake, min_odds=min_odds, min_vol=min_vol, market_countries=market_countries) sm.manage_strategies()
from os.path import join from common import initialize_logging, ROOT_DIR from predictors.RFRPredictor import RFRPredictor from strategy_handlers.strategies.MLPredictor import MLPredictor from strategy_handlers.strategies_manager import strategy_manager if __name__ == "__main__": initialize_logging("machine_learning_predictor") path_models = join(ROOT_DIR, "predictors\RFPPredictorModels\predictor") path_encoder = join(ROOT_DIR, "predictors\RFPPredictorModels\encoder") runners = ["1", "x", "2"] min_odds = 1.1 max_odds = 4 min_pred = 0.5 scale_with_pred = True stake = 4 predictor = RFRPredictor(path_models, path_encoder, runners, stake=stake, scale_with_pred=scale_with_pred, min_odds=min_odds, max_odds=max_odds, min_pred=min_pred) sm = strategy_manager(MLPredictor, number_threads=100, predictor=predictor, max_odds=max_odds,
from common import initialize_logging from data.hist_trades_export import Recorder from data.sql_wrapper.connection import initialize_secdb if __name__ == "__main__": initialize_secdb() initialize_logging("hist_betfair_data") json_trades_recorder = Recorder() json_trades_recorder.read_files()
i = 0 for root, dirs, files in os.walk( "E:\\Betfair Data JSON\\data\\xds\\historic\\BASIC"): for name in files: if name[1] == '.': filepath = os.path.join(root, name) newfilepath = os.path.join(outputpath, name + '.decompressed') list.append((filepath, newfilepath)) i += 1 if i == chunk: yield list list = [] i = 0 if __name__ == "__main__": initialize_secdb() initialize_logging("decompress_betfair_data_2") for list_files in file_generator(8): jobs = [] for files in list_files: p = multiprocessing.Process(target=decompress_file, args=files) jobs.append(p) p.start() for p in jobs: p.join() json_trades_recorder.read_json_files()
self._released = 0 self._rejected = 0 # Main flow for sender app if __name__ == "__main__": # parsing arguments parsed_opts, args = common.parse_opts(True) # same message body will be used by all sender instances message_body = common.generate_message_body( parsed_opts.message_size, "abcdedfgijklmnopqrstuvwxyz0123456789") # initializing logging common.initialize_logging(parsed_opts.log_level) # list of spawned sender processes processes = list() # Interrupts all running senders def interrupt_handler(sig, f): global interrupted interrupted = True for sender in processes: sender.interrupt() # Capturing SIGINT signal.signal(signal.SIGINT, interrupt_handler) signal.signal(signal.SIGTERM, interrupt_handler)
def main() : # TODO argparse? maybe divide into "fast", "exhaustive", "heuristic"; also add option to specify file from command line (?) # hard-coded values here n_splits = 10 final_report_file_name = "00_final_report.txt" # create uniquely named folder folder_name = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M") + "-classification" if not os.path.exists(folder_name) : os.makedirs(folder_name) # start logging common.initialize_logging(folder_name) # this part can be used by some case studies, storing variable names variableY = variablesX = None # get data logging.info("Loading data...") #X, y, variablesX, variablesY = common.loadRallouData() # TODO replace here to load different data #X, y, variablesX, variablesY = common.loadCoronaData() #X, y, variablesX, variablesY = common.loadXORData() #X, y, variablesX, variablesY = common.loadMl4Microbiome() X, y, variablesX, variablesY = common.loadMl4MicrobiomeCRC() variableY = variablesY[0] logging.info("Shape of X: " + str(X.shape)) logging.info("Shape of y: " + str(y.shape)) # also take note of the classes, they will be useful in the following classes, classesCount = np.unique(y, return_counts=True) # let's output some details about the data, that might be important logging.info("Class distribution for the %d classes." % len(classes)) for i, c in enumerate(classes) : logging.info("- Class %d has %.4f of the samples in the dataset." % (c, float(classesCount[i]) / float(y.shape[0]))) # an interesting comparison: what's the performance of a random classifier? random_scores = [] for i in range(0, 100) : y_random = np.random.randint( min(classes), high=max(classes)+1, size=y.shape[0] ) random_scores.append( accuracy_score(y, y_random) ) logging.info("As a comparison, randomly picking labels 100 times returns an average accuracy of %.4f (+/- %.4f)\n" % (np.mean(random_scores), np.std(random_scores))) # check: do the variables' names exist? if not, put some placeholders if variableY is None : variableY = "Y" if variablesX is None : variablesX = [ "X" + str(i) for i in range(0, X.shape[1]) ] # this is a utility dictionary, that will be used to create a more concise summary performances = dict() # perform stratified k-fold cross-validation, but explicitly skf = StratifiedKFold(n_splits=n_splits, shuffle=True) folds = [ [train_index, test_index] for train_index, test_index in skf.split(X, y) ] # TODO # - also call function for feature selection # - also keep track of time needed for each classification for classifierIndex, classifierOriginal in enumerate(classifier_list) : classifier = copy.deepcopy( classifierOriginal ) classifier_string = str(classifier) # now, we automatically generate the name of the classifier, using a regular expression classifierName = classifier_string.split("(")[0] match = regex.search("n_estimators=([0-9]+)", classifier_string) if match : classifierName += "_" + match.group(1) logging.info("Classifier #%d/%d: %s..." % (classifierIndex+1, len(classifier_list), classifierName)) # initialize local performance performances[classifierName] = dict() # vector that contains (at the moment) two possibilities dataPreprocessingOptions = ["raw", "normalized"] for dataPreprocessing in dataPreprocessingOptions : # create list performances[classifierName][dataPreprocessing] = [] # this is used to produce a "global" confusion matrix for the classifier all_y_test = [] all_y_pred = [] # iterate over all splits splitIndex = 0 for train_index, test_index in folds : X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] if dataPreprocessing == "normalized" : scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) logging.info("Training classifier %s on split #%d/%d (%s data)..." % (classifierName, splitIndex+1, n_splits, dataPreprocessing)) try: classifier.fit(X_train, y_train) # instead of calling the classifier's "score" method, let's compute accuracy explicitly y_train_pred = classifier.predict(X_train) y_test_pred = classifier.predict(X_test) trainScore = accuracy_score(y_train, y_train_pred) testScore = accuracy_score(y_test, y_test_pred) logging.info("Training score: %.4f ; Test score: %.4f", trainScore, testScore) # store performance and information performances[classifierName][dataPreprocessing].append( (testScore, trainScore) ) all_y_test = np.append(all_y_test, y_test) all_y_pred = np.append(all_y_pred, y_test_pred) # get features, ordered by importance featuresByImportance = get_relative_feature_importance(classifier) # write feature importance to disk featureImportanceFileName = classifierName + "-featureImportance-split-" + str(splitIndex) + "." + dataPreprocessing + ".csv" with open( os.path.join(folder_name, featureImportanceFileName), "w") as fp : fp.write("feature,importance\n") for featureImportance, featureIndex in featuresByImportance : fp.write( "\"" + variablesX[int(featureIndex)] + "\"," + str(featureImportance) + "\n") # also create and plot confusion matrix for test confusionMatrixFileName = classifierName + "-confusion-matrix-split-" + str(splitIndex) + "-" + dataPreprocessing + ".png" confusionMatrix = confusion_matrix(y_test, y_test_pred) plot_confusion_matrix(confusionMatrix, classes, os.path.join(folder_name, confusionMatrixFileName)) except Exception as e : logging.warning("\tunexpected error: ", e) splitIndex += 1 # the classifier might have crashed, so we need a check here if len(performances[classifierName][dataPreprocessing]) > 0 : testPerformance = [ x[0] for x in performances[classifierName][dataPreprocessing] ] logging.info("Average performance (test) of classifier %s on %s data: %.4f (+/- %.4f)" % (classifierName, dataPreprocessing, np.mean(testPerformance), np.std(testPerformance))) # plot a last confusion matrix including information for all the splits confusionMatrixFileName = classifierName + "-confusion-matrix-" + dataPreprocessing + ".png" confusionMatrix = confusion_matrix(all_y_test, all_y_pred) plot_confusion_matrix(confusionMatrix, classes, os.path.join(folder_name, confusionMatrixFileName)) # but also save all test predictions, so that other metrics could be computed on top of them df = pd.DataFrame() df["y_true"] = all_y_test df["y_pred"] = all_y_pred df.to_csv(os.path.join(folder_name, classifierName + "-test-predictions-" + dataPreprocessing + ".csv"), index=False) # now, here we can write a final report # first, convert performance dictionary to list performances_list = [] for classifier_name in performances : for data_preprocessing in performances[classifier_name] : if len(performances[classifier_name][data_preprocessing]) > 0 : performance = [ x[0] for x in performances[classifier_name][data_preprocessing] ] performance_mean = np.mean(performance) performance_std = np.std(performance) performances_list.append( [classifier_name + " (" + data_preprocessing + ")", performance_mean, performance_std, performance] ) performances_list = sorted(performances_list, key = lambda x : x[1], reverse=True) final_report_file_name = os.path.join(folder_name, final_report_file_name) logging.info("Final results (that will also be written to file \"" + final_report_file_name + "\"...") with open(final_report_file_name, "w") as fp : fp.write("Final accuracy results for variable \"%s\", %d samples, %d classes:\n" % (variableY, len(X), len(classes))) for result in performances_list : temp_string = "Classifier \"%s\", accuracy: mean=%.4f, stdev=%.4f" % (result[0], result[1], result[2]) logging.info(temp_string) fp.write(temp_string + "\n") temp_string = "Folds: %s" % str(result[3]) logging.info(temp_string) fp.write(temp_string + "\n\n") # # this part can be skipped because it's computationally expensive; also skip if there are only two classes # if False : # # multiclass classifiers are treated differently # logging.info("Now training OneVsOneClassifier with " + classifierName + "...") # multiClassClassifier = OneVsOneClassifier( classifierData[0] ) # multiClassClassifier.fit(trainData, trainLabels) # trainScore = multiClassClassifier.score(trainData, trainLabels) # testScore = multiClassClassifier.score(testData, testLabels) # logging.info("\ttraining score: %.4f ; test score: %.4f", trainScore, testScore) # logging.info(common.classByClassTest(multiClassClassifier, testData, testLabels)) # # logging.info("Now training OneVsRestClassifier with " + classifierName + "...") # currentClassifier = copy.deepcopy( classifierData[0] ) # multiClassClassifier = OneVsRestClassifier( currentClassifier ) # multiClassClassifier.fit(trainData, trainLabels) # trainScore = multiClassClassifier.score(trainData, trainLabels) # testScore = multiClassClassifier.score(testData, testLabels) # logging.info("\ttraining score: %.4f ; test score: %.4f", trainScore, testScore) # logging.info(common.classByClassTest(multiClassClassifier, testData, testLabels)) # # logging.info("Now training OutputCodeClassifier with " + classifierName + "...") # multiClassClassifier = OutputCodeClassifier( classifierData[0] ) # multiClassClassifier.fit(trainData, trainLabels) # trainScore = multiClassClassifier.score(trainData, trainLabels) # testScore = multiClassClassifier.score(testData, testLabels) # logging.info("\ttraining score: %.4f ; test score: %.4f", trainScore, testScore) # logging.info(common.classByClassTest(multiClassClassifier, testData, testLabels)) # TODO save files for each classifier: # - recall? # - accuracy? # - "special" stuff for each classifier, for example the PDF tree for DecisionTree return