Ejemplo n.º 1
0
def create_final_dataframe(setup):

    file_path = os.path.join(SysConfig.path('datasets'), setup.dataset_id,
                             'single.csv')
    df = pd.read_csv(file_path)

    features = list()
    if 'sm_accum_odds' in setup.dataset_id:
        features = process_accum(df)
    elif 'sm_comp_odds' in setup.dataset_id:
        features = process_comp(df)

    df = df[features]

    df_file_path = os.path.join(SysConfig.path('datasets'), setup.dataset_id,
                                'final.csv')
    df.to_csv(df_file_path)
Ejemplo n.º 2
0
def create_single_dataframe(setup):

    partial_dir = os.path.join(SysConfig.path('datasets'), setup.dataset_id,
                               'partial')
    files = sorted(os.listdir(partial_dir))

    print('Concatenating file %s' % files[0])
    file_path = os.path.join(partial_dir, files[0])
    df = pd.read_csv(file_path)

    for f in files[1:]:

        print('Concatenating file %s' % f)
        file_path = os.path.join(partial_dir, f)
        df_temp = pd.read_csv(file_path)
        df = pd.concat([df, df_temp], sort=True)

    df_file_name = 'single.csv'
    df_file_path = os.path.join(SysConfig.path('datasets'), setup.dataset_id,
                                df_file_name)
    df.to_csv(df_file_path)
Ejemplo n.º 3
0
    def _load_model(self):

        file_name = str(self.setup.minute) + '.pkl'
        dir_name = os.path.join(SysConfig.path('models'),
                                self.setup.classifier, self.setup.dataset_id,
                                self.setup.features)

        file_path = os.path.join(dir_name, file_name)

        with open(file_path, 'rb') as outfile:
            model = joblib.load(outfile)

        return model
Ejemplo n.º 4
0
    def _save_model(self, model):

        file_name = str(self.setup.minute) + '.pkl'
        dir_name = os.path.join(SysConfig.path('models'),
                                self.setup.classifier, self.setup.dataset_id,
                                self.setup.features)

        if not os.path.exists(dir_name):
            os.makedirs(dir_name)

        file_path = os.path.join(dir_name, file_name)

        with open(file_path, 'wb') as outfile:
            joblib.dump(model, outfile)
Ejemplo n.º 5
0
    def _save_cv_results(self, grid):

        file_name = str(self.setup.minute) + '.csv'
        dir_name = os.path.join(SysConfig.path('tuning'),
                                self.setup.classifier, self.setup.dataset_id,
                                self.setup.features)

        if not os.path.exists(dir_name):
            os.makedirs(dir_name)

        file_path = os.path.join(dir_name, file_name)

        df = pd.DataFrame(grid.cv_results_)
        with open(file_path, 'w') as outfile:
            df.to_csv(outfile)
Ejemplo n.º 6
0
def create_file(setup):

    commands = list()

    if setup.is_minute_feature:

        job_filename = '_'.join([
            setup.classifier, setup.virtualenv, setup.dataset_id,
            setup.features, 'ALL.sh'
        ])
        label = '_'.join(
            [setup.classifier, '0', setup.dataset_id, setup.features])
        command = get_prefix_command(label, setup)
        command = add_default_parameters(setup, command)
        command += ' -tf '
        commands.append(command)

    else:

        job_filename = '_'.join([
            setup.classifier, setup.virtualenv, setup.dataset_id,
            setup.features, 'MIN.sh'
        ])
        for minute in range(setup.minute):
            label = '_'.join([
                setup.classifier,
                str(minute), setup.dataset_id, setup.features
            ])

            command = get_prefix_command(label, setup)
            command = add_default_parameters(setup, command)

            command += ' -t ' + str(minute)
            commands.append(command)

    jobs_dir = SysConfig.path('jobs')
    job_file = os.path.join(jobs_dir, job_filename)
    with open(job_file, 'w') as outfile:
        outfile.write('#!/usr/bin/env bash')
        outfile.write('\n')
        for c in commands:
            outfile.write(c)
            outfile.write('\n')
Ejemplo n.º 7
0
    def _save_results(self, acc_train, acc_test, rps_train, rps_test, minute):

        results = dict()
        results['acc_train'] = acc_train
        results['acc_test'] = acc_test

        results['rps_train'] = rps_train
        results['rps_test'] = rps_test

        filename = str(minute) + '.json'

        score_dir = os.path.join(SysConfig.path('results'),
                                 self.setup.classifier, self.setup.dataset_id,
                                 self.setup.features)
        score_file = os.path.join(score_dir, filename)

        if not os.path.exists(score_dir):
            os.makedirs(score_dir)

        with open(score_file, 'w') as outfile:
            json.dump(results, outfile)
Ejemplo n.º 8
0
def create_partial_frames(setup, selection, projection):

    partial_size = 1000
    idx_from = 0
    idx_to = 0
    num_matches = session.get_collection('matches').count_documents(selection)

    dataset_dir = os.path.join(SysConfig.path('datasets'), setup.dataset_id)
    partial_dir = os.path.join(dataset_dir, 'partial')

    if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
        os.makedirs(partial_dir)
    else:
        shutil.rmtree(partial_dir)
        os.makedirs(partial_dir)

    idx_file = 1
    while idx_from < num_matches:

        idx_file_str = str(idx_file).zfill(4)

        idx_to = idx_to + partial_size
        print('Creating partial dataframe number [%s] from index %i to %i ' %
              (idx_file_str, idx_from, idx_to))

        data = session.get_collection('matches').find(selection, projection).\
            skip(idx_from).limit(partial_size).sort([("date", pymongo.ASCENDING)])

        df = pd.DataFrame(json_normalize(list(data), sep='_'))

        file_name = '_'.join(
            [idx_file_str, str(idx_from),
             str(idx_to), '.csv'])
        file_path = os.path.join(partial_dir, file_name)
        df.to_csv(file_path)

        idx_from += partial_size
        idx_file += 1
Ejemplo n.º 9
0
    def __init__(self, setup):

        self._list_features_cards = ['yellow_cards', 'red_cards']
        self._list_comparatives = ['ratio', 'sub']
        self._list_locales = ['home', 'away']
        self._list_features_trends = ['goals', 'corners', 'on_target', 'off_target', 'attacks', 'dangerous_attacks']
        self._list_features_scouts = self._list_features_trends + ['possession']

        self.setup = setup

        dataset_dir = os.path.join(SysConfig.path('datasets'), self.setup.dataset_id, 'final.csv')
        self.data = pd.read_csv(dataset_dir)
        self.features = self.get_features()
        self.target = 'result'

        self.train_size = self.setup.train_size

        self.train = self.data[:self.train_size]
        self.test = self.data[self.train_size:]

        self.x_train = self.train[self.features]
        self.y_train = self.train[self.target]
        self.x_test = self.test[self.features]
        self.y_test = self.test[self.target]
Ejemplo n.º 10
0
def transform_minutes_into_feature(setup):

    file_path = os.path.join(SysConfig.path('datasets'), setup.dataset_id,
                             'final.csv')
    df = pd.read_csv(file_path)

    features_basic = [
        'id', 'minute_max', 'date', 'observed_away', 'observed_draw',
        'observed_home', 'result'
    ]
    features_odds = [
        'odds_p_mean_home', 'odds_p_mean_draw', 'odds_p_mean_away',
        'odds_p_std_home', 'odds_p_std_draw', 'odds_p_std_away'
    ]

    features_trends = [
        'goals', 'corners', 'on_target', 'off_target', 'attacks',
        'dangerous_attacks'
    ]
    features_cards = ['yellow_cards', 'red_cards']
    features_ratio = ['possession']

    locales = ['home', 'away']

    features_columns = []
    if 'sm_accum_odds' in setup.dataset_id:
        group = 'accum_trends'
        for feature in features_trends:
            for l in locales:
                col = '_'.join([group, feature, l])
                features_columns.append(col)

        group = 'accum_cards'
        for feature in features_cards:
            for l in locales:
                col = '_'.join([group, feature, l])
                features_columns.append(col)

        group = 'ratio_trends'
        for feature in features_ratio:
            col = '_'.join([group, feature])
            features_columns.append(col)

    elif 'sm_comp_odds' in setup.dataset_id:
        comp = ['sub', 'ratio']
        group = 'trends'
        for feature in features_trends + features_ratio:
            for c in comp:
                col = '_'.join([c, group, feature])
                features_columns.append(col)

        group = 'cards'
        for feature in features_cards:
            for c in comp:
                col = '_'.join([c, group, feature])
                features_columns.append(col)

    df_new = pd.DataFrame()
    for i in range(96):
        logger.debug("Extracting minute %i" % i)
        features_columns_by_minute = [
            '_'.join([a, str(i)]) for a in features_columns
        ]
        df_partial = df[features_basic + features_odds +
                        features_columns_by_minute]
        df_partial.columns = features_basic + features_odds + features_columns
        df_partial.loc[:, 'minute'] = i
        df_new = pd.concat([df_new, df_partial]).sort_index(kind='merge')

    setup.dataset_id = '_'.join([setup.dataset_id, 'min'])
    dataset_dir = os.path.join(SysConfig.path('datasets'), setup.dataset_id)

    if not os.path.exists(dataset_dir):
        os.mkdir(dataset_dir)

    df_file_path = os.path.join(dataset_dir, 'final.csv')
    df_new.to_csv(df_file_path)