Esempio n. 1
0
    def predict(self, X_test, time_remain):

        Xs = self.tables
        main_table, len_X_train = Xs[MAIN_TABLE_NAME], len(Xs[MAIN_TABLE_NAME])
        main_table = pd.concat([main_table, X_test],
                               keys=['train', 'test'],
                               sort=True)
        main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        Xs[MAIN_TABLE_NAME] = main_table

        # Xs[MAIN_TABLE_NAME] = clean_df(Xs[MAIN_TABLE_NAME])
        clean_df(Xs[MAIN_TABLE_NAME])
        X = merge_table(Xs, self.config)
        clean_df(X)
        selected_features = list(
            self.selected_features_0
        ) + self.time_feature_list + self.mul_feature_list
        X = feature_engineer_rewrite(X.filter(selected_features), self.config)

        # X = X[X.index.str.startswith("test")]
        X = X.iloc[len_X_train:, ]
        X.sort_index(inplace=True)
        if FEATURE_SELECTION_SWITCH:
            X = X[self.selected_features_1]
        result = predict(X, self.config)

        del self.tables, X_test
        # gc.collect()

        return pd.Series(result)
Esempio n. 2
0
    def predict(self, X_test, time_remain):
        self.Time_data_info['time_ramain_so_far'] = time_remain

        start_feature = time.time()

        Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]

        log(f"Merge train and test tables...")
        main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        Xs[MAIN_TABLE_NAME] = main_table

        log(f"Feature engineering...")
        clean_tables(Xs)
        X = merge_table(Xs, self.config)
        X = clean_df(X)
        X = feature_engineer(X, self.config)

        X_train = X[X.index.str.startswith("train")]
        X_train.index = X_train.index.map(lambda x: int(x.split('_')[1]))
        X_train.sort_index(inplace=True)
        y_train = self.targets

        end_feature = time.time()

        self.Time_data_info['time_for_feature_engineering'] = (end_feature -
                                                               start_feature)

        self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[
            'time_ramain_so_far'] - self.Time_data_info[
                'time_for_feature_engineering']

        print(f"TIME info:", self.Time_data_info)

        # train model
        log(f"Training...")
        train_start = time.time()

        timetrain(X_train, y_train, self.config, self.Time_data_info)

        train_end = time.time()

        self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[
            'time_ramain_so_far'] - (train_end - train_start)
        self.Time_data_info['time_for_model_train'] = (train_end - train_start)

        print("TIME info:", self.Time_data_info)

        # predict
        log(f"Predicting...")
        X_test = X[X.index.str.startswith("test")]
        X_test.index = X_test.index.map(lambda x: int(x.split('_')[1]))
        X_test.sort_index(inplace=True)
        result = predict(X_test, self.config)

        return pd.Series(result)
Esempio n. 3
0
    def predict(self, X_test, time_remain):

        ##--------Calculate sample size----------
        '''main_table=self.tables[MAIN_TABLE_NAME]
        print(main_table.shape[0])
        print(X_test.shape[0])
        return None'''

        Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]
        main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        Xs[MAIN_TABLE_NAME] = main_table

        ## Clean tables
        clean_tables(Xs)
        #remove_trivial_features_in_tables(Xs)

        ## Merge tables and remove trivial features
        X = merge_table(Xs, self.config)
        clean_df(X)
        feature_engineer(X, self.config)
        ### ----------Temporarily remove multi-categorical features from related tables----------
        X.drop([c for c in X.columns if c.startswith("mul_")],
               axis=1,
               inplace=True)
        #print(X.columns)
        #input()
        ### ----------End-----------
        remove_trivial_features(X)

        ## Add number frequency feature
        cat_features = []
        for col in X.columns:
            if "c_" in col and "ROLLING" not in col and "cnt" not in col:
                cat_features.append(col)
        X, _ = cat_value_counts(X, cat_features)

        ## Split train and test data
        X_train = X[X.index.str.startswith("train")]
        X = X[X.index.str.startswith("test")]
        X.index = X.index.map(lambda x: int(x.split('_')[1]))
        X.sort_index(inplace=True)

        ## Training process
        train_with_time_control(X_train, self.y, self.config)

        ## Testing process
        result = predict(X, self.config)

        return pd.Series(result)
Esempio n. 4
0
    def predict(self, X_test, time_remain):

        Xs = self.Xs

        from feature_for_test import baseline_features_test

        if self.one_hot_features is not None:
            X_test = baseline_features_test(Xs,X_test,self.config,self.m_features,self.mlbs,self.one_hot_models)
        else:
            X_test = baseline_features_test(Xs, X_test, self.config, [], None, None)

        result = predict(X_test, self.config)

        return pd.Series(result)
Esempio n. 5
0
    def predict(self, X_test, time_remain):

        time_manager = TimeManager(self.config, time_remain)
        print(f"prediction remaining time: {time_remain}")
        print('', flush=True)
        Xs = self.tables
        # main_table, len_X_train = Xs[MAIN_TABLE_NAME], len(Xs[MAIN_TABLE_NAME])
        # main_table = pd.concat([main_table, X_test], keys=['train', 'test'], sort=True)
        # time_manager.check("concat X_train and X_test")
        # main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        # Xs[MAIN_TABLE_NAME] = main_table
        Xs[MAIN_TABLE_NAME] = X_test
        clean_df(Xs[MAIN_TABLE_NAME])
        time_manager.check("clean main table")

        X = merge_table(Xs, self.config)
        time_manager.check("merge table")

        clean_df(X)
        time_manager.check("clean data before learning")
        print('', flush=True)

        if FEATURE_SELECTION_SWITCH:
            selected_features = list(self.selected_features_0) + self.time_feature_list + self.mul_feature_list + self.num_feature_list
        else:
            selected_features = self.time_feature_list + self.mul_feature_list + self.num_feature_list
        X = feature_engineer_rewrite(X.filter(selected_features), self.config, time_manager)
        time_manager.check("exit feature engineering")
        print('', flush=True)

        # X = X[X.index.str.startswith("test")]
        # X = X.iloc[len_X_train:, ]
        X.sort_index(inplace=True)
        time_manager.check("X sorting")
        if FEATURE_SELECTION_SWITCH:
            test_data_feature_selection(X, self.selected_features_1)
            X = X[self.selected_features_1]
            time_manager.check("test data feature selection")

        print('', flush=True)
        result = predict(X, self.config)
        time_manager.check("prediction")
        print('', flush=True)

        return pd.Series(result)
Esempio n. 6
0
    def predict(self, X_test, time_remain):

        Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]
        main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        Xs[MAIN_TABLE_NAME] = main_table

        clean_tables(Xs)
        X = merge_table(Xs, self.config)
        clean_df(X)
        feature_engineer(X, self.config)

        cat_features = []

        for col in X.columns:
            if "c_" in col and "ROLLING" not in col and "cnt" not in col:
                cat_features.append(col)


        X, _ = cat_value_counts(X, cat_features)



        X_train = X[X.index.str.startswith("train")]



        X = X[X.index.str.startswith("test")]
        X.index = X.index.map(lambda x: int(x.split('_')[1]))
        X.sort_index(inplace=True)

        result = None
        for i in range(0,3):
            train(X_train, self.y, self.config)
            tmp = predict(X, self.config)
            if result == None:
                result = tmp
                continue
            else:
                result = result + tmp

        result = result/float(3)

        return pd.Series(result)
Esempio n. 7
0
    def predict(self, X_test, time_remain):

        Xs = self.Xs

        from feature_for_test import baseline_features_test, cat_value_counts, feature_selection_test

        X_test = baseline_features_test(Xs, X_test, self.config)

        features_from_base = feature_selection_test(
            X_test, self.feature_selection_models,
            int(len(X_test.columns) / 5))

        X_test = cat_value_counts(X_test, self.cat_dict_counts)

        X_test.index = features_from_base.index

        X_test = pd.concat([X_test, features_from_base], axis=1)

        result = predict(X_test, self.config)

        return pd.Series(result)
Esempio n. 8
0
    def predict(self, X_test, time_remain):
        timer = Timer()

        # -------- trace mem ----------------------
        #tracemalloc.start(3)
        # -------- trace mem ----------------------

        gc.collect()

        # ----- set mem for feature resume -------
        MemoryManager.set_avl_sys_mem()
        # ----- set mem for feature resumme -------
        #print(self.Xs[CONSTANT.MAIN_TABLE_NAME]['t_01'].min(), self.Xs[CONSTANT.MAIN_TABLE_NAME]['t_01'].max())
        #print(X_test['t_01'].min(), X_test['t_01'].max())

        X_test.reset_index(drop=True, inplace=True)
        if self.config['time_col'] in X_test.columns:
            X_test.sort_values(self.config['time_col'], inplace=True)
        index = X_test.index
        X_test.reset_index(drop=True, inplace=True)
        index = np.argsort(index)

        #print(f'X_test preprocess memory trace (now, peak): {tracemalloc.get_traced_memory()}')

        X_main = self.Xs[CONSTANT.MAIN_TABLE_NAME]  # train main

        self.Xs[CONSTANT.MAIN_TABLE_NAME] = X_test
        del X_test
        self.convertX(self.Xs, self.config, False)
        self.clean_data(self.Xs, isTrain=False)

        #print(f'X_test convert memory trace (now, peak): {tracemalloc.get_traced_memory()}')

        train_len = X_main.shape[0]
        self.Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat([X_main, self.Xs[CONSTANT.MAIN_TABLE_NAME]], axis=0).reset_index(
            drop=True)

        #print(f'concate memory trace (now, peak): {tracemalloc.get_traced_memory()}')

        del X_main
        gc.collect()

        #print(f'del memory trace (now, peak): {tracemalloc.get_traced_memory()}')

        X = self.feature_iter_Xs.feature_resume(self.Xs, self.y, isTrain=True)  # 分表作一阶特征
        #print(f'Xs feature memory trace (now, peak): {tracemalloc.get_traced_memory()}')

        Xs_name_list = [name for name in self.Xs.keys()]
        for name in Xs_name_list:
            del self.Xs[name]
        del self.Xs
        gc.collect()

        #print(f'del Xs memory trace (now, peak): {tracemalloc.get_traced_memory()}')

        # zcm修改,删除MC特征
        self.prep_class.drop_mulcat_features(X)
        gc.collect()
        # end zcm修改,删除MC特征

        CONSTANT.TRAIN_LEN = train_len
        self.feature_iter.feature_resume(X, self.y, isTrain=True)  # 大表作所有特征

        #print(f'X feature memory trace (now, peak): {tracemalloc.get_traced_memory()}')
        print(f'X mem after resume: {X.memory_usage().sum()}')

        self.prep_class.drop_features(X)
        gc.collect()
        #print(f'drop memory trace (now, peak): {tracemalloc.get_traced_memory()}')

        print(f'final X mem: {X.memory_usage().sum()}')

        train(X.iloc[0:train_len], self.y, self.config, timer)
        gc.collect()

        #print(f'train memory trace (now, peak): {tracemalloc.get_traced_memory()}')

        X_test = X.iloc[train_len:].reset_index(drop=True)
        del X
        gc.collect()
        result = predict(X_test, self.config)
        #print(f'predict memory trace (now, peak): {tracemalloc.get_traced_memory()}')

        result = result[index]
        return pd.Series(result)
Esempio n. 9
0
        overall_time_budget = overall_time_budget + time_budget
        time_spent = time.time() - start	
	
        vprint( verbose,  "[+] Remaining time after reading data %5.2f sec" % (time_budget-time_spent))
        if time_spent >= time_budget:
            vprint( verbose,  "[-] Sorry, time budget exceeded, skipping this task")
            execution_success = False
            continue
        
        # ========= Creating a model, knowing its assigned task from D.info['task'].
        # The model can also select its hyper-parameters based on other elements of info.  
        vprint( verbose,  "======== Creating model ==========")

	import automl as mpd
	mpd.predict(D, output_dir, start,  time_budget, basename, running_on_codalab )
		    
    '''        
    if zipme and overall_time_budget == 0:
        vprint( verbose,  "========= Zipping this directory to prepare for submit ==============")
        data_io.zipdir(submission_filename + '.zip', ".")
    '''	
    overall_time_spent = time.time() - overall_start
    if execution_success:
        vprint( verbose,  "[+] Done")
        vprint( verbose,  "[+] Overall time spent %5.2f sec " % overall_time_spent + "::  Overall time budget %5.2f sec" % overall_time_budget)
    else:
        vprint( verbose,  "[-] Done, but some tasks aborted because time limit exceeded")
        vprint( verbose,  "[-] Overall time spent %5.2f sec " % overall_time_spent + " > Overall time budget %5.2f sec" % overall_time_budget)
    print "overall end",  time.ctime()         
    if running_on_codalab: 
Esempio n. 10
0
    def predict(self, X_test, time_remain):
        timer = Timer()
        timer.set(time_remain)
        with timer.time_limit('ProProcess'):
            # fetch information of test dataset
            self.config[TEST_DATA_LENGTH] = len(X_test)
            self.config['test_time'] = self._fectch_time_range(X_test)
            self.config[STAGE] = 'test'

            Xs = self.tables
            main_table = pd.concat([Xs[MAIN_TABLE_NAME], X_test],
                                   axis=0,
                                   copy=False)
            main_table.reset_index(drop=True, inplace=True)

            del Xs[MAIN_TABLE_NAME]
            Xs[MAIN_TABLE_NAME] = main_table

            pre_process(Xs, self.config)
            clean_tables(Xs)
            pre_feature_extract(Xs)
            pre_tables_memory_cut(Xs)

            X = merge_table(Xs, self.config)
            # clean datas
            del self.tables, Xs
            gc.collect()

            self.null_count_sum(X, self.config)
            clean_df(X, fill_time=True)
            # compress data for memory problem
            X = table_memory_cut(X)

            # feature engineering
            print('overall X size', X.shape)
            X, add_feature = feature_engineer(X, self.config)

            # 内存问题 11G
            X = table_memory_cut(X)
            add_feature = table_memory_cut(add_feature)
            X = pd.concat([X, add_feature], axis=1, copy=False)
            del add_feature
            print(X.shape)
            # re compress data

            # 测试集分割
            X_train_val, y_train_val = X.iloc[:self.config[
                TRAIN_DATA_LENGTH]], self.train_label
            X_test = X.iloc[self.config[TRAIN_DATA_LENGTH]:]

            train_len = int(self.config[TRAIN_DATA_LENGTH] * 0.8)
            valid_len = self.config[TRAIN_DATA_LENGTH] - train_len
            self.config[TRAIN_LEN_OF_TRAIN_VAL] = train_len
            self.config[VAL_LEN_OF_TRAIN_VAL] = valid_len
            del X
            gc.collect()

            # 特征处理
            all_label_count_feature_list = cat_Lable_Cnt_Fun(
                X_train_val, y_train_val, X_test, self.config)
            all_mutlicat_feature_data_list = Mv_Label_Cnt_Func(
                X_train_val, y_train_val, X_test, self.config)

            if (all_label_count_feature_list is
                    None) & (all_mutlicat_feature_data_list is None):
                X_train, y_train = X_train_val.iloc[:
                                                    train_len], self.train_label[:
                                                                                 train_len]
                X_val, y_val = X_train_val.iloc[train_len:], self.train_label[
                    train_len:]
            else:
                all_feature_list = []
                if all_label_count_feature_list is not None:
                    all_feature_list += all_label_count_feature_list
                if all_mutlicat_feature_data_list is not None:
                    all_feature_list += all_mutlicat_feature_data_list

                add_feature_data = pd.concat(all_feature_list,
                                             axis=1,
                                             copy=False)
                add_feature_data.sort_index(inplace=True)

                del all_label_count_feature_list, all_mutlicat_feature_data_list, all_feature_list
                gc.collect()

                X_train = pd.concat(
                    [X_train_val[:train_len], add_feature_data[:train_len]],
                    axis=1,
                    copy=False)
                X_val = pd.concat([
                    X_train_val[train_len:self.config[TRAIN_DATA_LENGTH]],
                    add_feature_data[train_len:self.config[TRAIN_DATA_LENGTH]]
                ],
                                  axis=1,
                                  copy=False)
                y_train = self.train_label[:train_len]
                y_val = self.train_label[train_len:]

                X_test = pd.concat([
                    X_test, add_feature_data[self.config[TRAIN_DATA_LENGTH]:]
                ],
                                   axis=1,
                                   copy=False)

                del X_train_val, y_train_val, add_feature_data, self.train_label
                gc.collect()

        train_columns = train(X_train, X_val, y_train, y_val, self.config,
                              timer.remain)
        del X_train, X_val, y_train, y_val
        gc.collect()

        result = predict(X_test[train_columns], self.config)

        return pd.Series(result)
Esempio n. 11
0
    def predict(self, X_test, time_remain):

        Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]

        main_table['y_sorted'] = self.y
        main_table.sort_values(self.ts_col, inplace=True)
        #y_trn = main_table.y_sorted.copy()
        #main_table.drop('y_sorted', axis=1, inplace=True)

        #main_table['data_type'] = 'train'
        #X_test['data_type'] = 'test'
        X_test['y_sorted'] = -1
        main_table = pd.concat([main_table, X_test],
                               ignore_index=True).reset_index()

        del X_test
        gc.collect()

        # main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        # main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")

        Xs[MAIN_TABLE_NAME] = main_table
        log('memory usage of main_table: {:.2f}MB'.format(
            df_memory_usage(main_table) // 1e6))
        log('memory usage of process: {:.2f}MB'.format(get_process_memory()))

        clean_tables(Xs)
        X = merge_table(Xs, self.config)
        clean_df(X)

        del Xs, main_table
        gc.collect()

        log('memory usage of X: {:.2f}MB'.format(df_memory_usage(X) // 1e6))
        log('memory usage of process: {:.2f}MB'.format(get_process_memory()))

        self.cat_cols = sorted(
            [c for c in X.columns if c.startswith(CATEGORY_PREFIX)])
        self.mcat_cols = sorted(
            [c for c in X.columns if c.startswith(MULTI_CAT_PREFIX)])
        self.num_cols = sorted(
            [c for c in X.columns if c.startswith(NUMERICAL_PREFIX)])
        self.ts_cols = sorted(
            [c for c in X.columns if c.startswith(TIME_PREFIX)])

        X = self.feature_engineer(X, train=True)

        # X_trn = X[X.index.str.startswith("train")]
        # X_trn.index = X_trn.index.map(lambda x: int(x.split('_')[1]))
        X_trn = X[X['y_sorted'] != -1]
        y_trn = X_trn.y_sorted.copy()
        X_trn = X_trn.drop('y_sorted', axis=1)

        # X_tst = X[X.index.str.startswith("test")]
        # X_tst.index = X_tst.index.map(lambda x: int(x.split('_')[1]))
        X_tst = X[X['y_sorted'] == -1]
        X_tst = X_tst.drop('y_sorted', axis=1)

        X_tst.sort_index(inplace=True)

        del X
        gc.collect()

        log('memory usage of X_trn: {:.2f}MB'.format(
            df_memory_usage(X_trn) // 1e6))
        log('memory usage of process: {:.2f}MB'.format(get_process_memory()))

        train(X_trn, y_trn, self.config)
        del X_trn, y_trn
        gc.collect()

        log('memory usage of X_tst: {:.2f}MB'.format(
            df_memory_usage(X_tst) // 1e6))
        log('memory usage of process: {:.2f}MB'.format(get_process_memory()))
        result = predict(X_tst, self.config)
        del X_tst
        gc.collect()

        return pd.Series(result)
Esempio n. 12
0
    def predict(self, X_test, time_remain):

        Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]
        main_time_index = main_table[["t_01"]].sort_values("t_01")
        # catLabel_dict = process_cat_label(main_table, self.lables.loc[main_table.index]) # modified By 05.30
        main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        Xs[MAIN_TABLE_NAME] = main_table
        clean_tables(Xs, self.config, fill=True)
        main_table = Xs[MAIN_TABLE_NAME]

        main_cat_cols = [
            col for col in main_table.columns
            if (col.startswith("c_") or col.startswith("m_"))
            and len(main_table[col].unique()) > 1
        ]
        total_num_fea = 0
        catFea_dict, total_num_fea = process_main_cat(
            main_table, main_cat_cols, total_num_fea)  # 专门利用主表提其他类别特征针对main的特征
        print("total_num Fea:", total_num_fea)
        catFea_dicts = []
        relation_catFea_dicts = []
        relation_time_dicts = []
        relation_catFea_dicts2 = []
        if total_num_fea < 150:  # 表示主表的衍生特征不够多,还可加
            for relation in self.config['relations']:
                tableA = relation["table_A"]
                l_type = relation["type"].split("_")[0]
                tableB = relation["table_B"]
                r_type = relation["type"].split("_")[2]
                key = relation["key"][0]
                if tableA == "main" and l_type == "many" and r_type == "one":  #and "t_01" not in Xs[tableB].columns:  # 这里比较定制,后期需要改
                    '''
                    temp_main_cat = main_table[main_cat_cols]
                    relation_num_cols = [col for col in Xs[tableB].columns if col.startswith("n_")]
                    temp_tableB_num = Xs[tableB][[key]+relation_num_cols]
                    temp_tableB_num = temp_tableB_num.set_index(key)
                    temp_main_cat = temp_main_cat.join(temp_tableB_num, on=key)
                    temp_dict, total_num_fea = process_main_cat_v2(temp_main_cat, main_cat_cols, key, tableB, total_num_fea) #main的类别,relation的numerical
                    catFea_dicts.append(temp_dict)
                    if total_num_fea > 150: break
                    '''
                    Xs[tableB].drop_duplicates([key], inplace=True)
                    relation_cat_cols = [
                        col for col in Xs[tableB].columns
                        if (col.startswith("c_") or col.startswith("m_"))
                        and len(Xs[tableB][col].unique()) > 1
                    ]
                    temp_tableB_cat = Xs[tableB][relation_cat_cols]
                    if key in main_table and key in temp_tableB_cat:
                        temp_main_num = main_table[[key]]
                        temp_tableB_cat = temp_tableB_cat.set_index(key)
                        temp_main_num = temp_main_num.join(temp_tableB_cat,
                                                           on=key)
                        relation_temp_dict, total_num_fea = process_relation_cat(
                            temp_main_num, relation_cat_cols, key, tableB,
                            total_num_fea)  #relation的类别,main的numerical
                        #relation_catFea_dicts.append(relation_temp_dict)
                        relation_catFea_dicts = relation_catFea_dicts + relation_temp_dict
                        # if total_num_fea > 150: break
                        '''
                        temp_tableB_cat = Xs[tableB][relation_cat_cols]
                        relation_temp_dict2, total_num_fea = process_relation_cat_v2(temp_tableB_cat, relation_cat_cols, key,
                                                                                 tableB,
                                                                                 total_num_fea)
                        relation_catFea_dicts2.append(relation_temp_dict2)
                        '''

                    relation_time_cols = [
                        col for col in Xs[tableB].columns
                        if col.startswith("t_")
                    ]
                    if len(relation_time_cols) > 0:
                        if key in Xs[
                                tableB] and key in main_table and "t_01" in main_table:
                            temp_tableB_time = Xs[tableB][[key] +
                                                          relation_time_cols]
                            temp_tableB_time.columns = [
                                col + "_in_" +
                                tableB if col.startswith("t_") else col
                                for col in temp_tableB_time.columns
                            ]
                            temp_main_time = main_table[[key] + ["t_01"]]
                            temp_tableB_time = temp_tableB_time.set_index(key)
                            temp_main_time = temp_main_time.join(
                                temp_tableB_time, on=key)
                            temp_main_time.drop(key, axis=1, inplace=True)
                            #print("time_test v1")
                            #print(temp_main_time.head())
                            temp_main_time = process_relation_time(
                                temp_main_time)
                            relation_time_dicts.append(temp_main_time)
                    '''
                    temp_tableB = Xs[tableB].set_index(key)
                    temp_main_key = main_table[[key]]
                    temp_main_key = temp_main_key.join(temp_tableB, on=key)
                    relation_temp_dict2, total_num_fea = process_relation_cat_v2(temp_main_key, relation_cat_cols, key,
                                                                                 tableB, total_num_fea)
                    del temp_main_key
                    del temp_tableB
                    relation_catFea_dicts2.append(relation_temp_dict2)
                    if total_num_fea > 150: break
                    '''
        '''
        #if len(relation_time_dicts) > 0:
        main_time_col=[col for col in main_table.columns if col.startswith("t_")]
        temp_main_time = main_table[main_time_col]
        for col in main_time_col:
            temp_main_time["n_weekday_" + col], temp_main_time["n_hour_" + col], temp_main_time["n_day_" + col]=zip(*temp_main_time[col].map(trans2basicInfo))
            # temp_main_time["n_weekday_" + col] = temp_main_time[col].apply(trans2weekday)
            # temp_main_time["n_hour_" + col] = temp_main_time[col].apply(trans2hour)
            # temp_main_time["n_day_" + col] = temp_main_time[col].apply(trans2day)
            if not col.startswith("t_0"):
                temp_main_time["n_interval_" + col] = (temp_main_time[col] - temp_main_time["t_01"]).map(trans2interval)
        temp_main_time.drop(main_time_col, axis=1, inplace=True)
        relation_time_dicts.append(temp_main_time)
        print("Processing Trans to main time")
        '''

        # Xs[MAIN_TABLE_NAME] = main_table
        # clean_tables(Xs, self.config, fill=True)
        merge_table_v2(Xs, self.config)
        #clean_tables(Xs)
        X = FT_process(Xs, self.config)
        del Xs
        del self.tables
        del main_table
        #print(X.shape)
        '''
        for catLabel in catLabel_dict:
            # print(catLabel_dict[catLabel].head())
            if catLabel in X.columns:
                X = X.join(catLabel_dict[catLabel], on=catLabel)
        '''
        t1 = time.time()
        useful_catFea = [
            catFea_dict[catFea] for catFea in catFea_dict
            if catFea in X.columns
        ]
        X = pd.concat([X] + useful_catFea, axis=1)
        print("processing process_main_cat")
        '''
        for catFea in catFea_dict:
            if catFea in X.columns:
                #print(catFea_dict[catFea].head())
                X = X.join(catFea_dict[catFea], on=catFea)
                print("processing process_main_cat")
            #print(X.head())
        '''
        del catFea_dict
        '''
        for catFea_dict2 in catFea_dicts:
            for catFea in catFea_dict2:
                if catFea in X.columns:
                    #print(catFea_dict2[catFea].head())
                    X = X.join(catFea_dict2[catFea], on=catFea)
                    print("processing process_main_cat_v2")
                    #print(X.head())
        del catFea_dicts
        '''
        '''
        for relation_catFea_dict in relation_catFea_dicts:
            for relation_catFea in relation_catFea_dict:
                #print(relation_catFea_dict[relation_catFea].head())
                if relation_catFea in X.columns:
                    z=yield(relation_catFea_dict[relation_catFea])
                    # X = X.join(relation_catFea_dict[relation_catFea], on=relation_catFea)
                    print("processing process_relation_cat")
                    #print(X.head())
        '''
        X = pd.concat([X] + relation_catFea_dicts, axis=1)
        del relation_catFea_dicts

        if len(relation_time_dicts) > 0:
            X = pd.concat([X] + relation_time_dicts, axis=1)
            print("processing process_relation_time")
            #print(X.shape)
            #print(X.head())
            del relation_time_dicts
        '''
        for relation_catFea_dict2 in relation_catFea_dicts2:
            for relation_catFea in relation_catFea_dict2:
                #print(relation_catFea_dict2[relation_catFea].head())
                if relation_catFea in X.columns:
                    X = X.join(relation_catFea_dict2[relation_catFea], on=relation_catFea)
                    print("processing process_relation_cat_v2")
                    #print(X.head())
        del relation_catFea_dicts2
        '''
        t2 = time.time()
        print("cat join cost time: ", t2 - t1)
        #print(X.head())
        X.columns = [
            "m_" + c if (".m_" in c) and ("MEAN" not in c) and
            ("SUM" not in c) and ("COUNT" not in c) and
            ("N_UNIQUE" not in c) and ("N_TIME" not in c) else c
            for c in X.columns
        ]
        X.columns = [
            "c_" + c if (".c_" in c) and ("MEAN" not in c) and
            ("SUM" not in c) and ("COUNT" not in c) and
            ("N_UNIQUE" not in c) and ("N_TIME" not in c) else c
            for c in X.columns
        ]
        X.columns = [
            "n_" + c if not c.startswith("n_") and not c.startswith("m_")
            and not c.startswith("c_") and not c.startswith("t_") else c
            for c in X.columns
        ]
        #print(X.columns)
        print("Column Number:", len(X.columns))

        clean_df(X, "no_table", self.config)
        feature_engineer(X, self.config, len(X.columns), self.lables)

        X_train = X[X.index.str.startswith("train")]
        X_train.index = X_train.index.map(lambda x: int(x.split('_')[1]))
        X_train.sort_index(inplace=True)
        #train(X_train, self.lables.loc[X_train.index], self.config)
        train(X_train.loc[main_time_index.index],
              self.lables.loc[main_time_index.index], self.config)  # 按时间排序
        del main_time_index

        X = X[X.index.str.startswith("test")]
        X.index = X.index.map(lambda x: int(x.split('_')[1]))
        X.sort_index(inplace=True)
        result = predict(X, self.config)

        return pd.Series(result)
Esempio n. 13
0
        vprint(
            verbose, "[+] Remaining time after reading data %5.2f sec" %
            (time_budget - time_spent))
        if time_spent >= time_budget:
            vprint(verbose,
                   "[-] Sorry, time budget exceeded, skipping this task")
            execution_success = False
            continue

        # ========= Creating a model, knowing its assigned task from D.info['task'].
        # The model can also select its hyper-parameters based on other elements of info.
        vprint(verbose, "======== Creating model ==========")

        import automl as mpd
        mpd.predict(D, output_dir, start, time_budget, basename,
                    running_on_codalab)
    '''        
    if zipme and overall_time_budget == 0:
        vprint( verbose,  "========= Zipping this directory to prepare for submit ==============")
        data_io.zipdir(submission_filename + '.zip', ".")
    '''
    overall_time_spent = time.time() - overall_start
    if execution_success:
        vprint(verbose, "[+] Done")
        vprint(
            verbose, "[+] Overall time spent %5.2f sec " % overall_time_spent +
            "::  Overall time budget %5.2f sec" % overall_time_budget)
    else:
        vprint(verbose,
               "[-] Done, but some tasks aborted because time limit exceeded")
        vprint(
Esempio n. 14
0
        time_spent = time.time() - start
        vprint( verbose,  "[+] Remaining time after reading data %5.2f sec" % (time_budget-time_spent))
        if time_spent >= time_budget:
            vprint( verbose,  "[-] Sorry, time budget exceeded, skipping this task")
            execution_success = False
            continue
        
        # ========= Creating a model, knowing its assigned task from D.info['task'].
        # The model can also select its hyper-parameters based on other elements of info.  
        vprint( verbose,  "======== Creating model ==========")

	# djajetic, 2015 - original placeholder for code created by organizers is skipped (commented below) and replaced with file automl.py
	# code is leaned on "standard" organizer provided data loader and management

	import automl
	automl.predict(D, output_dir, basename )
	
	'''
	M = MyAutoML(D.info, verbose, debug_mode)
	print M
	
	# ========= Iterating over learning cycles and keeping track of time
	# Preferably use a method that iteratively improves the model and
	# regularly saves predictions results gradually getting better
	# until the time budget is exceeded.
	# The example model we provide we use just votes on an increasingly 
	# large number of "base estimators".
	time_spent = time.time() - start
	vprint( verbose,  "[+] Remaining time after building model %5.2f sec" % (time_budget-time_spent))        
	if time_spent >= time_budget:
	    vprint( verbose,  "[-] Sorry, time budget exceeded, skipping this task")
Esempio n. 15
0
    def predict(self, X_test, time_remain):
        self.Time_data_info['time_ramain_so_far'] = time_remain

        start_feature = time.time()

        Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]

        #index = main_table.sort_values(by=self.config['time_col']).index
        #split = int(0.6*len(index))
        #train_index, test_index = index[:split], index[split:]

        #log(f"Merge train and test tables...")
        main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        Xs[MAIN_TABLE_NAME] = main_table

        log(f"Feature engineering...")
        clean_tables(Xs)
        X = merge_table(Xs, self.config)
        X = clean_df(X)
        X = feature_engineer(X, self.config)

        X_train = X[X.index.str.startswith("train")]
        X_train.index = X_train.index.map(lambda x: int(x.split('_')[1]))
        X_train.sort_index(inplace=True)
        y_train = self.targets

        end_feature = time.time()

        self.Time_data_info['time_for_feature_engineering'] = (end_feature -
                                                               start_feature)

        self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[
            'time_ramain_so_far'] - self.Time_data_info[
                'time_for_feature_engineering']

        #self.Time_data_info['data_cols_for_hp'] = X.shape[1]
        #self.Time_data_info['data_rows_for_hp'] = X.shape[0]
        print(f"TIME info:", self.Time_data_info)

        # train model
        log(f"Training...")
        train_start = time.time()
        #train(X_train.iloc[train_index], y_train.iloc[train_index], self.config)

        timetrain(X_train, y_train, self.config, self.Time_data_info)

        #train with time limitation
        #timetrain(X_train.iloc[train_index], y_train.iloc[train_index], self.config, self.Time_data_info)

        train_end = time.time()

        self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[
            'time_ramain_so_far'] - (train_end - train_start)
        self.Time_data_info['time_for_model_train'] = (train_end - train_start)

        print("TIME info:", self.Time_data_info)

        #r = predict(X_train.iloc[test_index], self.config)
        #r = timepredict(X_train.iloc[test_index], self.config)

        #print('Test auc: ', roc_auc_score(y_train.iloc[test_index], r))

        #importance = self.config["model"].feature_importance(importance_type='split')
        #feature_name = np.array(self.config["model"].feature_name())
        #feature_importance = pd.DataFrame({'feature_importance': feature_name[np.argsort(-importance)], 'importnace':-np.sort(-importance)})
        #feature_importance.to_csv('feature_importance.csv', index=False)

        # predict
        log(f"Predicting...")
        X_test = X[X.index.str.startswith("test")]
        X_test.index = X_test.index.map(lambda x: int(x.split('_')[1]))
        X_test.sort_index(inplace=True)
        result = predict(X_test, self.config)

        return pd.Series(result)
Esempio n. 16
0
    def predict(self, X_test, time_remain):

        Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]  #.iloc[0:4000]
        #X_test = X_test#.iloc[0:4000]
        #self.y = self.y#.iloc[0:4000]
        if int(self.config["time_budget"]) > 2000:
            from data_sample import data_sample
            main_table, self.y = data_sample(main_table, self.y, ratio=1)
            # main_table = Xs[MAIN_TABLE_NAME].iloc[-1000000:]
            # self.y = self.y.iloc[-1000000:]

        main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        Xs[MAIN_TABLE_NAME] = main_table

        clean_tables(Xs)

        X = merge_table(Xs, self.config)

        clean_df(X)
        feature_engineer(X, self.config)

        ###-------------------- cat feature -----------------------###
        cat_features = []

        for col in X.columns:
            if "ROLLING" not in col and "c_" in col:
                cat_features.append(col)

        X, _ = cat_value_counts(X, cat_features)
        ###--------------------------------------------------------###

        ###------------------- data sample ------------------###

        if int(self.config["time_budget"]) <= 300:

            X_train = X[X.index.str.startswith("train")]
            X_test = X[X.index.str.startswith("test")]
            from data_sample import data_sample
            X_train, self.y = data_sample(X_train, self.y, flag=True)

            X = pd.concat([X_train, X_test], keys=['train', 'test'])
        elif int(self.config["time_budget"]) < 2000:
            X_train = X[X.index.str.startswith("train")]
            X_test = X[X.index.str.startswith("test")]
            from data_sample import data_sample
            X_train, self.y = data_sample(X_train, self.y)

            X = pd.concat([X_train, X_test], keys=['train', 'test'])

        #X.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")

        ###------------------- mul onehot feature -----------------###
        m_features = []

        for col in X.columns:
            if ("ROLLING" not in col) and ("mul_feature_" in col):
                m_features.append(col)

        # if len(self.mlbs)>0 or  self.mlbs is not None:
        #     m_features = list(self.mlbs.keys())
        # else:
        #     m_features = []

        one_hot_features = None
        one_hot_models = None
        mlbs = None

        one_hot_features_m = None

        from feature_expansion import onehot_feature_selection_m

        if len(m_features) > 0 and int(self.config["time_budget"]) > 100:
            one_hot_features_m, one_hot_models, mlbs = onehot_feature_selection_m(
                X,
                self.y,
                m_features,
                feature_num_everyiter=len(m_features),
                selection=True)
            X.drop(m_features, inplace=True, axis=1)

        elif len(m_features) > 0:
            X.drop(m_features, inplace=True, axis=1)

        ###-------------------------------------------------###

        ###------------------- onehot encoder ------------------###

        from feature_expansion import onehot_feature_selection
        one_hot_features = None
        if len(cat_features) > 0 and int(self.config["time_budget"]) > 4000:
            one_hot_features, one_hot_models, mlbs = onehot_feature_selection(
                X,
                self.y,
                cat_features,
                feature_num_everyiter=len(cat_features),
                selection=True)
            for cat_col in cat_features:
                if cat_col not in mlbs:
                    X.drop(cat_col, inplace=True, axis=1)

        ###-----------------------concat--------------------###

        from scipy.sparse import hstack, csr_matrix
        X = csr_matrix(X)
        if one_hot_features is not None:
            X = hstack([X, one_hot_features]).tocsr()

        if one_hot_features_m is not None:
            X = hstack([X, one_hot_features_m]).tocsr()

        ###-------------------------------------------------###

        # ###------------------drop mul_feature---------------###
        # m_features = []
        # for feature in X.columns:
        #     if "mul_feature_" in feature:
        #         m_features.append(feature)
        #
        # X.drop(m_features,inplace=True,axis=1)
        # ###-------------------------------------------------###

        X_train = X[0:self.y.shape[0]]
        X = X[self.y.shape[0]:]

        result = None

        if int(self.config["time_budget"]) < 2000 and int(
                self.config["time_budget"]) > 300:
            for i in range(0, 3):
                train(X_train, self.y, self.config)
                tmp = predict(X, self.config)
                if result is None:
                    result = tmp
                    continue
                else:
                    result = result + tmp

            result = result / float(3)
        else:
            train(X_train, self.y, self.config)
            result = predict(X, self.config)

        return pd.Series(result)