Esempio n. 1
0
    def predict(self, X_test, time_remain):

        Xs = self.tables
        self.istrain = False
   
        
     
        Xs[MAIN_TABLE_NAME] = X_test

        clean_tables(Xs)
        start = time.time()

        X = merge_table(Xs, self.config)

        self.time['merging_test']= time.time() -start

        clean_df(X)

        
        #feature_engineer(X, self.config, self.dropcols,self.numericmap, self.istrain,self.square_cubic_transform,self.skewness)

        transform_numeric(X, self.dropcols, self.numericmap, self.istrain,self.square_cubic_transform,self.skewness)
        transform_categorical_hash(X, self.dropcols,self.istrain)
        
        start = time.time()
        result =self.model.predict(X,self.diff_info,self.start_time)

        self.time['result_predict']= time.time() -start  

        return pd.Series(result)
Esempio n. 2
0
    def predict(self, X_test, time_remain):
        self.Time_data_info['time_ramain_so_far'] = time_remain

        start_feature = time.time()

        Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]

        log(f"Merge train and test tables...")
        main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        Xs[MAIN_TABLE_NAME] = main_table

        log(f"Feature engineering...")
        clean_tables(Xs)
        X = merge_table(Xs, self.config)
        X = clean_df(X)
        X = feature_engineer(X, self.config)

        X_train = X[X.index.str.startswith("train")]
        X_train.index = X_train.index.map(lambda x: int(x.split('_')[1]))
        X_train.sort_index(inplace=True)
        y_train = self.targets

        end_feature = time.time()

        self.Time_data_info['time_for_feature_engineering'] = (end_feature -
                                                               start_feature)

        self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[
            'time_ramain_so_far'] - self.Time_data_info[
                'time_for_feature_engineering']

        print(f"TIME info:", self.Time_data_info)

        # train model
        log(f"Training...")
        train_start = time.time()

        timetrain(X_train, y_train, self.config, self.Time_data_info)

        train_end = time.time()

        self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[
            'time_ramain_so_far'] - (train_end - train_start)
        self.Time_data_info['time_for_model_train'] = (train_end - train_start)

        print("TIME info:", self.Time_data_info)

        # predict
        log(f"Predicting...")
        X_test = X[X.index.str.startswith("test")]
        X_test.index = X_test.index.map(lambda x: int(x.split('_')[1]))
        X_test.sort_index(inplace=True)
        result = predict(X_test, self.config)

        return pd.Series(result)
Esempio n. 3
0
def baseline_features_test(Xs,X_test,config,m_features,mlbs,one_hot_model):

    main_table = Xs[MAIN_TABLE_NAME]
    main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
    main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")

    Xs[MAIN_TABLE_NAME] = main_table
    clean_tables(Xs)


    X = merge_table(Xs, config)

    clean_df(X)


    from feature_for_test import multi_features_for_test
    X = X[X.index.str.startswith("test")]

    feature_engineer(X, config)
    new_features=None
    
    if len(m_features)>0 and int(config["time_budget"])>300:
        new_features = multi_features_for_test(X, m_features, mlbs, one_hot_model)
        #new_features.index = X.index
        X.drop(m_features, inplace=True, axis=1)
        X.index = X.index.map(lambda x: int(x.split('_')[1]))
        X.sort_index(inplace=True)
        from scipy.sparse import hstack, csr_matrix
        X = csr_matrix(X)
        X = hstack([X,new_features]).tocsr()
        print("------------------")
        print(X.shape)
        #X = pd.concat([X, new_features], axis=1)

    elif len(m_features)>0:
        X.index = X.index.map(lambda x: int(x.split('_')[1]))
        X.sort_index(inplace=True)
        X.drop(m_features, inplace=True, axis=1)
        from scipy.sparse import hstack, csr_matrix
        X = csr_matrix(X)

    else:
        X.index = X.index.map(lambda x: int(x.split('_')[1]))
        X.sort_index(inplace=True)
        from scipy.sparse import hstack, csr_matrix
        X = csr_matrix(X)




    return X
Esempio n. 4
0
    def fit(self, Xs, y, time_remain):
        print('', flush=True)
        time_manager = TimeManager(self.config, time_remain)
        duration = 0
        # self.tables = copy.deepcopy(Xs)
        self.tables = Xs
        clean_tables(self.tables)
        duration += 2*time_manager.check("clean tables")

        if DROP_OUTLIER:
            # the percentage of outliers dropped is around 15% to 20%
            inlier_lable = drop_outlier(self.tables[MAIN_TABLE_NAME])
            self.tables[MAIN_TABLE_NAME] = self.tables[MAIN_TABLE_NAME][inlier_lable == 1].reset_index(drop=True)
            y = y[inlier_lable == 1].reset_index(drop=True)
            duration += time_manager.check("drop outlier")

        X = merge_table(self.tables, self.config)
        duration += 2*time_manager.check("merge table")

        clean_df(X)
        duration += 2*time_manager.check("clean data before learning")

        self.time_feature_list = [c for c in X if c.startswith(TIME_PREFIX)]
        self.mul_feature_list = [c for c in X if c.startswith(MULTI_CAT_PREFIX)]
        self.num_feature_list = [c for c in X if c.startswith(NUMERICAL_PREFIX)]

        print('', flush=True)

        if FEATURE_SELECTION_SWITCH:
            _, self.selected_features_0 = feature_selection(X.drop(columns=self.time_feature_list+self.mul_feature_list+self.num_feature_list)
                                                            , y, self.config, FEATURE_RATIO_1)
            time_manager.check("first feature selection")
            selected_features = list(self.selected_features_0) + self.time_feature_list + self.mul_feature_list + self.num_feature_list
        else:
            selected_features = self.time_feature_list + self.mul_feature_list + self.num_feature_list

        st = time.time()
        X = feature_engineer_rewrite(X.filter(selected_features), self.config, time_manager)
        duration += 2*(time.time() - st)
        time_manager.check("exit feature engineering")

        if FEATURE_SELECTION_SWITCH:
            X, self.selected_features_1 = feature_selection(X, y , self.config, FEATURE_RATIO_2)
            duration += time_manager.check("second feature selection")
        print('', flush=True)

        self.config["prediction_estimated"] = 1.57 * duration
        print(f"estimated prediction time: {self.config['prediction_estimated']}")
        train(X, y, self.config, time_manager)
        time_manager.check("model training")
        print('', flush=True)
Esempio n. 5
0
    def predict(self, X_test, time_remain):

        ##--------Calculate sample size----------
        '''main_table=self.tables[MAIN_TABLE_NAME]
        print(main_table.shape[0])
        print(X_test.shape[0])
        return None'''

        Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]
        main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        Xs[MAIN_TABLE_NAME] = main_table

        ## Clean tables
        clean_tables(Xs)
        #remove_trivial_features_in_tables(Xs)

        ## Merge tables and remove trivial features
        X = merge_table(Xs, self.config)
        clean_df(X)
        feature_engineer(X, self.config)
        ### ----------Temporarily remove multi-categorical features from related tables----------
        X.drop([c for c in X.columns if c.startswith("mul_")],
               axis=1,
               inplace=True)
        #print(X.columns)
        #input()
        ### ----------End-----------
        remove_trivial_features(X)

        ## Add number frequency feature
        cat_features = []
        for col in X.columns:
            if "c_" in col and "ROLLING" not in col and "cnt" not in col:
                cat_features.append(col)
        X, _ = cat_value_counts(X, cat_features)

        ## Split train and test data
        X_train = X[X.index.str.startswith("train")]
        X = X[X.index.str.startswith("test")]
        X.index = X.index.map(lambda x: int(x.split('_')[1]))
        X.sort_index(inplace=True)

        ## Training process
        train_with_time_control(X_train, self.y, self.config)

        ## Testing process
        result = predict(X, self.config)

        return pd.Series(result)
Esempio n. 6
0
    def predict(self, X_test, time_remain):

        Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]
        main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        Xs[MAIN_TABLE_NAME] = main_table

        clean_tables(Xs)
        X = merge_table(Xs, self.config)
        clean_df(X)
        feature_engineer(X, self.config)

        cat_features = []

        for col in X.columns:
            if "c_" in col and "ROLLING" not in col and "cnt" not in col:
                cat_features.append(col)


        X, _ = cat_value_counts(X, cat_features)



        X_train = X[X.index.str.startswith("train")]



        X = X[X.index.str.startswith("test")]
        X.index = X.index.map(lambda x: int(x.split('_')[1]))
        X.sort_index(inplace=True)

        result = None
        for i in range(0,3):
            train(X_train, self.y, self.config)
            tmp = predict(X, self.config)
            if result == None:
                result = tmp
                continue
            else:
                result = result + tmp

        result = result/float(3)

        return pd.Series(result)
Esempio n. 7
0
    def fit(self, Xs, y, time_ramain):

        # self.tables = copy.deepcopy(Xs)
        self.tables = Xs
        if DATA_DOWNSAMPLING_SWITCH:
            self.tables[MAIN_TABLE_NAME], y = data_downsampling(
                self.tables[MAIN_TABLE_NAME], y, self.config)
        print("before clean table mem used")
        os.system("free -m")
        clean_tables(self.tables)
        print("before merge table mem used")
        os.system("free -m")
        # gc.collect()
        X = merge_table(self.tables, self.config)
        # gc.collect()
        print("before engineer mem used")
        os.system("free -m")
        self.time_feature_list = [c for c in X if c.startswith(TIME_PREFIX)]
        self.mul_feature_list = [
            c for c in X if c.startswith(MULTI_CAT_PREFIX)
        ]
        clean_df(X)

        if FEATURE_SELECTION_SWITCH:
            _, self.selected_features_0 = feature_selection(
                X.drop(columns=self.time_feature_list + self.mul_feature_list),
                y, self.config)
        selected_features = list(
            self.selected_features_0
        ) + self.time_feature_list + self.mul_feature_list

        X = feature_engineer_rewrite(X.filter(selected_features), self.config)
        # clean_df(X)

        if FEATURE_SELECTION_SWITCH:
            X, self.selected_features_1 = feature_selection(X, y, self.config)

        # gc.collect()
        print("before feature selection mem used")
        os.system("free -m")

        print("before train mem used")
        os.system("free -m")
        train(X, y, self.config)
Esempio n. 8
0
    def fit(self, Xs, y, time_ramain):

        self.tables = copy.deepcopy(Xs)
        
        self.dropcols = []
       
        self.istrain = True
        
        self.numericmap = {}        
        self.square_cubic_transform = True
       
        self.skewness = True
     
        clean_tables(Xs)
        enc = OneHotEncoder(handle_unknown='ignore')

        self.ohe = enc
      
        start = time.time()
        X = merge_table(Xs, self.config)
        self.time['merging_train']= time.time() -start
        clean_df(X)
      
        start = time.time()
        #feature_engineer(X, self.config, self.dropcols, self.numericmap, self.istrain,self.square_cubic_transform,self.skewness)
        transform_numeric(X, self.dropcols, self.numericmap, self.istrain,self.square_cubic_transform,self.skewness)
        transform_categorical_hash(X, self.dropcols,self.istrain)
 
        self.time['feature_engineer']= time.time() -start
      

        numerical_list = list()
        date_time = list()
        categorical=list()
    
        for term,col in enumerate(X.columns):
           if ((X[col].dtype == "int64") or (X[col].dtype=="float64")):
                 numerical_list.append(term)
           if ((X[col].dtype=="datetime64[ns]")):
                 date_time.append(term)
           if ((X[col].dtype.name=="category")):
                 categorical.append(term)      
                
        
        datainfo={}


        
        datainfo['loaded_feat_types'] = list()
        datainfo['loaded_feat_types'].append(date_time)
        datainfo['loaded_feat_types'].append(numerical_list)
        datainfo['loaded_feat_types'].append(categorical)
        datainfo['time_budget'] = self.config['time_budget']

        self.diff_info = datainfo
 
        self.training_data = X
        self.model = Model_NIPS(datainfo)
        start = time.time()
        self.model.fit(X, y,datainfo)

        self.time['fitting']= time.time() -start 
Esempio n. 9
0
def baseline_features(Xs, y, config):

    clean_tables(Xs)
    stampcol = Xs[CONSTANT.MAIN_TABLE_NAME][config["time_col"]].apply(
        lambda x: int(x.timestamp()))
    main_table = Xs[CONSTANT.MAIN_TABLE_NAME]
    main_table["label"] = y
    main_table["timestamp"] = stampcol
    main_table.sort_values("timestamp", inplace=True)

    tmp_columns = main_table.columns
    main_table = pd.DataFrame(main_table.values)

    main_table.columns = tmp_columns

    #main_table = main_table.iloc[0:40000]
    Xs[CONSTANT.MAIN_TABLE_NAME] = main_table

    y = main_table["label"]
    stampcol = main_table["timestamp"]
    X = merge_table(Xs, config)

    print(X.columns)

    X.drop(["label", "timestamp"], axis=1, inplace=True)

    clean_df(X)
    feature_engineer(X, config)

    cat_feature_map = {}
    for col in X.columns:
        if "c_" in col and "ROLLING" not in col and "cnt" not in col:
            cat_feature_map[col] = set(X[col])

    feature_names = X.columns

    m_features = []
    for feature in feature_names:
        if "mul_feature_" in feature:
            m_features.append(feature)

    one_hot_features = None
    one_hot_models = None
    mlbs = None

    if len(m_features) > 0 and int(config["time_budget"]) > 200000:
        one_hot_features, one_hot_models, mlbs = onehot_feature_selection_m(
            X, y, m_features, feature_num_everyiter=len(m_features))
        X.drop(m_features, inplace=True, axis=1)
        #X = pd.concat([X, one_hot_features], axis=1)
        from scipy.sparse import hstack, csr_matrix
        X = csr_matrix(X)
        X = hstack([X, one_hot_features]).tocsr()
    elif len(m_features) > 0:
        X.drop(m_features, inplace=True, axis=1)
        from scipy.sparse import hstack, csr_matrix
        X = csr_matrix(X)
    else:
        from scipy.sparse import hstack, csr_matrix
        X = csr_matrix(X)

    print("---------------------------------")
    print(X.shape)

    #X.drop(m_features,inplace=True,axis=1)

    # one_hot_features=None
    # one_hot_models = None

    # import random
    # X_tmp = [X]
    # y_tmp = [y]
    # for i in range(5):
    #     cols = list(X.columns)
    #     random.shuffle(cols)
    #     cols_tmp = cols[0:int(len(cols)*0.5)]
    #     X_tmp.append(X[cols_tmp])
    #     y_tmp.append(y)
    #
    # y = pd.concat(y_tmp,axis=0)
    # X = pd.concat(X_tmp, axis=0)

    return X, y, feature_names, cat_feature_map, stampcol, one_hot_features, one_hot_models, m_features, mlbs
Esempio n. 10
0
    def predict(self, X_test, time_remain):

        ##--------Calculate sample size----------
        '''main_table=self.tables[MAIN_TABLE_NAME]
        print(main_table.shape[0])
        print(X_test.shape[0])
        return None'''

        Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]
        ## -------------Sort X_train according to the timestamp-------------------
        time_col = self.config['time_col']
        '''print(main_table[time_col])
        print(main_table[time_col].dtype)
        input()'''
        main_table.sort_values(time_col, inplace=True)
        index = main_table.index
        self.y = self.y.reindex(index)
        main_table.reset_index(inplace=True, drop=True)
        self.y.reset_index(inplace=True, drop=True)
        #print(main_table.index)
        #print(self.y.index)
        #input()
        #print(main_table.columns)
        #print(self.y.columns)
        #input()

        if self.test_mode:
            train_ratio = 0.8
            train_size = int(train_ratio * main_table.shape[0])
            X_test = main_table[train_size:]
            main_table = main_table[0:train_size]
            y_test = self.y[train_size:]
            self.y = self.y[0:train_size]

        ##----------concat and merge tables------------------------

        main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        Xs[MAIN_TABLE_NAME] = main_table

        ## Clean tables
        clean_tables(Xs)
        #remove_trivial_features_in_tables(Xs)

        ## Merge tables and remove trivial features
        print(main_table.index)
        X = merge_table(Xs, self.config)
        train_index = X.index.str.startswith("train")
        test_index = X.index.str.startswith("test")

        ##------convert m_ features to c_---------##
        new_columns = []
        mul_features = []
        for col in X.columns:
            if "m_" in col and "mul_" not in col:
                new_columns.append("c_" + col)
            elif "mul_" in col:
                mul_features.append(col)
                new_columns.append(col)
            else:
                new_columns.append(col)
        X.columns = new_columns

        print(X.columns)
        clean_df(X)
        print(X.shape)

        ##-------------- Add number frequency feature ----------------
        cat_features = []
        for col in X.columns:
            #if "c_" in col and "ROLLING" not in col and "cnt" not in col:
            if col.split('_')[0] == 'c':
                cat_features.append(col)
        print("cat_features", cat_features)
        X, _ = cat_value_counts(X, cat_features)
        print(X.shape)

        #print(X.dtypes)

        # ##-------------- Reserve multi-cat features ------------------
        # all_features = X.columns
        # tmp_c = None
        # mul_features = []
        #
        # for c in all_features:
        #     if c.split('_')[0] == 'm':
        #         tmp_c = X[c].copy()
        #         tmp_c.fillna("0",inplace=True)
        #         tmp_c = tmp_c.apply(lambda x: str(x).split(","))
        #         X["mul_"+tmp_c.name] = tmp_c
        #         mul_features.append("mul_"+tmp_c.name)
        #         tmp_c = tmp_c.apply(lambda x: int(x[0]))
        #         tmp_c.name = f"{CONSTANT.CATEGORY_PREFIX}{tmp_c.name}"
        #         X = pd.concat([X, tmp_c], axis=1)
        #         print(c)
        #
        # #input()
        # if not (tmp_c is None):
        #     del tmp_c
        # #print(X.columns)
        # print(X.shape)
        # #input()

        ###--------------Change data type------------------
        X.drop([self.config['time_col']], axis=1, inplace=True)
        X = normalize_categorical_features(X)
        for c in X.columns:
            if c.split('_')[0] == 'n':
                X[c] = X[c].astype('float32')
            elif c.split('_')[0] == 'c':
                #X[c] = X[c].apply(lambda x: int(x))
                X[c] = X[c].astype('int32')
            elif c.split('_')[0] == 't':
                X[c] = X[c].values.astype('float32')
            elif c.split('_')[0] == 'm':
                continue
            elif c.split('_')[0] == 'mul':
                continue
            else:
                raise ValueError('Undefined column type: %s' % c)
        print(X.shape)

        ##---------------features split--------------------

        main_features = []
        for feature in X.columns:
            if "mul_" not in feature:
                main_features.append(feature)
        print(main_features)

        print(X.shape)
        ##--------------Remove trivial features-------------------
        remove_trivial_features(X[main_features])
        print(X.shape)

        # ##---------------Multi-cat features--------------------
        # multi_cat_features=[]
        # for c in X.columns:
        #     if c.split('_')[0] == 'm':
        #         multi_cat_features.append(c)
        #
        # if len(multi_cat_features) > 0:
        #     X_multi_cat = X[multi_cat_features]
        #     X.drop(multi_cat_features, axis=1, inplace=True)

        ###-------------Train lightgbm to get an initial result-------------
        result = None
        num_trails = 0
        skip_multi_cat = False

        selection = False
        one_hot_features_m = None

        mlbs_m = None
        mlbs = None

        for i in range(1000):
            random_state = np.random.RandomState(i)

            num_trails = num_trails + 1
            if self.config.time_left() < 50:
                num_trails = num_trails - 1
                break

            X_train = X[X.index.str.startswith("train")]
            X_test = X[X.index.str.startswith("test")]

            ##-------------data sample ----------------#
            from data_sample import data_sample

            if i == 0:

                X_train_sample, y_train_sample = data_sample(
                    X_train, self.y, p_n_ratio=1, ratio=1, random_state_seed=i)

                mul_size = 0
                for mul_feature in mul_features:
                    mul_count_data_tmp = X_train_sample[mul_feature].apply(
                        lambda x: len(x))
                    #print(mul_feature,mul_count_data_tmp.sum(axis=0))
                    mul_size = mul_size + mul_count_data_tmp.sum(axis=0)

                size_fo_train = 60000000

                train_size = csr_matrix(
                    X_train_sample[main_features]).nonzero()[0].size + mul_size
                p_n_ratio = size_fo_train / train_size

            X_train_sample, y_train_sample = data_sample(X_train,
                                                         self.y,
                                                         random_state_seed=i,
                                                         p_n_ratio=p_n_ratio,
                                                         ratio=1)
            #print(y_train_sample)
            #input()

            if self.config.time_left() < 50:
                num_trails = num_trails - 1
                break

            print(X_train_sample.shape)
            X_tmp = pd.concat([X_train_sample, X_test])

            print("train test_train shape:train", X_tmp.shape)
            print("train test_train shape:test",
                  X_tmp[0:y_train_sample.shape[0]].shape)
            print("train test_train shape:y",
                  X_tmp[y_train_sample.shape[0]:].shape)

            from feature_expansion import onehot_feature_selection_m

            main_body_start_time = time.time()

            if i >= 0:

                if len(mul_features) > 0:
                    # if i>1 :
                    #     selection=False
                    #     one_hot_features_m, _, mlbs_m = onehot_feature_selection_m(X_tmp, y_train_sample, mlbs_m.keys(),
                    #                                                                             feature_num_everyiter=len(
                    #                                                                                 mlbs_m.keys()),
                    #                                                                             selection=selection)
                    #
                    # else:
                    #     selection = False
                    if i == 0:
                        first_iter_flag = True
                    else:
                        first_iter_flag = False

                    if skip_multi_cat:
                        one_hot_features_m, mul_features = onehot_encoding_without_fit(
                            X_tmp, mul_features, mlbs_m, config=self.config)
                    else:
                        one_hot_features_m, one_hot_models, mlbs_m, mul_features = onehot_feature_selection_m(
                            X_tmp,
                            y_train_sample,
                            mul_features,
                            config=self.config,
                            is_first_iter=first_iter_flag,
                            feature_num_everyiter=len(mul_features),
                            selection=selection)

                    if self.config.time_left() < 50:
                        num_trails = num_trails - 1
                        break

                    if one_hot_features_m is not None:
                        one_hot_features_m = csr_matrix(one_hot_features_m,
                                                        dtype=np.float32)
                        one_hot_features_m = one_hot_features_m[
                            0:y_train_sample.shape[0], :]

                        print("mul_features shape sparse:",
                              one_hot_features_m.shape)

            ###--------------------data concat--------------------###

            X_train_sample = X_tmp[0:y_train_sample.shape[0]]
            X_test = X_tmp[y_train_sample.shape[0]:]
            X_train_sample = X_train_sample[main_features]
            X_train_sample = csr_matrix(X_train_sample)
            y_train_sample = y_train_sample.values

            if one_hot_features_m is not None:
                X_train_sample = hstack([X_train_sample,
                                         one_hot_features_m]).tocsr()

            #if self.config.time_left() < 60:
            #    num_trails = num_trails - 1
            #    break
            if result is None:
                #model = train_and_predict_with_time_control_basic(X_train_sample, X_test, y_train_sample, self.config, random_state=random_state)
                model = train_lightgbm(X_train_sample,
                                       y_train_sample,
                                       self.config,
                                       random_state=random_state,
                                       mode="timestamp")
                del X_train_sample
                result = lightgbm_predict_by_split(model, X_test,
                                                   main_features, mlbs_m,
                                                   mul_features, self.config)

                whole_process_time = time.time() - main_body_start_time
                print("Time for the whole process time is: %.4f" %
                      whole_process_time)
                if result is None:
                    mul_features = list()

            else:
                #model= train_and_predict_with_time_control_basic(X_train_sample, X_test, y_train_sample, self.config, random_state=random_state)
                model = train_lightgbm(X_train_sample,
                                       y_train_sample,
                                       self.config,
                                       random_state=random_state,
                                       mode="timestamp")
                del X_train_sample
                print("timeleft", self.config.time_left())
                if self.config.time_left() < 50 or model is None:
                    num_trails = num_trails - 1
                    break
                result_tmp = lightgbm_predict_by_split(model, X_test,
                                                       main_features, mlbs_m,
                                                       mul_features,
                                                       self.config)

                if self.config.time_left() < 50 or result_tmp is None:
                    num_trails = num_trails - 1
                    break

                result = result_tmp + result
            '''
            if i>0:
                ###-------------- get sparse training and testing matrix---------
                numeric_table = X_tmp[main_features]
                numeric_table = min_max_func(numeric_table)
                numeric_table = csr_matrix(numeric_table)
                X_train_sample = numeric_table[0:y_train_sample.shape[0],:]

                if len(cat_features) > 0 :

                    if i > 1:
                        selection = False
                        one_hot_features, _, mlbs, le_models = onehot_feature_selection(X_tmp, y_train_sample,
                                                                                        mlbs.keys(),
                                                                                        feature_num_everyiter=len(
                                                                                            mlbs.keys()),
                                                                                        selection=selection)
                    else:

                        one_hot_features, one_hot_models, mlbs, le_models = onehot_feature_selection(X_tmp, y_train_sample,
                                                                                                     cat_features,
                                                                                                     feature_num_everyiter=len(
                                                                                                         cat_features),
                                                                                                     selection=selection)


                if one_hot_features is not None:
                    X_train_sample = hstack([X_train_sample, one_hot_features]).tocsr()

                if one_hot_features_m is not None:
                    X_train_sample = hstack([X_train_sample, one_hot_features_m]).tocsr()

                train_fastfm_batch(X_train_sample, y_train_sample, self.config)

            '''
            ####---------------If there is not enough time for the whole process, skip multi-cat-------------
            if self.config.time_left() < whole_process_time:
                if skip_multi_cat:
                    print("Time is not enough even using basic features!")
                    break
                else:
                    skip_multi_cat = True

            print("Time remaining: %.4f" % self.config.time_left())
            print("-" * 50)
            if self.config.time_left() < 50:
                break

        if result is None:
            print("Time is not enough for a complete iteration!")
            result = np.zeros(X_test.shape[0])
        else:
            result = result / num_trails
            print(result)
        #result /= num_trails

        ###-------------Ensemble-------------------------
        flag = False
        if flag:

            ###--------------Multi-cat features processing-----------
            if len(multi_cat_features) > 0:
                X_multi_cat_sparse = None
                for c in multi_cat_features:
                    sparse_out = get_tfidf_vector(X_multi_cat[c],
                                                  max_features=100,
                                                  sparse=True)
                    if X_multi_cat_sparse is None:
                        X_multi_cat_sparse = sparse_out
                    else:
                        X_multi_cat_sparse = hstack(
                            [X_multi_cat_sparse, sparse_out]).tocsr()

                    ## Warning appears here, but there is no effect.
                    X_multi_cat.drop([c], axis=1, inplace=True)
            print("Time remaining: %.4f" % self.config.time_left())
            print("-" * 50)

            ###-------------- get sparse training and testing matrix---------
            numeric_table = X[[
                c for c in X.columns if c.startswith(CONSTANT.NUMERICAL_PREFIX)
                or c.startswith(CONSTANT.TIME_PREFIX)
            ]]
            X = X[[
                c for c in X.columns
                if not (c.startswith(CONSTANT.NUMERICAL_PREFIX)
                        or c.startswith(CONSTANT.TIME_PREFIX))
            ]]
            numeric_table = (numeric_table - numeric_table.min()) / (
                numeric_table.max() - numeric_table.min())
            enc = OneHotEncoder(sparse=True,
                                dtype=np.float32,
                                categories="auto")
            X = enc.fit_transform(X)
            X = hstack((X, numeric_table.values), dtype=np.float32).tocsr()

            del numeric_table
            del enc

            if len(multi_cat_features) > 0:
                X = hstack([X, X_multi_cat_sparse]).tocsr()
                del X_multi_cat_sparse
            print("Time remaining: %.4f" % self.config.time_left())
            print("-" * 50)

            ###--------------- train FM and merge result -------------------
            weight = 1.0
            result *= weight
            #result += (1 - weight)*train_fastfm_batch(X[train_index], X[test_index], self.y.values, self.config)

        ## Training process
        #train_with_time_control(X_train, self.y, self.config)

        ## Testing process
        #result = predict(X, self.config)

        ###-------------Train and Predict--------------------
        #result = train_and_predict(X, self.y, self.config)
        #result = train_and_predict_with_concept_drift_zhiqiang(X, self.y, self.config)

        # Can not install autosklearn
        '''from autosklearn import classification
        import sklearn
        X, X_test, y, y_test = sklearn.model_selection.train_test_split(X, self.y, test_size=0.2, random_state=1)
        automl = classification.AutoSklearnClassifier()
        automl.fit(X, y)
        pred = automl.predict(X_test)
        score = roc_auc_score(y_test.values, pred)
        print("Test auc on hold-out dataset: %.4f"%score)
        result=None'''

        if self.test_mode:
            score = roc_auc_score(y_test.values, result)
            print("Test auc on hold-out dataset: %.4f" % score)
            result = None

        return pd.Series(result)
Esempio n. 11
0
    def predict(self, X_test, time_remain):
        timer = Timer()
        timer.set(time_remain)
        with timer.time_limit('ProProcess'):
            # fetch information of test dataset
            self.config[TEST_DATA_LENGTH] = len(X_test)
            self.config['test_time'] = self._fectch_time_range(X_test)
            self.config[STAGE] = 'test'

            Xs = self.tables
            main_table = pd.concat([Xs[MAIN_TABLE_NAME], X_test],
                                   axis=0,
                                   copy=False)
            main_table.reset_index(drop=True, inplace=True)

            del Xs[MAIN_TABLE_NAME]
            Xs[MAIN_TABLE_NAME] = main_table

            pre_process(Xs, self.config)
            clean_tables(Xs)
            pre_feature_extract(Xs)
            pre_tables_memory_cut(Xs)

            X = merge_table(Xs, self.config)
            # clean datas
            del self.tables, Xs
            gc.collect()

            self.null_count_sum(X, self.config)
            clean_df(X, fill_time=True)
            # compress data for memory problem
            X = table_memory_cut(X)

            # feature engineering
            print('overall X size', X.shape)
            X, add_feature = feature_engineer(X, self.config)

            # 内存问题 11G
            X = table_memory_cut(X)
            add_feature = table_memory_cut(add_feature)
            X = pd.concat([X, add_feature], axis=1, copy=False)
            del add_feature
            print(X.shape)
            # re compress data

            # 测试集分割
            X_train_val, y_train_val = X.iloc[:self.config[
                TRAIN_DATA_LENGTH]], self.train_label
            X_test = X.iloc[self.config[TRAIN_DATA_LENGTH]:]

            train_len = int(self.config[TRAIN_DATA_LENGTH] * 0.8)
            valid_len = self.config[TRAIN_DATA_LENGTH] - train_len
            self.config[TRAIN_LEN_OF_TRAIN_VAL] = train_len
            self.config[VAL_LEN_OF_TRAIN_VAL] = valid_len
            del X
            gc.collect()

            # 特征处理
            all_label_count_feature_list = cat_Lable_Cnt_Fun(
                X_train_val, y_train_val, X_test, self.config)
            all_mutlicat_feature_data_list = Mv_Label_Cnt_Func(
                X_train_val, y_train_val, X_test, self.config)

            if (all_label_count_feature_list is
                    None) & (all_mutlicat_feature_data_list is None):
                X_train, y_train = X_train_val.iloc[:
                                                    train_len], self.train_label[:
                                                                                 train_len]
                X_val, y_val = X_train_val.iloc[train_len:], self.train_label[
                    train_len:]
            else:
                all_feature_list = []
                if all_label_count_feature_list is not None:
                    all_feature_list += all_label_count_feature_list
                if all_mutlicat_feature_data_list is not None:
                    all_feature_list += all_mutlicat_feature_data_list

                add_feature_data = pd.concat(all_feature_list,
                                             axis=1,
                                             copy=False)
                add_feature_data.sort_index(inplace=True)

                del all_label_count_feature_list, all_mutlicat_feature_data_list, all_feature_list
                gc.collect()

                X_train = pd.concat(
                    [X_train_val[:train_len], add_feature_data[:train_len]],
                    axis=1,
                    copy=False)
                X_val = pd.concat([
                    X_train_val[train_len:self.config[TRAIN_DATA_LENGTH]],
                    add_feature_data[train_len:self.config[TRAIN_DATA_LENGTH]]
                ],
                                  axis=1,
                                  copy=False)
                y_train = self.train_label[:train_len]
                y_val = self.train_label[train_len:]

                X_test = pd.concat([
                    X_test, add_feature_data[self.config[TRAIN_DATA_LENGTH]:]
                ],
                                   axis=1,
                                   copy=False)

                del X_train_val, y_train_val, add_feature_data, self.train_label
                gc.collect()

        train_columns = train(X_train, X_val, y_train, y_val, self.config,
                              timer.remain)
        del X_train, X_val, y_train, y_val
        gc.collect()

        result = predict(X_test[train_columns], self.config)

        return pd.Series(result)
Esempio n. 12
0
    def predict(self, X_test, time_remain):

        Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]

        main_table['y_sorted'] = self.y
        main_table.sort_values(self.ts_col, inplace=True)
        #y_trn = main_table.y_sorted.copy()
        #main_table.drop('y_sorted', axis=1, inplace=True)

        #main_table['data_type'] = 'train'
        #X_test['data_type'] = 'test'
        X_test['y_sorted'] = -1
        main_table = pd.concat([main_table, X_test],
                               ignore_index=True).reset_index()

        del X_test
        gc.collect()

        # main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        # main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")

        Xs[MAIN_TABLE_NAME] = main_table
        log('memory usage of main_table: {:.2f}MB'.format(
            df_memory_usage(main_table) // 1e6))
        log('memory usage of process: {:.2f}MB'.format(get_process_memory()))

        clean_tables(Xs)
        X = merge_table(Xs, self.config)
        clean_df(X)

        del Xs, main_table
        gc.collect()

        log('memory usage of X: {:.2f}MB'.format(df_memory_usage(X) // 1e6))
        log('memory usage of process: {:.2f}MB'.format(get_process_memory()))

        self.cat_cols = sorted(
            [c for c in X.columns if c.startswith(CATEGORY_PREFIX)])
        self.mcat_cols = sorted(
            [c for c in X.columns if c.startswith(MULTI_CAT_PREFIX)])
        self.num_cols = sorted(
            [c for c in X.columns if c.startswith(NUMERICAL_PREFIX)])
        self.ts_cols = sorted(
            [c for c in X.columns if c.startswith(TIME_PREFIX)])

        X = self.feature_engineer(X, train=True)

        # X_trn = X[X.index.str.startswith("train")]
        # X_trn.index = X_trn.index.map(lambda x: int(x.split('_')[1]))
        X_trn = X[X['y_sorted'] != -1]
        y_trn = X_trn.y_sorted.copy()
        X_trn = X_trn.drop('y_sorted', axis=1)

        # X_tst = X[X.index.str.startswith("test")]
        # X_tst.index = X_tst.index.map(lambda x: int(x.split('_')[1]))
        X_tst = X[X['y_sorted'] == -1]
        X_tst = X_tst.drop('y_sorted', axis=1)

        X_tst.sort_index(inplace=True)

        del X
        gc.collect()

        log('memory usage of X_trn: {:.2f}MB'.format(
            df_memory_usage(X_trn) // 1e6))
        log('memory usage of process: {:.2f}MB'.format(get_process_memory()))

        train(X_trn, y_trn, self.config)
        del X_trn, y_trn
        gc.collect()

        log('memory usage of X_tst: {:.2f}MB'.format(
            df_memory_usage(X_tst) // 1e6))
        log('memory usage of process: {:.2f}MB'.format(get_process_memory()))
        result = predict(X_tst, self.config)
        del X_tst
        gc.collect()

        return pd.Series(result)
    def predict(self, X_test, time_remain):

        time_1 = time.time()

        Xs = self.tables
        main_table_tmp = Xs[MAIN_TABLE_NAME]
        main_table = pd.concat([main_table_tmp, X_test],
                               keys=['train', 'test'])
        # main_table = pd.concat([X_test], keys=['test'])

        main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        Xs[MAIN_TABLE_NAME] = main_table

        del main_table_tmp
        del X_test
        gc.collect()

        clean_tables(Xs)
        X = merge_table(Xs, self.config)
        clean_df(X)

        del Xs
        gc.collect()

        ##############################################################################3
        ##############################################################################3

        print(
            "########################################################################\n"
            "#              select feature                                          #\n"
            "########################################################################\n"
        )

        X_to_select = X[X.index.str.startswith("train")]

        big_df_memory = X_to_select.memory_usage().sum()
        big_df_len = X_to_select.shape[0]

        sample_num = int(len(self.y) / 10)

        part_X, part_y = data_sample_new(X_to_select, self.y, sample_num)

        del X_to_select
        # del y
        gc.collect()

        part_X = part_X.reset_index(drop=True)
        part_y = part_y.reset_index(drop=True)
        tmp_part_X, \
        self.two_order_cols, \
        self.two_group_cols, \
        self.mv_encs, \
        self.c_one_order_cols, \
        self.c_two_order_cols, \
        self.c_two_order_group_cnt_cols, \
        self.c_two_order_n_groupby_cat_cols, \
        self.n_minus_mean_cols, \
        max_numb_cols_to_select, \
        fe_model \
            = feature_engineer(part_X, self.config, part_y,
                               two_order_cols=self.two_order_cols,
                               two_group_cols=self.two_group_cols,
                               mv_encs=self.mv_encs,
                               c_one_order_cols=self.c_one_order_cols,
                               c_two_order_cols=self.c_two_order_cols,
                               c_two_order_group_cnt_cols=self.c_two_order_group_cnt_cols,
                               c_two_order_n_groupby_cat_cols=self.c_two_order_n_groupby_cat_cols,
                               n_minus_mean_cols=self.n_minus_mean_cols,
                               cols_selected=self.cols_selected,
                               big_df_memory=big_df_memory,
                               big_df_len=big_df_len,
                               fe_model=None
                               )

        tmp_part_X_d, self.cols_selected = feature_selector(
            tmp_part_X,
            part_y,
            max_numb_cols_to_select=max_numb_cols_to_select)

        # print("#" * 50)
        # print(part_X.memory_usage())
        #
        # print(tmp_part_X_d.memory_usage())
        #
        # part_X_mem_use_b = part_X.memory_usage().sum()
        # tmp_part_X_mem_use_b = tmp_part_X.memory_usage().sum()
        # tmp_part_X_d_mem_use_b = tmp_part_X_d.memory_usage().sum()
        # print(part_X_mem_use_b)
        # print(tmp_part_X_d_mem_use_b)
        # print(tmp_part_X_d_mem_use_b / part_X_mem_use_b)
        # print(tmp_part_X_mem_use_b / part_X_mem_use_b)
        #
        # part_X_mem_use_g = part_X.memory_usage().sum() / (1024 ** 3)
        # tmp_part_X__d_mem_use_g = tmp_part_X_d.memory_usage().sum() / (1024 ** 3)
        # print(part_X_mem_use_g)
        # print(tmp_part_X__d_mem_use_g)
        # print(tmp_part_X__d_mem_use_g / part_X_mem_use_g)
        # print("#" * 50)

        self.mv_encs = None

        del tmp_part_X
        del tmp_part_X_d
        del part_X
        del part_y

        gc.collect()

        print(
            "########################################################################\n"
            "#              after select feature use  all of data to train          #\n"
            "########################################################################\n"
        )
        ##############################################################################3
        ##############################################################################3


        X, \
        self.two_order_cols, \
        self.two_group_cols, \
        self.mv_encs, \
        self.c_one_order_cols, \
        self.c_two_order_cols, \
        self.c_two_order_group_cnt_cols, \
        self.c_two_order_n_groupby_cat_cols, \
        self.n_minus_mean_cols, \
        max_numb_cols_to_select, \
        fe_model \
            = feature_engineer(X, self.config,
                               two_order_cols=self.two_order_cols,
                               two_group_cols=self.two_group_cols,
                               mv_encs=self.mv_encs,
                               c_one_order_cols=self.c_one_order_cols,
                               c_two_order_cols=self.c_two_order_cols,
                               c_two_order_group_cnt_cols=self.c_two_order_group_cnt_cols,
                               c_two_order_n_groupby_cat_cols=self.c_two_order_n_groupby_cat_cols,
                               n_minus_mean_cols=self.n_minus_mean_cols,
                               cols_selected=self.cols_selected,
                               fe_model=fe_model
                               )

        X = X[self.cols_selected]

        print(X.columns.tolist())
        print(self.cols_selected)

        X_train = X[X.index.str.startswith("train")]
        X = X[X.index.str.startswith("test")]

        gc.collect()

        X_train.index = X_train.index.map(lambda x: int(x.split('_')[1]))
        X_train.sort_index(inplace=True)
        gc.collect()

        time_2 = time.time()

        time_left_to_train = time_remain - (time_2 - time_1)
        tmp_time = time_left_to_train
        run_flag = True
        a_time = 0
        train_count = 0

        train_num = 0
        run_num = 1
        # while run_flag:
        change_flag = True
        print(tmp_time)
        while run_num > 0:

            for i in range(1):
                t_1 = time.time()
                part_X, part_y = data_sample_for_train(X_train, self.y)
                print("*" * 10)
                print(len(part_y))
                print("*" * 10)
                train(part_X, part_y, self.config)
                t_2 = time.time()

                a_time = t_2 - t_1

                time_left_to_train = time_left_to_train - a_time
                print('a_time: ', a_time, 'time_left_to_train: ',
                      time_left_to_train)

                if tmp_time / a_time > 60:
                    if change_flag:
                        run_num = 25
                        print("###25###")
                elif tmp_time / a_time < 5 and tmp_time > 3 * a_time:
                    if change_flag:
                        run_num = 2
                        print("###2###")

                elif time_left_to_train <= 3 * a_time:
                    run_num = 0
                    print("###stop###")

                elif time_left_to_train < 50:
                    run_num = 0
                    print("###stop###")

                else:
                    if change_flag:
                        run_num = 3
                        print("###3###")

                change_flag = False
                run_num = run_num - 1

                # if a_time * 5 + 30 >= time_left_to_train:
                #     run_flag = False
                # train_count = train_count + 1
                # if train_count > 25:
                #     run_flag = False
                # if train_count < 4:
                #     run_flag = True
                # if time_left_to_train / a_time < 3:
                #     run_flag = False

        # train(X_train, self.y, self.config)

        gc.collect()

        del X_train
        gc.collect()

        # X = X[X.index.str.startswith("test")]
        X.index = X.index.map(lambda x: int(x.split('_')[1]))
        X.sort_index(inplace=True)

        gc.collect()

        result = predict(X, self.config)

        return pd.Series(result)
Esempio n. 14
0
    def predict(self, X_test, time_remain):

        Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]
        main_time_index = main_table[["t_01"]].sort_values("t_01")
        # catLabel_dict = process_cat_label(main_table, self.lables.loc[main_table.index]) # modified By 05.30
        main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        Xs[MAIN_TABLE_NAME] = main_table
        clean_tables(Xs, self.config, fill=True)
        main_table = Xs[MAIN_TABLE_NAME]

        main_cat_cols = [
            col for col in main_table.columns
            if (col.startswith("c_") or col.startswith("m_"))
            and len(main_table[col].unique()) > 1
        ]
        total_num_fea = 0
        catFea_dict, total_num_fea = process_main_cat(
            main_table, main_cat_cols, total_num_fea)  # 专门利用主表提其他类别特征针对main的特征
        print("total_num Fea:", total_num_fea)
        catFea_dicts = []
        relation_catFea_dicts = []
        relation_time_dicts = []
        relation_catFea_dicts2 = []
        if total_num_fea < 150:  # 表示主表的衍生特征不够多,还可加
            for relation in self.config['relations']:
                tableA = relation["table_A"]
                l_type = relation["type"].split("_")[0]
                tableB = relation["table_B"]
                r_type = relation["type"].split("_")[2]
                key = relation["key"][0]
                if tableA == "main" and l_type == "many" and r_type == "one":  #and "t_01" not in Xs[tableB].columns:  # 这里比较定制,后期需要改
                    '''
                    temp_main_cat = main_table[main_cat_cols]
                    relation_num_cols = [col for col in Xs[tableB].columns if col.startswith("n_")]
                    temp_tableB_num = Xs[tableB][[key]+relation_num_cols]
                    temp_tableB_num = temp_tableB_num.set_index(key)
                    temp_main_cat = temp_main_cat.join(temp_tableB_num, on=key)
                    temp_dict, total_num_fea = process_main_cat_v2(temp_main_cat, main_cat_cols, key, tableB, total_num_fea) #main的类别,relation的numerical
                    catFea_dicts.append(temp_dict)
                    if total_num_fea > 150: break
                    '''
                    Xs[tableB].drop_duplicates([key], inplace=True)
                    relation_cat_cols = [
                        col for col in Xs[tableB].columns
                        if (col.startswith("c_") or col.startswith("m_"))
                        and len(Xs[tableB][col].unique()) > 1
                    ]
                    temp_tableB_cat = Xs[tableB][relation_cat_cols]
                    if key in main_table and key in temp_tableB_cat:
                        temp_main_num = main_table[[key]]
                        temp_tableB_cat = temp_tableB_cat.set_index(key)
                        temp_main_num = temp_main_num.join(temp_tableB_cat,
                                                           on=key)
                        relation_temp_dict, total_num_fea = process_relation_cat(
                            temp_main_num, relation_cat_cols, key, tableB,
                            total_num_fea)  #relation的类别,main的numerical
                        #relation_catFea_dicts.append(relation_temp_dict)
                        relation_catFea_dicts = relation_catFea_dicts + relation_temp_dict
                        # if total_num_fea > 150: break
                        '''
                        temp_tableB_cat = Xs[tableB][relation_cat_cols]
                        relation_temp_dict2, total_num_fea = process_relation_cat_v2(temp_tableB_cat, relation_cat_cols, key,
                                                                                 tableB,
                                                                                 total_num_fea)
                        relation_catFea_dicts2.append(relation_temp_dict2)
                        '''

                    relation_time_cols = [
                        col for col in Xs[tableB].columns
                        if col.startswith("t_")
                    ]
                    if len(relation_time_cols) > 0:
                        if key in Xs[
                                tableB] and key in main_table and "t_01" in main_table:
                            temp_tableB_time = Xs[tableB][[key] +
                                                          relation_time_cols]
                            temp_tableB_time.columns = [
                                col + "_in_" +
                                tableB if col.startswith("t_") else col
                                for col in temp_tableB_time.columns
                            ]
                            temp_main_time = main_table[[key] + ["t_01"]]
                            temp_tableB_time = temp_tableB_time.set_index(key)
                            temp_main_time = temp_main_time.join(
                                temp_tableB_time, on=key)
                            temp_main_time.drop(key, axis=1, inplace=True)
                            #print("time_test v1")
                            #print(temp_main_time.head())
                            temp_main_time = process_relation_time(
                                temp_main_time)
                            relation_time_dicts.append(temp_main_time)
                    '''
                    temp_tableB = Xs[tableB].set_index(key)
                    temp_main_key = main_table[[key]]
                    temp_main_key = temp_main_key.join(temp_tableB, on=key)
                    relation_temp_dict2, total_num_fea = process_relation_cat_v2(temp_main_key, relation_cat_cols, key,
                                                                                 tableB, total_num_fea)
                    del temp_main_key
                    del temp_tableB
                    relation_catFea_dicts2.append(relation_temp_dict2)
                    if total_num_fea > 150: break
                    '''
        '''
        #if len(relation_time_dicts) > 0:
        main_time_col=[col for col in main_table.columns if col.startswith("t_")]
        temp_main_time = main_table[main_time_col]
        for col in main_time_col:
            temp_main_time["n_weekday_" + col], temp_main_time["n_hour_" + col], temp_main_time["n_day_" + col]=zip(*temp_main_time[col].map(trans2basicInfo))
            # temp_main_time["n_weekday_" + col] = temp_main_time[col].apply(trans2weekday)
            # temp_main_time["n_hour_" + col] = temp_main_time[col].apply(trans2hour)
            # temp_main_time["n_day_" + col] = temp_main_time[col].apply(trans2day)
            if not col.startswith("t_0"):
                temp_main_time["n_interval_" + col] = (temp_main_time[col] - temp_main_time["t_01"]).map(trans2interval)
        temp_main_time.drop(main_time_col, axis=1, inplace=True)
        relation_time_dicts.append(temp_main_time)
        print("Processing Trans to main time")
        '''

        # Xs[MAIN_TABLE_NAME] = main_table
        # clean_tables(Xs, self.config, fill=True)
        merge_table_v2(Xs, self.config)
        #clean_tables(Xs)
        X = FT_process(Xs, self.config)
        del Xs
        del self.tables
        del main_table
        #print(X.shape)
        '''
        for catLabel in catLabel_dict:
            # print(catLabel_dict[catLabel].head())
            if catLabel in X.columns:
                X = X.join(catLabel_dict[catLabel], on=catLabel)
        '''
        t1 = time.time()
        useful_catFea = [
            catFea_dict[catFea] for catFea in catFea_dict
            if catFea in X.columns
        ]
        X = pd.concat([X] + useful_catFea, axis=1)
        print("processing process_main_cat")
        '''
        for catFea in catFea_dict:
            if catFea in X.columns:
                #print(catFea_dict[catFea].head())
                X = X.join(catFea_dict[catFea], on=catFea)
                print("processing process_main_cat")
            #print(X.head())
        '''
        del catFea_dict
        '''
        for catFea_dict2 in catFea_dicts:
            for catFea in catFea_dict2:
                if catFea in X.columns:
                    #print(catFea_dict2[catFea].head())
                    X = X.join(catFea_dict2[catFea], on=catFea)
                    print("processing process_main_cat_v2")
                    #print(X.head())
        del catFea_dicts
        '''
        '''
        for relation_catFea_dict in relation_catFea_dicts:
            for relation_catFea in relation_catFea_dict:
                #print(relation_catFea_dict[relation_catFea].head())
                if relation_catFea in X.columns:
                    z=yield(relation_catFea_dict[relation_catFea])
                    # X = X.join(relation_catFea_dict[relation_catFea], on=relation_catFea)
                    print("processing process_relation_cat")
                    #print(X.head())
        '''
        X = pd.concat([X] + relation_catFea_dicts, axis=1)
        del relation_catFea_dicts

        if len(relation_time_dicts) > 0:
            X = pd.concat([X] + relation_time_dicts, axis=1)
            print("processing process_relation_time")
            #print(X.shape)
            #print(X.head())
            del relation_time_dicts
        '''
        for relation_catFea_dict2 in relation_catFea_dicts2:
            for relation_catFea in relation_catFea_dict2:
                #print(relation_catFea_dict2[relation_catFea].head())
                if relation_catFea in X.columns:
                    X = X.join(relation_catFea_dict2[relation_catFea], on=relation_catFea)
                    print("processing process_relation_cat_v2")
                    #print(X.head())
        del relation_catFea_dicts2
        '''
        t2 = time.time()
        print("cat join cost time: ", t2 - t1)
        #print(X.head())
        X.columns = [
            "m_" + c if (".m_" in c) and ("MEAN" not in c) and
            ("SUM" not in c) and ("COUNT" not in c) and
            ("N_UNIQUE" not in c) and ("N_TIME" not in c) else c
            for c in X.columns
        ]
        X.columns = [
            "c_" + c if (".c_" in c) and ("MEAN" not in c) and
            ("SUM" not in c) and ("COUNT" not in c) and
            ("N_UNIQUE" not in c) and ("N_TIME" not in c) else c
            for c in X.columns
        ]
        X.columns = [
            "n_" + c if not c.startswith("n_") and not c.startswith("m_")
            and not c.startswith("c_") and not c.startswith("t_") else c
            for c in X.columns
        ]
        #print(X.columns)
        print("Column Number:", len(X.columns))

        clean_df(X, "no_table", self.config)
        feature_engineer(X, self.config, len(X.columns), self.lables)

        X_train = X[X.index.str.startswith("train")]
        X_train.index = X_train.index.map(lambda x: int(x.split('_')[1]))
        X_train.sort_index(inplace=True)
        #train(X_train, self.lables.loc[X_train.index], self.config)
        train(X_train.loc[main_time_index.index],
              self.lables.loc[main_time_index.index], self.config)  # 按时间排序
        del main_time_index

        X = X[X.index.str.startswith("test")]
        X.index = X.index.map(lambda x: int(x.split('_')[1]))
        X.sort_index(inplace=True)
        result = predict(X, self.config)

        return pd.Series(result)
Esempio n. 15
0
    def predict(self, X_test, time_remain):
        self.Time_data_info['time_ramain_so_far'] = time_remain

        start_feature = time.time()

        Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]

        #index = main_table.sort_values(by=self.config['time_col']).index
        #split = int(0.6*len(index))
        #train_index, test_index = index[:split], index[split:]

        #log(f"Merge train and test tables...")
        main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        Xs[MAIN_TABLE_NAME] = main_table

        log(f"Feature engineering...")
        clean_tables(Xs)
        X = merge_table(Xs, self.config)
        X = clean_df(X)
        X = feature_engineer(X, self.config)

        X_train = X[X.index.str.startswith("train")]
        X_train.index = X_train.index.map(lambda x: int(x.split('_')[1]))
        X_train.sort_index(inplace=True)
        y_train = self.targets

        end_feature = time.time()

        self.Time_data_info['time_for_feature_engineering'] = (end_feature -
                                                               start_feature)

        self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[
            'time_ramain_so_far'] - self.Time_data_info[
                'time_for_feature_engineering']

        #self.Time_data_info['data_cols_for_hp'] = X.shape[1]
        #self.Time_data_info['data_rows_for_hp'] = X.shape[0]
        print(f"TIME info:", self.Time_data_info)

        # train model
        log(f"Training...")
        train_start = time.time()
        #train(X_train.iloc[train_index], y_train.iloc[train_index], self.config)

        timetrain(X_train, y_train, self.config, self.Time_data_info)

        #train with time limitation
        #timetrain(X_train.iloc[train_index], y_train.iloc[train_index], self.config, self.Time_data_info)

        train_end = time.time()

        self.Time_data_info['time_ramain_so_far'] = self.Time_data_info[
            'time_ramain_so_far'] - (train_end - train_start)
        self.Time_data_info['time_for_model_train'] = (train_end - train_start)

        print("TIME info:", self.Time_data_info)

        #r = predict(X_train.iloc[test_index], self.config)
        #r = timepredict(X_train.iloc[test_index], self.config)

        #print('Test auc: ', roc_auc_score(y_train.iloc[test_index], r))

        #importance = self.config["model"].feature_importance(importance_type='split')
        #feature_name = np.array(self.config["model"].feature_name())
        #feature_importance = pd.DataFrame({'feature_importance': feature_name[np.argsort(-importance)], 'importnace':-np.sort(-importance)})
        #feature_importance.to_csv('feature_importance.csv', index=False)

        # predict
        log(f"Predicting...")
        X_test = X[X.index.str.startswith("test")]
        X_test.index = X_test.index.map(lambda x: int(x.split('_')[1]))
        X_test.sort_index(inplace=True)
        result = predict(X_test, self.config)

        return pd.Series(result)
Esempio n. 16
0
    def predict(self, X_test, time_remain):

        Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]  #.iloc[0:4000]
        #X_test = X_test#.iloc[0:4000]
        #self.y = self.y#.iloc[0:4000]
        if int(self.config["time_budget"]) > 2000:
            from data_sample import data_sample
            main_table, self.y = data_sample(main_table, self.y, ratio=1)
            # main_table = Xs[MAIN_TABLE_NAME].iloc[-1000000:]
            # self.y = self.y.iloc[-1000000:]

        main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        Xs[MAIN_TABLE_NAME] = main_table

        clean_tables(Xs)

        X = merge_table(Xs, self.config)

        clean_df(X)
        feature_engineer(X, self.config)

        ###-------------------- cat feature -----------------------###
        cat_features = []

        for col in X.columns:
            if "ROLLING" not in col and "c_" in col:
                cat_features.append(col)

        X, _ = cat_value_counts(X, cat_features)
        ###--------------------------------------------------------###

        ###------------------- data sample ------------------###

        if int(self.config["time_budget"]) <= 300:

            X_train = X[X.index.str.startswith("train")]
            X_test = X[X.index.str.startswith("test")]
            from data_sample import data_sample
            X_train, self.y = data_sample(X_train, self.y, flag=True)

            X = pd.concat([X_train, X_test], keys=['train', 'test'])
        elif int(self.config["time_budget"]) < 2000:
            X_train = X[X.index.str.startswith("train")]
            X_test = X[X.index.str.startswith("test")]
            from data_sample import data_sample
            X_train, self.y = data_sample(X_train, self.y)

            X = pd.concat([X_train, X_test], keys=['train', 'test'])

        #X.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")

        ###------------------- mul onehot feature -----------------###
        m_features = []

        for col in X.columns:
            if ("ROLLING" not in col) and ("mul_feature_" in col):
                m_features.append(col)

        # if len(self.mlbs)>0 or  self.mlbs is not None:
        #     m_features = list(self.mlbs.keys())
        # else:
        #     m_features = []

        one_hot_features = None
        one_hot_models = None
        mlbs = None

        one_hot_features_m = None

        from feature_expansion import onehot_feature_selection_m

        if len(m_features) > 0 and int(self.config["time_budget"]) > 100:
            one_hot_features_m, one_hot_models, mlbs = onehot_feature_selection_m(
                X,
                self.y,
                m_features,
                feature_num_everyiter=len(m_features),
                selection=True)
            X.drop(m_features, inplace=True, axis=1)

        elif len(m_features) > 0:
            X.drop(m_features, inplace=True, axis=1)

        ###-------------------------------------------------###

        ###------------------- onehot encoder ------------------###

        from feature_expansion import onehot_feature_selection
        one_hot_features = None
        if len(cat_features) > 0 and int(self.config["time_budget"]) > 4000:
            one_hot_features, one_hot_models, mlbs = onehot_feature_selection(
                X,
                self.y,
                cat_features,
                feature_num_everyiter=len(cat_features),
                selection=True)
            for cat_col in cat_features:
                if cat_col not in mlbs:
                    X.drop(cat_col, inplace=True, axis=1)

        ###-----------------------concat--------------------###

        from scipy.sparse import hstack, csr_matrix
        X = csr_matrix(X)
        if one_hot_features is not None:
            X = hstack([X, one_hot_features]).tocsr()

        if one_hot_features_m is not None:
            X = hstack([X, one_hot_features_m]).tocsr()

        ###-------------------------------------------------###

        # ###------------------drop mul_feature---------------###
        # m_features = []
        # for feature in X.columns:
        #     if "mul_feature_" in feature:
        #         m_features.append(feature)
        #
        # X.drop(m_features,inplace=True,axis=1)
        # ###-------------------------------------------------###

        X_train = X[0:self.y.shape[0]]
        X = X[self.y.shape[0]:]

        result = None

        if int(self.config["time_budget"]) < 2000 and int(
                self.config["time_budget"]) > 300:
            for i in range(0, 3):
                train(X_train, self.y, self.config)
                tmp = predict(X, self.config)
                if result is None:
                    result = tmp
                    continue
                else:
                    result = result + tmp

            result = result / float(3)
        else:
            train(X_train, self.y, self.config)
            result = predict(X, self.config)

        return pd.Series(result)
Esempio n. 17
0
    def predict(self, X_test, time_remain):

        ### calculate time range
        '''Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]
        print(main_table.columns)
        input()
        min_train_time = np.min(main_table[[c for c in main_table.columns if c.startswith(CONSTANT.TIME_PREFIX)]])
        max_train_time = np.max(main_table[[c for c in main_table.columns if c.startswith(CONSTANT.TIME_PREFIX)]])
        min_test_time = np.min(X_test[[c for c in X_test.columns if c.startswith(CONSTANT.TIME_PREFIX)]])
        max_test_time = np.max(X_test[[c for c in X_test.columns if c.startswith(CONSTANT.TIME_PREFIX)]])

        print("minimum time in training dataset %s"%str(min_train_time))
        print("maximum time in training dataset %s"%str(max_train_time))
        print("minimum time in testing dataset %s"%str(min_test_time))
        print("maximum time in testing dataset %s"%str(max_test_time))
        return None'''

        ### test concept drift
        '''Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]
        #main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        #main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        main_table = pd.concat([main_table, self.y], axis=1)
        time_feature = [c for c in main_table.columns if c.startswith(CONSTANT.TIME_PREFIX)]
        main_table = main_table.sort_values(time_feature)
        number_test = int(main_table.shape[0]*0.2)
        X_test = main_table.tail(number_test)
        X_test.index = range(X_test.shape[0])
        main_table = main_table.head(main_table.shape[0] - number_test)
        main_table.index = range(main_table.shape[0])


        min_train_time = np.min(main_table[time_feature])
        max_train_time = np.max(main_table[time_feature])
        min_test_time = np.min(X_test[time_feature])
        max_test_time = np.max(X_test[time_feature])

        print("minimum time in training dataset %s"%str(min_train_time))
        print("maximum time in training dataset %s"%str(max_train_time))
        print("minimum time in testing dataset %s"%str(min_test_time))
        print("maximum time in testing dataset %s"%str(max_test_time))

        y_test = X_test[X_test.columns[-1]]
        X_test = X_test[X_test.columns[0:-1]]
        y_train = main_table[main_table.columns[-1]]
        main_table = main_table[main_table.columns[0:-1]]

        main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        Xs[MAIN_TABLE_NAME] = main_table

        clean_tables(Xs)
        X = merge_table(Xs, self.config)
        clean_df(X)
        feature_engineer(X, self.config)

        cat_features = []
        for col in X.columns:
            if "c_" in col and "ROLLING" not in col and "cnt" not in col:
                cat_features.append(col)
        X, _ = cat_value_counts(X, cat_features)

        X_train = X[X.index.str.startswith("train")]
        X_test = X[X.index.str.startswith("test")]

        train(X_train, y_train, self.config)
        result = predict(X_test, self.config)

        fpr, tpr, thresholds=metrics.roc_curve(y_test.values, result, pos_label=1)
        print("test auc is %.4f"%(metrics.auc(fpr, tpr)))
        return None'''

        Xs = self.tables
        main_table = Xs[MAIN_TABLE_NAME]
        main_table = pd.concat([main_table, X_test], keys=['train', 'test'])
        main_table.index = main_table.index.map(lambda x: f"{x[0]}_{x[1]}")
        Xs[MAIN_TABLE_NAME] = main_table

        clean_tables(Xs)
        X = merge_table(Xs, self.config)
        clean_df(X)
        feature_engineer(X, self.config)

        diff = X.max() - X.min()
        threshold = 1e-6
        X = X[X.columns[diff > threshold]]
        print("There are %d columns of trivial features" %
              (diff.shape[0] - X.shape[1]))
        '''cat_features = []

        for col in X.columns:
            if "c_" in col and "ROLLING" not in col and "cnt" not in col:
                cat_features.append(col)'''

        #X, _ = cat_value_counts(X, cat_features)

        #X = pd.get_dummies(X, columns = X.columns, sparse=True)
        #cumulative_shift, X = oneHotEncoding(X)
        #self.config["cumulative_shift"] = cumulative_shift

        X_train, X, one_hot_features, all_features = oneHotEncodingCSRMatrix(X)
        #cumulative_shift = X.shape[1]
        self.config["cumulative_shift"] = all_features
        y = self.y.values
        result = None

        #X_train = X[X.index.str.startswith("train")]
        #train(X_train, y, self.config)

        #X = X[X.index.str.startswith("test")]
        #X.index = X.index.map(lambda x: int(x.split('_')[1]))
        #X.sort_index(inplace=True)
        #result = predict(X, self.config)

        #result = train_fm_keras(X_train, X, y, self.config, one_hot_features)
        #input()
        result = train_fm_keras_batch(X_train, X, y, self.config,
                                      one_hot_features)

        #result = train_and_predict(X_train, y, X, self.config, one_hot_features)
        '''tf.reset_default_graph()
        from tensorflow.python.summary.writer import writer_cache
        #print(writer_cache.FileWriterCache.get('./models/eval'))
        writer_cache.FileWriterCache.clear()

        input()

        os.system("rm -r ./models/*")'''
        '''os.system("rm -r ./models/model.*")
        os.system("rm -r ./models/check*")
        os.system("rm -r ./models/graph.*")
        os.system("rm -r ./models/eval/*")'''

        return pd.Series(result)