def go(self):
     y_train = self.dfAll["relevance"].values[:TRAIN_SIZE]
     for obs_field in self.obs_fields:
         if obs_field not in self.dfAll.columns:
             self.logger.info("Skip %s" % obs_field)
             continue
         obs_corpus = self.dfAll[obs_field].values
         ext = self.generator(obs_corpus, None, *self.param_list)
         x = ext.transform()
         if isinstance(ext.__name__(), list):
             for i, feat_name in enumerate(ext.__name__()):
                 dim = 1
                 fname = "%s_%s_%dD" % (feat_name, obs_field, dim)
                 pkl_utils._save(
                     os.path.join(self.feat_dir,
                                  fname + config.FEAT_FILE_SUFFIX), x[:, i])
                 corr = np_utils._corr(x[:TRAIN_SIZE, i], y_train)
                 self.logger.info("%s (%dD): corr = %.6f" %
                                  (fname, dim, corr))
         else:
             dim = np_utils._dim(x)
             fname = "%s_%s_%dD" % (ext.__name__(), obs_field, dim)
             pkl_utils._save(
                 os.path.join(self.feat_dir,
                              fname + config.FEAT_FILE_SUFFIX), x)
             if dim == 1:
                 corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                 self.logger.info("%s (%dD): corr = %.6f" %
                                  (fname, dim, corr))
             elif self.force_corr:
                 for j in range(dim):
                     corr = np_utils._corr(x[:TRAIN_SIZE, j], y_train)
                     self.logger.info("%s (%d/%dD): corr = %.6f" %
                                      (fname, j + 1, dim, corr))
Example #2
0
 def go(self):
     y_train = self.dfAll["relevance"].values[:TRAIN_SIZE]
     for obs_field in self.obs_fields:
         if obs_field not in self.dfAll.columns:
             self.logger.info("Skip %s"%obs_field)
             continue
         obs_corpus = self.dfAll[obs_field].values
         for target_field in self.target_fields:
             if target_field not in self.dfAll.columns:
                 self.logger.info("Skip %s"%target_field)
                 continue
             target_corpus = self.dfAll[target_field].values
             ext = self.generator(obs_corpus, target_corpus, *self.param_list)
             x = ext.transform()
             if isinstance(ext.__name__(), list):
                 for i,feat_name in enumerate(ext.__name__()):
                     dim = 1
                     fname = "%s_%s_x_%s_%dD"%(feat_name, obs_field, target_field, dim)
                     pkl_utils._save(os.path.join(self.feat_dir, fname+config.FEAT_FILE_SUFFIX), x[:,i])
                     corr = np_utils._corr(x[:TRAIN_SIZE,i], y_train)
                     self.logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))
             else:
                 dim = np_utils._dim(x)
                 fname = "%s_%s_x_%s_%dD"%(ext.__name__(), obs_field, target_field, dim)
                 pkl_utils._save(os.path.join(self.feat_dir, fname+config.FEAT_FILE_SUFFIX), x)
                 if dim == 1:
                     corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                     self.logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))
                 elif self.force_corr:
                     for j in range(dim):
                         corr = np_utils._corr(x[:TRAIN_SIZE,j], y_train)
                         self.logger.info("%s (%d/%dD): corr = %.6f"%(fname, j+1, dim, corr))
def main():
    logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    obs_corpus = []
    query_suffix = []
    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("raw")
    # after processing    
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("lemmatized")
    # after extracting product_name in search_term
    obs_corpus.append(dfAll["search_term_product_name"].values)
    query_suffix.append("product_name")
    if "search_term_auto_corrected" in dfAll.columns:
        # after auto correction
        obs_corpus.append(dfAll["search_term_auto_corrected"].values)
        query_suffix.append("corrected")  
    # after stemming
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("stemmed")

    y_train = dfAll["relevance"].values[:TRAIN_SIZE]
    for i in range(len(query_suffix)-1):
        for j in range(i+1, len(query_suffix)):
            ext = QueryQuality(obs_corpus[i], obs_corpus[j])
            x = ext.transform()
            dim = np_utils._dim(x)
            fname = "%s_%s_x_%s_%dD"%(ext.__name__(), query_suffix[i], query_suffix[j], dim)
            pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x)
            corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
            logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))

    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_fields = ["search_term"]
    param_list = []
    sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
    sf.go()
Example #4
0
    def go(self):
        y_train = self.dfAll['relevance'].values
        for obs_field in self.obs_fields:
            obs_transformer, obs_field_transformed = make_transformer(
                self.dfAll, obs_field)
            if obs_transformer is None:
                self.logger.info("Skip %s" % (obs_field))
                continue

            for target_field in self.target_fields:
                target_transformer, target_field_transformed = make_transformer(
                    self.dfAll, target_field)
                if target_transformer is None:
                    self.logger.info("Skip %s" % (target_field))
                    continue

                deduplicator = self.make_deduplicator(
                    obs_field_transformed, target_field_transformed)
                obs_corpus, target_corpus = deduplicator.deduplicate()
                obs_trans = obs_transformer(obs_corpus)
                target_trans = target_transformer(target_corpus)
                estimator = self.generator(obs_trans, target_trans,
                                           *self.param_list)
                x = deduplicator.reduplicate(obs_corpus, target_corpus,
                                             estimator.transform())

                if isinstance(estimator.__name__(), list):
                    for i, feat_name in enumerate(estimator.__name__()):
                        self.save_feature(feat_name, obs_field, target_field,
                                          1, x[:, i], y_train)
                else:
                    dim = np_utils._dim(x)
                    self.save_feature(estimator.__name__(), obs_field,
                                      target_field, dim, x, y_train)
                # Release memory between iterations. Not sure if necessary yet,
                # but noticed some strange memory usage so trying this out
                del obs_corpus
                del obs_trans
                del target_corpus
                del target_trans
                del x
Example #5
0
    def go(self):
        y_train = self.dfAll["relevance"].values
        for obs_field in self.obs_fields:
            obs_transformer, obs_field_transformed = make_transformer(
                self.dfAll, obs_field)
            if obs_transformer is None:
                self.logger.info("Skip %s" % (obs_field))
                continue

            deduplicator = self.make_deduplicator(obs_field_transformed, None)
            obs_corpus, _ = deduplicator.deduplicate()
            obs_trans = obs_transformer(obs_corpus)
            estimator = self.generator(obs_trans, None, *self.param_list)
            x = deduplicator.reduplicate(obs_corpus, None,
                                         estimator.transform())

            if isinstance(estimator.__name__(), list):
                for i, feat_name in enumerate(estimator.__name__()):
                    self.save_feature(feat_name, obs_field, 1, x[:, i],
                                      y_train)
            else:
                dim = np_utils._dim(x)
                self.save_feature(estimator.__name__(), obs_field, dim, x,
                                  y_train)
    def combine(self):

        dfAll = pkl_utils._load(config.INFO_DATA)
        dfAll_raw = dfAll.copy()
        y_train = dfAll["relevance"].values[:TRAIN_SIZE]

        ## for basic features
        feat_cnt = 0
        self.logger.info("Run for basic...")
        for file_name in sorted(os.listdir(config.FEAT_DIR)):
            if self.feature_suffix in file_name:
                fname = file_name.split(".")[0]
                if fname not in self.feature_dict:
                    continue
                x = self.load_feature(config.FEAT_DIR, fname)
                x = np.nan_to_num(x)
                if np.isnan(x).any():
                    self.logger.info("%s nan"%fname)
                    continue
                # apply feature transform
                mandatory = self.feature_dict[fname][0]
                transformer = self.feature_dict[fname][1]
                x = transformer.fit_transform(x)
                dim = np_utils._dim(x)
                if dim == 1:
                    corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                    if not mandatory and abs(corr) < self.corr_threshold:
                        self.logger.info("Drop: {} ({}D) (abs corr = {}, < threshold = {})".format(
                            fname, dim, abs(corr), self.corr_threshold))
                        continue
                    dfAll[fname] = x
                    self.feature_names.append(fname)
                else:
                    columns = ["%s_%d"%(fname, x) for x in range(dim)]
                    df = pd.DataFrame(x, columns=columns)
                    dfAll = pd.concat([dfAll, df], axis=1)
                    self.feature_names.extend(columns)
                feat_cnt += 1
                self.feature_names_basic.append(fname)
                if dim == 1:
                    self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format(
                        feat_cnt, len(self.feature_dict.keys()), fname, dim, corr))
                else:
                    self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                        feat_cnt, len(self.feature_dict.keys()), fname, dim))
        dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
        ## basic
        dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()
        self.y_train = dfTrain["relevance"].values.astype(float)
        dfTrain.drop(["id","relevance"], axis=1, inplace=True)
        self.X_train = dfTrain.values.astype(float)

        dfTest = dfAll.iloc[TRAIN_SIZE:].copy()
        self.id_test = dfTest["id"].values.astype(int)
        dfTest.drop(["id","relevance"], axis=1, inplace=True)
        self.X_test = dfTest.values.astype(float)

        ## all
        first = True
        feat_cv_cnt = 0
        dfAll_cv_all = dfAll_raw.copy()
        feature_dir = "%s/All" % (config.FEAT_DIR)
        for file_name in sorted(os.listdir(feature_dir)):
            if self.feature_suffix in file_name:
                fname = file_name.split(".")[0]
                if fname not in self.feature_dict:
                    continue
                if first:
                    self.logger.info("Run for all...")
                    first = False
                x = self.load_feature(feature_dir, fname)
                x = np.nan_to_num(x)
                if np.isnan(x).any():
                    self.logger.info("%s nan"%fname)
                    continue
                # apply feature transform
                mandatory = self.feature_dict[fname][0]
                transformer = self.feature_dict[fname][1]
                x = transformer.fit_transform(x)
                dim = np_utils._dim(x)
                if dim == 1:
                    corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                    if not mandatory and abs(corr) < self.corr_threshold:
                        self.logger.info("Drop: {} ({}D) (abs corr = {}, < threshold = {})".format(
                            fname, dim, abs(corr), self.corr_threshold))
                        continue
                    dfAll_cv_all[fname] = x
                    self.feature_names.append(fname)
                else:
                    columns = ["%s_%d"%(fname, x) for x in range(dim)]
                    df = pd.DataFrame(x, columns=columns)
                    dfAll_cv_all = pd.concat([dfAll_cv_all, df], axis=1)
                    self.feature_names.extend(columns)
                feat_cv_cnt += 1
                self.feature_names_cv.append(fname)
                if dim == 1:
                    self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format(
                        feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim, corr))
                else:
                    self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                        feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim))
        if feat_cv_cnt > 0:
            dfAll_cv_all.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
            X_tmp = dfAll_cv_all.drop(["id","relevance"], axis=1).values.astype(float)
            self.X_train_cv_all = X_tmp[:TRAIN_SIZE]
            self.X_test = np.hstack((self.X_test, X_tmp[TRAIN_SIZE:]))
        else:
            self.X_train_cv_all = None
        feat_cnt += feat_cv_cnt

        ## for cv features
        first = True
        for run in range(1,self.n_iter+1):
            feat_cv_cnt = 0
            dfAll_cv = dfAll_raw.copy()
            feature_dir = "%s/Run%d" % (config.FEAT_DIR, run)
            for file_name in sorted(os.listdir(feature_dir)):
                if self.feature_suffix in file_name:
                    fname = file_name.split(".")[0]
                    if (fname not in self.feature_dict) or (fname not in self.feature_names_cv):
                        continue
                    if first:
                        self.logger.info("Run for cv...")
                        first = False
                    if feat_cv_cnt == 0:
                        self.logger.info("Run %d"%run)
                    x = self.load_feature(feature_dir, fname)
                    x = np.nan_to_num(x)
                    if np.isnan(x).any():
                        self.logger.info("%s nan"%fname)
                        continue
                    # apply feature transform
                    mandatory = self.feature_dict[fname][0]
                    transformer = self.feature_dict[fname][1]
                    x = transformer.fit_transform(x)
                    dim = np_utils._dim(x)
                    if dim == 1:
                        dfAll_cv[fname] = x
                    else:
                        columns = ["%s_%d"%(fname, x) for x in range(dim)]
                        df = pd.DataFrame(x, columns=columns)
                        dfAll_cv = pd.concat([dfAll_cv, df], axis=1)
                    feat_cv_cnt += 1
                    self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                        feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim))
            if feat_cv_cnt > 0:
                dfAll_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
                dfTrain_cv = dfAll_cv.iloc[:TRAIN_SIZE].copy()
                X_tmp = dfTrain_cv.drop(["id","relevance"], axis=1).values.astype(float)
                if run == 1:
                    self.X_train_cv = np.zeros((X_tmp.shape[0], X_tmp.shape[1], self.n_iter), dtype=float)
                self.X_train_cv[:,:,run-1] = X_tmp
        if feat_cv_cnt == 0:
            self.X_train_cv = None
            self.basic_only = 1

        # report final results
        if self.basic_only:
            self.logger.info("Overall Shape: %d x %d"%(len(self.y_train), self.X_train.shape[1]))
        else:
            self.logger.info("Overall Shape: %d x %d"%(
                len(self.y_train), self.X_train.shape[1]+self.X_train_cv_all.shape[1])) 
        self.logger.info("Done combinning.")

        return self
    def combine(self):
        # combine meta features
        if self.meta_feature_dict:
            cb = Combiner(feature_dict=self.meta_feature_dict, 
                        feature_name=self.feature_name, 
                        feature_suffix=".pkl", 
                        corr_threshold=self.corr_threshold)
            cb.combine()
            self.X_train_basic = cb.X_train
            self.X_test_basic = cb.X_test
            self.feature_names_basic = cb.feature_names_basic
            self.feature_names.extend(cb.feature_names)
        else:
            self.X_train_basic = None
            self.X_test_basic = None

        # combine other features
        dfAll = pkl_utils._load(config.INFO_DATA)

        dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

        dfTest = dfAll.iloc[TRAIN_SIZE:].copy()
        self.id_test = dfTest["id"].values.astype(int)

        ## all
        first = True
        feat_cnt = 0
        feature_dir = "%s/All" % (config.OUTPUT_DIR)
        for file_name in sorted(os.listdir(feature_dir)):
            if self.feature_suffix in file_name:
                ## NOTE the name is valid.pred.MODEL.csv and test.pred.MODEL.csv
                fname = file_name.split(".")[2]
                if fname not in self.feature_list:
                    continue
                if first:
                    self.logger.info("Run for all...")
                    first = False
                # load prediction
                x = self.load_feature(feature_dir, "test.pred."+fname)
                x = np.nan_to_num(x)
                dim = np_utils._dim(x)
                dfTest[fname] = x
                feat_cnt += 1
                self.feature_names_cv.append(fname)
                self.feature_names.append(fname)
                self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                    feat_cnt, len(self.feature_list), fname, dim))
                # load probability if any
                try:
                    x = self.load_feature(feature_dir, "test.proba."+fname, 
                                        columns=None, columns_pattern="proba")
                    x = np.nan_to_num(x)
                    dim = np_utils._dim(x)
                    for i in range(dim):
                        dfTest["%s_proba%d"%(fname, i)] = x[:,i]
                    self.logger.info("Combine {:>3}/{:>3} proba feat: {} ({}D)".format(
                        feat_cnt, len(self.feature_list), fname, dim))
                    self.feature_names.extend(["%s_proba%d"%(fname, i) for i in range(dim)])
                except:
                    pass

        dfTest.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
        self.X_test = dfTest.drop(["id","relevance"], axis=1).values.astype(float)
        if self.meta_feature_dict:
            self.X_test = np.hstack([self.X_test_basic, self.X_test])

        ## for cv features
        first = True
        for run in range(1,self.n_iter+1):
            feat_cnt = 0
            idx1 = splitter_level1[run-1][1]
            idx2 = splitter_level2[run-1][1]
            if self.feature_level == 2:
                idx = idx1
            elif self.feature_level == 3:
                idx = [ idx1[i] for i in idx2 ]
            self.splitter_prev[run-1] = idx
            dfTrain_cv = dfTrain.iloc[idx].copy()
            feature_dir = "%s/Run%d" % (config.OUTPUT_DIR, run)
            for file_name in sorted(os.listdir(feature_dir)):
                if self.feature_suffix in file_name:
                    ## NOTE the name is valid.pred.MODEL.csv and test.pred.MODEL.csv
                    fname = file_name.split(".")[2]
                    if (fname not in self.feature_list) or (fname not in self.feature_names_cv):
                        continue
                    if first:
                        self.logger.info("Run for cv...")
                        first = False
                    if feat_cnt == 0:
                        self.logger.info("Run %d"%run)
                    # load prediction
                    x = self.load_feature(feature_dir, "valid.pred."+fname)
                    x = np.nan_to_num(x)
                    dim = np_utils._dim(x)
                    # also including level 1 models' preditions
                    if x.shape[0] > len(idx):
                        x = x[idx2]
                    dfTrain_cv[fname] = x
                    feat_cnt += 1
                    self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                        feat_cnt, len(self.feature_list), fname, dim))
                    # load probability if any
                    try:
                        x = self.load_feature(feature_dir, "valid.proba."+fname, 
                                            columns=None, columns_pattern="proba")
                        x = np.nan_to_num(x)
                        dim = np_utils._dim(x)
                        # also including level 1 models' preditions
                        if x.shape[0] > len(idx):
                            x = x[idx2]
                        for i in range(dim):
                            dfTrain_cv["%s_proba%d"%(fname, i)] = x[:,i]
                        self.logger.info("Combine {:>3}/{:>3} proba feat: {} ({}D)".format(
                            feat_cnt, len(self.feature_list), fname, dim))
                    except:
                        pass

            dfTrain_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
            if run == 1:
                self.y_train_cv = [0]*self.n_iter
                self.X_train_cv = [0]*self.n_iter
            self.y_train_cv[run-1] = dfTrain_cv["relevance"].values.astype(float)
            self.X_train_cv[run-1] = dfTrain_cv.drop(["id","relevance"], axis=1).values.astype(float)

        if self.has_basic:
            self.logger.info("Overall Shape: %d x %d"%(
                len(self.y_train_cv[self.n_iter-1]), self.X_train_basic.shape[1] + self.X_train_cv[self.n_iter-1].shape[1]))
        else:
            self.logger.info("Overall Shape: %d x %d"%(
                len(self.y_train_cv[self.n_iter-1]), self.X_train_cv[self.n_iter-1].shape[1]))
        self.logger.info("Done combinning.")
        
        return self
    def combine(self):

        dfAll = pkl_utils._load(config.INFO_DATA)
        dfAll_raw = dfAll.copy()
        y_train = dfAll["relevance"].values[:TRAIN_SIZE]

        ## for basic features
        feat_cnt = 0
        self.logger.info("Run for basic...")
        for file_name in sorted(os.listdir(config.FEAT_DIR)):
            if self.feature_suffix in file_name:
                fname = file_name.split(".")[0]
                if fname not in self.feature_dict:
                    continue
                x = self.load_feature(config.FEAT_DIR, fname)
                x = np.nan_to_num(x)
                if np.isnan(x).any():
                    self.logger.info("%s nan" % fname)
                    continue
                # apply feature transform
                mandatory = self.feature_dict[fname][0]
                transformer = self.feature_dict[fname][1]
                x = transformer.fit_transform(x)
                dim = np_utils._dim(x)
                if dim == 1:
                    corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                    if not mandatory and abs(corr) < self.corr_threshold:
                        self.logger.info(
                            "Drop: {} ({}D) (abs corr = {}, < threshold = {})".
                            format(fname, dim, abs(corr), self.corr_threshold))
                        continue
                    dfAll[fname] = x
                    self.feature_names.append(fname)
                else:
                    columns = ["%s_%d" % (fname, x) for x in range(dim)]
                    df = pd.DataFrame(x, columns=columns)
                    dfAll = pd.concat([dfAll, df], axis=1)
                    self.feature_names.extend(columns)
                feat_cnt += 1
                self.feature_names_basic.append(fname)
                if dim == 1:
                    self.logger.info(
                        "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".
                        format(feat_cnt, len(self.feature_dict.keys()), fname,
                               dim, corr))
                else:
                    self.logger.info(
                        "Combine {:>3}/{:>3} feat: {} ({}D)".format(
                            feat_cnt, len(self.feature_dict.keys()), fname,
                            dim))
        dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
        ## basic
        dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()
        self.y_train = dfTrain["relevance"].values.astype(float)
        dfTrain.drop(["id", "relevance"], axis=1, inplace=True)
        self.X_train = dfTrain.values.astype(float)

        dfTest = dfAll.iloc[TRAIN_SIZE:].copy()
        self.id_test = dfTest["id"].values.astype(int)
        dfTest.drop(["id", "relevance"], axis=1, inplace=True)
        self.X_test = dfTest.values.astype(float)

        ## all
        first = True
        feat_cv_cnt = 0
        dfAll_cv_all = dfAll_raw.copy()
        feature_dir = "%s/All" % (config.FEAT_DIR)
        for file_name in sorted(os.listdir(feature_dir)):
            if self.feature_suffix in file_name:
                fname = file_name.split(".")[0]
                if fname not in self.feature_dict:
                    continue
                if first:
                    self.logger.info("Run for all...")
                    first = False
                x = self.load_feature(feature_dir, fname)
                x = np.nan_to_num(x)
                if np.isnan(x).any():
                    self.logger.info("%s nan" % fname)
                    continue
                # apply feature transform
                mandatory = self.feature_dict[fname][0]
                transformer = self.feature_dict[fname][1]
                x = transformer.fit_transform(x)
                dim = np_utils._dim(x)
                if dim == 1:
                    corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                    if not mandatory and abs(corr) < self.corr_threshold:
                        self.logger.info(
                            "Drop: {} ({}D) (abs corr = {}, < threshold = {})".
                            format(fname, dim, abs(corr), self.corr_threshold))
                        continue
                    dfAll_cv_all[fname] = x
                    self.feature_names.append(fname)
                else:
                    columns = ["%s_%d" % (fname, x) for x in range(dim)]
                    df = pd.DataFrame(x, columns=columns)
                    dfAll_cv_all = pd.concat([dfAll_cv_all, df], axis=1)
                    self.feature_names.extend(columns)
                feat_cv_cnt += 1
                self.feature_names_cv.append(fname)
                if dim == 1:
                    self.logger.info(
                        "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".
                        format(feat_cnt + feat_cv_cnt,
                               len(self.feature_dict.keys()), fname, dim,
                               corr))
                else:
                    self.logger.info(
                        "Combine {:>3}/{:>3} feat: {} ({}D)".format(
                            feat_cnt + feat_cv_cnt,
                            len(self.feature_dict.keys()), fname, dim))
        if feat_cv_cnt > 0:
            dfAll_cv_all.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
            X_tmp = dfAll_cv_all.drop(["id", "relevance"],
                                      axis=1).values.astype(float)
            self.X_train_cv_all = X_tmp[:TRAIN_SIZE]
            self.X_test = np.hstack((self.X_test, X_tmp[TRAIN_SIZE:]))
        else:
            self.X_train_cv_all = None
        feat_cnt += feat_cv_cnt

        ## for cv features
        first = True
        for run in range(1, self.n_iter + 1):
            feat_cv_cnt = 0
            dfAll_cv = dfAll_raw.copy()
            feature_dir = "%s/Run%d" % (config.FEAT_DIR, run)
            for file_name in sorted(os.listdir(feature_dir)):
                if self.feature_suffix in file_name:
                    fname = file_name.split(".")[0]
                    if (fname not in self.feature_dict) or (
                            fname not in self.feature_names_cv):
                        continue
                    if first:
                        self.logger.info("Run for cv...")
                        first = False
                    if feat_cv_cnt == 0:
                        self.logger.info("Run %d" % run)
                    x = self.load_feature(feature_dir, fname)
                    x = np.nan_to_num(x)
                    if np.isnan(x).any():
                        self.logger.info("%s nan" % fname)
                        continue
                    # apply feature transform
                    mandatory = self.feature_dict[fname][0]
                    transformer = self.feature_dict[fname][1]
                    x = transformer.fit_transform(x)
                    dim = np_utils._dim(x)
                    if dim == 1:
                        dfAll_cv[fname] = x
                    else:
                        columns = ["%s_%d" % (fname, x) for x in range(dim)]
                        df = pd.DataFrame(x, columns=columns)
                        dfAll_cv = pd.concat([dfAll_cv, df], axis=1)
                    feat_cv_cnt += 1
                    self.logger.info(
                        "Combine {:>3}/{:>3} feat: {} ({}D)".format(
                            feat_cnt + feat_cv_cnt,
                            len(self.feature_dict.keys()), fname, dim))
            if feat_cv_cnt > 0:
                dfAll_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
                dfTrain_cv = dfAll_cv.iloc[:TRAIN_SIZE].copy()
                X_tmp = dfTrain_cv.drop(["id", "relevance"],
                                        axis=1).values.astype(float)
                if run == 1:
                    self.X_train_cv = np.zeros(
                        (X_tmp.shape[0], X_tmp.shape[1], self.n_iter),
                        dtype=float)
                self.X_train_cv[:, :, run - 1] = X_tmp
        if feat_cv_cnt == 0:
            self.X_train_cv = None
            self.basic_only = 1

        # report final results
        if self.basic_only:
            self.logger.info("Overall Shape: %d x %d" %
                             (len(self.y_train), self.X_train.shape[1]))
        else:
            self.logger.info("Overall Shape: %d x %d" %
                             (len(self.y_train), self.X_train.shape[1] +
                              self.X_train_cv_all.shape[1]))
        self.logger.info("Done combinning.")

        return self
    def combine(self):
        # combine meta features
        if self.meta_feature_dict:
            cb = Combiner(feature_dict=self.meta_feature_dict,
                          feature_name=self.feature_name,
                          feature_suffix=".pkl",
                          corr_threshold=self.corr_threshold)
            cb.combine()
            self.X_train_basic = cb.X_train
            self.X_test_basic = cb.X_test
            self.feature_names_basic = cb.feature_names_basic
            self.feature_names.extend(cb.feature_names)
        else:
            self.X_train_basic = None
            self.X_test_basic = None

        # combine other features
        dfAll = pkl_utils._load(config.INFO_DATA)

        dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

        dfTest = dfAll.iloc[TRAIN_SIZE:].copy()
        self.id_test = dfTest["id"].values.astype(int)

        ## all
        first = True
        feat_cnt = 0
        feature_dir = "%s/All" % (config.OUTPUT_DIR)
        for file_name in sorted(os.listdir(feature_dir)):
            if self.feature_suffix in file_name:
                ## NOTE the name is valid.pred.MODEL.csv and test.pred.MODEL.csv
                fname = file_name.split(".")[2]
                if fname not in self.feature_list:
                    continue
                if first:
                    self.logger.info("Run for all...")
                    first = False
                # load prediction
                x = self.load_feature(feature_dir, "test.pred." + fname)
                x = np.nan_to_num(x)
                dim = np_utils._dim(x)
                dfTest[fname] = x
                feat_cnt += 1
                self.feature_names_cv.append(fname)
                self.feature_names.append(fname)
                self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                    feat_cnt, len(self.feature_list), fname, dim))
                # load probability if any
                try:
                    x = self.load_feature(feature_dir,
                                          "test.proba." + fname,
                                          columns=None,
                                          columns_pattern="proba")
                    x = np.nan_to_num(x)
                    dim = np_utils._dim(x)
                    for i in range(dim):
                        dfTest["%s_proba%d" % (fname, i)] = x[:, i]
                    self.logger.info(
                        "Combine {:>3}/{:>3} proba feat: {} ({}D)".format(
                            feat_cnt, len(self.feature_list), fname, dim))
                    self.feature_names.extend(
                        ["%s_proba%d" % (fname, i) for i in range(dim)])
                except:
                    pass

        dfTest.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
        self.X_test = dfTest.drop(["id", "relevance"],
                                  axis=1).values.astype(float)
        if self.meta_feature_dict:
            self.X_test = np.hstack([self.X_test_basic, self.X_test])

        ## for cv features
        first = True
        for run in range(1, self.n_iter + 1):
            feat_cnt = 0
            idx1 = splitter_level1[run - 1][1]
            idx2 = splitter_level2[run - 1][1]
            if self.feature_level == 2:
                idx = idx1
            elif self.feature_level == 3:
                idx = [idx1[i] for i in idx2]
            self.splitter_prev[run - 1] = idx
            dfTrain_cv = dfTrain.iloc[idx].copy()
            feature_dir = "%s/Run%d" % (config.OUTPUT_DIR, run)
            for file_name in sorted(os.listdir(feature_dir)):
                if self.feature_suffix in file_name:
                    ## NOTE the name is valid.pred.MODEL.csv and test.pred.MODEL.csv
                    fname = file_name.split(".")[2]
                    if (fname not in self.feature_list) or (
                            fname not in self.feature_names_cv):
                        continue
                    if first:
                        self.logger.info("Run for cv...")
                        first = False
                    if feat_cnt == 0:
                        self.logger.info("Run %d" % run)
                    # load prediction
                    x = self.load_feature(feature_dir, "valid.pred." + fname)
                    x = np.nan_to_num(x)
                    dim = np_utils._dim(x)
                    # also including level 1 models' preditions
                    if x.shape[0] > len(idx):
                        x = x[idx2]
                    dfTrain_cv[fname] = x
                    feat_cnt += 1
                    self.logger.info(
                        "Combine {:>3}/{:>3} feat: {} ({}D)".format(
                            feat_cnt, len(self.feature_list), fname, dim))
                    # load probability if any
                    try:
                        x = self.load_feature(feature_dir,
                                              "valid.proba." + fname,
                                              columns=None,
                                              columns_pattern="proba")
                        x = np.nan_to_num(x)
                        dim = np_utils._dim(x)
                        # also including level 1 models' preditions
                        if x.shape[0] > len(idx):
                            x = x[idx2]
                        for i in range(dim):
                            dfTrain_cv["%s_proba%d" % (fname, i)] = x[:, i]
                        self.logger.info(
                            "Combine {:>3}/{:>3} proba feat: {} ({}D)".format(
                                feat_cnt, len(self.feature_list), fname, dim))
                    except:
                        pass

            dfTrain_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
            if run == 1:
                self.y_train_cv = [0] * self.n_iter
                self.X_train_cv = [0] * self.n_iter
            self.y_train_cv[run -
                            1] = dfTrain_cv["relevance"].values.astype(float)
            self.X_train_cv[run - 1] = dfTrain_cv.drop(
                ["id", "relevance"], axis=1).values.astype(float)

        if self.has_basic:
            self.logger.info("Overall Shape: %d x %d" %
                             (len(self.y_train_cv[self.n_iter - 1]),
                              self.X_train_basic.shape[1] +
                              self.X_train_cv[self.n_iter - 1].shape[1]))
        else:
            self.logger.info("Overall Shape: %d x %d" %
                             (len(self.y_train_cv[self.n_iter - 1]),
                              self.X_train_cv[self.n_iter - 1].shape[1]))
        self.logger.info("Done combinning.")

        return self
Example #10
0
    def combine(self):
        dfAll = table_utils._read(config.INFO_DATA)
        dfAll_raw = dfAll.copy()
        y = dfAll['relevance'].values

        feat_cnt = 0
        self.logger.info('Run for basic...')
        for file_name in sorted(os.listdir(config.FEAT_DIR)):
            if not config.FEAT_FILE_SUFFIX in file_name:
                continue
            fname = os.path.splitext(file_name)[0]
            if fname not in self.feature_dict:
                continue
            x = self.load_feature(config.FEAT_DIR, fname)
            x = np.nan_to_num(x)
            # Still necessary?
            if np.isnan(x).any():
                self.logger.info("%s nan" % (fname))
                continue
            # Apply feature transformers (?)
            mandatory = self.feature_dict[fname][0]
            transformer = self.feature_dict[fname][1]
            x = transformer.fit_transform(x)
            dim = np_utils._dim(x)
            if dim == 1:
                corr = np_utils._corr(x, y)
                if not mandatory and (np.isnan(corr)
                                      or abs(corr) < self.corr_threshold):
                    self.logger.info(
                        "Drop: {} ({}D) (abs_corr = {}, < threshold {})".
                        format(fname, dim, abs(corr), self.corr_threshold))
                    continue
                dfAll[fname] = x
                self.feature_names.append(fname)
            else:
                columns = ["%s_%d" % (fname, x) for x in range(dim)]
                df = pd.DataFrame(x, columns=columns)
                dfAll = pd.concat([dfAll, df], axis=1)
                self.feature_names.extend(columns)
            feat_cnt += 1
            self.feature_names_basic.append(fname)
            if dim == 1:
                self.logger.info(
                    "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format(
                        feat_cnt, len(self.feature_dict.keys()), fname, dim,
                        corr))
            else:
                self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                    feat_cnt, len(self.feature_dict.keys()), fname, dim))

        dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
        self.y = dfAll["relevance"].values.astype(float)
        self.weights = dfAll['weight'].values
        self.query_ids = dfAll['norm_query_id'].values
        dfAll.drop(["relevance", "weight", "norm_query_id"],
                   axis=1,
                   inplace=True)
        self.X = dfAll.values.astype(float)

        self.logger.info("Overall Shape: %d x %d" %
                         (len(self.y), self.X.shape[1]))
        self.logger.info("Done combining")