Beispiel #1
0
 def _inner_train(self, model: models.Doc2Vec,
                  sentences: List[TaggedDocument], sentence_ids: List[str]):
     for x in range(250):
         # for x in range(60):
         logger.info("epoch:{}".format(x))
         model.train(sentences, total_examples=model.corpus_count, epochs=1)
         if x % 50 != 0:
             # if x % 1 != 0:
             continue
         ranks = []
         for doc_id in random.sample(sentence_ids, 100):
             inferred_vector = model.infer_vector(sentences[doc_id].words)
             sims = model.docvecs.most_similar([inferred_vector],
                                               topn=len(model.docvecs))
             rank = [doc_id_candidate for doc_id_candidate, sim in sims
                     ].index(sentences[doc_id].tags[0])
             ranks.append(rank)
         counter = collections.Counter(ranks)
         result = "epoch:{}, rank value:{}, counter:{}".format(
             x, counter[0], counter)
         logger.info(result)
         if counter[0] >= self._passing_precision:
             break
     datetime_pattern = "%Y%m%d%H%M"
     datetime_str = datetime.now().strftime(datetime_pattern)
     file_name_format = "doc2vec_{}.model"
     file_name = file_name_format.format(datetime_str)
     model.save(file_name)
     return model
Beispiel #2
0
 def load_test(self):
     test_df = pd.read_csv(self._test_file_name)
     logger.info("Test file rows and columns are : {}".format(
         test_df.shape))
     test_id = test_df["item_id"].values
     test_X = test_df.drop(cols_to_drop, axis=1)
     test_X = self._convert(test_X)
     test_X = test_X.fillna(-1)
     return test_X, test_id
Beispiel #3
0
 def load_train(self):
     train_df = pd.read_csv(self._train_file_name)
     logger.info("Train file rows and columns are : {}".format(
         train_df.shape))
     train_y = train_df["deal_probability"].values
     train_X = train_df.drop(cols_to_drop + ["deal_probability"], axis=1)
     train_X = self._convert(train_X)
     train_X = train_X.fillna(-1)
     return train_X, train_y
 def test_train(self):
     model = models.Doc2Vec.load('doc2vec.model')
     sentences = self._corpus_to_sentences()
     sentence_ids = range(len(sentences))
     ranks = []
     for doc_id in random.sample(sentence_ids, 100):
         inferred_vector = model.infer_vector(sentences[doc_id].words)
         sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
         rank = [doc_id_candidate for doc_id_candidate, sim in sims].index(sentences[doc_id].tags[0])
         ranks.append(rank)
     counter = collections.Counter(ranks)
     logger.info(counter)
     logger.info("rank value:{}".format(counter[0]))
 def summarize_text(self):
     feature_df = pd.read_csv(
         os.path.join(self.data_directory, self.summary_with_image_file))
     text_df = pd.read_csv(
         os.path.join(self.data_directory, self.text_file_name))
     text_df["abc"] = "abc"
     summary_df = pd.merge(feature_df, text_df, on="item_id", how="left")
     if len(summary_df[summary_df["abc"] != "abc"]["item_id"].values) != 0:
         import pdb
         pdb.set_trace()
         raise ValueError()
     summary_df = summary_df.drop(["abc"], axis=1)
     summary_df.fillna(-1, inplace=True)
     summary_df.to_csv(self.summary_with_image_and_text_file, index=False)
     logger.info("end")
Beispiel #6
0
def _main():
    data_loader = DataLoader()
    train_X, train_y = data_loader.load_train()
    test_X, test_id = data_loader.load_test()

    train_model = TrainModel()
    predicted_test = train_model.exec(train_X, train_y, test_X)

    # Making a submission file #
    predicted_test[predicted_test > 1] = 1
    predicted_test[predicted_test < 0] = 0
    sub_df = pd.DataFrame({"item_id": test_id})
    sub_df["deal_probability"] = predicted_test

    file_name_format = "lgb_{}.csv"
    file_name = file_name_format.format(datetime_str)
    sub_df.to_csv(file_name, index=False)
    logger.info("end")
Beispiel #7
0
def run_lightgbm(train_X, train_y, test_X):
    param_grid = {
        'max_depth': [-1, 8],
        'num_leaves': [150, 200],
        'learning_rate': [0.1],
        'n_estimators': [1500]
    }
    """
    param_grid = {
        'max_depth': [-1],
        'num_leaves': [31, 62],
        'learning_rate': [0.1],
        'n_estimators': [150]
    }
    """
    model = lightgbm.LGBMRegressor(objective="regression")
    gbm = GridSearchCV(model,
                       param_grid,
                       cv=3,
                       scoring="neg_mean_squared_error",
                       n_jobs=1)
    # TODO specify category columns
    gbm.fit(train_X, train_y)

    means = gbm.cv_results_['mean_test_score']
    stds = gbm.cv_results_['std_test_score']

    best_params_file_name_format = "best_params_{}.txt"
    best_params_file_name = best_params_file_name_format.format(datetime_str)
    with open(best_params_file_name, encoding="utf-8", mode="w") as f:
        f.write(str(gbm.best_estimator_))
        f.write("\n")
        f.write("best params: {}".format(gbm.best_params_))
    for mean, std, params in zip(means, stds, gbm.cv_results_['params']):
        logger.info("    mean:{}, std:{}, params:{}".format(mean, std, params))
    logger.info("best params: {}".format(gbm.best_params_))

    model = lightgbm.LGBMRegressor(objective="regression", **gbm.best_params_)
    model.fit(X=train_X, y=train_y)
    predicted_test_y = model.predict(test_X)
    # load
    # bst = lgb.Booster(model_file='mode.txt')bst = lgb.Booste
    return predicted_test_y
Beispiel #8
0
# validation_index = -200
# validation_index = -200000
# dev_X = train_X.iloc[:validation_index,:]
# val_X = train_X.iloc[validation_index:,:]
# dev_y = train_y[:validation_index]
# val_y = train_y[validation_index:]

test_X, test_id = data_loader.load_test()

# Training the model #
predicted_test = run_lightgbm(train_X, train_y, test_X)

# Making a submission file #
predicted_test[predicted_test > 1] = 1
predicted_test[predicted_test < 0] = 0
sub_df = pd.DataFrame({"item_id": test_id})
sub_df["deal_probability"] = predicted_test

file_name_format = "baseline_lightgbm_{}.csv"
file_name = file_name_format.format(datetime_str)
sub_df.to_csv(file_name, index=False)
logger.info("end")

# TODO
# see discussion
# see kernel
# see パラメーターチューニング
# アンサンブル
# 画像処理
# description, title, param1, param2, param3解析
Beispiel #9
0
    def exec(self, train_X, train_y, evaluation_X):
        """
        LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=8, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=1000,
       n_jobs=-1, num_leaves=150, objective='regression',
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=1)
        """
        all_params = {
            "objective": ["regression"],
            "metric": ["rmse"],
            "num_iterations": [1500],
            "learning_rate": [0.01],
            "num_leaves": [200],
            "max_depth": [-1],
            "min_data_in_leaf": [100],
            "max_bin": [1023]
        }
        """
        all_params = {"objective": ["regression"],
                      "metric": ["rmse"],
                      "num_iterations": [1500],
                      "learning_rate": [0.01],
                      "num_leaves": [200],
                      "max_depth": [-1],
                      "min_data_in_leaf": [100],
                      "max_bin": [1023]
                     }

        all_params = {"objective": ["regression"],
                      "metric": ["rmse"],
                      "num_iterations": [100, 1000, 1500],
                      "learning_rate": [0.1, 0.01],
                      "num_leaves": [31, 93, 150],
                      "max_depth": [-1],
                      "min_data_in_leaf": [1, 20, 100],
                      "max_bin": [63, 255, 511]
                      }
        """
        """
        all_params = {"objective": ["regression"],
                      "metric": ["rmse"],
                      "num_iterations": [100, 1000]
                      }
        """
        min_score = 100
        min_params = None
        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
        for params in ParameterGrid(all_params):
            logger.info('params: {}'.format(params))
            min_params = params
        """

            loss_function_scores = list()
            best_iterations = list()
            for train_idx, valid_idx in cv.split(train_X, train_y):
                trn_x = train_X.iloc[train_idx, :]
                val_x = train_X.iloc[valid_idx, :]
                trn_y = train_y[train_idx]
                val_y = train_y[valid_idx]

                train_set = lgb.Dataset(trn_x, label=trn_y, categorical_feature=cat_vars)
                test_set = lgb.Dataset(val_x, label=val_y, categorical_feature=cat_vars)
                train_params = copy.deepcopy(params)
                model = lgb.train(params=train_params, train_set=train_set,
                                  valid_sets=[test_set], early_stopping_rounds=100)

                predicted_val = model.predict(val_x)
                rmse = np.sqrt(mean_squared_error(y_true=val_y, y_pred=predicted_val))
                loss_function_scores.append(rmse)
                best_iterations.append(model.best_iteration)
                logger.info("   loss: {}".format(rmse))

            # params['num_iteration'] = int(np.mean(best_iterations))
            model_score = np.mean(loss_function_scores)
            logger.info("   mean loss: {}, params:{}".format(model_score, params))
            if min_score > model_score:
                min_score = model_score
                params.update(train_params)
                min_params = params
            logger.info('current min score: {}, params: {}'.format(min_score, min_params))

        logger.info('minimum score: {}'.format(min_score))
        logger.info('minimum params: {}'.format(min_params))
        """

        train_set = lgb.Dataset(train_X,
                                label=train_y,
                                categorical_feature=cat_vars)
        model = lgb.train(params=min_params, train_set=train_set)
        model_file_name = "model_{}.pkl".format(datetime_str)
        with open(model_file_name, 'wb') as f:
            pickle.dump(model, f, -1)
        # with open(model_file_name, 'rb') as f:
        #     model = pickle.load(f)

        predicted_val = model.predict(train_X)
        rmse = np.sqrt(mean_squared_error(y_true=train_y,
                                          y_pred=predicted_val))
        logger.info("final loss: {}".format(rmse))

        predicted_test = model.predict(data=evaluation_X)
        return predicted_test
Beispiel #10
0
        return predicted_test


def _main():
    data_loader = DataLoader()
    train_X, train_y = data_loader.load_train()
    test_X, test_id = data_loader.load_test()

    train_model = TrainModel()
    predicted_test = train_model.exec(train_X, train_y, test_X)

    # Making a submission file #
    predicted_test[predicted_test > 1] = 1
    predicted_test[predicted_test < 0] = 0
    sub_df = pd.DataFrame({"item_id": test_id})
    sub_df["deal_probability"] = predicted_test

    file_name_format = "lgb_{}.csv"
    file_name = file_name_format.format(datetime_str)
    sub_df.to_csv(file_name, index=False)
    logger.info("end")


if __name__ == '__main__':
    logger.info("start>>>>>>>>>>>>>>>>>>>>>")
    try:
        _main()
    except Exception as e:
        logger.error("Unexpected error has occurred.", exc_info=e)
    logger.info("end>>>>>>>>>>>>>>>>>>>>>")