def _inner_train(self, model: models.Doc2Vec, sentences: List[TaggedDocument], sentence_ids: List[str]): for x in range(250): # for x in range(60): logger.info("epoch:{}".format(x)) model.train(sentences, total_examples=model.corpus_count, epochs=1) if x % 50 != 0: # if x % 1 != 0: continue ranks = [] for doc_id in random.sample(sentence_ids, 100): inferred_vector = model.infer_vector(sentences[doc_id].words) sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs)) rank = [doc_id_candidate for doc_id_candidate, sim in sims ].index(sentences[doc_id].tags[0]) ranks.append(rank) counter = collections.Counter(ranks) result = "epoch:{}, rank value:{}, counter:{}".format( x, counter[0], counter) logger.info(result) if counter[0] >= self._passing_precision: break datetime_pattern = "%Y%m%d%H%M" datetime_str = datetime.now().strftime(datetime_pattern) file_name_format = "doc2vec_{}.model" file_name = file_name_format.format(datetime_str) model.save(file_name) return model
def load_test(self): test_df = pd.read_csv(self._test_file_name) logger.info("Test file rows and columns are : {}".format( test_df.shape)) test_id = test_df["item_id"].values test_X = test_df.drop(cols_to_drop, axis=1) test_X = self._convert(test_X) test_X = test_X.fillna(-1) return test_X, test_id
def load_train(self): train_df = pd.read_csv(self._train_file_name) logger.info("Train file rows and columns are : {}".format( train_df.shape)) train_y = train_df["deal_probability"].values train_X = train_df.drop(cols_to_drop + ["deal_probability"], axis=1) train_X = self._convert(train_X) train_X = train_X.fillna(-1) return train_X, train_y
def test_train(self): model = models.Doc2Vec.load('doc2vec.model') sentences = self._corpus_to_sentences() sentence_ids = range(len(sentences)) ranks = [] for doc_id in random.sample(sentence_ids, 100): inferred_vector = model.infer_vector(sentences[doc_id].words) sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs)) rank = [doc_id_candidate for doc_id_candidate, sim in sims].index(sentences[doc_id].tags[0]) ranks.append(rank) counter = collections.Counter(ranks) logger.info(counter) logger.info("rank value:{}".format(counter[0]))
def summarize_text(self): feature_df = pd.read_csv( os.path.join(self.data_directory, self.summary_with_image_file)) text_df = pd.read_csv( os.path.join(self.data_directory, self.text_file_name)) text_df["abc"] = "abc" summary_df = pd.merge(feature_df, text_df, on="item_id", how="left") if len(summary_df[summary_df["abc"] != "abc"]["item_id"].values) != 0: import pdb pdb.set_trace() raise ValueError() summary_df = summary_df.drop(["abc"], axis=1) summary_df.fillna(-1, inplace=True) summary_df.to_csv(self.summary_with_image_and_text_file, index=False) logger.info("end")
def _main(): data_loader = DataLoader() train_X, train_y = data_loader.load_train() test_X, test_id = data_loader.load_test() train_model = TrainModel() predicted_test = train_model.exec(train_X, train_y, test_X) # Making a submission file # predicted_test[predicted_test > 1] = 1 predicted_test[predicted_test < 0] = 0 sub_df = pd.DataFrame({"item_id": test_id}) sub_df["deal_probability"] = predicted_test file_name_format = "lgb_{}.csv" file_name = file_name_format.format(datetime_str) sub_df.to_csv(file_name, index=False) logger.info("end")
def run_lightgbm(train_X, train_y, test_X): param_grid = { 'max_depth': [-1, 8], 'num_leaves': [150, 200], 'learning_rate': [0.1], 'n_estimators': [1500] } """ param_grid = { 'max_depth': [-1], 'num_leaves': [31, 62], 'learning_rate': [0.1], 'n_estimators': [150] } """ model = lightgbm.LGBMRegressor(objective="regression") gbm = GridSearchCV(model, param_grid, cv=3, scoring="neg_mean_squared_error", n_jobs=1) # TODO specify category columns gbm.fit(train_X, train_y) means = gbm.cv_results_['mean_test_score'] stds = gbm.cv_results_['std_test_score'] best_params_file_name_format = "best_params_{}.txt" best_params_file_name = best_params_file_name_format.format(datetime_str) with open(best_params_file_name, encoding="utf-8", mode="w") as f: f.write(str(gbm.best_estimator_)) f.write("\n") f.write("best params: {}".format(gbm.best_params_)) for mean, std, params in zip(means, stds, gbm.cv_results_['params']): logger.info(" mean:{}, std:{}, params:{}".format(mean, std, params)) logger.info("best params: {}".format(gbm.best_params_)) model = lightgbm.LGBMRegressor(objective="regression", **gbm.best_params_) model.fit(X=train_X, y=train_y) predicted_test_y = model.predict(test_X) # load # bst = lgb.Booster(model_file='mode.txt')bst = lgb.Booste return predicted_test_y
# validation_index = -200 # validation_index = -200000 # dev_X = train_X.iloc[:validation_index,:] # val_X = train_X.iloc[validation_index:,:] # dev_y = train_y[:validation_index] # val_y = train_y[validation_index:] test_X, test_id = data_loader.load_test() # Training the model # predicted_test = run_lightgbm(train_X, train_y, test_X) # Making a submission file # predicted_test[predicted_test > 1] = 1 predicted_test[predicted_test < 0] = 0 sub_df = pd.DataFrame({"item_id": test_id}) sub_df["deal_probability"] = predicted_test file_name_format = "baseline_lightgbm_{}.csv" file_name = file_name_format.format(datetime_str) sub_df.to_csv(file_name, index=False) logger.info("end") # TODO # see discussion # see kernel # see パラメーターチューニング # アンサンブル # 画像処理 # description, title, param1, param2, param3解析
def exec(self, train_X, train_y, evaluation_X): """ LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, learning_rate=0.1, max_depth=8, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=1000, n_jobs=-1, num_leaves=150, objective='regression', random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=1) """ all_params = { "objective": ["regression"], "metric": ["rmse"], "num_iterations": [1500], "learning_rate": [0.01], "num_leaves": [200], "max_depth": [-1], "min_data_in_leaf": [100], "max_bin": [1023] } """ all_params = {"objective": ["regression"], "metric": ["rmse"], "num_iterations": [1500], "learning_rate": [0.01], "num_leaves": [200], "max_depth": [-1], "min_data_in_leaf": [100], "max_bin": [1023] } all_params = {"objective": ["regression"], "metric": ["rmse"], "num_iterations": [100, 1000, 1500], "learning_rate": [0.1, 0.01], "num_leaves": [31, 93, 150], "max_depth": [-1], "min_data_in_leaf": [1, 20, 100], "max_bin": [63, 255, 511] } """ """ all_params = {"objective": ["regression"], "metric": ["rmse"], "num_iterations": [100, 1000] } """ min_score = 100 min_params = None cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0) for params in ParameterGrid(all_params): logger.info('params: {}'.format(params)) min_params = params """ loss_function_scores = list() best_iterations = list() for train_idx, valid_idx in cv.split(train_X, train_y): trn_x = train_X.iloc[train_idx, :] val_x = train_X.iloc[valid_idx, :] trn_y = train_y[train_idx] val_y = train_y[valid_idx] train_set = lgb.Dataset(trn_x, label=trn_y, categorical_feature=cat_vars) test_set = lgb.Dataset(val_x, label=val_y, categorical_feature=cat_vars) train_params = copy.deepcopy(params) model = lgb.train(params=train_params, train_set=train_set, valid_sets=[test_set], early_stopping_rounds=100) predicted_val = model.predict(val_x) rmse = np.sqrt(mean_squared_error(y_true=val_y, y_pred=predicted_val)) loss_function_scores.append(rmse) best_iterations.append(model.best_iteration) logger.info(" loss: {}".format(rmse)) # params['num_iteration'] = int(np.mean(best_iterations)) model_score = np.mean(loss_function_scores) logger.info(" mean loss: {}, params:{}".format(model_score, params)) if min_score > model_score: min_score = model_score params.update(train_params) min_params = params logger.info('current min score: {}, params: {}'.format(min_score, min_params)) logger.info('minimum score: {}'.format(min_score)) logger.info('minimum params: {}'.format(min_params)) """ train_set = lgb.Dataset(train_X, label=train_y, categorical_feature=cat_vars) model = lgb.train(params=min_params, train_set=train_set) model_file_name = "model_{}.pkl".format(datetime_str) with open(model_file_name, 'wb') as f: pickle.dump(model, f, -1) # with open(model_file_name, 'rb') as f: # model = pickle.load(f) predicted_val = model.predict(train_X) rmse = np.sqrt(mean_squared_error(y_true=train_y, y_pred=predicted_val)) logger.info("final loss: {}".format(rmse)) predicted_test = model.predict(data=evaluation_X) return predicted_test
return predicted_test def _main(): data_loader = DataLoader() train_X, train_y = data_loader.load_train() test_X, test_id = data_loader.load_test() train_model = TrainModel() predicted_test = train_model.exec(train_X, train_y, test_X) # Making a submission file # predicted_test[predicted_test > 1] = 1 predicted_test[predicted_test < 0] = 0 sub_df = pd.DataFrame({"item_id": test_id}) sub_df["deal_probability"] = predicted_test file_name_format = "lgb_{}.csv" file_name = file_name_format.format(datetime_str) sub_df.to_csv(file_name, index=False) logger.info("end") if __name__ == '__main__': logger.info("start>>>>>>>>>>>>>>>>>>>>>") try: _main() except Exception as e: logger.error("Unexpected error has occurred.", exc_info=e) logger.info("end>>>>>>>>>>>>>>>>>>>>>")