class PredictRNNv3Threshold(_PredictRNNv3): threshold = luigi.FloatParameter(default=0.29) @property def model_name(self): model_name = super().model_name model_name += '_threshold_{}'.format(self.threshold) return model_name def run(self): self.random = RandomState(self.random_seed) np.random.seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder)) tf.set_random_seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder)) orders_path = self.requires()['orders'].output().path order_ids, inputs, _ = self._load_data(orders_path) model = self._build_model() model.load_weights(self.input()['model'].path) model.summary() scores = model.predict(inputs, batch_size=self.batch_size, verbose=1).flatten() scores = pd.DataFrame({'order_id': order_ids, 'product_id': inputs['product'], 'score': scores}) scores = scores[scores.score > self.threshold].sort_values('score', ascending=False) predictions = {} for order_id in order_ids: predictions[order_id] = [] for row in scores.itertuples(index=False): # ujson fails when it tries to serialize the numpy int values predictions[int(row.order_id)].append(int(row.product_id)) with self.output().open('w') as fd: ujson.dump(predictions, fd)
class RNNv5(object): product_history = luigi.IntParameter(default=91) embedding_dim = luigi.IntParameter(default=10) lstm_size = luigi.IntParameter(default=25) lstm_layers = luigi.IntParameter(default=2) hidden_layers = luigi.IntParameter(default=2) hidden_nonlinearily = luigi.Parameter(default='leaky_relu') dropout = luigi.FloatParameter(default=0.2) random_seed = luigi.IntParameter(default=3996193, significant=False) global_orders_ratio = luigi.FloatParameter(default=1.0, significant=False) validation_orders_ratio = luigi.FloatParameter(default=0.2, significant=False) batch_size = luigi.IntParameter(default=4096, significant=False) negative_factor = luigi.IntParameter(default=2, significant=False) epochs = luigi.IntParameter(default=1000, significant=False) @property def model_name(self): params = [ self.product_history, self.embedding_dim, self.lstm_size, self.lstm_layers, self.hidden_layers, self.hidden_nonlinearily, self.dropout, ] model_name = 'rnn_v5_{}'.format('_'.join(str(p).lower() for p in params)) return model_name def _init_random_state(self): self.random = RandomState(self.random_seed) np.random.seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder)) torch.manual_seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder)) def _build_model(self): model = Model( embedding_dim=self.embedding_dim, lstm_size=self.lstm_size, lstm_layers=self.lstm_layers, hidden_layers=self.hidden_layers, hidden_nonlinearily=self.hidden_nonlinearily, dropout=self.dropout) if torch.cuda.is_available(): model = model.cuda() print(model) return model
class PredictRNNv3ReorderSizeKnown(_PredictRNNv3): @staticmethod def _count_reordered_products(order): k = 0 for product in order['products']: if product['reordered']: k += 1 return k def _determine_reorder_size(self): assert self.mode == 'evaluation' num_reordered = {} orders_path = self.requires()['orders'].output().path with open(orders_path) as orders_file: for line in orders_file: user_data = ujson.loads(line) order_id = int(user_data['last_order']['order_id']) num_reordered[order_id] = self._count_reordered_products(user_data['last_order']) return num_reordered def run(self): self.random = RandomState(self.random_seed) np.random.seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder)) tf.set_random_seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder)) orders_path = self.requires()['orders'].output().path order_ids, inputs, _ = self._load_data(orders_path) model = self._build_model() model.load_weights(self.input()['model'].path) model.summary() scores = model.predict(inputs, batch_size=self.batch_size, verbose=0).flatten() scores = pd.DataFrame({'order_id': order_ids, 'product_id': inputs['product'], 'score': scores}) reorder_size = self._determine_reorder_size() predictions = {} for order_id in set(order_ids): predictions[order_id] = [] df = scores[scores.order_id == order_id].nlargest(reorder_size[order_id], 'score') for row in df.itertuples(index=False): # ujson fails when it tries to serialize the numpy int values predictions[int(order_id)].append(int(row.product_id)) with self.output().open('w') as fd: ujson.dump(predictions, fd)
class PredictMLPv2ExpectedF1(_PredictMLPv2): @property def model_name(self): model_name = super().model_name model_name += '_expected_f1' return model_name def _determine_reorder_size(self, scores): reorder_size = {} grouped = scores.groupby('order_id') P_values = grouped['score'].apply(list) with multiprocessing.Pool() as pool: results = pool.map(maximize_expected_f1, P_values) for order_id, (best_k, max_f1) in zip(grouped.groups.keys(), results): reorder_size[order_id] = best_k return reorder_size def run(self): self.random = RandomState(self.random_seed) np.random.seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder)) tf.set_random_seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder)) orders_path = self.requires()['orders'].output().path _, order_ids, product_ids, inputs, _ = self._load_data(orders_path) model = self._build_model() model.load_weights(self.input()['model'].path) model.summary() scores = model.predict(inputs, batch_size=self.batch_size, verbose=0).flatten() scores = pd.DataFrame({'order_id': order_ids, 'product_id': product_ids, 'score': scores}) reorder_size = self._determine_reorder_size(scores) predictions = {} for order_id in set(order_ids): predictions[order_id] = [] df = scores[scores.order_id == order_id].nlargest(reorder_size[order_id], 'score') for row in df.itertuples(index=False): # ujson fails when it tries to serialize the numpy int values predictions[int(order_id)].append(int(row.product_id)) with self.output().open('w') as fd: ujson.dump(predictions, fd)
class FitRNNv3(RNNv3, FitModel): def run(self): self.random = RandomState(self.random_seed) np.random.seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder)) tf.set_random_seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder)) # Split the orders into training and validation sets and write them to separate files orders_path = self.requires()['orders'].output().path training_fd = tempfile.NamedTemporaryFile(mode='w+', delete=True) validation_fd = tempfile.NamedTemporaryFile(mode='w+', delete=True) with open(orders_path) as input_fd: for line in input_fd: if self.global_orders_ratio >= 1 or self.random.uniform() <= self.global_orders_ratio: if self.random.uniform() <= self.validation_orders_ratio: validation_fd.write(line) else: training_fd.write(line) validation_fd.flush() training_fd.flush() _, validation_inputs, validation_predictions = self._load_data(validation_fd.name) training_generator, training_steps_per_epoch = \ self._create_data_generator(training_fd.name, self.max_prior_orders, self.batch_size) model = self._build_model() model.summary() callbacks = [ EarlyStopping(monitor='val_loss', patience=10), ModelCheckpoint(os.path.abspath(self.output().path), verbose=1, save_weights_only=True, save_best_only=True), ] class_weight = compute_class_weight('balanced', [0, 1], validation_predictions) class_weight = dict(enumerate(class_weight)) model.fit_generator(training_generator, training_steps_per_epoch, validation_data=(validation_inputs, validation_predictions), callbacks=callbacks, class_weight=class_weight, epochs=self.epochs, verbose=1) validation_fd.close() training_fd.close()
def z_standard_normal(num, random_state=None): """ Standard normal float32s by the Ziggurat method """ # Extra randoms samples becuase the Ziggurat rejects some zig_fac = 0.025 if random_state is None: random_state = RandomState(seed=123) # space to store the randoms out = empty(num, dtype=float32) i0 = 0 while i0 < num: n_needed = num - i0 # How many random integers do we need for n_needed random normals? pad = n_needed * zig_fac + ( n_needed * zig_fac * (1 - zig_fac) )**0.5 # mean + binomial estimate for 1 std. dev. (c.f. ignore (1-zf) and get Poisson est.) guess_n_rand = int(n_needed + pad + 1) if i0 > 0: print('Not enough random integers for normals, increasing') print('Need {:,} random numbers'.format(n_needed), 'guess {:,}'.format(guess_n_rand), 'i0=', i0) ri = fromstring(random_state.bytes(4 * guess_n_rand), dtype=uint32) # consume ri as needed res = zigg(n_needed, ri) out[i0:i0 + res.size] = res i0 += res.size return out
def init_random_state(random_seed): rng = RandomState(random_seed) np.random.seed(int.from_bytes(rng.bytes(4), byteorder=sys.byteorder)) torch.manual_seed(int.from_bytes(rng.bytes(4), byteorder=sys.byteorder)) return rng
class NGram(base.BaseModel): def main(self): t_start = datetime.now() logger.info(' {} / {} '.format(self.name, self.random_seed).center(62, '=')) logger.info('Hyperparameters:\n{}'.format(pprint.pformat(self.params))) if os.path.isfile(os.path.join(self.output_dir, 'test.csv')): logger.info('Output already exists - skipping') return # Initialize the random number generator self.random_state = RandomState(self.random_seed) np.random.seed(int.from_bytes(self.random_state.bytes(4), byteorder=sys.byteorder)) train_df = common.load_data('train') train_df['comment_text'] = train_df['comment_text'].apply(unidecode) test_df = common.load_data('test') test_df['comment_text'] = test_df['comment_text'].apply(unidecode) vectorizer = self.build_vectorizer(train_df, test_df) folds = common.stratified_kfold(train_df, random_seed=self.random_seed) for fold_num, train_ids, val_ids in folds: logger.info(f'Fold #{fold_num}') fold_train_df = train_df[train_df['id'].isin(train_ids)] fold_val_df = train_df[train_df['id'].isin(val_ids)] models = self.train(fold_num, vectorizer, fold_train_df, fold_val_df) logger.info('Generating the out-of-fold predictions') path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv') self.predict(models, vectorizer, fold_val_df, path) logger.info('Generating the test predictions') path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv') self.predict(models, vectorizer, test_df, path) logger.info('Combining the out-of-fold predictions') df_parts = [] for fold_num in range(1, 11): path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv') df_part = pd.read_csv(path, usecols=['id'] + common.LABELS) df_parts.append(df_part) train_pred = pd.concat(df_parts) path = os.path.join(self.output_dir, 'train.csv') train_pred.to_csv(path, index=False) logger.info('Averaging the test predictions') df_parts = [] for fold_num in range(1, 11): path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv') df_part = pd.read_csv(path, usecols=['id'] + common.LABELS) df_parts.append(df_part) test_pred = pd.concat(df_parts).groupby('id', as_index=False).mean() path = os.path.join(self.output_dir, 'test.csv') test_pred.to_csv(path, index=False) logger.info('Total elapsed time - {}'.format(datetime.now() - t_start)) def train(self, fold_num, vectorizer, train_df, val_df): X_train = vectorizer.transform(train_df['comment_text']) models = {} for label in common.LABELS: logger.info('Training the %s model', label) y_train = train_df[label].values model = LogisticRegression( solver='sag', penalty='l2', C=self.params['C'], tol=1e-8, max_iter=1000, random_state=self.random_state, verbose=1) model.fit(X_train, y_train) models[label] = model path = os.path.join(self.output_dir, f'fold{fold_num}.pickle') joblib.dump((vectorizer, models), path) return models def predict(self, models, vectorizer, df, output_path): X = vectorizer.transform(df['comment_text']) output = defaultdict(list) for label in common.LABELS: model = models[label] yhat = model.predict_proba(X)[:, 1] output[label].extend(yhat) predictions = pd.DataFrame.from_dict(output) predictions = predictions[common.LABELS] predictions.insert(0, 'id', df['id'].values) predictions.to_csv(output_path, index=False) def build_vectorizer(self, train_df, test_df): logger.info('Learning the vocabulary') vectorizer = TfidfVectorizer( strip_accents='unicode', analyzer=self.params['analyzer'], min_df=self.params['min_df'], ngram_range=(1, self.params['max_ngram']), max_features=self.params['max_features'], stop_words='english', sublinear_tf=True) train_text = train_df['comment_text'] test_text = test_df['comment_text'] all_text = pd.concat([train_text, test_text]) vectorizer.fit(all_text) logger.info('The vocabulary has %s words (%s ignored as stopwords)', len(vectorizer.vocabulary_), len(vectorizer.stop_words_)) return vectorizer
class XGB(base.BaseModel): def main(self): t_start = datetime.now() logger.info(' {} / {} '.format(self.name, self.random_seed).center(62, '=')) logger.info('Hyperparameters:\n{}'.format(pprint.pformat(self.params))) if os.path.isfile(os.path.join(self.output_dir, 'test.csv')): logger.info('Output already exists - skipping') # Initialize the random number generator self.random_state = RandomState(self.random_seed) np.random.seed( int.from_bytes(self.random_state.bytes(4), byteorder=sys.byteorder)) preprocessed_data = preprocessing.load(self.params) vectorizer = self.build_vectorizer(preprocessed_data) train_df = common.load_data('train') train_df['comment_text'] = train_df['id'].map(preprocessed_data) test_df = common.load_data('test') test_df['comment_text'] = test_df['id'].map(preprocessed_data) folds = common.stratified_kfold(train_df, random_seed=self.random_seed) for fold_num, train_ids, val_ids in folds: logger.info(f'Fold #{fold_num}') fold_train_df = train_df[train_df['id'].isin(train_ids)] fold_val_df = train_df[train_df['id'].isin(val_ids)] models = self.train(fold_num, vectorizer, fold_train_df, fold_val_df) logger.info('Generating the out-of-fold predictions') path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv') self.predict(models, vectorizer, fold_val_df, path) logger.info('Generating the test predictions') path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv') self.predict(models, vectorizer, test_df, path) logger.info('Combining the out-of-fold predictions') df_parts = [] for fold_num in range(1, 11): path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv') df_part = pd.read_csv(path, usecols=['id'] + common.LABELS) df_parts.append(df_part) train_pred = pd.concat(df_parts) path = os.path.join(self.output_dir, 'train.csv') train_pred.to_csv(path, index=False) logger.info('Averaging the test predictions') df_parts = [] for fold_num in range(1, 11): path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv') df_part = pd.read_csv(path, usecols=['id'] + common.LABELS) df_parts.append(df_part) test_pred = pd.concat(df_parts).groupby('id', as_index=False).mean() path = os.path.join(self.output_dir, 'test.csv') test_pred.to_csv(path, index=False) logger.info('Total elapsed time - {}'.format(datetime.now() - t_start)) def train(self, fold_num, vectorizer, train_df, val_df): X_train = vectorizer.transform(train_df['comment_text']) X_val = vectorizer.transform(val_df['comment_text']) models = {} for label in common.LABELS: logger.info('Training the %s model', label) y_train, y_val = train_df[label].values, val_df[label].values model = xgb.XGBClassifier( n_estimators=10000, # determined by early stopping objective='binary:logistic', max_depth=self.params['max_depth'], min_child_weight=self.params['min_child_weight'], subsample=self.params['subsample'], colsample_bytree=self.params['colsample_bytree'], learning_rate=self.params['learning_rate'], random_state=self.random_seed, n_jobs=mp.cpu_count()) model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='auc', early_stopping_rounds=self.params['patience']) models[label] = model path = os.path.join(self.output_dir, f'fold{fold_num}.pickle') joblib.dump((vectorizer, models), path) return models def predict(self, models, vectorizer, df, output_path): X = vectorizer.transform(df['comment_text']) output = defaultdict(list) for label in common.LABELS: model = models[label] yhat = model.predict_proba(X, ntree_limit=model.best_ntree_limit)[:, 1] output[label].extend(yhat) predictions = pd.DataFrame.from_dict(output) predictions = predictions[common.LABELS] predictions.insert(0, 'id', df['id'].values) predictions.to_csv(output_path, index=False) def build_vectorizer(self, preprocessed_data): logger.info('Learning the vocabulary') vectorizer = TfidfVectorizer(min_df=self.params['min_df']) vectorizer.fit(preprocessed_data.values()) logger.info('The vocabulary has %s words (%s ignored as stopwords)', len(vectorizer.vocabulary_), len(vectorizer.stop_words_)) return vectorizer
class LabelStacking(object): model_cls = { 'lstm': LSTM, 'gru': GRU, 'dpcnn': DPCNN, 'gcnn': GCNN, 'char-ngram': NGram, 'word-ngram': NGram, 'mlp': MLP, 'xgb': XGB, } model_params = { 'gcnn': { 'vocab_size': 100000, 'max_len': 300, 'vectors': 'glove.42B.300d', 'num_blocks': 1, 'num_layers': 2, 'num_channels': 128, 'kernel_size': 3, 'dense_layers': 0, 'dense_dropout': 0.5, 'batch_size': 64, 'lr_high': 1.0, 'lr_low': 0.2, }, 'lstm': { 'vocab_size': 30000, 'max_len': 300, 'vectors': 'glove.42B.300d', 'rnn_size': 500, 'rnn_dropout': 0.2, 'dense_layers': 1, 'dense_dropout': 0.5, 'batch_size': 128, 'lr_high': 0.5, 'lr_low': 0.01, }, 'dpcnn': { 'vocab_size': 50000, 'max_len': 400, 'vectors': 'glove.42B.300d', 'conv_blocks': 1, 'conv_dropout': 0.1, 'dense_layers': 1, 'dense_dropout': 0.5, 'batch_size': 256, 'lr_high': 0.01, 'lr_low': 0.001, }, 'char-ngram': { 'analyzer': 'char', 'min_df': 5, 'max_ngram': 5, 'max_features': 100000, 'C': 1.0, }, 'word-ngram': { 'analyzer': 'word', 'min_df': 5, 'max_ngram': 2, 'max_features': 50000, 'C': 1.0, }, 'mlp': { 'vocab_size': 100000, 'max_len': 600, 'vectors': 'glove.42B.300d', 'hidden_layers': 2, 'hidden_units': 600, 'input_dropout': 0.1, 'hidden_dropout': 0.5, 'batch_size': 512, 'lr_high': 0.3, 'lr_low': 0.1, }, 'xgb': { 'vocab_size': 300000, 'max_len': 1000, 'min_df': 5, 'max_depth': 6, 'min_child_weight': 1, 'subsample': 0.4, 'colsample_bytree': 0.5, 'learning_rate': 0.1, 'patience': 50, }, 'gru': { 'vocab_size': 100000, 'max_len': 300, 'vectors': 'glove.twitter.27B.200d', 'annotation_dropout': 0.1, 'prediction_dropout': 0.3, 'batch_size': 256, 'lr_high': 0.5, 'lr_low': 0.1, }, } def __init__(self, label, params, random_seed): self.label = label self.params = params self.random_seed = random_seed self.output_dir = os.path.join(common.OUTPUT_DIR, 'label_stacking', str(self.random_seed), self.label, common.params_str(self.params)) if not os.path.isdir(self.output_dir): os.makedirs(self.output_dir) def main(self): t_start = datetime.now() logger.info(' label_stacking / {} '.format(self.random_seed).center( 62, '=')) logger.info('Hyperparameters:\n{}'.format(pprint.pformat(self.params))) if os.path.isfile(os.path.join(self.output_dir, 'test.csv')): logger.info('Output already exists - skipping') return self.random_state = RandomState(self.random_seed) np.random.seed( int.from_bytes(self.random_state.bytes(4), byteorder=sys.byteorder)) test_df = common.load_data('test') train_df = common.load_data('train') folds = common.stratified_kfold(train_df, random_seed=self.random_seed) for fold_num, train_ids, val_ids in folds: logger.info(f'Fold #{fold_num}') logger.info( 'Loading the training and validation data for the %s model', self.label) X_train = self.load_inputs(train_ids, 'train') X_val = self.load_inputs(val_ids, 'train') y_train = train_df.loc[train_df['id'].isin(train_ids)].sort_values( 'id') y_train = y_train[self.label].values y_val = train_df[train_df['id'].isin(val_ids)].sort_values('id') y_val = y_val[self.label].values logger.info('Training the %s model', self.label) model = self.train(fold_num, self.label, X_train, y_train, X_val, y_val) logger.info('Generating the out-of-fold predictions') y_model = self.predict(model, X_val) val_pred = pd.DataFrame({ 'id': sorted(list(val_ids)), self.label: y_model }) path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv') val_pred.to_csv(path, index=False) logger.info('Generating the test predictions') X_test = self.load_inputs(test_df['id'].values, 'test') y_model = self.predict(model, X_test) test_pred = pd.DataFrame({ 'id': test_df['id'], self.label: y_model }) path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv') test_pred.to_csv(path, index=False) logger.info('Averaging the test predictions') df_parts = [] for fold_num in range(1, 11): path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv') df_part = pd.read_csv(path, usecols=['id', self.label]) df_parts.append(df_part) test_pred = pd.concat(df_parts).groupby('id', as_index=False).mean() path = os.path.join(self.output_dir, 'test.csv') test_pred.to_csv(path, index=False) logger.info('Elapsed time - {}'.format(datetime.now() - t_start)) def load_inputs(self, ids, dataset): X = [] for name in self.params['models']: model = self.model_cls[name](self.model_params[name], random_seed=base.RANDOM_SEED) df = pd.read_csv(os.path.join(model.output_dir, f'{dataset}.csv')) df = df[df['id'].isin(ids)] df = df[['id'] + common.LABELS].sort_values('id') X.append(df[common.LABELS].values) X = np.hstack(X) return X def train(self, fold_num, label, X_train, y_train, X_val, y_val): model = xgb.XGBClassifier( n_estimators=10000, # determined by early stopping objective='binary:logistic', max_depth=self.params['max_depth'], min_child_weight=self.params['min_child_weight'], subsample=self.params['subsample'], colsample_bytree=self.params['colsample_bytree'], learning_rate=self.params['learning_rate'], random_state=self.random_seed, n_jobs=mp.cpu_count()) model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='auc', early_stopping_rounds=self.params['patience']) self.save_model(fold_num, label, model) return model def save_model(self, fold_num, label, model): model_file = os.path.join(self.output_dir, f'fold{fold_num}_{label}.pickle') joblib.dump(model, model_file) def load_model(self, fold_num, label): model_file = os.path.join(self.output_dir, f'fold{fold_num}_{label}.pickle') model = joblib.load(model_file) return model def predict(self, model, X): output = model.predict_proba(X, ntree_limit=model.best_ntree_limit)[:, 1] return output
class RNNv4(object): product_history = luigi.IntParameter(default=91) scoring_dim = luigi.IntParameter(default=10) hidden_layers = luigi.IntParameter(default=2) hidden_nonlinearily = luigi.Parameter(default='leaky_relu') dropout = luigi.FloatParameter(default=0.0) random_seed = luigi.IntParameter(default=3996193, significant=False) global_orders_ratio = luigi.FloatParameter(default=0.00001, significant=False) validation_orders_ratio = luigi.FloatParameter(default=0.1, significant=False) target_orders_ratio = luigi.FloatParameter(default=0.1, significant=False) epochs = luigi.IntParameter(default=1000, significant=False) @property def model_name(self): params = [ self.product_history, self.scoring_dim, self.hidden_layers, self.hidden_nonlinearily, self.dropout, ] model_name = 'rnn_v4_{}'.format('_'.join( str(p).lower() for p in params)) return model_name def _init_random_state(self): self.random = RandomState(self.random_seed) np.random.seed( int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder)) torch.manual_seed( int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder)) def _iter_user_data(self, orders_path, shuffle=False): with (open_shuffled(orders_path) if shuffle else open(orders_path)) as orders_file: for line in orders_file: user_data = ujson.loads(line) yield user_data def _generate_example(self, prior_orders, last_order): recently_ordered = set() product_history = [] for order_num, order in enumerate(prior_orders): weekday = order['day_of_week'] hour_sin = np.sin(2 * np.pi * order['hour_of_day'] / 23) hour_cos = np.cos(2 * np.pi * order['hour_of_day'] / 23) hour = np.array([hour_sin, hour_cos]) for product in order['products']: product_history.append({ 'weekday': weekday, 'hour': hour, 'department': int(PRODUCT_ID_TO_DEPT_ID[product['product_id']] - 1), 'aisle': int(PRODUCT_ID_TO_AISLE_ID[product['product_id']] - 1), 'product': int(product['product_id'] - 1), }) recently_ordered.add(product['product_id']) weekday = last_order['day_of_week'] hour_sin = np.sin(2 * np.pi * last_order['hour_of_day'] / 23) hour_cos = np.cos(2 * np.pi * last_order['hour_of_day'] / 23) hour = np.array([hour_sin, hour_cos]) next_products = [] next_products_targets = [] reordered = set() for product in last_order['products']: if product['reordered'] and product[ 'product_id'] in recently_ordered: reordered.add(product['product_id']) for product_id in recently_ordered: next_products.append({ 'weekday': weekday, 'hour': hour, 'department': int(PRODUCT_ID_TO_DEPT_ID[product_id] - 1), 'aisle': int(PRODUCT_ID_TO_AISLE_ID[product_id] - 1), 'product': int(product_id - 1), }) next_products_targets.append(int(product_id in reordered)) return product_history, next_products, next_products_targets def _generate_examples(self, orders_path, target_orders=None, shuffle=False): for user_data in self._iter_user_data(orders_path, shuffle=shuffle): user_orders = user_data['prior_orders'].copy() user_orders.append(user_data['last_order']) # Determine the number of target orders to include for this user. user_target_orders = target_orders if not user_target_orders: user_target_orders = int( np.ceil(self.target_orders_ratio * len(user_orders))) for last_order_index in reversed(range(1, len(user_orders))): last_order = user_orders[last_order_index] prior_orders = [] days_count = last_order['days_since_prior_order'] for order in reversed(user_orders[:last_order_index]): prior_orders.insert(0, order) if order['days_since_prior_order'] is not None: # There is at least another order, stop if it will go over the limit days_count += order['days_since_prior_order'] if days_count >= self.product_history: break yield self._generate_example(prior_orders, last_order) user_target_orders -= 1 if user_target_orders == 0: break def _format_as_tensors(self, product_history, next_products, next_products_targets): def create_tensor(tensor_type, orders, field): return Variable(tensor_type([p[field] for p in orders]), requires_grad=False) product_history_tensor = { 'weekday': create_tensor(LongTensor, product_history, 'weekday').view(1, -1), 'hour': create_tensor(FloatTensor, product_history, 'hour').view(1, -1, 2), 'department': create_tensor(LongTensor, product_history, 'department').view(1, -1), 'aisle': create_tensor(LongTensor, product_history, 'aisle').view(1, -1), 'product': create_tensor(LongTensor, product_history, 'product').view(1, -1), } next_products_tensor = { 'weekday': create_tensor(LongTensor, next_products, 'weekday').view(-1, 1), 'hour': create_tensor(FloatTensor, next_products, 'hour').view(-1, 2), 'department': create_tensor(LongTensor, next_products, 'department').view(-1, 1), 'aisle': create_tensor(LongTensor, next_products, 'aisle').view(-1, 1), 'product': create_tensor(LongTensor, next_products, 'product').view(-1, 1), } next_products_targets_tensor = Variable( FloatTensor(next_products_targets), requires_grad=False) return product_history_tensor, next_products_tensor, next_products_targets_tensor def _load_model(self): model = Model(weekday_dim=2, department_dim=3, aisle_dim=5, product_dim=10, scoring_dim=10, hidden_layers=self.hidden_layers, hidden_nonlinearily=self.hidden_nonlinearily, dropout=self.dropout) return model
class BaseModel(object): def __init__(self, params, random_seed): self.name = self.__class__.__name__.lower() self.params = params self.random_seed = random_seed self.output_dir = os.path.join(common.OUTPUT_DIR, self.name, str(self.random_seed), common.params_str(self.params)) if not os.path.isdir(self.output_dir): os.makedirs(self.output_dir) def main(self): t_start = datetime.now() logger.info(' {} / {} '.format(self.name, self.random_seed).center(62, '=')) logger.info('Hyperparameters:\n{}'.format(pprint.pformat(self.params))) if os.path.isfile(os.path.join(self.output_dir, 'test.csv')): logger.info('Output already exists - skipping') return # Initialize the random number generator self.random_state = RandomState(self.random_seed) np.random.seed( int.from_bytes(self.random_state.bytes(4), byteorder=sys.byteorder)) torch.manual_seed( int.from_bytes(self.random_state.bytes(4), byteorder=sys.byteorder)) preprocessed_data = self.load_preprocessed_data() self.fields, self.vocab = self.build_fields_and_vocab( preprocessed_data) train_df = common.load_data('train') train_df['comment_text'] = train_df['id'].map(preprocessed_data) test_df = common.load_data('test') test_df['comment_text'] = test_df['id'].map(preprocessed_data) folds = common.stratified_kfold(train_df, random_seed=self.random_seed) for fold_num, train_ids, val_ids in folds: logger.info(f'Fold #{fold_num}') fold_train_df = train_df[train_df['id'].isin(train_ids)] fold_val_df = train_df[train_df['id'].isin(val_ids)] model = self.train(fold_num, fold_train_df, fold_val_df) logger.info('Generating the out-of-fold predictions') path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv') self.predict(model, fold_val_df, path) logger.info('Generating the test predictions') path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv') self.predict(model, test_df, path) logger.info('Combining the out-of-fold predictions') df_parts = [] for fold_num in range(1, 11): path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv') df_part = pd.read_csv(path, usecols=['id'] + common.LABELS) df_parts.append(df_part) train_pred = pd.concat(df_parts) path = os.path.join(self.output_dir, 'train.csv') train_pred.to_csv(path, index=False) logger.info('Averaging the test predictions') df_parts = [] for fold_num in range(1, 11): path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv') df_part = pd.read_csv(path, usecols=['id'] + common.LABELS) df_parts.append(df_part) test_pred = pd.concat(df_parts).groupby('id', as_index=False).mean() path = os.path.join(self.output_dir, 'test.csv') test_pred.to_csv(path, index=False) logger.info('Total elapsed time - {}'.format(datetime.now() - t_start)) def load_preprocessed_data(self): preprocessed_data = preprocessing.load(self.params) return preprocessed_data def build_fields_and_vocab(self, preprocessed_data): text_field = Field(pad_token='<PAD>', unk_token=None, batch_first=True, include_lengths=True) labels_field = Field(sequential=False, use_vocab=False, tensor_type=torch.FloatTensor) fields = [('text', text_field), ('labels', labels_field)] # Build the vocabulary datasets = [] for dataset in ['train', 'test']: df = common.load_data(dataset) df['comment_text'] = df['id'].map(preprocessed_data) datasets.append(CommentsDataset(df, fields)) text_field.build_vocab(*datasets) vocab = text_field.vocab assert vocab.stoi['<PAD>'] == 0 # Fill in missing words with the mean of the existing vectors vectors = pretrained_aliases[self.params['vectors']]() vectors_sum = np.zeros((vectors.dim, )) vectors_count = 0 for token in vocab.itos: if token in vectors.stoi: vectors_sum += vectors[token].numpy() vectors_count += 1 mean_vector = torch.FloatTensor(vectors_sum / vectors_count).unsqueeze(0) def getitem(self, token): return self.vectors[ self.stoi[token]] if token in self.stoi else mean_vector Vectors.__getitem__ = getitem vocab.load_vectors(vectors) return fields, vocab def train(self, fold_num, train_df, val_df): train_iter = self.build_train_iterator(train_df) _, val_iter = self.build_prediction_iterator(val_df) logger.info( 'Training on {:,} examples, validating on {:,} examples'.format( len(train_iter.dataset), len(val_iter.dataset))) # Train the model keeping the word embeddings frozen until the validation AUC # stops improving, then unfreeze the embeddings and fine-tune the entire # model with a lower learning rate. Use SGD with warm restarts. model = self.build_model() model.embedding.weight.requires_grad = False parameters = list(filter(lambda p: p.requires_grad, model.parameters())) model_size = sum([np.prod(p.size()) for p in parameters]) logger.info('Optimizing {:,} parameters:\n{}'.format( model_size, model)) run = epoch = 0 lr_max = self.params['lr_high'] optimizer = optim.SGD(parameters, lr=lr_max, momentum=0.9) t_max = 10 best_val_auc = 0 while True: run += 1 # grad_norms = [] t_cur, lr = 0, lr_max for param_group in optimizer.param_groups: param_group['lr'] = lr logger.info('Starting run {} - t_max {}'.format(run, t_max)) for t_index in range(t_max): epoch += 1 loss_sum = 0 model.train() t = tqdm(train_iter, ncols=79) for batch_index, batch in enumerate(t): # Update the learning rate t_cur = t_index + batch_index / len(train_iter) lr = lr_max * (1 + math.cos(math.pi * t_cur / t_max)) / 2 t.set_postfix(t_cur='{:.4f}'.format(t_cur), lr='{:.6f}'.format(lr)) for param_group in optimizer.param_groups: param_group['lr'] = lr # Forward and backward pass optimizer.zero_grad() loss = self.calculate_loss(model, batch) loss.backward() # grad_vector = [p.grad.data.view(-1) for p in parameters] # grad_norms.append(torch.cat(grad_vector).norm()) self.update_parameters(model, optimizer, loss) loss_sum += loss.data[0] loss = loss_sum / len(train_iter) logger.info( 'Epoch {} - run {} - t_cur {}/{} - lr {:.6f} - loss {:.6f}' .format(epoch, run, int(math.ceil(t_cur)), t_max, lr, loss)) # https://arxiv.org/abs/1212.0901 # logger.info('Average norm of the gradient - {:.6f}'.format(np.mean(grad_norms))) # Run ended - evaluate early stopping val_auc = self.evaluate_model(model, val_iter) if val_auc > best_val_auc: logger.info( 'Saving best model - val_auc {:.6f}'.format(val_auc)) self.save_model(fold_num, model) best_val_auc = val_auc else: logger.info('Stopping - val_auc {:.6f}'.format(val_auc)) if self.params[ 'lr_low'] == 0 or model.embedding.weight.requires_grad: # Fine-tuning disabled or it just finished break else: model = self.load_model(fold_num) model.embedding.weight.requires_grad = True parameters = list( filter(lambda p: p.requires_grad, model.parameters())) model_size = sum([np.prod(p.size()) for p in parameters]) logger.info( 'Fine-tuning {:,} parameters - best_val_auc {:.6f}'. format(model_size, best_val_auc)) run = 0 lr_max = self.params['lr_low'] optimizer = optim.SGD(parameters, lr=lr_max, momentum=0.9) t_max = 1 logger.info('Final model - best_val_auc {:.6f}'.format(best_val_auc)) model = self.load_model(fold_num) return model def predict(self, model, df, output_path): model.eval() predictions = [] pred_id, pred_iter = self.build_prediction_iterator(df) for batch in pred_iter: (text, text_lengths), _ = batch.text, batch.labels output = model(text, text_lengths) predictions.append(output.data.cpu()) predictions = torch.cat(predictions).numpy() predictions = pd.DataFrame(predictions, columns=common.LABELS) predictions.insert(0, 'id', pred_id) predictions.to_csv(output_path, index=False) def build_train_iterator(self, df): raise NotImplementedError def build_prediction_iterator(self, df): raise NotImplementedError def build_model(self): raise NotImplementedError def calculate_loss(self, model, batch): (text, text_lengths), labels = batch.text, batch.labels output = model(text, text_lengths) loss = F.binary_cross_entropy(output, labels) return loss def update_parameters(self, model, optimizer, loss): optimizer.step() def evaluate_model(self, model, batch_iter): model.eval() labels, predictions = [], [] for batch in batch_iter: text, text_lengths = batch.text labels.append(batch.labels.data.cpu()) output = model(text, text_lengths) predictions.append(output.data.cpu()) labels = torch.cat(labels).numpy() predictions = torch.cat(predictions).numpy() auc = roc_auc_score(labels, predictions, average='macro') return auc def save_model(self, fold_num, model): path = os.path.join(self.output_dir, f'fold{fold_num}.pickle') torch.save(model.state_dict(), path) def load_model(self, fold_num): model = self.build_model() path = os.path.join(self.output_dir, f'fold{fold_num}.pickle') model.load_state_dict(torch.load(path)) return model
class PredictMLPv2ThresholdVariable(_PredictMLPv2): @property def model_name(self): model_name = super().model_name model_name += '_threshold_variable' return model_name def _determine_reorder_thresholds(self, model, scores): orders_path = self.requires()['orders'].output().path all_user_ids, all_order_ids, all_product_ids, all_inputs, all_targets = \ self._load_data(orders_path, num_orders_per_user=self.num_orders_per_user) target_order_ids = set(scores.order_id) user_id_to_target_order_id = {} for i in range(len(all_user_ids)): if all_order_ids[i] in target_order_ids: user_id_to_target_order_id[all_user_ids[i]] = all_order_ids[i] mask = np.array([order_id not in target_order_ids for order_id in all_order_ids]) for k in all_inputs.keys(): all_inputs[k] = all_inputs[k][mask] all_predictions = model.predict(all_inputs, batch_size=self.batch_size, verbose=0).flatten() results = pd.DataFrame({ 'user_id': list(itertools.compress(all_user_ids, mask)), 'order_id': list(itertools.compress(all_order_ids, mask)), 'product_id': list(itertools.compress(all_product_ids, mask)), 'prediction': all_predictions, 'target': all_targets[mask], }) # Find the best threshold value for each previous order by each user best_thresholds = defaultdict(list) grouped = results.groupby(['user_id', 'order_id']) for (user_id, order_id), group in grouped: product_ids = np.array(group['product_id']) reordered = set(product_ids[np.array(group['target']) > 0]) probability = np.array(group['prediction']) if not reordered: best_threshold = probability.max() else: best_threshold, best_f1 = None, None for threshold in probability: predicted = set(product_ids[probability >= threshold]) tp = len(predicted & reordered) precision = tp / len(predicted) recall = tp / len(reordered) f1 = 2.0 * (precision * recall) / (precision + recall) if precision or recall else 0.0 if best_f1 is None or f1 > best_f1: best_threshold = threshold best_f1 = f1 best_thresholds[user_id].append(best_threshold) # Select the average threshold for each user reorder_thresholds = {} for user_id in user_id_to_target_order_id: order_id = user_id_to_target_order_id[user_id] reorder_thresholds[order_id] = np.mean(best_thresholds[user_id]) return reorder_thresholds def run(self): self.random = RandomState(self.random_seed) np.random.seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder)) tf.set_random_seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder)) model = self._build_model() model.load_weights(self.input()['model'].path) model.summary() orders_path = self.requires()['orders'].output().path user_ids, order_ids, product_ids, inputs, _ = self._load_data(orders_path) scores = model.predict(inputs, batch_size=self.batch_size, verbose=0).flatten() scores = pd.DataFrame({'order_id': order_ids, 'product_id': product_ids, 'score': scores}) reorder_thresholds = self._determine_reorder_thresholds(model, scores) predictions = {} for order_id in set(order_ids): predictions[order_id] = [] df = scores[scores.order_id == order_id] df = df[df.score >= reorder_thresholds[order_id]] for row in df.itertuples(index=False): # ujson fails when it tries to serialize the numpy int values predictions[int(order_id)].append(int(row.product_id)) with self.output().open('w') as fd: ujson.dump(predictions, fd)