Example #1
0
class PredictRNNv3Threshold(_PredictRNNv3):

    threshold = luigi.FloatParameter(default=0.29)

    @property
    def model_name(self):
        model_name = super().model_name
        model_name += '_threshold_{}'.format(self.threshold)
        return model_name

    def run(self):
        self.random = RandomState(self.random_seed)
        np.random.seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder))
        tf.set_random_seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder))

        orders_path = self.requires()['orders'].output().path
        order_ids, inputs, _ = self._load_data(orders_path)

        model = self._build_model()
        model.load_weights(self.input()['model'].path)
        model.summary()

        scores = model.predict(inputs, batch_size=self.batch_size, verbose=1).flatten()
        scores = pd.DataFrame({'order_id': order_ids, 'product_id': inputs['product'], 'score': scores})
        scores = scores[scores.score > self.threshold].sort_values('score', ascending=False)

        predictions = {}
        for order_id in order_ids:
            predictions[order_id] = []
        for row in scores.itertuples(index=False):
            # ujson fails when it tries to serialize the numpy int values
            predictions[int(row.order_id)].append(int(row.product_id))

        with self.output().open('w') as fd:
            ujson.dump(predictions, fd)
Example #2
0
class RNNv5(object):

    product_history = luigi.IntParameter(default=91)
    embedding_dim = luigi.IntParameter(default=10)
    lstm_size = luigi.IntParameter(default=25)
    lstm_layers = luigi.IntParameter(default=2)
    hidden_layers = luigi.IntParameter(default=2)
    hidden_nonlinearily = luigi.Parameter(default='leaky_relu')
    dropout = luigi.FloatParameter(default=0.2)

    random_seed = luigi.IntParameter(default=3996193, significant=False)
    global_orders_ratio = luigi.FloatParameter(default=1.0, significant=False)
    validation_orders_ratio = luigi.FloatParameter(default=0.2, significant=False)
    batch_size = luigi.IntParameter(default=4096, significant=False)
    negative_factor = luigi.IntParameter(default=2, significant=False)
    epochs = luigi.IntParameter(default=1000, significant=False)

    @property
    def model_name(self):
        params = [
            self.product_history,
            self.embedding_dim,
            self.lstm_size,
            self.lstm_layers,
            self.hidden_layers,
            self.hidden_nonlinearily,
            self.dropout,
        ]
        model_name = 'rnn_v5_{}'.format('_'.join(str(p).lower() for p in params))
        return model_name

    def _init_random_state(self):
        self.random = RandomState(self.random_seed)
        np.random.seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder))
        torch.manual_seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder))

    def _build_model(self):
        model = Model(
            embedding_dim=self.embedding_dim,
            lstm_size=self.lstm_size,
            lstm_layers=self.lstm_layers,
            hidden_layers=self.hidden_layers,
            hidden_nonlinearily=self.hidden_nonlinearily,
            dropout=self.dropout)

        if torch.cuda.is_available():
            model = model.cuda()

        print(model)

        return model
Example #3
0
class PredictRNNv3ReorderSizeKnown(_PredictRNNv3):

    @staticmethod
    def _count_reordered_products(order):
        k = 0
        for product in order['products']:
            if product['reordered']:
                k += 1
        return k

    def _determine_reorder_size(self):
        assert self.mode == 'evaluation'
        num_reordered = {}
        orders_path = self.requires()['orders'].output().path
        with open(orders_path) as orders_file:
            for line in orders_file:
                user_data = ujson.loads(line)
                order_id = int(user_data['last_order']['order_id'])
                num_reordered[order_id] = self._count_reordered_products(user_data['last_order'])
        return num_reordered

    def run(self):
        self.random = RandomState(self.random_seed)
        np.random.seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder))
        tf.set_random_seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder))

        orders_path = self.requires()['orders'].output().path
        order_ids, inputs, _ = self._load_data(orders_path)

        model = self._build_model()
        model.load_weights(self.input()['model'].path)
        model.summary()

        scores = model.predict(inputs, batch_size=self.batch_size, verbose=0).flatten()
        scores = pd.DataFrame({'order_id': order_ids, 'product_id': inputs['product'], 'score': scores})

        reorder_size = self._determine_reorder_size()

        predictions = {}
        for order_id in set(order_ids):
            predictions[order_id] = []
            df = scores[scores.order_id == order_id].nlargest(reorder_size[order_id], 'score')
            for row in df.itertuples(index=False):
                # ujson fails when it tries to serialize the numpy int values
                predictions[int(order_id)].append(int(row.product_id))

        with self.output().open('w') as fd:
            ujson.dump(predictions, fd)
Example #4
0
class PredictMLPv2ExpectedF1(_PredictMLPv2):

    @property
    def model_name(self):
        model_name = super().model_name
        model_name += '_expected_f1'
        return model_name

    def _determine_reorder_size(self, scores):
        reorder_size = {}
        grouped = scores.groupby('order_id')
        P_values = grouped['score'].apply(list)
        with multiprocessing.Pool() as pool:
            results = pool.map(maximize_expected_f1, P_values)
        for order_id, (best_k, max_f1) in zip(grouped.groups.keys(), results):
            reorder_size[order_id] = best_k
        return reorder_size

    def run(self):
        self.random = RandomState(self.random_seed)
        np.random.seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder))
        tf.set_random_seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder))

        orders_path = self.requires()['orders'].output().path
        _, order_ids, product_ids, inputs, _ = self._load_data(orders_path)

        model = self._build_model()
        model.load_weights(self.input()['model'].path)
        model.summary()

        scores = model.predict(inputs, batch_size=self.batch_size, verbose=0).flatten()
        scores = pd.DataFrame({'order_id': order_ids, 'product_id': product_ids, 'score': scores})
        reorder_size = self._determine_reorder_size(scores)

        predictions = {}
        for order_id in set(order_ids):
            predictions[order_id] = []
            df = scores[scores.order_id == order_id].nlargest(reorder_size[order_id], 'score')
            for row in df.itertuples(index=False):
                # ujson fails when it tries to serialize the numpy int values
                predictions[int(order_id)].append(int(row.product_id))

        with self.output().open('w') as fd:
            ujson.dump(predictions, fd)
Example #5
0
class FitRNNv3(RNNv3, FitModel):

    def run(self):
        self.random = RandomState(self.random_seed)
        np.random.seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder))
        tf.set_random_seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder))

        # Split the orders into training and validation sets and write them to separate files
        orders_path = self.requires()['orders'].output().path
        training_fd = tempfile.NamedTemporaryFile(mode='w+', delete=True)
        validation_fd = tempfile.NamedTemporaryFile(mode='w+', delete=True)
        with open(orders_path) as input_fd:
            for line in input_fd:
                if self.global_orders_ratio >= 1 or self.random.uniform() <= self.global_orders_ratio:
                    if self.random.uniform() <= self.validation_orders_ratio:
                        validation_fd.write(line)
                    else:
                        training_fd.write(line)
        validation_fd.flush()
        training_fd.flush()

        _, validation_inputs, validation_predictions = self._load_data(validation_fd.name)
        training_generator, training_steps_per_epoch = \
            self._create_data_generator(training_fd.name, self.max_prior_orders, self.batch_size)

        model = self._build_model()
        model.summary()

        callbacks = [
            EarlyStopping(monitor='val_loss', patience=10),
            ModelCheckpoint(os.path.abspath(self.output().path), verbose=1, save_weights_only=True, save_best_only=True),
        ]
        class_weight = compute_class_weight('balanced', [0, 1], validation_predictions)
        class_weight = dict(enumerate(class_weight))
        model.fit_generator(training_generator, training_steps_per_epoch,
                            validation_data=(validation_inputs, validation_predictions),
                            callbacks=callbacks, class_weight=class_weight,
                            epochs=self.epochs, verbose=1)

        validation_fd.close()
        training_fd.close()
Example #6
0
def z_standard_normal(num, random_state=None):
    """
    Standard normal float32s by the Ziggurat method
    """

    # Extra randoms samples becuase the Ziggurat rejects some
    zig_fac = 0.025

    if random_state is None:
        random_state = RandomState(seed=123)

    # space to store the randoms
    out = empty(num, dtype=float32)
    i0 = 0

    while i0 < num:
        n_needed = num - i0

        # How many random integers do we need for n_needed random normals?
        pad = n_needed * zig_fac + (
            n_needed * zig_fac * (1 - zig_fac)
        )**0.5  # mean + binomial estimate for 1 std. dev. (c.f. ignore (1-zf) and get Poisson est.)
        guess_n_rand = int(n_needed + pad + 1)
        if i0 > 0:
            print('Not enough random integers for normals, increasing')
            print('Need {:,} random numbers'.format(n_needed),
                  'guess {:,}'.format(guess_n_rand), 'i0=', i0)

        ri = fromstring(random_state.bytes(4 * guess_n_rand), dtype=uint32)

        # consume ri as needed
        res = zigg(n_needed, ri)

        out[i0:i0 + res.size] = res
        i0 += res.size

    return out
Example #7
0
def init_random_state(random_seed):
    rng = RandomState(random_seed)
    np.random.seed(int.from_bytes(rng.bytes(4), byteorder=sys.byteorder))
    torch.manual_seed(int.from_bytes(rng.bytes(4), byteorder=sys.byteorder))
    return rng
Example #8
0
class NGram(base.BaseModel):

    def main(self):
        t_start = datetime.now()
        logger.info(' {} / {} '.format(self.name, self.random_seed).center(62, '='))
        logger.info('Hyperparameters:\n{}'.format(pprint.pformat(self.params)))
        if os.path.isfile(os.path.join(self.output_dir, 'test.csv')):
            logger.info('Output already exists - skipping')
            return

        # Initialize the random number generator
        self.random_state = RandomState(self.random_seed)
        np.random.seed(int.from_bytes(self.random_state.bytes(4), byteorder=sys.byteorder))

        train_df = common.load_data('train')
        train_df['comment_text'] = train_df['comment_text'].apply(unidecode)
        test_df = common.load_data('test')
        test_df['comment_text'] = test_df['comment_text'].apply(unidecode)

        vectorizer = self.build_vectorizer(train_df, test_df)

        folds = common.stratified_kfold(train_df, random_seed=self.random_seed)
        for fold_num, train_ids, val_ids in folds:
            logger.info(f'Fold #{fold_num}')

            fold_train_df = train_df[train_df['id'].isin(train_ids)]
            fold_val_df = train_df[train_df['id'].isin(val_ids)]
            models = self.train(fold_num, vectorizer, fold_train_df, fold_val_df)

            logger.info('Generating the out-of-fold predictions')
            path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv')
            self.predict(models, vectorizer, fold_val_df, path)

            logger.info('Generating the test predictions')
            path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv')
            self.predict(models, vectorizer, test_df, path)

        logger.info('Combining the out-of-fold predictions')
        df_parts = []
        for fold_num in range(1, 11):
            path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv')
            df_part = pd.read_csv(path, usecols=['id'] + common.LABELS)
            df_parts.append(df_part)
        train_pred = pd.concat(df_parts)
        path = os.path.join(self.output_dir, 'train.csv')
        train_pred.to_csv(path, index=False)

        logger.info('Averaging the test predictions')
        df_parts = []
        for fold_num in range(1, 11):
            path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv')
            df_part = pd.read_csv(path, usecols=['id'] + common.LABELS)
            df_parts.append(df_part)
        test_pred = pd.concat(df_parts).groupby('id', as_index=False).mean()
        path = os.path.join(self.output_dir, 'test.csv')
        test_pred.to_csv(path, index=False)

        logger.info('Total elapsed time - {}'.format(datetime.now() - t_start))

    def train(self, fold_num, vectorizer, train_df, val_df):
        X_train = vectorizer.transform(train_df['comment_text'])

        models = {}
        for label in common.LABELS:
            logger.info('Training the %s model', label)
            y_train = train_df[label].values
            model = LogisticRegression(
                solver='sag',
                penalty='l2',
                C=self.params['C'],
                tol=1e-8,
                max_iter=1000,
                random_state=self.random_state,
                verbose=1)
            model.fit(X_train, y_train)
            models[label] = model

        path = os.path.join(self.output_dir, f'fold{fold_num}.pickle')
        joblib.dump((vectorizer, models), path)
        return models

    def predict(self, models, vectorizer, df, output_path):
        X = vectorizer.transform(df['comment_text'])
        output = defaultdict(list)
        for label in common.LABELS:
            model = models[label]
            yhat = model.predict_proba(X)[:, 1]
            output[label].extend(yhat)
        predictions = pd.DataFrame.from_dict(output)
        predictions = predictions[common.LABELS]
        predictions.insert(0, 'id', df['id'].values)
        predictions.to_csv(output_path, index=False)

    def build_vectorizer(self, train_df, test_df):
        logger.info('Learning the vocabulary')

        vectorizer = TfidfVectorizer(
            strip_accents='unicode',
            analyzer=self.params['analyzer'],
            min_df=self.params['min_df'],
            ngram_range=(1, self.params['max_ngram']),
            max_features=self.params['max_features'],
            stop_words='english',
            sublinear_tf=True)

        train_text = train_df['comment_text']
        test_text = test_df['comment_text']
        all_text = pd.concat([train_text, test_text])
        vectorizer.fit(all_text)
        logger.info('The vocabulary has %s words (%s ignored as stopwords)', 
                    len(vectorizer.vocabulary_), len(vectorizer.stop_words_))

        return vectorizer
Example #9
0
class XGB(base.BaseModel):
    def main(self):
        t_start = datetime.now()
        logger.info(' {} / {} '.format(self.name,
                                       self.random_seed).center(62, '='))
        logger.info('Hyperparameters:\n{}'.format(pprint.pformat(self.params)))
        if os.path.isfile(os.path.join(self.output_dir, 'test.csv')):
            logger.info('Output already exists - skipping')

        # Initialize the random number generator
        self.random_state = RandomState(self.random_seed)
        np.random.seed(
            int.from_bytes(self.random_state.bytes(4),
                           byteorder=sys.byteorder))

        preprocessed_data = preprocessing.load(self.params)
        vectorizer = self.build_vectorizer(preprocessed_data)

        train_df = common.load_data('train')
        train_df['comment_text'] = train_df['id'].map(preprocessed_data)
        test_df = common.load_data('test')
        test_df['comment_text'] = test_df['id'].map(preprocessed_data)

        folds = common.stratified_kfold(train_df, random_seed=self.random_seed)
        for fold_num, train_ids, val_ids in folds:
            logger.info(f'Fold #{fold_num}')

            fold_train_df = train_df[train_df['id'].isin(train_ids)]
            fold_val_df = train_df[train_df['id'].isin(val_ids)]
            models = self.train(fold_num, vectorizer, fold_train_df,
                                fold_val_df)

            logger.info('Generating the out-of-fold predictions')
            path = os.path.join(self.output_dir,
                                f'fold{fold_num}_validation.csv')
            self.predict(models, vectorizer, fold_val_df, path)

            logger.info('Generating the test predictions')
            path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv')
            self.predict(models, vectorizer, test_df, path)

        logger.info('Combining the out-of-fold predictions')
        df_parts = []
        for fold_num in range(1, 11):
            path = os.path.join(self.output_dir,
                                f'fold{fold_num}_validation.csv')
            df_part = pd.read_csv(path, usecols=['id'] + common.LABELS)
            df_parts.append(df_part)
        train_pred = pd.concat(df_parts)
        path = os.path.join(self.output_dir, 'train.csv')
        train_pred.to_csv(path, index=False)

        logger.info('Averaging the test predictions')
        df_parts = []
        for fold_num in range(1, 11):
            path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv')
            df_part = pd.read_csv(path, usecols=['id'] + common.LABELS)
            df_parts.append(df_part)
        test_pred = pd.concat(df_parts).groupby('id', as_index=False).mean()
        path = os.path.join(self.output_dir, 'test.csv')
        test_pred.to_csv(path, index=False)

        logger.info('Total elapsed time - {}'.format(datetime.now() - t_start))

    def train(self, fold_num, vectorizer, train_df, val_df):
        X_train = vectorizer.transform(train_df['comment_text'])
        X_val = vectorizer.transform(val_df['comment_text'])

        models = {}
        for label in common.LABELS:
            logger.info('Training the %s model', label)
            y_train, y_val = train_df[label].values, val_df[label].values

            model = xgb.XGBClassifier(
                n_estimators=10000,  # determined by early stopping
                objective='binary:logistic',
                max_depth=self.params['max_depth'],
                min_child_weight=self.params['min_child_weight'],
                subsample=self.params['subsample'],
                colsample_bytree=self.params['colsample_bytree'],
                learning_rate=self.params['learning_rate'],
                random_state=self.random_seed,
                n_jobs=mp.cpu_count())

            model.fit(X_train,
                      y_train,
                      eval_set=[(X_val, y_val)],
                      eval_metric='auc',
                      early_stopping_rounds=self.params['patience'])

            models[label] = model

        path = os.path.join(self.output_dir, f'fold{fold_num}.pickle')
        joblib.dump((vectorizer, models), path)
        return models

    def predict(self, models, vectorizer, df, output_path):
        X = vectorizer.transform(df['comment_text'])
        output = defaultdict(list)
        for label in common.LABELS:
            model = models[label]
            yhat = model.predict_proba(X,
                                       ntree_limit=model.best_ntree_limit)[:,
                                                                           1]
            output[label].extend(yhat)
        predictions = pd.DataFrame.from_dict(output)
        predictions = predictions[common.LABELS]
        predictions.insert(0, 'id', df['id'].values)
        predictions.to_csv(output_path, index=False)

    def build_vectorizer(self, preprocessed_data):
        logger.info('Learning the vocabulary')
        vectorizer = TfidfVectorizer(min_df=self.params['min_df'])
        vectorizer.fit(preprocessed_data.values())
        logger.info('The vocabulary has %s words (%s ignored as stopwords)',
                    len(vectorizer.vocabulary_), len(vectorizer.stop_words_))
        return vectorizer
Example #10
0
class LabelStacking(object):

    model_cls = {
        'lstm': LSTM,
        'gru': GRU,
        'dpcnn': DPCNN,
        'gcnn': GCNN,
        'char-ngram': NGram,
        'word-ngram': NGram,
        'mlp': MLP,
        'xgb': XGB,
    }

    model_params = {
        'gcnn': {
            'vocab_size': 100000,
            'max_len': 300,
            'vectors': 'glove.42B.300d',
            'num_blocks': 1,
            'num_layers': 2,
            'num_channels': 128,
            'kernel_size': 3,
            'dense_layers': 0,
            'dense_dropout': 0.5,
            'batch_size': 64,
            'lr_high': 1.0,
            'lr_low': 0.2,
        },
        'lstm': {
            'vocab_size': 30000,
            'max_len': 300,
            'vectors': 'glove.42B.300d',
            'rnn_size': 500,
            'rnn_dropout': 0.2,
            'dense_layers': 1,
            'dense_dropout': 0.5,
            'batch_size': 128,
            'lr_high': 0.5,
            'lr_low': 0.01,
        },
        'dpcnn': {
            'vocab_size': 50000,
            'max_len': 400,
            'vectors': 'glove.42B.300d',
            'conv_blocks': 1,
            'conv_dropout': 0.1,
            'dense_layers': 1,
            'dense_dropout': 0.5,
            'batch_size': 256,
            'lr_high': 0.01,
            'lr_low': 0.001,
        },
        'char-ngram': {
            'analyzer': 'char',
            'min_df': 5,
            'max_ngram': 5,
            'max_features': 100000,
            'C': 1.0,
        },
        'word-ngram': {
            'analyzer': 'word',
            'min_df': 5,
            'max_ngram': 2,
            'max_features': 50000,
            'C': 1.0,
        },
        'mlp': {
            'vocab_size': 100000,
            'max_len': 600,
            'vectors': 'glove.42B.300d',
            'hidden_layers': 2,
            'hidden_units': 600,
            'input_dropout': 0.1,
            'hidden_dropout': 0.5,
            'batch_size': 512,
            'lr_high': 0.3,
            'lr_low': 0.1,
        },
        'xgb': {
            'vocab_size': 300000,
            'max_len': 1000,
            'min_df': 5,
            'max_depth': 6,
            'min_child_weight': 1,
            'subsample': 0.4,
            'colsample_bytree': 0.5,
            'learning_rate': 0.1,
            'patience': 50,
        },
        'gru': {
            'vocab_size': 100000,
            'max_len': 300,
            'vectors': 'glove.twitter.27B.200d',
            'annotation_dropout': 0.1,
            'prediction_dropout': 0.3,
            'batch_size': 256,
            'lr_high': 0.5,
            'lr_low': 0.1,
        },
    }

    def __init__(self, label, params, random_seed):
        self.label = label
        self.params = params
        self.random_seed = random_seed

        self.output_dir = os.path.join(common.OUTPUT_DIR, 'label_stacking',
                                       str(self.random_seed), self.label,
                                       common.params_str(self.params))
        if not os.path.isdir(self.output_dir):
            os.makedirs(self.output_dir)

    def main(self):
        t_start = datetime.now()
        logger.info(' label_stacking / {} '.format(self.random_seed).center(
            62, '='))
        logger.info('Hyperparameters:\n{}'.format(pprint.pformat(self.params)))
        if os.path.isfile(os.path.join(self.output_dir, 'test.csv')):
            logger.info('Output already exists - skipping')
            return

        self.random_state = RandomState(self.random_seed)
        np.random.seed(
            int.from_bytes(self.random_state.bytes(4),
                           byteorder=sys.byteorder))

        test_df = common.load_data('test')
        train_df = common.load_data('train')

        folds = common.stratified_kfold(train_df, random_seed=self.random_seed)
        for fold_num, train_ids, val_ids in folds:
            logger.info(f'Fold #{fold_num}')

            logger.info(
                'Loading the training and validation data for the %s model',
                self.label)
            X_train = self.load_inputs(train_ids, 'train')
            X_val = self.load_inputs(val_ids, 'train')
            y_train = train_df.loc[train_df['id'].isin(train_ids)].sort_values(
                'id')
            y_train = y_train[self.label].values
            y_val = train_df[train_df['id'].isin(val_ids)].sort_values('id')
            y_val = y_val[self.label].values

            logger.info('Training the %s model', self.label)
            model = self.train(fold_num, self.label, X_train, y_train, X_val,
                               y_val)

            logger.info('Generating the out-of-fold predictions')
            y_model = self.predict(model, X_val)
            val_pred = pd.DataFrame({
                'id': sorted(list(val_ids)),
                self.label: y_model
            })
            path = os.path.join(self.output_dir,
                                f'fold{fold_num}_validation.csv')
            val_pred.to_csv(path, index=False)

            logger.info('Generating the test predictions')
            X_test = self.load_inputs(test_df['id'].values, 'test')
            y_model = self.predict(model, X_test)
            test_pred = pd.DataFrame({
                'id': test_df['id'],
                self.label: y_model
            })
            path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv')
            test_pred.to_csv(path, index=False)

        logger.info('Averaging the test predictions')
        df_parts = []
        for fold_num in range(1, 11):
            path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv')
            df_part = pd.read_csv(path, usecols=['id', self.label])
            df_parts.append(df_part)
        test_pred = pd.concat(df_parts).groupby('id', as_index=False).mean()
        path = os.path.join(self.output_dir, 'test.csv')
        test_pred.to_csv(path, index=False)

        logger.info('Elapsed time - {}'.format(datetime.now() - t_start))

    def load_inputs(self, ids, dataset):
        X = []
        for name in self.params['models']:
            model = self.model_cls[name](self.model_params[name],
                                         random_seed=base.RANDOM_SEED)
            df = pd.read_csv(os.path.join(model.output_dir, f'{dataset}.csv'))
            df = df[df['id'].isin(ids)]
            df = df[['id'] + common.LABELS].sort_values('id')
            X.append(df[common.LABELS].values)
        X = np.hstack(X)
        return X

    def train(self, fold_num, label, X_train, y_train, X_val, y_val):

        model = xgb.XGBClassifier(
            n_estimators=10000,  # determined by early stopping
            objective='binary:logistic',
            max_depth=self.params['max_depth'],
            min_child_weight=self.params['min_child_weight'],
            subsample=self.params['subsample'],
            colsample_bytree=self.params['colsample_bytree'],
            learning_rate=self.params['learning_rate'],
            random_state=self.random_seed,
            n_jobs=mp.cpu_count())

        model.fit(X_train,
                  y_train,
                  eval_set=[(X_val, y_val)],
                  eval_metric='auc',
                  early_stopping_rounds=self.params['patience'])

        self.save_model(fold_num, label, model)
        return model

    def save_model(self, fold_num, label, model):
        model_file = os.path.join(self.output_dir,
                                  f'fold{fold_num}_{label}.pickle')
        joblib.dump(model, model_file)

    def load_model(self, fold_num, label):
        model_file = os.path.join(self.output_dir,
                                  f'fold{fold_num}_{label}.pickle')
        model = joblib.load(model_file)
        return model

    def predict(self, model, X):
        output = model.predict_proba(X, ntree_limit=model.best_ntree_limit)[:,
                                                                            1]
        return output
Example #11
0
class RNNv4(object):

    product_history = luigi.IntParameter(default=91)
    scoring_dim = luigi.IntParameter(default=10)
    hidden_layers = luigi.IntParameter(default=2)
    hidden_nonlinearily = luigi.Parameter(default='leaky_relu')
    dropout = luigi.FloatParameter(default=0.0)

    random_seed = luigi.IntParameter(default=3996193, significant=False)
    global_orders_ratio = luigi.FloatParameter(default=0.00001,
                                               significant=False)
    validation_orders_ratio = luigi.FloatParameter(default=0.1,
                                                   significant=False)
    target_orders_ratio = luigi.FloatParameter(default=0.1, significant=False)
    epochs = luigi.IntParameter(default=1000, significant=False)

    @property
    def model_name(self):
        params = [
            self.product_history,
            self.scoring_dim,
            self.hidden_layers,
            self.hidden_nonlinearily,
            self.dropout,
        ]
        model_name = 'rnn_v4_{}'.format('_'.join(
            str(p).lower() for p in params))
        return model_name

    def _init_random_state(self):
        self.random = RandomState(self.random_seed)
        np.random.seed(
            int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder))
        torch.manual_seed(
            int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder))

    def _iter_user_data(self, orders_path, shuffle=False):
        with (open_shuffled(orders_path)
              if shuffle else open(orders_path)) as orders_file:
            for line in orders_file:
                user_data = ujson.loads(line)
                yield user_data

    def _generate_example(self, prior_orders, last_order):
        recently_ordered = set()

        product_history = []
        for order_num, order in enumerate(prior_orders):
            weekday = order['day_of_week']
            hour_sin = np.sin(2 * np.pi * order['hour_of_day'] / 23)
            hour_cos = np.cos(2 * np.pi * order['hour_of_day'] / 23)
            hour = np.array([hour_sin, hour_cos])

            for product in order['products']:
                product_history.append({
                    'weekday':
                    weekday,
                    'hour':
                    hour,
                    'department':
                    int(PRODUCT_ID_TO_DEPT_ID[product['product_id']] - 1),
                    'aisle':
                    int(PRODUCT_ID_TO_AISLE_ID[product['product_id']] - 1),
                    'product':
                    int(product['product_id'] - 1),
                })
                recently_ordered.add(product['product_id'])

        weekday = last_order['day_of_week']
        hour_sin = np.sin(2 * np.pi * last_order['hour_of_day'] / 23)
        hour_cos = np.cos(2 * np.pi * last_order['hour_of_day'] / 23)
        hour = np.array([hour_sin, hour_cos])

        next_products = []
        next_products_targets = []

        reordered = set()
        for product in last_order['products']:
            if product['reordered'] and product[
                    'product_id'] in recently_ordered:
                reordered.add(product['product_id'])

        for product_id in recently_ordered:
            next_products.append({
                'weekday':
                weekday,
                'hour':
                hour,
                'department':
                int(PRODUCT_ID_TO_DEPT_ID[product_id] - 1),
                'aisle':
                int(PRODUCT_ID_TO_AISLE_ID[product_id] - 1),
                'product':
                int(product_id - 1),
            })
            next_products_targets.append(int(product_id in reordered))

        return product_history, next_products, next_products_targets

    def _generate_examples(self,
                           orders_path,
                           target_orders=None,
                           shuffle=False):
        for user_data in self._iter_user_data(orders_path, shuffle=shuffle):
            user_orders = user_data['prior_orders'].copy()
            user_orders.append(user_data['last_order'])

            # Determine the number of target orders to include for this user.
            user_target_orders = target_orders
            if not user_target_orders:
                user_target_orders = int(
                    np.ceil(self.target_orders_ratio * len(user_orders)))

            for last_order_index in reversed(range(1, len(user_orders))):
                last_order = user_orders[last_order_index]
                prior_orders = []
                days_count = last_order['days_since_prior_order']
                for order in reversed(user_orders[:last_order_index]):
                    prior_orders.insert(0, order)
                    if order['days_since_prior_order'] is not None:
                        # There is at least another order, stop if it will go over the limit
                        days_count += order['days_since_prior_order']
                        if days_count >= self.product_history:
                            break
                yield self._generate_example(prior_orders, last_order)
                user_target_orders -= 1
                if user_target_orders == 0:
                    break

    def _format_as_tensors(self, product_history, next_products,
                           next_products_targets):
        def create_tensor(tensor_type, orders, field):
            return Variable(tensor_type([p[field] for p in orders]),
                            requires_grad=False)

        product_history_tensor = {
            'weekday':
            create_tensor(LongTensor, product_history, 'weekday').view(1, -1),
            'hour':
            create_tensor(FloatTensor, product_history, 'hour').view(1, -1, 2),
            'department':
            create_tensor(LongTensor, product_history,
                          'department').view(1, -1),
            'aisle':
            create_tensor(LongTensor, product_history, 'aisle').view(1, -1),
            'product':
            create_tensor(LongTensor, product_history, 'product').view(1, -1),
        }

        next_products_tensor = {
            'weekday':
            create_tensor(LongTensor, next_products, 'weekday').view(-1, 1),
            'hour':
            create_tensor(FloatTensor, next_products, 'hour').view(-1, 2),
            'department':
            create_tensor(LongTensor, next_products, 'department').view(-1, 1),
            'aisle':
            create_tensor(LongTensor, next_products, 'aisle').view(-1, 1),
            'product':
            create_tensor(LongTensor, next_products, 'product').view(-1, 1),
        }

        next_products_targets_tensor = Variable(
            FloatTensor(next_products_targets), requires_grad=False)

        return product_history_tensor, next_products_tensor, next_products_targets_tensor

    def _load_model(self):

        model = Model(weekday_dim=2,
                      department_dim=3,
                      aisle_dim=5,
                      product_dim=10,
                      scoring_dim=10,
                      hidden_layers=self.hidden_layers,
                      hidden_nonlinearily=self.hidden_nonlinearily,
                      dropout=self.dropout)

        return model
Example #12
0
File: base.py Project: nptit/kaggle
class BaseModel(object):
    def __init__(self, params, random_seed):
        self.name = self.__class__.__name__.lower()
        self.params = params
        self.random_seed = random_seed

        self.output_dir = os.path.join(common.OUTPUT_DIR, self.name,
                                       str(self.random_seed),
                                       common.params_str(self.params))
        if not os.path.isdir(self.output_dir):
            os.makedirs(self.output_dir)

    def main(self):
        t_start = datetime.now()
        logger.info(' {} / {} '.format(self.name,
                                       self.random_seed).center(62, '='))
        logger.info('Hyperparameters:\n{}'.format(pprint.pformat(self.params)))
        if os.path.isfile(os.path.join(self.output_dir, 'test.csv')):
            logger.info('Output already exists - skipping')
            return

        # Initialize the random number generator
        self.random_state = RandomState(self.random_seed)
        np.random.seed(
            int.from_bytes(self.random_state.bytes(4),
                           byteorder=sys.byteorder))
        torch.manual_seed(
            int.from_bytes(self.random_state.bytes(4),
                           byteorder=sys.byteorder))

        preprocessed_data = self.load_preprocessed_data()
        self.fields, self.vocab = self.build_fields_and_vocab(
            preprocessed_data)

        train_df = common.load_data('train')
        train_df['comment_text'] = train_df['id'].map(preprocessed_data)
        test_df = common.load_data('test')
        test_df['comment_text'] = test_df['id'].map(preprocessed_data)

        folds = common.stratified_kfold(train_df, random_seed=self.random_seed)
        for fold_num, train_ids, val_ids in folds:
            logger.info(f'Fold #{fold_num}')

            fold_train_df = train_df[train_df['id'].isin(train_ids)]
            fold_val_df = train_df[train_df['id'].isin(val_ids)]
            model = self.train(fold_num, fold_train_df, fold_val_df)

            logger.info('Generating the out-of-fold predictions')
            path = os.path.join(self.output_dir,
                                f'fold{fold_num}_validation.csv')
            self.predict(model, fold_val_df, path)

            logger.info('Generating the test predictions')
            path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv')
            self.predict(model, test_df, path)

        logger.info('Combining the out-of-fold predictions')
        df_parts = []
        for fold_num in range(1, 11):
            path = os.path.join(self.output_dir,
                                f'fold{fold_num}_validation.csv')
            df_part = pd.read_csv(path, usecols=['id'] + common.LABELS)
            df_parts.append(df_part)
        train_pred = pd.concat(df_parts)
        path = os.path.join(self.output_dir, 'train.csv')
        train_pred.to_csv(path, index=False)

        logger.info('Averaging the test predictions')
        df_parts = []
        for fold_num in range(1, 11):
            path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv')
            df_part = pd.read_csv(path, usecols=['id'] + common.LABELS)
            df_parts.append(df_part)
        test_pred = pd.concat(df_parts).groupby('id', as_index=False).mean()
        path = os.path.join(self.output_dir, 'test.csv')
        test_pred.to_csv(path, index=False)

        logger.info('Total elapsed time - {}'.format(datetime.now() - t_start))

    def load_preprocessed_data(self):
        preprocessed_data = preprocessing.load(self.params)
        return preprocessed_data

    def build_fields_and_vocab(self, preprocessed_data):
        text_field = Field(pad_token='<PAD>',
                           unk_token=None,
                           batch_first=True,
                           include_lengths=True)
        labels_field = Field(sequential=False,
                             use_vocab=False,
                             tensor_type=torch.FloatTensor)
        fields = [('text', text_field), ('labels', labels_field)]

        # Build the vocabulary
        datasets = []
        for dataset in ['train', 'test']:
            df = common.load_data(dataset)
            df['comment_text'] = df['id'].map(preprocessed_data)
            datasets.append(CommentsDataset(df, fields))
        text_field.build_vocab(*datasets)
        vocab = text_field.vocab
        assert vocab.stoi['<PAD>'] == 0

        # Fill in missing words with the mean of the existing vectors
        vectors = pretrained_aliases[self.params['vectors']]()
        vectors_sum = np.zeros((vectors.dim, ))
        vectors_count = 0
        for token in vocab.itos:
            if token in vectors.stoi:
                vectors_sum += vectors[token].numpy()
                vectors_count += 1
        mean_vector = torch.FloatTensor(vectors_sum /
                                        vectors_count).unsqueeze(0)

        def getitem(self, token):
            return self.vectors[
                self.stoi[token]] if token in self.stoi else mean_vector

        Vectors.__getitem__ = getitem

        vocab.load_vectors(vectors)

        return fields, vocab

    def train(self, fold_num, train_df, val_df):
        train_iter = self.build_train_iterator(train_df)
        _, val_iter = self.build_prediction_iterator(val_df)
        logger.info(
            'Training on {:,} examples, validating on {:,} examples'.format(
                len(train_iter.dataset), len(val_iter.dataset)))

        # Train the model keeping the word embeddings frozen until the validation AUC
        # stops improving, then unfreeze the embeddings and fine-tune the entire
        # model with a lower learning rate. Use SGD with warm restarts.
        model = self.build_model()
        model.embedding.weight.requires_grad = False
        parameters = list(filter(lambda p: p.requires_grad,
                                 model.parameters()))
        model_size = sum([np.prod(p.size()) for p in parameters])
        logger.info('Optimizing {:,} parameters:\n{}'.format(
            model_size, model))
        run = epoch = 0
        lr_max = self.params['lr_high']
        optimizer = optim.SGD(parameters, lr=lr_max, momentum=0.9)
        t_max = 10
        best_val_auc = 0

        while True:
            run += 1
            # grad_norms = []
            t_cur, lr = 0, lr_max
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

            logger.info('Starting run {} - t_max {}'.format(run, t_max))
            for t_index in range(t_max):
                epoch += 1
                loss_sum = 0
                model.train()
                t = tqdm(train_iter, ncols=79)
                for batch_index, batch in enumerate(t):
                    # Update the learning rate
                    t_cur = t_index + batch_index / len(train_iter)
                    lr = lr_max * (1 + math.cos(math.pi * t_cur / t_max)) / 2
                    t.set_postfix(t_cur='{:.4f}'.format(t_cur),
                                  lr='{:.6f}'.format(lr))
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr
                    # Forward and backward pass
                    optimizer.zero_grad()
                    loss = self.calculate_loss(model, batch)
                    loss.backward()
                    # grad_vector = [p.grad.data.view(-1) for p in parameters]
                    # grad_norms.append(torch.cat(grad_vector).norm())
                    self.update_parameters(model, optimizer, loss)
                    loss_sum += loss.data[0]
                loss = loss_sum / len(train_iter)
                logger.info(
                    'Epoch {} - run {} - t_cur {}/{} - lr {:.6f} - loss {:.6f}'
                    .format(epoch, run, int(math.ceil(t_cur)), t_max, lr,
                            loss))

                # https://arxiv.org/abs/1212.0901
                # logger.info('Average norm of the gradient - {:.6f}'.format(np.mean(grad_norms)))

            # Run ended - evaluate early stopping
            val_auc = self.evaluate_model(model, val_iter)
            if val_auc > best_val_auc:
                logger.info(
                    'Saving best model - val_auc {:.6f}'.format(val_auc))
                self.save_model(fold_num, model)
                best_val_auc = val_auc
            else:
                logger.info('Stopping - val_auc {:.6f}'.format(val_auc))
                if self.params[
                        'lr_low'] == 0 or model.embedding.weight.requires_grad:
                    # Fine-tuning disabled or it just finished
                    break
                else:
                    model = self.load_model(fold_num)
                    model.embedding.weight.requires_grad = True
                    parameters = list(
                        filter(lambda p: p.requires_grad, model.parameters()))
                    model_size = sum([np.prod(p.size()) for p in parameters])
                    logger.info(
                        'Fine-tuning {:,} parameters - best_val_auc {:.6f}'.
                        format(model_size, best_val_auc))
                    run = 0
                    lr_max = self.params['lr_low']
                    optimizer = optim.SGD(parameters, lr=lr_max, momentum=0.9)
                    t_max = 1

        logger.info('Final model - best_val_auc {:.6f}'.format(best_val_auc))
        model = self.load_model(fold_num)
        return model

    def predict(self, model, df, output_path):
        model.eval()
        predictions = []
        pred_id, pred_iter = self.build_prediction_iterator(df)
        for batch in pred_iter:
            (text, text_lengths), _ = batch.text, batch.labels
            output = model(text, text_lengths)
            predictions.append(output.data.cpu())
        predictions = torch.cat(predictions).numpy()

        predictions = pd.DataFrame(predictions, columns=common.LABELS)
        predictions.insert(0, 'id', pred_id)
        predictions.to_csv(output_path, index=False)

    def build_train_iterator(self, df):
        raise NotImplementedError

    def build_prediction_iterator(self, df):
        raise NotImplementedError

    def build_model(self):
        raise NotImplementedError

    def calculate_loss(self, model, batch):
        (text, text_lengths), labels = batch.text, batch.labels
        output = model(text, text_lengths)
        loss = F.binary_cross_entropy(output, labels)
        return loss

    def update_parameters(self, model, optimizer, loss):
        optimizer.step()

    def evaluate_model(self, model, batch_iter):
        model.eval()
        labels, predictions = [], []
        for batch in batch_iter:
            text, text_lengths = batch.text
            labels.append(batch.labels.data.cpu())
            output = model(text, text_lengths)
            predictions.append(output.data.cpu())
        labels = torch.cat(labels).numpy()
        predictions = torch.cat(predictions).numpy()
        auc = roc_auc_score(labels, predictions, average='macro')
        return auc

    def save_model(self, fold_num, model):
        path = os.path.join(self.output_dir, f'fold{fold_num}.pickle')
        torch.save(model.state_dict(), path)

    def load_model(self, fold_num):
        model = self.build_model()
        path = os.path.join(self.output_dir, f'fold{fold_num}.pickle')
        model.load_state_dict(torch.load(path))
        return model
Example #13
0
class PredictMLPv2ThresholdVariable(_PredictMLPv2):

    @property
    def model_name(self):
        model_name = super().model_name
        model_name += '_threshold_variable'
        return model_name

    def _determine_reorder_thresholds(self, model, scores):
        orders_path = self.requires()['orders'].output().path
        all_user_ids, all_order_ids, all_product_ids, all_inputs, all_targets = \
            self._load_data(orders_path, num_orders_per_user=self.num_orders_per_user)

        target_order_ids = set(scores.order_id)
        user_id_to_target_order_id = {}
        for i in range(len(all_user_ids)):
            if all_order_ids[i] in target_order_ids:
                user_id_to_target_order_id[all_user_ids[i]] = all_order_ids[i]
        mask = np.array([order_id not in target_order_ids for order_id in all_order_ids])

        for k in all_inputs.keys():
            all_inputs[k] = all_inputs[k][mask]
        all_predictions = model.predict(all_inputs, batch_size=self.batch_size, verbose=0).flatten()

        results = pd.DataFrame({
            'user_id': list(itertools.compress(all_user_ids, mask)),
            'order_id': list(itertools.compress(all_order_ids, mask)),
            'product_id': list(itertools.compress(all_product_ids, mask)),
            'prediction': all_predictions,
            'target': all_targets[mask],
        })

        # Find the best threshold value for each previous order by each user
        best_thresholds = defaultdict(list)
        grouped = results.groupby(['user_id', 'order_id'])
        for (user_id, order_id), group in grouped:
            product_ids = np.array(group['product_id'])
            reordered = set(product_ids[np.array(group['target']) > 0])
            probability = np.array(group['prediction'])
            if not reordered:
                best_threshold = probability.max()
            else:
                best_threshold, best_f1 = None, None
                for threshold in probability:
                    predicted = set(product_ids[probability >= threshold])
                    tp = len(predicted & reordered)
                    precision = tp / len(predicted)
                    recall = tp / len(reordered)
                    f1 = 2.0 * (precision * recall) / (precision + recall) if precision or recall else 0.0
                    if best_f1 is None or f1 > best_f1:
                        best_threshold = threshold
                        best_f1 = f1
            best_thresholds[user_id].append(best_threshold)

        # Select the average threshold for each user
        reorder_thresholds = {}
        for user_id in user_id_to_target_order_id:
            order_id = user_id_to_target_order_id[user_id]
            reorder_thresholds[order_id] = np.mean(best_thresholds[user_id])

        return reorder_thresholds

    def run(self):
        self.random = RandomState(self.random_seed)
        np.random.seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder))
        tf.set_random_seed(int.from_bytes(self.random.bytes(4), byteorder=sys.byteorder))

        model = self._build_model()
        model.load_weights(self.input()['model'].path)
        model.summary()

        orders_path = self.requires()['orders'].output().path
        user_ids, order_ids, product_ids, inputs, _ = self._load_data(orders_path)
        scores = model.predict(inputs, batch_size=self.batch_size, verbose=0).flatten()
        scores = pd.DataFrame({'order_id': order_ids, 'product_id': product_ids, 'score': scores})

        reorder_thresholds = self._determine_reorder_thresholds(model, scores)

        predictions = {}
        for order_id in set(order_ids):
            predictions[order_id] = []
            df = scores[scores.order_id == order_id]
            df = df[df.score >= reorder_thresholds[order_id]]
            for row in df.itertuples(index=False):
                # ujson fails when it tries to serialize the numpy int values
                predictions[int(order_id)].append(int(row.product_id))

        with self.output().open('w') as fd:
            ujson.dump(predictions, fd)