Beispiel #1
0
def preprocess_features(df_train, df_test):
    categoricals = POLICY_FIXED_CATEGORICALS + EXTRA_CATEGORICALS
    numericals = list(
        set(df_train.columns) - set(["Next_Premium"] + categoricals))
    # One-hot Encoding
    encoder = OneHotEncoder(min_obs=0)
    matrix = encoder.fit_transform(df_train[categoricals]).todense()
    onehot_categoricals = [f"categorical_{i}" for i in range(matrix.shape[1])]
    df_tmp = pd.DataFrame(matrix,
                          columns=onehot_categoricals,
                          index=df_train.index)
    df_train = pd.concat([df_train[numericals + ["Next_Premium"]], df_tmp],
                         axis=1)
    matrix = encoder.transform(df_test[categoricals]).todense()
    df_tmp = pd.DataFrame(matrix,
                          columns=onehot_categoricals,
                          index=df_test.index)
    df_test = pd.concat([df_test[numericals + ["Next_Premium"]], df_tmp],
                        axis=1)
    # Normalize
    scaler = StandardScaler(copy=True)
    scaler.fit(
        pd.concat([
            df_train[numericals + onehot_categoricals],
            df_test[numericals + onehot_categoricals]
        ],
                  axis=0).values)
    df_train[numericals + onehot_categoricals] = scaler.transform(
        df_train[numericals + onehot_categoricals].values)
    df_test[numericals + onehot_categoricals] = scaler.transform(
        df_test[numericals + onehot_categoricals].values)
    df_train["Next_Premium"] = df_train["Next_Premium"] / 1000
    return df_train, df_test
 def __init__(
     self,
     path_to_data='data/strokes-py3.npy',
     path_to_sentences='data/sentences.txt',
     clean_text=True,
     allow_multiple=False,
 ):
     self.path_to_data = path_to_data
     self.path_to_sentences = path_to_sentences
     self.clean_text = clean_text
     self.encoder = OneHotEncoder(allow_multiple=allow_multiple)
 def __init__(
     self,
     path_to_data='data/strokes-py3.npy',
     path_to_sentences='data/sentences.txt',
     clean_text=True,
     allow_multiple=False,
 ):
     self.path_to_data = path_to_data
     self.path_to_sentences = path_to_sentences
     self.clean_text = clean_text
     self.encoder = OneHotEncoder()
     # padd with one, to have the pen lifted
     self.padding = [[1, 0, 0]]
Beispiel #4
0
    def __init__(self,
                 algorithm='gd',
                 multiclass='ovr',
                 lr=0.01,
                 epochs=1000,
                 threshold=1e-3):
        """Initialize logistic regression model

        Parameters
        ----------
        algorithm: str
            Training algorithm ('gd' or 'sgd')
        multiclass: str
            The way to handle multiclass targets ('ovr' or None)
        lr: float
            Learning rate
        epochs: int
            Maximum number of training iterations
        threshold: float
            If the difference between the sum of the parameters at iteration
            t+1 and at iteration t is lower than this value, we assume
            convergence has been reached

        """
        if algorithm not in self.SUPPORTED_ALGORITHMS:
            raise ValueError(f'Algorithm "{algorithm}" not supported')

        if multiclass not in self.SUPPORTED_MULTICLASS:
            raise ValueError(f'Multiclass method "{multiclass}" not supported')

        self._config = {
            'algorithm': algorithm,
            'multiclass': multiclass,
            'epochs': int(epochs),
            'threshold': float(threshold),
            'lr': float(lr)
        }

        self._models = []
        self._encoder = OneHotEncoder()
class Data(ABC):
    def __init__(
        self,
        path_to_data='data/strokes-py3.npy',
        path_to_sentences='data/sentences.txt',
        clean_text=True,
        allow_multiple=False,
    ):
        self.path_to_data = path_to_data
        self.path_to_sentences = path_to_sentences
        self.clean_text = clean_text
        self.encoder = OneHotEncoder(allow_multiple=allow_multiple)

    @property
    def strokes(self):
        if not hasattr(self, '_strokes'):
            strokes = np.load(self.path_to_data, allow_pickle=True)
            self._strokes = strokes
        return tf.identity(self._strokes)

    @property
    def sentences(self):
        if not hasattr(self, '_sentences'):
            with open(self.path_to_sentences) as f:
                texts = f.read()

            if self.clean_text:
                texts = texts.replace(' ', '')
                texts = re.sub('[^.,a-zA-Z!?\-\'"\n]', '#', texts)

            texts = texts.split('\n')
            self._sentences = self.encoder.fit_transform(texts)
        return self._sentences.copy()

    @abstractmethod
    def batch_generator(self, sequence_lenght, batch_size=10, shuffle=True):
        raise NotImplementedError
def train(device, args, data_path="data/"):
    """
    """
    random_seed = 42

    writer = SummaryWriter(log_dir=args.logdir, comment="")

    model_path = args.logdir + ("/unconditional_models/"
                                if args.uncond else "/conditional_models/")
    os.makedirs(model_path, exist_ok=True)

    strokes = np.load(data_path + "strokes.npy", encoding="latin1")
    sentences = ""
    with open(data_path + "sentences.txt") as f:
        sentences = f.readlines()
    sentences = [snt.replace("\n", "") for snt in sentences]
    # Instead of removing the newline symbols, should it be used instead?

    MAX_STROKE_LEN = 800
    strokes, sentences, MAX_SENTENCE_LEN = filter_long_strokes(
        strokes, sentences, MAX_STROKE_LEN, max_index=args.n_data)
    # print("Max sentence len after filter is: {}".format(MAX_SENTENCE_LEN))

    # dimension of one-hot representation
    N_CHAR = 57
    oh_encoder = OneHotEncoder(sentences, n_char=N_CHAR)
    pickle.dump(oh_encoder, open("data/one_hot_encoder.pkl", "wb"))
    sentences_oh = [s.to(device) for s in oh_encoder.one_hot(sentences)]

    # normalize strokes data and convert to pytorch tensors
    strokes = normalize_data(strokes)
    # plot_stroke(strokes[1])
    tstrokes = [torch.from_numpy(stroke).to(device) for stroke in strokes]

    # pytorch dataset
    dataset = HandWritingData(sentences_oh, tstrokes)

    # validating the padding lengths
    assert dataset.strokes_padded_len <= MAX_STROKE_LEN
    assert dataset.sentences_padded_len == MAX_SENTENCE_LEN

    # train - validation split
    train_split = 0.95
    train_size = int(train_split * len(dataset))
    validn_size = len(dataset) - train_size
    dataset_train, dataset_validn = torch.utils.data.random_split(
        dataset, [train_size, validn_size])

    dataloader_train = DataLoader(
        dataset_train,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=False)  # last batch may be smaller than batch_size
    dataloader_validn = DataLoader(dataset_validn,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   drop_last=False)

    common_model_structure = {
        "memory_cells": 400,
        "n_gaussians": 20,
        "num_layers": 2
    }
    model = (HandWritingRNN(**common_model_structure).to(device)
             if args.uncond else HandWritingSynthRNN(
                 n_char=N_CHAR,
                 n_gaussians_window=10,
                 kappa_factor=0.05,
                 **common_model_structure,
             ).to(device))
    print(model)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=0)
    # optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-2,
    #                                   weight_decay=0, momentum=0)

    if args.resume is None:
        model.init_params()
    else:
        model.load_state_dict(torch.load(args.resume, map_location=device))
        print("Resuming trainig on {}".format(args.resume))
        # resume_optim_file = args.resume.split(".pt")[0] + "_optim.pt"
        # if os.path.exists(resume_optim_file):
        #     optimizer = torch.load(resume_optim_file, map_location=device)

    scheduler = ReduceLROnPlateau(optimizer,
                                  mode="min",
                                  factor=0.1**0.5,
                                  patience=10,
                                  verbose=True)

    best_batch_loss = 1e7
    for epoch in range(200):

        train_losses = []
        validation_iters = []
        validation_losses = []
        for i, (c_seq, x, masks, c_masks) in enumerate(dataloader_train):

            # make batch_first = false
            x = x.permute(1, 0, 2)
            masks = masks.permute(1, 0)

            # remove last point (prepending a dummy point (zeros) already done in data)
            inp_x = x[:-1]  # shape : (T, B, 3)
            masks = masks[:-1]  # shape: (B, T)
            # c_seq.shape: (B, MAX_SENTENCE_LEN, n_char), c_masks.shape: (B, MAX_SENTENCE_LEN)
            inputs = (inp_x, c_seq, c_masks)
            if args.uncond:
                inputs = (inp_x, )

            e, log_pi, mu, sigma, rho, *_ = model(*inputs)

            # remove first point from x to make it y
            loss = criterion(x[1:], e, log_pi, mu, sigma, rho, masks)
            train_losses.append(loss.detach().cpu().numpy())

            optimizer.zero_grad()

            loss.backward()

            # --- this may not be needed
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)

            optimizer.step()

            # do logging
            print("{},\t".format(loss))
            if i % 10 == 0:
                writer.add_scalar("Every_10th_batch_loss", loss,
                                  epoch * len(dataloader_train) + i)

            # save as best model if loss is better than previous best
            if loss < best_batch_loss:
                best_batch_loss = loss
                model_file = (
                    model_path +
                    f"handwriting_{('un' if args.uncond else '')}cond_best.pt")
                torch.save(model.state_dict(), model_file)
                optim_file = model_file.split(".pt")[0] + "_optim.pt"
                torch.save(optimizer, optim_file)

        epoch_avg_loss = np.array(train_losses).mean()
        scheduler.step(epoch_avg_loss)

        # ======================== do the per-epoch logging ========================
        writer.add_scalar("Avg_loss_for_epoch", epoch_avg_loss, epoch)
        print(f"Average training-loss for epoch {epoch} is: {epoch_avg_loss}")

        model_file = (
            model_path +
            f"handwriting_{('un' if args.uncond else '')}cond_ep{epoch}.pt")
        torch.save(model.state_dict(), model_file)
        optim_file = model_file.split(".pt")[0] + "_optim.pt"
        torch.save(optimizer, optim_file)

        # generate samples from model
        sample_count = 3
        sentences = ["welcome to lyrebird"
                     ] + ["abcd efgh vicki"] * (sample_count - 1)
        sentences = [s.to(device) for s in oh_encoder.one_hot(sentences)]

        if args.uncond:
            generated_samples = model.generate(600,
                                               batch=sample_count,
                                               device=device)
        else:
            generated_samples, attn_vars = model.generate(sentences,
                                                          device=device)

        figs = []
        # save png files of the generated models
        for i in range(sample_count):
            f = plot_stroke(
                generated_samples[:, i, :].cpu().numpy(),
                save_name=args.logdir +
                "/training_imgs/{}cond_ep{}_{}.png".format(
                    ("un" if args.uncond else ""), epoch, i),
            )
            figs.append(f)

        for i, f in enumerate(figs):
            writer.add_figure(f"samples/image_{i}", f, epoch)

        if not args.uncond:
            figs_phi = plot_phi(attn_vars["phi_list"])
            figs_kappa = plot_attn_scalar(attn_vars["kappa_list"])
            for i, (f_phi, f_kappa) in enumerate(zip(figs_phi, figs_kappa)):
                writer.add_figure(f"attention/phi_{i}", f_phi, epoch)
                writer.add_figure(f"attention/kappa_{i}", f_kappa, epoch)
Beispiel #7
0
class LogisticRegression:
    SUPPORTED_ALGORITHMS = ('gd', 'sgd')
    SUPPORTED_MULTICLASS = ('ovr')

    def __init__(self,
                 algorithm='gd',
                 multiclass='ovr',
                 lr=0.01,
                 epochs=1000,
                 threshold=1e-3):
        """Initialize logistic regression model

        Parameters
        ----------
        algorithm: str
            Training algorithm ('gd' or 'sgd')
        multiclass: str
            The way to handle multiclass targets ('ovr' or None)
        lr: float
            Learning rate
        epochs: int
            Maximum number of training iterations
        threshold: float
            If the difference between the sum of the parameters at iteration
            t+1 and at iteration t is lower than this value, we assume
            convergence has been reached

        """
        if algorithm not in self.SUPPORTED_ALGORITHMS:
            raise ValueError(f'Algorithm "{algorithm}" not supported')

        if multiclass not in self.SUPPORTED_MULTICLASS:
            raise ValueError(f'Multiclass method "{multiclass}" not supported')

        self._config = {
            'algorithm': algorithm,
            'multiclass': multiclass,
            'epochs': int(epochs),
            'threshold': float(threshold),
            'lr': float(lr)
        }

        self._models = []
        self._encoder = OneHotEncoder()

    @staticmethod
    def _sigmoid(x):
        """Link function mapping the features space into the target space"""
        return 1 / (1 + np.exp(-x))

    @classmethod
    def _model(cls, X, b):
        """Logistic regression model"""
        return cls._sigmoid(np.dot(X, b))

    @classmethod
    def _gradient(cls, X, b, y):
        """Gradient of the log likelihood of the parameters"""
        return np.dot(X.T, (y - cls._model(X, b)))

    @classmethod
    def _log_likelihood(cls, X, b, y):
        """Log likelihood of the parameters"""
        return np.mean(y * np.log(cls._model(X, b)) +
                       (1 - y) * np.log(1 - cls._model(X, b)))

    @staticmethod
    def _accuracy_score(y_pred, y_true):
        if len(y_pred) != len(y_true):
            raise ValueError(
                'Prediction and ground truth vectors must have the same dimension'
            )
        return (y_pred == y_true).sum() / y_pred.size

    @staticmethod
    def _intercept(X):
        """Add intercept (column of ones) to the features matrix X"""
        return np.c_[np.ones(X.shape[0]), X]

    @property
    def is_multiclass(self):
        return len(self._models) > 0

    def fit(self, X, y, **kwargs):
        """Main fit method.

        This method handles multiclass target vectors by evaluating their cardi-
        nality.

        Parameters
        ----------
        X: np.array
            Training data
        y: np.array
            Target
        kwargs:
            Keyword parameters of the _fit method

        Returns
        -------
        Nothing
        """
        # Reset object
        self._models = []

        if len(np.unique(y)) > 2:
            # There are more than 2 classes in the target column
            if self._config['multiclass'] == 'ovr':
                # Encode the target column
                y = self._encoder.fit_transform(y)
                # Train one model per class
                for name, data in zip(self._encoder.categories, y.T):
                    print(f'Training model for class "{name}"')

                    model = LogisticRegression(**self._config)
                    model._fit(X, data, **kwargs)

                    # Store trained model
                    self._models.append(model)
        else:
            self._fit(X, y, **kwargs)

    def _fit(self, X, y, verbose=False):
        """Trains a logistic regression model.

        Parameters
        ----------
        X: np.array
            Training data
        y: np.array
            Target (shape (x, 1))
        verbose: bool
            Should we print metrics during training ?

        Returns
        -------
        Nothing

        """
        # Add intercept column
        X = self._intercept(X)

        # Initialize weight vector
        self.beta = np.ones(X.shape[1])
        self.history = np.zeros((self._config['epochs'], 3))

        # Iterate until we reach convergence or maximum number of iterations
        for i in range(self._config['epochs']):
            # We save n-1 beta for convergence test
            beta = self.beta

            if self._config['algorithm'] == 'gd':
                # We take the whole dataset for each iteration
                indexes = np.arange(X.shape[0])
            elif self._config['algorithm'] == 'sgd':
                # We randomly take samples from the dataset
                indexes = np.random.choice(X.shape[0], 10)

            # Compute gradient on whole dataset and update weights
            # according to the learning rate
            self.beta = self.beta + (self._config['lr'] * self._gradient(
                X[indexes, :], self.beta, y[indexes]))
            self.log_likelihood = self._log_likelihood(X, self.beta, y)
            self.accuracy = self._accuracy_score(
                self._model(X, self.beta) > .5, y)

            # Store history
            self.history[i, :] = (i, self.log_likelihood, self.accuracy)

            # If we reached sufficient precision, let's exit the loop
            if np.sum(np.abs(beta - self.beta)) < self._config['threshold']:
                print(
                    f'Convergence reached in {i} iterations, exiting the loop ...'
                )
                break

            # Print info on the current iteration if needed
            if verbose and i % 10 == 0:
                print(
                    f'[{i:5}] Train accuracy: {self.accuracy:10.3%} | LL: {self.log_likelihood:.4f}'
                )

    def predict_proba(self, X):
        """Predicts target probability according to input data"""
        if self.is_multiclass:
            return np.array([m.predict_proba(X) for m in self._models]).T
        else:
            X = self._intercept(X)
            return self._model(X, self.beta)

    def predict(self, X, threshold=0.5, names=True):
        """Returns class predictions.

        Parameters
        ----------
        X: np.array
            Features matrix
        threshold: float ([0, 1])
            Decision frontier
        names: bool
            Returns the classes names rather than indices ?

        Returns
        -------
        np.array

        """
        if self.is_multiclass:
            res = self.predict_proba(X).argmax(axis=1)
        else:
            res = self.predict_proba(X) >= threshold

        return (np.array([self._encoder.categories[i]
                          for i in res]) if names else res)

    def plot(self):
        """Plots a summary graph of the fitting process."""
        if self.is_multiclass:
            for name, model in zip(self._encoder.categories, self._models):
                model._plot(f'Training statistics for class "{name}"')
        else:
            self._plot()

    def _plot(self, title=None):
        """Plots a summary graph of the fitting process."""
        if not hasattr(self, 'history'):
            raise ValueError('Please train the model first')

        labels = ['Train accurary', 'LL']
        colors = ['r', 'g']
        fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))
        title = (f'Logistic regression on {len(self.beta) - 1} features'
                 if not title else title)

        for ax, data, label, color in zip(
                axes, self.history[self.history[:, 0] != 0, 1:].T, labels,
                colors):
            ax.plot(np.arange(data.size), data, color=color)
            ax.legend([label])
            ax.set_xlabel('Iterations')

        fig.suptitle(title)
        plt.show()
class Data(ABC):
    def __init__(
        self,
        path_to_data='data/strokes-py3.npy',
        path_to_sentences='data/sentences.txt',
        clean_text=True,
        allow_multiple=False,
    ):
        self.path_to_data = path_to_data
        self.path_to_sentences = path_to_sentences
        self.clean_text = clean_text
        self.encoder = OneHotEncoder()
        # padd with one, to have the pen lifted
        self.padding = [[1, 0, 0]]

    @property
    def strokes(self):
        if not hasattr(self, '_strokes'):
            strokes = np.load(self.path_to_data, allow_pickle=True)
            self._max_length = max(map(len, strokes))
            self._strokes = strokes
        return self._strokes.copy()

    @property
    def max_length(self):
        if not hasattr(self, '_max_length'):
            _ = self.strokes
        return self._max_length

    def prepare_text(self, text):
        _ = self.sentences
        text = re.sub('[^.,a-zA-Z!?\-\'" \n]', '#', text)
        text = text.split('\n')
        text = self.encoder.transform(text)[0]
        text = np.vstack(
            (text, self.char_padding * (self.char_length - text.shape[0])))
        return tf.dtypes.cast(text.reshape((1, ) + text.shape), float)

    @property
    def sentences(self):
        if not hasattr(self, '_sentences'):
            with open(self.path_to_sentences) as f:
                texts = f.read()

            if self.clean_text:
                texts = re.sub('[^.,a-zA-Z!?\-\'" \n]', '#', texts)

            texts = texts.split('\n')
            self._sentences = self.encoder.fit_transform(texts)
            self._char_length = max(map(len, self._sentences))
            self.char_padding = [[0] * len(self._sentences[0][0])]
        return self._sentences.copy()

    @property
    def char_length(self):
        if not hasattr(self, '_char_length'):
            _ = self.sentences
        return self._char_length

    @abstractmethod
    def batch_generator(self, sequence_lenght, batch_size=10):
        raise NotImplementedError
if __name__ == '__main__':
    args = parser.parse_args()

    if not args.output.is_dir() or not args.output.exists():
        raise ValueError(f'{args.output} is not a valid directory')

    # Load and preprocess the data
    try:
        print(f"Opening {args.input.name} ...")
        data = pd.read_csv(args.input)

        # Fill NAs
        data = data.fillna(0)
        # Preprocessing
        encoder = OneHotEncoder()
        scaler = StandardScaler()
        # X
        categoricals = data.loc[:, ['Best Hand']].values
        categoricals = [encoder.fit_transform(c) for c in categoricals]
        categoricals = np.concatenate(categoricals, axis=1).T
        numerical = data.iloc[:, 6:].values
        numerical = scaler.fit_transform(numerical)
        X = np.concatenate([categoricals, numerical], axis=1)
        # y
        y = data.loc[:, 'Hogwarts House'].values
    except Exception as e:
        print("Unable to open input dataset")
        sys.exit(1)

    # Train the logistic regression
def feature_engineering(df_train, df_test, data_path="data/"):
    df_claims = pd.read_csv(pathlib.Path(data_path) / "claim_0702.csv")
    df_policy = pd.read_csv(pathlib.Path(data_path) / "policy_0702.csv")

    # Fill NA
    df_policy["fsex"] = df_policy["fsex"].fillna(" ")
    df_policy["fmarriage"] = df_policy["fmarriage"].fillna(" ")

    # Index policies
    df_uniq_policy = df_policy[["Policy_Number"]].drop_duplicates()
    df_uniq_policy["index"] = range(df_uniq_policy.shape[0])
    df_uniq_policy.set_index("Policy_Number", inplace=True)
    # # Index Feature
    # df_uniq_policy["overlap_test"] = (
    #     df_uniq_policy["index"] >= SPLIT_POINT).astype("int8")
    # Index Order Features
    split_points = [-1, 95943, 132800, 189940, 252695, 294932, 351272]
    df_uniq_policy["relative_position"] = 0
    for i in range(1, len(split_points)):
        mask = (df_uniq_policy["index"] <= split_points[i]) & (
            df_uniq_policy["index"] > split_points[i - 1])
        width = (split_points[i] - split_points[i - 1] - 1)
        df_uniq_policy.loc[mask, "relative_position"] = (
            (df_uniq_policy.loc[mask, "index"] - split_points[i - 1] - 1) /
            width)
    df_uniq_policy.to_pickle("cache/uniq_policy.pd")

    # Process claims
    coverage_mappings = {
        k: v
        for _, (v, k) in df_policy[[
            "Main_Insurance_Coverage_Group", "Insurance_Coverage"
        ]].drop_duplicates().iterrows()
    }
    df_claims["main_coverage_group"] = df_claims['Coverage'].apply(
        lambda x: coverage_mappings[x])
    # At Fault
    df_claims.loc[df_claims['At_Fault?'] > 100, 'At_Fault?'] = 100
    df_claims.loc[(df_claims['At_Fault?'] > 0) &
                  (df_claims['At_Fault?'] < 100), 'At_Fault?'] = 50
    # TODO: CHECK NA
    df_claims['At_Fault?'] = df_claims['At_Fault?'].fillna(0).astype(
        "int32").astype("category")
    # Clean Coverages
    encoder = LabelEncoder(min_obs=1000)
    tmp = encoder.fit_transform(df_claims[["Coverage"]])
    df_claims["Coverage"] = tmp["Coverage"].astype("int32")
    # Payments
    df_claims["Total_Paid"] = (df_claims["Paid_Loss_Amount"] +
                               df_claims["paid_Expenses_Amount"])
    df_claims["Total_Paid_at_Fault"] = (df_claims["At_Fault?"].values *
                                        df_claims["Total_Paid"].values)
    df_claims_per_policy = df_uniq_policy[["index"]].join(
        df_claims.groupby("Policy_Number")[[
            "Total_Paid", "Total_Paid_at_Fault", "Deductible"
        ]].sum(),
        how="right").fillna(0).drop("index", axis=1)
    df_claims_per_policy = df_claims_per_policy.join(
        df_claims.groupby("Policy_Number")[["Claim_Number"]].nunique(),
        how="left").fillna(0)
    df_claims_per_policy.rename(columns={
        "Claim_Number": "claims",
        "Total_Paid": "claim_total_paid",
        "Total_Paid_at_Fault": "claim_total_paid_at_fault",
        "Deductible": "claim_total_deductible"
    },
                                inplace=True)
    df_claims_per_policy = df_claims_per_policy.join(df_claims.groupby(
        "Policy_Number")["Claim_Number"].size().to_frame("claim_entries"),
                                                     how="left").fillna(0)
    # Per Coverage Group
    df_claims_main_coverage = df_claims.groupby([
        "Policy_Number", "main_coverage_group"
    ])[["Total_Paid", "Deductible"]].sum().unstack(-1, fill_value=0)
    df_claims_main_coverage.columns = [
        "claims_%s_%s" % (x, y)
        for x, y in zip(df_claims_main_coverage.columns.get_level_values(1),
                        df_claims_main_coverage.columns.get_level_values(0))
    ]
    df_claims_per_policy = df_claims_per_policy.join(df_claims_main_coverage,
                                                     how="left")
    # Relationship with Insured
    df_claims_relations = df_claims[[
        "Policy_Number", "Claim_Number", "Driver's_Relationship_with_Insured"
    ]].drop_duplicates().groupby([
        "Policy_Number", "Driver's_Relationship_with_Insured"
    ])["Claim_Number"].size().unstack(-1, fill_value=0)
    df_claims_relations.columns = [
        "relation_%s" % x for x in df_claims_relations.columns
    ]
    df_claims_per_policy = df_claims_per_policy.join(df_claims_relations,
                                                     how="left")
    # # Per Coverage
    # df_claims_coverage = df_claims.groupby(
    #     ["Policy_Number", "Coverage"]
    # )["Total_Paid"].agg(["count", "sum"]).unstack(-1, fill_value=0)
    # df_claims_coverage.columns = [
    #     "claims_coverage_%s_%s" % (x, y) for x, y in zip(
    #         df_claims_coverage.columns.get_level_values(1),
    #         df_claims_coverage.columns.get_level_values(0)
    #     )
    # ]
    # df_claims_per_policy = df_claims_per_policy.join(
    #     df_claims_coverage, how="left")
    # At Fault
    # df_claims_at_fault = df_claims[[
    #     "Policy_Number", "Claim_Number", "At_Fault?", "Total_Paid"
    # ]].drop_duplicates().groupby(
    #     ["Policy_Number", "At_Fault?"]
    # )["Total_Paid"].agg(["sum", "count"]).unstack(-1, fill_value=0)  # .unstack(-1, fill_value=0)
    # df_claims_at_fault.columns = [
    #     "claims_at_fault_%s_%s" % (x, y) for x, y in zip(
    #         df_claims_at_fault.columns.get_level_values(0),
    #         df_claims_at_fault.columns.get_level_values(1)
    #     )
    # ]
    # df_claims_per_policy = df_claims_per_policy.join(
    #     df_claims_at_fault, how="left")
    # df_claims_per_policy["claim_total_paid"] = np.log1p(
    #     df_claims_per_policy["claim_total_paid"])
    # df_claims_per_policy["claim_total_deductible"] = np.log1p(
    #     df_claims_per_policy["claim_total_deductible"])
    df_claims_per_policy["claims_車責_Total_Paid"] = np.log1p(
        df_claims_per_policy["claims_車責_Total_Paid"])
    df_claims_per_policy["claims_車損_Total_Paid"] = np.log1p(
        df_claims_per_policy["claims_車損_Total_Paid"])
    del df_claims_per_policy["claims_竊盜_Total_Paid"]

    print("\nClaims per policy")
    print(df_claims_per_policy.sample(10))

    # Claim Date Statistics
    df_claims["Date"] = df_claims["Accident_Date"].apply(
        lambda x: datetime.strptime(x, "%Y/%m"))
    df_claim_dates = df_claims.groupby("Policy_Number")["Date"].min().to_frame(
        "min_date")
    # df_claim_dates["min_month"] = df_claim_dates["min_date"].dt.month
    # df_claim_dates["min_year"] = df_claim_dates["min_date"].dt.year
    for col in ["min_date"]:
        df_claim_dates[col] = df_claim_dates[col].astype("int64")
        # df_claim_dates[col] = df_claim_dates[col].astype(
        #     "int64") / 10**9 / 60 / 60 / 24  # ns / s / m / d
    # fill policy
    df_claim_dates = df_claim_dates.join(
        df_uniq_policy[["index"]],
        how="right").sort_values("index").drop("index", axis=1)
    # df_claim_dates["min_month"] = df_claim_dates["min_month"].fillna(
    #     method="ffill").fillna(method="bfill")
    # df_claim_dates["min_year"].fillna(-1, inplace=True)
    # df_claim_dates["min_date"] = df_claim_dates["min_date"].fillna(
    #     method="ffill").fillna(method="bfill")
    df_claim_dates["min_date_smooth_1000"] = df_claim_dates[
        "min_date"].rolling(1000, min_periods=20, center=True).mean().fillna(
            method="bfill").fillna(method="ffill") / 10**9 / 60 / 60 / 24
    df_claim_dates["min_date_dayofyear"] = pd.to_datetime(
        df_claim_dates["min_date"].rolling(
            1000, min_periods=20, center=True).min().fillna(
                method="bfill").fillna(method="ffill")).dt.dayofyear
    # weights = np.concatenate(
    #     [np.arange(1, 501), np.arange(500, 0, -1)], axis=0)
    # weights = weights / np.sum(weights)
    # def f(x):
    #     return np.sum(weights * x)
    # df_claim_dates["min_date_weighted_2000"] = df_claim_dates["min_date"].rolling(
    #     2000, min_periods=20, center=False, win_type="blackman"
    # ).mean().fillna(method="bfill").fillna(method="ffill")
    # df_claim_dates["min_date_ewm_500"] = df_claim_dates["min_date"].ewm(
    #     halflife=500, min_periods=1
    # ).mean().fillna(method="ffill").fillna(method="bfill")
    del df_claim_dates["min_date"]
    # df_claim_dates.rename(columns={
    #     "min": "min_claim_date", "max": "max_claim_date"}, inplace=True)
    print("\n Claim Dates")
    print(df_claim_dates.sample(10))

    # Policy Premium Stats
    # df_policy["Premium_log"] = np.log1p(df_policy["Premium"])
    df_policy_premiums = df_policy.groupby("Policy_Number")["Premium"].agg(
        ["count", "min", "max", "sum"])
    df_policy_premiums.rename(columns={
        "count": "n_coverages",
        "sum": "total_premium",
        "min": "min_premium",
        "max": "max_premium"
    },
                              inplace=True)
    print("\nPolicy_Premium:")
    print(df_policy_premiums.sample(10))

    # Polic Premium Stats by Main Coverage
    df_policy_main_premiums = df_policy.groupby([
        "Policy_Number", "Main_Insurance_Coverage_Group"
    ])["Premium"].agg(["count", "sum"]).unstack(-1, fill_value=0)
    df_policy_main_premiums.columns = [
        "premium_%s_%s" % (x, y)
        for x, y in zip(df_policy_main_premiums.columns.get_level_values(1),
                        df_policy_main_premiums.columns.get_level_values(0))
    ]
    print("\nPolicy_Main_Premium:")
    print(df_policy_main_premiums.sample(10))

    # Clean Coverages
    encoder = LabelEncoder(min_obs=5000)
    tmp = encoder.fit_transform(df_policy[["Insurance_Coverage"]])
    df_policy["Insurance_Coverage"] = tmp["Insurance_Coverage"].astype("int32")

    # Policy Premium Stats by Coverage
    df_policy_coverage_premiums = df_policy.groupby([
        "Policy_Number", "Insurance_Coverage"
    ])["Premium"].agg(["count", "sum"]).unstack(-1, fill_value=0)
    df_policy_coverage_premiums.columns = [
        "premium_coverage_%s_%s" % (x, y) for x, y in zip(
            df_policy_coverage_premiums.columns.get_level_values(1),
            df_policy_coverage_premiums.columns.get_level_values(0))
    ]
    print("\nPolicy_Coverage_Premium:")
    print(df_policy_coverage_premiums.sample(10))

    # Other Policy Aggs
    df_policy["Total_Insured_Amount"] = (df_policy["Insured_Amount1"] +
                                         df_policy["Insured_Amount2"] +
                                         df_policy["Insured_Amount3"])
    df_policy_aggs = df_policy.groupby("Policy_Number")[[
        # , "Coverage_Deductible_if_applied"
        "Total_Insured_Amount",
        "Insured_Amount1",
        "Insured_Amount2",
        "Insured_Amount3",
    ]].sum()
    print("\nPolicy Aggs:")
    print(df_policy_aggs.sample(10))

    # Other Policy Aggs By Main Coverage
    df_policy_main_aggs = df_policy.groupby(
        ["Policy_Number", "Main_Insurance_Coverage_Group"])[[
            # "Coverage_Deductible_if_applied"
            "Insured_Amount1",
            "Insured_Amount2",
            "Insured_Amount3",
            "Total_Insured_Amount"
        ]].sum().unstack(-1, fill_value=0)
    df_policy_main_aggs.columns = [
        "%s_%s" % (x, y)
        for x, y in zip(df_policy_main_aggs.columns.get_level_values(1),
                        df_policy_main_aggs.columns.get_level_values(0))
    ]
    print("\nPolicy Aggs by Main:")
    print(df_policy_main_aggs.sample(10))

    # Encode Categoricals
    encoder = LabelEncoder(min_obs=7500)
    df_policy["same_bdate"] = (df_policy["ibirth"] == df_policy["dbirth"]) * 1
    df_policy_fixed_categoricals = df_policy[
        ["Policy_Number"] +
        POLICY_FIXED_CATEGORICALS].drop_duplicates().set_index("Policy_Number")
    df_policy_fixed_categoricals = encoder.fit_transform(
        df_policy_fixed_categoricals[POLICY_FIXED_CATEGORICALS])
    print("\nPolicy-fixed Categoricals")
    print(df_policy_fixed_categoricals.nunique())

    # Policy-fixed Numeric Categoricals
    df_policy_fixed_numericals = df_policy[[
        "Policy_Number", "Replacement_cost_of_insured_vehicle", "ibirth",
        "dbirth", "Engine_Displacement_(Cubic_Centimeter)",
        'Manafactured_Year_and_Month'
    ]].drop_duplicates().set_index("Policy_Number")
    # df_policy_fixed_numericals["differnt_birth"] = (
    #     df_policy_fixed_numericals["ibirth"] != df_policy_fixed_numericals["dbirth"])
    # del df_policy_fixed_numericals["dbirth"]
    df_policy_fixed_numericals['ibirth'] = df_policy_fixed_numericals[
        'ibirth'].str.extract('(19..)',
                              expand=True).fillna(value=1968).astype("int32")
    df_policy_fixed_numericals['dbirth'] = df_policy_fixed_numericals[
        'dbirth'].str.extract('(19..)',
                              expand=True).fillna(value=1968).astype("int32")

    # Prior Policy
    df_prior_policy = df_policy[["Policy_Number", "Prior_Policy_Number"
                                 ]].drop_duplicates().fillna("New")
    # df_prior_policy["first_time_policy"] = df_prior_policy["Prior_Policy_Number"] == "New"
    df_prior_policy = df_prior_policy.merge(
        df_policy_premiums[["total_premium", "n_coverages"]].reset_index(),
        left_on="Prior_Policy_Number",
        right_on="Policy_Number",
        how="left",
        suffixes=["", "_prev"]).set_index("Policy_Number").fillna(0)
    del df_prior_policy["Prior_Policy_Number"]
    del df_prior_policy["Policy_Number_prev"]
    df_prior_policy.rename(columns={
        "total_premium": "prev_total_premium",
        "n_coverages": "n_prev_coverages"
    },
                           inplace=True)

    # Coverage
    encoder = OneHotEncoder(min_obs=10000)
    df_coverage = df_policy[["Policy_Number"]].copy()
    sparse = encoder.fit_transform(df_policy[["Insurance_Coverage"]])
    column_names = [f"coverage_{i}" for i in range(sparse.shape[1])]
    df_coverage = pd.concat(
        [df_coverage,
         pd.DataFrame(sparse.todense(), columns=column_names)],
        axis=1).reset_index(drop=True)
    df_coverage = df_coverage.groupby("Policy_Number").sum()

    df_features = df_policy_premiums.join(
        df_claims_per_policy,
        how="left").fillna(0).join(df_policy_main_premiums).fillna(0).join(
            df_policy_coverage_premiums).join(df_policy_aggs).join(
                df_policy_main_aggs).join(df_policy_fixed_categoricals).join(
                    df_policy_fixed_numericals).join(df_prior_policy).join(
                        df_coverage).join(df_uniq_policy).join(df_claim_dates)

    # Meta Features
    df_features["premium_paid_ratio"] = np.nan_to_num(
        df_features["claim_total_paid"] / df_features["total_premium"])
    df_features["premium_paid_at_fault_ratio"] = np.nan_to_num(
        df_features["claim_total_paid_at_fault"] /
        df_features["total_premium"])
    assert df_policy_premiums.shape[0] == df_features.shape[0]
    print("Feature Shape:", df_features.shape)
    return df_features