Ejemplo n.º 1
0
    def predict_proba(self, dataset, device="cpu", idx=0):
        """Infer causal directions using the trained NCC pairwise model.

        Args:
            dataset (tuple): Couple of np.ndarray variables to classify
            device (str): Device to run the algorithm on (defaults to ``cdt.SETTINGS.default_device``)

        Returns:
            float: Causation score (Value : 1 if a->b and -1 if b->a)
        """
        a, b = dataset
        device = SETTINGS.get_default(device=device)
        if self.model is None:
            print('Model has to be trained before doing any predictions')
            raise ValueError
        if len(np.array(a).shape) == 1:
            a = np.array(a).reshape((-1, 1))
            b = np.array(b).reshape((-1, 1))
        m = np.hstack((a, b))
        m = scale(m)
        m = m.astype('float32')
        m = th.from_numpy(m).t().unsqueeze(0)
        m = m.to(device)

        return (self.model(m).data.cpu().numpy() - .5) * 2
Ejemplo n.º 2
0
    def predict_dataset(self, df, device=None, verbose=None):
        """
        Args:
            x_tr (pd.DataFrame): CEPC format dataframe containing the pairs
            epochs (int): number of train epochs
            learning rate (float): learning rate of Adam
            verbose (bool): verbosity (defaults to ``cdt.SETTINGS.verbose``)
            device (str): cuda or cpu device (defaults to ``cdt.SETTINGS.default_device``)

        Returns:
            pandas.DataFrame: dataframe containing the predicted causation coefficients
        """
        verbose, device = SETTINGS.get_default(('verbose', verbose),
                                               ('device', device))
        dataset = []
        for i, (idx, row) in enumerate(df.iterrows()):
            a = row['A'].reshape((len(row['A']), 1))
            b = row['B'].reshape((len(row['B']), 1))
            m = np.hstack((a, b))
            m = m.astype('float32')
            m = th.from_numpy(m).t().unsqueeze(0)
            dataset.append(m)

        dataset = [m.to(device) for m in dataset]
        return pd.DataFrame((th.cat(
            [self.model(m) for m, t in zip(dataset, trange(len(dataset)))],
            0).data.cpu().numpy() - .5) * 2)
Ejemplo n.º 3
0
    def __init__(self,
                 score='nonlinear',
                 cutoff=0.001,
                 variablesel=True,
                 selmethod='gamboost',
                 pruning=False,
                 prunmethod='gam',
                 njobs=None,
                 verbose=None):
        """Init the model and its available arguments."""
        if not RPackages.CAM:
            raise ImportError("R Package CAM is not available.")

        super(CAM_with_score, self).__init__()
        self.scores = {'nonlinear': 'SEMGAM', 'linear': 'SEMLIN'}
        self.var_selection = {
            'gamboost': 'selGamBoost',
            'gam': 'selGam',
            'lasso': 'selLasso',
            'linear': 'selLm',
            'linearboost': 'selLmBoost'
        }
        self.arguments = {
            '{FOLDER}': '/tmp/cdt_CAM/',
            '{FILE_TRAIN}': 'train_data.csv',
            '{FILE_VALID}': 'valid_data.csv',
            '{TARGETS_TRAIN}': 'targets_train.csv',
            '{TARGETS_VALID}': 'targets_valid.csv',
            '{SCORE}': 'SEMGAM',
            '{VARSEL}': 'TRUE',
            '{SELMETHOD}': 'selGamBoost',
            '{PRUNING}': 'TRUE',
            '{PRUNMETHOD}': 'selGam',
            '{NJOBS}': str(SETTINGS.NJOBS),
            '{CUTOFF}': str(0.001),
            '{VERBOSE}': 'FALSE',
            '{OUTPUT}': 'result.csv'
        }
        self.score = score
        self.cutoff = cutoff
        self.variablesel = variablesel
        self.selmethod = selmethod
        self.pruning = pruning
        self.prunmethod = prunmethod
        self.njobs = SETTINGS.get_default(njobs=njobs)
        self.verbose = SETTINGS.get_default(verbose=verbose)
Ejemplo n.º 4
0
    def __init__(self, verbose=False):
        """Init the model and its available arguments."""
        if not RPackages.pcalg:
            raise ImportError("R Package pcalg is not available.")

        super().__init__()
        self.arguments = {'{FOLDER}': '/tmp/cdt_pc/',
                          '{FILE}': 'data.csv',
                          '{SKELETON}': 'FALSE',
                          '{GAPS}': 'fixedgaps.csv',
                          '{REGIMES}': 'regimes.csv',
                          '{TARGETS}': 'targets.csv',
                          '{VERBOSE}': 'FALSE',
                          '{ALPHA}': '1e-2',
                          '{OUTPUT}': 'result.csv'}
        self.verbose = SETTINGS.get_default(verbose=verbose)
    def generate(self, npairs, npoints=500, rescale=True, njobs=None):
        """Generate Causal pairs, such that one variable causes the other.

        Args:
            npairs (int): Number of pairs of variables to generate.
            npoints (int): Number of data points to generate.
            rescale (bool): Rescale the output with zero mean and unit variance.
            njobs (int): Number of parallel jobs to execute. Defaults to
                cdt.SETTINGS.NJOBS

        Returns:
            tuple: (pandas.DataFrame, pandas.DataFrame) data and corresponding
            labels. The data is at the ``SampleID, a (numpy.ndarray) , b (numpy.ndarray))``
            format.
        """
        def generate_pair(npoints, label, rescale):
            root = self.initial_generator(npoints)[:, np.newaxis]
            cause = self.mechanism(1,
                                   npoints,
                                   self.noise,
                                   noise_coeff=self.noise_coeff)(root)
            effect = self.mechanism(
                1, npoints, self.noise,
                noise_coeff=self.noise_coeff)(cause).squeeze(1)
            cause = cause.squeeze(1)
            if rescale:
                cause = scale(cause)
                effect = scale(effect)
            return (cause, effect) if label == 1 else (effect, cause)

        njobs = SETTINGS.get_default(njobs=njobs)
        self.labels = (np.random.randint(2, size=npairs) - .5) * 2
        output = [
            generate_pair(npoints, self.labels[i], rescale)
            for i in range(npairs)
        ]
        self.data = pd.DataFrame(output, columns=['A', 'B'])
        self.labels = pd.DataFrame(self.labels,
                                   dtype='int32',
                                   columns=['label'])
        return self.data, self.labels
Ejemplo n.º 6
0
    def predict_list(self, l, device=None, verbose=None):
        """
        Args:
            l (list): CEPC format list containing the pairs
            verbose (bool): verbosity (defaults to ``cdt.SETTINGS.verbose``)
            device (str): cuda or cpu device (defaults to ``cdt.SETTINGS.default_device``)

        Returns:
            list: list containing the predicted causation coefficients
        """
        verbose, device = SETTINGS.get_default(('verbose', verbose),
                                               ('device', device))
        # points = []
        # out = th.stack([self.model(m.t().unsqueeze(0)) for m in l], 0).squeeze()
        # for point in l:
        #     m = np.hstack((a, b))
        # point = point.astype('float32')
        # point = th.from_numpy(point).unsqueeze(0)
        # points.append(point)
        # points = [m.to(device) for m in points]
        # a = [self.model(m.t().unsqueeze(0)) for m in l]
        return [self.model(m.t().unsqueeze(0)) for m in l]
Ejemplo n.º 7
0
    def train(self,
              X_tr,
              y_tr,
              X_val,
              y_val,
              epochs=50,
              batch_size=32,
              verbose=None,
              device='cpu',
              **kwargs):
        verbose, device = SETTINGS.get_default(('verbose', verbose),
                                               ('device', device))
        model = self.model.to(device)
        y = th.Tensor(y_tr)
        y = y.to(device)
        dataset = [th.Tensor(x).t().to(device) for x in X_tr]
        dat = Dataset(dataset, y, device, batch_size)
        data_per_epoch = (len(dataset) // batch_size)

        self.model.eval()
        self.log_values(*self.compute_values(X_tr, y_tr, device), 'train')
        self.log_values(*self.compute_values(X_val, y_val, device),
                        'validation')

        with trange(epochs, desc="Epochs", disable=not verbose) as te:
            for _ in te:
                self.model.train()
                with trange(data_per_epoch,
                            desc="Batches of 2*{}".format(batch_size),
                            disable=not (verbose
                                         and batch_size == len(dataset))) as t:
                    output = []
                    labels = []
                    for batch, label in dat:
                        symmetric_batch, symmetric_label = th_enforce_symmetry(
                            batch, label, self.anti)
                        batch += symmetric_batch
                        label = th.cat((label, symmetric_label))
                        self.opt.zero_grad()
                        out = th.stack(
                            [model(m.t().unsqueeze(0)) for m in batch],
                            0).squeeze()
                        loss = self.criterion(out, label)
                        loss.backward()
                        output.append(expit(out.data.cpu()))
                        t.set_postfix(loss=loss.item())
                        self.opt.step()
                        labels.append(label.data.cpu())
                    length = th.cat(output, 0).data.cpu().numpy().size
                    acc = th.where(th.cat(output, 0).data.cpu() > .5, th.ones((length, 1)).data.cpu(),
                                   th.zeros((length, 1)).data.cpu()) - \
                          th.cat(labels, 0).data.cpu()
                    Acc = 1 - acc.abs().mean().item()
                    te.set_postfix(Acc=Acc)

                self.model.eval()
                self.log_values(*self.compute_values(X_tr, y_tr, device),
                                'train')
                self.log_values(*self.compute_values(X_val, y_val, device),
                                'validation')
        return self.log_dict
Ejemplo n.º 8
0
    def _fit(self,
             x_tr,
             y_tr,
             epochs=50,
             batch_size=32,
             learning_rate=0.01,
             verbose=None,
             device='cpu',
             half=True):
        """Fit the NCC model.

        Args:
            x_tr (pd.DataFrame): CEPC format dataframe containing the pairs
            y_tr (pd.DataFrame or np.ndarray): labels associated to the pairs
            epochs (int): number of train epochs
            batch_size (int): size of batch
            learning_rate (float): learning rate of Adam
            verbose (bool): verbosity (defaults to ``cdt.SETTINGS.verbose``)
            device (str): cuda or cpu device (defaults to ``cdt.SETTINGS.default_device``)
        """

        if half:
            batch_size //= 2
        if batch_size > len(x_tr):
            batch_size = len(x_tr)
        verbose, device = SETTINGS.get_default(('verbose', verbose),
                                               ('device', device))
        model = self.model
        # opt = th.optim.Adam(model.parameters(), lr=learning_rate)
        opt = th.optim.RMSprop(model.parameters(), lr=learning_rate)
        criterion = nn.BCEWithLogitsLoss()
        model = model.to(device)
        y = th.Tensor(y_tr)
        y = y.to(device)
        dataset = [th.Tensor(x).t().to(device) for x in x_tr]
        da = Dataset(dataset, y, device, batch_size)
        data_per_epoch = (len(dataset) // batch_size)

        train_accuracy = []

        with trange(epochs, desc="Epochs", disable=not verbose) as te:
            for _ in te:

                with trange(data_per_epoch,
                            desc="Batches of 2*{}".format(batch_size),
                            disable=not (verbose
                                         and batch_size == len(dataset))) as t:
                    output = []
                    labels = []
                    for batch, label in da:
                        # for (batch, label), i in zip(da, t):
                        symmetric_batch, symmetric_label = th_enforce_symmetry(
                            batch, label)
                        batch += symmetric_batch
                        label = th.cat((label, symmetric_label))
                        opt.zero_grad()
                        out = th.stack(
                            [model(m.t().unsqueeze(0)) for m in batch],
                            0).squeeze(2)
                        loss = criterion(out, label)
                        loss.backward()
                        output.append(expit(out.data.cpu()))
                        t.set_postfix(loss=loss.item())
                        opt.step()
                        labels.append(label.data.cpu())
                    length = th.cat(output, 0).data.cpu().numpy().size
                    acc = th.where(th.cat(output, 0).data.cpu() > .5, th.ones((length, 1)).data.cpu(),
                                   th.zeros((length, 1)).data.cpu()) - \
                          th.cat(labels, 0).data.cpu()
                    Acc = 1 - acc.abs().mean().item()
                    te.set_postfix(Acc=Acc)
                    train_accuracy.append(Acc)