Example #1
0
def get_materiascursadas(request,
                         cod_carrera=None,
                         inicio=None,
                         fin=None,
                         subjectCode=None):

    provider = DataProvider()
    transformer = DataTransformer()
    manipulator = DataManipulator()

    # Saco el token del request
    token = get_token(request)
    # Formateo los args
    fecha_inicio = inicio or request.args.get('inicio')
    fecha_fin = fin or request.args.get('fin')
    # Tiene que ser una sola carrera y un solo plan para calcular creditos
    carrera = cod_carrera or request.args.get('carrera')
    plan = request.args.get('plan')
    # Traigo las cursadas
    if carrera:
        cursadas_json = provider.get_materiascursadas(token, carrera)
    else:
        cursadas_json = provider.getTakenSubjects(token, subjectCode)
    cursadas_data = transformer.transform_materiascursadas_to_dataframe(
        cursadas_json)

    # Filtro periodo
    df = manipulator.filtrar_periodo(cursadas_data, fecha_inicio, fecha_fin)
    return df
Example #2
0
def recursantes_materia(cod_materia):
    token = get_token(request)

    cod_materia = cod_materia.zfill(5)
    carrera = request.args.get('carrera')
    fecha_fin = request.args.get('fecha')
    anio = fecha_fin.split('-')[0] if fecha_fin else None
    mes = fecha_fin.split('-')[1] if fecha_fin else None
    semestre = 1 if mes and int(mes) <= 6 else 2
    dm = DataManipulator()

    # Filtro los inscriptos de la carrera y materia
    if carrera:
        inscriptos = DataProvider().get_inscriptos(token, carrera, anio,
                                                   semestre)
    else:
        inscriptos = DataProvider().getEnrolled(token, cod_materia, anio,
                                                semestre)
    inscriptos_df = DataTransformer().transform_materiascursadas_to_dataframe(
        inscriptos)

    # Filtro las cursadas de la carrera y materia
    if carrera:
        cursadas = DataProvider().get_materiascursadas(token, carrera)
    else:
        cursadas = DataProvider().getTakenSubjects(token, cod_materia)
    cursadas_df = DataTransformer().transform_materiascursadas_to_dataframe(
        cursadas)

    recursantes = dm.get_recursantes(cursadas_df, inscriptos_df, cod_materia)
    return json.dumps([{
        "Legajo": key,
        "Cantidad": value
    } for key, value in recursantes.items()])
Example #3
0
def dispersion_notas(cod_materia):
    transformer = DataTransformer()
    provider = DataProvider()
    token = get_token(request)  # Saco el token del request
    df = get_alumnos_de_materia_periodo(request, cod_materia)
    carrera = request.args.get('carrera')

    if carrera:
        alumnos_carrera_json = provider.get_alumnos_de_carrera(token, carrera)
    else:
        alumnos_carrera_json = provider.getCareerStudents(token)
    alumnos_carrera_df = transformer.transform_to_dataframe(
        alumnos_carrera_json)
    data = transformer.merge_materias_con_promedio(df, alumnos_carrera_df)
    # Itero para generar el json final
    resultado = []
    for row in data.itertuples():
        nota = getattr(row, 'nota')
        if nota:
            resultado.append({
                "Promedio": getattr(row, 'promedio'),
                "Alumno": getattr(row, 'alumno'),
                "Nota": nota
            })
    return json.dumps(resultado)
Example #4
0
def get_materiascursadas_plan(request, carrera=None):
    transformer = DataTransformer()

    cursadas_data = get_materiascursadas(request, carrera)
    plan_data = get_plan(request, carrera)
    data = transformer.merge_materias_con_plan(cursadas_data, plan_data)
    return data, cursadas_data, plan_data
Example #5
0
def detalle_aprobados(cod_materia):
    manipulator = DataManipulator()
    transformer = DataTransformer()
    df = get_alumnos_de_materia_periodo(request, cod_materia)
    df = manipulator.filtrar_aprobados(df)
    detalle_aprobados = manipulator.cantidades_formas_aprobacion(df)
    data = detalle_aprobados.to_dict()
    resultado = {}
    for nombre, valor in data.items():
        resultado[transformer.get_forma_aprobacion(nombre)] = valor
    return json.dumps([resultado])
Example #6
0
def alumnos_carrera(carrera):
    token = get_token(request)
    transformer = DataTransformer()
    json_data = DataProvider().get_alumnos_de_carrera(token, carrera)
    data = transformer.transform_to_dataframe(json_data)
    inscriptos = DataManipulator().inscriptos_por_carrera(data)['alumno']
    return json.dumps([{
        "nombre":
        transformer.transform_timestamp_to_semester(key),
        "cantidad":
        value
    } for key, value in inscriptos.items()])
Example #7
0
def train_model():
    train = pd.read_csv('data/train.csv', index_col=0)
    test = pd.read_csv('data/test.csv', index_col=0)

    transformer = DataTransformer().fit(pd.concat([train, test]))

    X_train = transformer.transform(train)
    y_train = train.Survived

    classifier = RandomForestClassifier(criterion='gini',
                                        n_estimators=1750,
                                        max_depth=7,
                                        min_samples_split=6,
                                        min_samples_leaf=6,
                                        max_features='auto',
                                        oob_score=True,
                                        random_state=SEED,
                                        n_jobs=-1,
                                        verbose=0)

    N = 5
    oob = 0

    scores, acc_scores = [], []

    skf = StratifiedKFold(n_splits=N, random_state=N, shuffle=True)

    for fold, (trn_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
        # Fitting the model
        classifier.fit(X_train.iloc[trn_idx], y_train.iloc[trn_idx])

        # Computing Validation AUC score
        val_fpr, val_tpr, val_thresholds = roc_curve(
            y_train.iloc[val_idx],
            classifier.predict_proba(X_train.iloc[val_idx])[:, 1])
        val_auc_score = auc(val_fpr, val_tpr)

        scores.append(val_auc_score)

        acc_scores.append(
            accuracy_score(y_train.iloc[val_idx],
                           classifier.predict(X_train.iloc[val_idx])))

        oob += classifier.oob_score_ / N
        logger.info('Fold {} OOB Score: {}'.format(fold,
                                                   classifier.oob_score_))

    logger.info('Average OOB Score: {}'.format(oob))
    logger.info('Average auc: {}'.format(np.mean(val_auc_score)))
    logger.info('Average accuracy: {}'.format(np.mean(acc_scores)))

    return classifier, transformer
Example #8
0
 def setUp(self):
     self.manipulator = DataManipulator()
     transformer = DataTransformer()
     with open('source/tests/json/api_carreras_materiascursadas.json',
               'r') as archivo_alumnos:
         data = json.loads(archivo_alumnos.read())
         df_materias = transformer.transform_materiascursadas_to_dataframe(
             data)
     with open('source/tests/json/plan_test.json', 'r') as archivo_plan:
         data = json.loads(archivo_plan.read())
         df_plan = transformer.transform_to_dataframe(data)
     self.dataframe = transformer.merge_materias_con_plan(
         df_materias, df_plan)
Example #9
0
def get_materiascursadas_promedio(request, carrera, inicio=None, fin=None):
    cursadas_data = get_materiascursadas(request, carrera, inicio, fin)

    transformer = DataTransformer()

    # Obtengo los alumnos de la carrera
    token = get_token(request)  # Saco el token del request
    alumnos_carrera_json = DataProvider().get_alumnos_de_carrera(
        token, carrera)
    alumnos_carrera_df = transformer.transform_to_dataframe(
        alumnos_carrera_json)
    data = transformer.merge_materias_con_promedio(cursadas_data,
                                                   alumnos_carrera_df)
    return data
Example #10
0
def get_plan(request, carrera=None):
    provider = DataProvider()
    transformer = DataTransformer()

    # Saco el token del request
    token = get_token(request)
    # Formateo los args
    fecha_inicio = request.args.get('inicio')
    fecha_fin = request.args.get('fin')
    # Tiene que ser una sola carrera y un solo plan para calcular creditos
    carrera = carrera or request.args.get('carrera')
    plan = request.args.get('plan')
    # Traigo el plan
    plan_json = provider.get_plan(token, carrera, plan)
    plan_data = transformer.transform_to_dataframe(plan_json)
    return plan_data
def main():
    args = parse_args()
    config = json.load(open(args.config_file, 'r'))

    path_list = loadtxt(config['input'])
    assert 0 < path_list

    save_dir = config['output']['save_dir']
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)

    print('Transformation parameters.')
    params = config['parameters']
    for key in params.keys():
        print('  => {}: {}'.format(key, params[key]))

    print('Data augmentation.')
    transformer = DataTransformer(config)
    x_times_sample = params['x_times']
    for i in range(x_times_sample):
        data = np.random.permutation(path_list)
        savefile = [
            os.path.join(save_dir, '{:04d}_x.pkl'.format(i)),
            os.path.join(save_dir, '{:04d}_y.pkl'.format(i))
        ]
        building(transformer, data, savefile)
        print('  => # of completed x times sample: {} / {}'.format(
            i + 1, x_times_sample),
              end='\r')
    print('\nDone')
Example #12
0
    def setUp(self):
        self.manipulator = DataManipulator()
        self.transformer = DataTransformer()
        with open('source/tests/json/api_carreras_materiascursadas.json', 'r') as archivo_alumnos:
            data = json.loads(archivo_alumnos.read())
            self.df_materiascursadas = self.transformer.transform_materiascursadas_to_dataframe(data)

        with open('source/tests/json/api_carreras_planes_anio.json', 'r') as archivo_plan:
            data = json.loads(archivo_plan.read())
            self.df_plan = self.transformer.transform_to_dataframe(data)

        with open('source/tests/json/api_carrera_planes_anio_cantidad_materias_necesarias.json', 'r') as archivo_plan:
            data = json.loads(archivo_plan.read())
            self.cantidad_materias_necesarias = data["cantidad"]

        self.dataframe = self.transformer.merge_materias_con_plan(
            self.df_materiascursadas, self.df_plan)
Example #13
0
def promedios_alumno(legajo):
    merged_data, _, plan_data = get_materiascursadas_plan(request)
    manipulator = DataManipulator()
    scores = manipulator.get_scores_alumno(merged_data, legajo)
    return json.dumps([{
        "nombre": row["periodo_semestre"],
        "valor": row["score_periodo"]
    } for index, row in DataTransformer().transform_scores_unicos(
        scores).iterrows()])
Example #14
0
class CTGANSynthesizer(object):
    """Conditional Table GAN Synthesizer.

    This is the core class of the CTGAN project, where the different components
    are orchestrated together.

    For more details about the process, please check the [Modeling Tabular data using
    Conditional GAN](https://arxiv.org/abs/1907.00503) paper.

    Args:
        embedding_dim (int):
            Size of the random sample passed to the Generator. Defaults to 128.
        gen_dim (tuple or list of ints):
            Size of the output samples for each one of the Residuals. A Resiudal Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        dis_dim (tuple or list of ints):
            Size of the output samples for each one of the Discriminator Layers. A Linear Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        l2scale (float):
            Wheight Decay for the Adam Optimizer. Defaults to 1e-6.
        batch_size (int):
            Number of data samples to process in each step.
    """
    def __init__(self,
                 embedding_dim=128,
                 gen_dim=(256, 256),
                 dis_dim=(256, 256),
                 l2scale=1e-6,
                 batch_size=500):

        self.embedding_dim = embedding_dim
        self.gen_dim = gen_dim
        self.dis_dim = dis_dim

        self.l2scale = l2scale
        self.batch_size = batch_size
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

    def _apply_activate(self, data):
        data_t = []
        st = 0
        for item in self.transformer.output_info:
            if item[1] == 'tanh':
                ed = st + item[0]
                data_t.append(torch.tanh(data[:, st:ed]))
                st = ed
            elif item[1] == 'softmax':
                ed = st + item[0]
                data_t.append(
                    functional.gumbel_softmax(data[:, st:ed], tau=0.2))
                st = ed
            else:
                assert 0

        return torch.cat(data_t, dim=1)

    def _cond_loss(self, data, c, m):
        loss = []
        st = 0
        st_c = 0
        skip = False
        for item in self.transformer.output_info:
            if item[1] == 'tanh':
                st += item[0]
                skip = True

            elif item[1] == 'softmax':
                if skip:
                    skip = False
                    st += item[0]
                    continue

                ed = st + item[0]
                ed_c = st_c + item[0]
                tmp = functional.cross_entropy(data[:, st:ed],
                                               torch.argmax(c[:, st_c:ed_c],
                                                            dim=1),
                                               reduction='none')
                loss.append(tmp)
                st = ed
                st_c = ed_c

            else:
                assert 0

        loss = torch.stack(loss, dim=1)

        return (loss * m).sum() / data.size()[0]

    def fit(self,
            train_data,
            weekday_percentage,
            discrete_columns=tuple(),
            epochs=300,
            log_frequency=True):
        """Fit the CTGAN Synthesizer models to the training data.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a
                pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
            epochs (int):
                Number of training epochs. Defaults to 300.
            log_frequency (boolean):
                Whether to use log frequency of categorical levels in conditional
                sampling. Defaults to ``True``.
        """

        self.transformer = DataTransformer()
        self.transformer.fit(train_data, discrete_columns)
        train_data = self.transformer.transform(train_data)

        data_sampler = Sampler(train_data, self.transformer.output_info)

        data_dim = self.transformer.output_dimensions
        self.cond_generator = ConditionalGenerator(
            train_data, self.transformer.output_info, log_frequency)

        self.generator = Generator(
            self.embedding_dim + self.cond_generator.n_opt, self.gen_dim,
            data_dim).to(self.device)

        discriminator = Discriminator(data_dim + self.cond_generator.n_opt,
                                      self.dis_dim).to(self.device)

        optimizerG = optim.Adam(self.generator.parameters(),
                                lr=2e-4,
                                betas=(0.5, 0.9),
                                weight_decay=self.l2scale)
        optimizerD = optim.Adam(discriminator.parameters(),
                                lr=2e-4,
                                betas=(0.5, 0.9))

        assert self.batch_size % 2 == 0
        mean = torch.zeros(self.batch_size,
                           self.embedding_dim,
                           device=self.device)
        std = mean + 1

        steps_per_epoch = max(len(train_data) // self.batch_size, 1)
        print("Number of training steps per epoch is {} for batch size {}\n".
              format(steps_per_epoch, self.batch_size))
        for epoch in range(epochs):
            for id_ in range(steps_per_epoch):
                fakez = torch.normal(mean=mean, std=std)

                condvec = self.cond_generator.sample(self.batch_size)
                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                    real = data_sampler.sample(self.batch_size, col, opt)
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                    perm = np.arange(self.batch_size)
                    np.random.shuffle(perm)
                    real = data_sampler.sample(self.batch_size, col[perm],
                                               opt[perm])
                    c2 = c1[perm]

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                real = torch.from_numpy(real.astype('float32')).to(self.device)

                if c1 is not None:
                    fake_cat = torch.cat([fakeact, c1], dim=1)
                    real_cat = torch.cat([real, c2], dim=1)
                else:
                    real_cat = real
                    fake_cat = fake

                y_fake = discriminator(fake_cat)
                y_real = discriminator(real_cat)

                pen = discriminator.calc_gradient_penalty(
                    real_cat, fake_cat, self.device)
                loss_d = -(torch.mean(y_real) - torch.mean(y_fake))

                optimizerD.zero_grad()
                pen.backward(retain_graph=True)
                loss_d.backward()
                optimizerD.step()

                fakez = torch.normal(mean=mean, std=std)
                condvec = self.cond_generator.sample(self.batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    y_fake = discriminator(torch.cat([fakeact, c1], dim=1))
                else:
                    y_fake = discriminator(fakeact)

                if condvec is None:
                    cross_entropy = 0
                else:
                    cross_entropy = self._cond_loss(fake, c1, m1)

                loss_g = -torch.mean(y_fake) + cross_entropy

                optimizerG.zero_grad()
                loss_g.backward()
                optimizerG.step()

            print("Epoch %d, Loss G: %.4f, Loss D: %.4f" %
                  (epoch + 1, loss_g.detach().cpu(), loss_d.detach().cpu()),
                  flush=True)

            self.evaluate_model(epoch, weekday_percentage)

    def evaluate_model(self, epoch, weekday_percentage):
        # create some genrated samples using the generator model
        gen_samples = self.sample(18000)
        gen_weekday_percentage = gen_samples['weekday'].value_counts(
            normalize=True)

        # sort series by the weekday name
        weekday_percentage = weekday_percentage.sort_index(ascending=True)
        gen_weekday_percentage = gen_weekday_percentage.sort_index(
            ascending=True)

        score = mean_squared_error(weekday_percentage, gen_weekday_percentage)
        print("Evaluation after epoch {} is {}\n".format(epoch + 1, score))

    def sample(self, n):
        """Sample data similar to the training data.

        Args:
            n (int):
                Number of rows to sample.

        Returns:
            numpy.ndarray or pandas.DataFrame
        """

        steps = n // self.batch_size + 1
        data = []
        for i in range(steps):
            mean = torch.zeros(self.batch_size, self.embedding_dim)
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self.device)

            condvec = self.cond_generator.sample_zero(self.batch_size)
            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self.device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self.generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]

        return self.transformer.inverse_transform(data, None)
Example #15
0
    def fit(self,
            train_data,
            weekday_percentage,
            discrete_columns=tuple(),
            epochs=300,
            log_frequency=True):
        """Fit the CTGAN Synthesizer models to the training data.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a
                pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
            epochs (int):
                Number of training epochs. Defaults to 300.
            log_frequency (boolean):
                Whether to use log frequency of categorical levels in conditional
                sampling. Defaults to ``True``.
        """

        self.transformer = DataTransformer()
        self.transformer.fit(train_data, discrete_columns)
        train_data = self.transformer.transform(train_data)

        data_sampler = Sampler(train_data, self.transformer.output_info)

        data_dim = self.transformer.output_dimensions
        self.cond_generator = ConditionalGenerator(
            train_data, self.transformer.output_info, log_frequency)

        self.generator = Generator(
            self.embedding_dim + self.cond_generator.n_opt, self.gen_dim,
            data_dim).to(self.device)

        discriminator = Discriminator(data_dim + self.cond_generator.n_opt,
                                      self.dis_dim).to(self.device)

        optimizerG = optim.Adam(self.generator.parameters(),
                                lr=2e-4,
                                betas=(0.5, 0.9),
                                weight_decay=self.l2scale)
        optimizerD = optim.Adam(discriminator.parameters(),
                                lr=2e-4,
                                betas=(0.5, 0.9))

        assert self.batch_size % 2 == 0
        mean = torch.zeros(self.batch_size,
                           self.embedding_dim,
                           device=self.device)
        std = mean + 1

        steps_per_epoch = max(len(train_data) // self.batch_size, 1)
        print("Number of training steps per epoch is {} for batch size {}\n".
              format(steps_per_epoch, self.batch_size))
        for epoch in range(epochs):
            for id_ in range(steps_per_epoch):
                fakez = torch.normal(mean=mean, std=std)

                condvec = self.cond_generator.sample(self.batch_size)
                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                    real = data_sampler.sample(self.batch_size, col, opt)
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                    perm = np.arange(self.batch_size)
                    np.random.shuffle(perm)
                    real = data_sampler.sample(self.batch_size, col[perm],
                                               opt[perm])
                    c2 = c1[perm]

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                real = torch.from_numpy(real.astype('float32')).to(self.device)

                if c1 is not None:
                    fake_cat = torch.cat([fakeact, c1], dim=1)
                    real_cat = torch.cat([real, c2], dim=1)
                else:
                    real_cat = real
                    fake_cat = fake

                y_fake = discriminator(fake_cat)
                y_real = discriminator(real_cat)

                pen = discriminator.calc_gradient_penalty(
                    real_cat, fake_cat, self.device)
                loss_d = -(torch.mean(y_real) - torch.mean(y_fake))

                optimizerD.zero_grad()
                pen.backward(retain_graph=True)
                loss_d.backward()
                optimizerD.step()

                fakez = torch.normal(mean=mean, std=std)
                condvec = self.cond_generator.sample(self.batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    y_fake = discriminator(torch.cat([fakeact, c1], dim=1))
                else:
                    y_fake = discriminator(fakeact)

                if condvec is None:
                    cross_entropy = 0
                else:
                    cross_entropy = self._cond_loss(fake, c1, m1)

                loss_g = -torch.mean(y_fake) + cross_entropy

                optimizerG.zero_grad()
                loss_g.backward()
                optimizerG.step()

            print("Epoch %d, Loss G: %.4f, Loss D: %.4f" %
                  (epoch + 1, loss_g.detach().cpu(), loss_d.detach().cpu()),
                  flush=True)

            self.evaluate_model(epoch, weekday_percentage)
 def row_periodos(self, row):
     transformer = DataTransformer()
     row['fecha_periodo'] = transformer.fecha_periodo(row.fecha)
     row['periodo_semestre'] = transformer.periodo_semestre(
         row['fecha_periodo'])
     return row
Example #17
0
class AlumnoTest(unittest.TestCase):

    def setUp(self):
        self.manipulator = DataManipulator()
        self.transformer = DataTransformer()
        with open('source/tests/json/api_carreras_materiascursadas.json', 'r') as archivo_alumnos:
            data = json.loads(archivo_alumnos.read())
            self.df_materiascursadas = self.transformer.transform_materiascursadas_to_dataframe(data)

        with open('source/tests/json/api_carreras_planes_anio.json', 'r') as archivo_plan:
            data = json.loads(archivo_plan.read())
            self.df_plan = self.transformer.transform_to_dataframe(data)

        with open('source/tests/json/api_carrera_planes_anio_cantidad_materias_necesarias.json', 'r') as archivo_plan:
            data = json.loads(archivo_plan.read())
            self.cantidad_materias_necesarias = data["cantidad"]

        self.dataframe = self.transformer.merge_materias_con_plan(
            self.df_materiascursadas, self.df_plan)

    def test_porcentaje_nucleos(self):
        materias_alumno = self.manipulator.filtrar_materias_de_alumno(self.dataframe, "1")
        data = self.manipulator.porcentajes_aprobadas_areas(
        self.df_plan, materias_alumno)
        self.assertEqual(data['Inglés'], 50) # Tiene el 50% aprobado segun el mock

    def test_porcentaje_nucleos_dentro_periodo(self):
        materias_alumno = self.manipulator.filtrar_materias_de_alumno(self.dataframe, "1")
        filtradas_periodo = self.manipulator.filtrar_periodo(materias_alumno, '2018-02-10', '2020-02-10')
        data = self.manipulator.porcentajes_aprobadas_areas(self.df_plan, filtradas_periodo)
        self.assertEqual(data['Inglés'], 50) # Tiene el 50% aprobado segun el mock

    def test_porcentaje_nucleos_fuera_periodo(self):
        materias_alumno = self.manipulator.filtrar_materias_de_alumno(self.dataframe, "1")
        filtradas_periodo = self.manipulator.filtrar_periodo(materias_alumno, '2018-02-10', '2018-10-10')
        data = self.manipulator.porcentajes_aprobadas_areas(self.df_plan, filtradas_periodo)
        self.assertEqual(data['Inglés'], 0) # Tiene el 0% aprobado segun el mock

    def test_porcentaje_areas(self):
        materias_alumno = self.manipulator.filtrar_materias_de_alumno(self.dataframe, "1")
        data = self.manipulator.porcentajes_aprobadas_nucleos(
        self.df_plan, materias_alumno)
        self.assertEqual("%.2f" % data['I'], '33.33') # Segun el mock, tiene el 33.333333%, lo limito a 2 decimales

    def test_porcentaje_areas_dentro_periodo(self):
        materias_alumno = self.manipulator.filtrar_materias_de_alumno(self.dataframe, "1")
        filtradas_periodo = self.manipulator.filtrar_periodo(materias_alumno, '2018-02-10', '2020-10-10')
        data = self.manipulator.porcentajes_aprobadas_nucleos(
        self.df_plan, filtradas_periodo)
        self.assertEqual("%.2f" % data['I'], '33.33') # Segun el mock, tiene el 33.333333%, lo limito a 2 decimales

    def test_porcentaje_areas_fuera_periodo(self):
        materias_alumno = self.manipulator.filtrar_materias_de_alumno(self.dataframe, "1")
        filtradas_periodo = self.manipulator.filtrar_periodo(materias_alumno, '2018-02-10', '2018-10-10')
        data = self.manipulator.porcentajes_aprobadas_nucleos(
        self.df_plan, filtradas_periodo)
        self.assertEqual("%.2f" % data['I'], '0.00') # Segun el mock, tiene el 0%

    def test_scores(self):
        scores = self.manipulator.get_scores_alumno(self.dataframe, "1")
        scores_unicos = self.transformer.transform_scores_unicos(scores)
        # Como en el 2018 tiene un 2, un 7 y un 8, el score de ese semestre deberia ser (2+7+8)/3 = 5.67
        resultado = "%.2f" % scores_unicos[scores_unicos.periodo_semestre == '2018-S2'].score_periodo
        esperado = "5.67"
        self.assertEqual(resultado, esperado)

    def test_cantidad_aprobadas(self):
        materias_alumno = self.manipulator.filtrar_materias_de_alumno(
            self.dataframe, "1")
        cantidad_aprobadas = self.manipulator.cantidad_aprobadas(materias_alumno) # Deberian ser 2
        self.assertEqual(cantidad_aprobadas, 2)

    def test_porcentaje_carrera(self):
        materias_alumno = self.manipulator.filtrar_materias_de_alumno(
            self.dataframe, "1")
        cantidad_aprobadas = self.manipulator.cantidad_aprobadas(materias_alumno) # Deberian ser 2

        porcentaje = self.manipulator.porcentaje_aprobadas(cantidad_aprobadas, self.cantidad_materias_necesarias) # (2/40)*100 = 5
        self.assertEqual(porcentaje, 5)
# In[3]:

# np.random.seed(1)
# gmm = GaussianMixture(n_components=5, covariance_type='spherical')
# gmm.means_ = np.array([[10], [20], [60], [80], [110]])
# gmm.covariances_ = np.array([[3], [3], [2], [2], [1]]) ** 2
# gmm.weights_ = np.array([0.2, 0.2, 0.2, 0.2, 0.2])

# X = gmm.sample(2000)
# data = X[0]
# data = (data - min(X[0]))/(max(X[0])-min(X[0]))
# plt.hist(data, 200000, density=False, histtype='stepfilled', alpha=1)

train_data = pd.read_csv('data/testone.csv')
transformer = DataTransformer()
discrete_columns = tuple()
num_gen = train_data.shape[1]
transformer.fit(train_data, discrete_columns)
train_data = transformer.transform(train_data)

# In[4]:


class formerNet(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate five parameters and assign them as members.
        """
        super().__init__()
        self.former = torch.nn.Sequential(torch.nn.Linear(Z_dim, h_dim),
Example #19
0
class CTGANSynthesizer(object):
    """Conditional Table GAN Synthesizer.

    This is the core class of the CTGAN project, where the different components
    are orchestrated together.

    For more details about the process, please check the [Modeling Tabular data using
    Conditional GAN](https://arxiv.org/abs/1907.00503) paper.

    Args:
        embedding_dim (int):
            Size of the random sample passed to the Generator. Defaults to 128.
        gen_dim (tuple or list of ints):
            Size of the output samples for each one of the Residuals. A Resiudal Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        dis_dim (tuple or list of ints):
            Size of the output samples for each one of the Discriminator Layers. A Linear Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        l2scale (float):
            Wheight Decay for the Adam Optimizer. Defaults to 1e-6.
        batch_size (int):
            Number of data samples to process in each step.
    """
    def __init__(self,
                 embedding_dim=128,
                 gen_dim=(256, 256),
                 dis_dim=(256, 256),
                 l2scale=1e-6,
                 batch_size=500):

        self.embedding_dim = embedding_dim
        self.gen_dim = gen_dim
        self.dis_dim = dis_dim

        self.l2scale = l2scale
        self.batch_size = batch_size
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

    def _apply_activate(self, data):
        data_t = []
        st = 0
        for item in self.transformer.output_info:
            if item[1] == 'tanh':
                ed = st + item[0]
                data_t.append(torch.tanh(data[:, st:ed]))
                st = ed
            elif item[1] == 'softmax':
                ed = st + item[0]
                data_t.append(
                    functional.gumbel_softmax(data[:, st:ed], tau=0.2))
                st = ed
            else:
                assert 0

        return torch.cat(data_t, dim=1)

    def _cond_loss(self, data, c, m):
        loss = []
        st = 0
        st_c = 0
        skip = False
        for item in self.transformer.output_info:
            if item[1] == 'tanh':
                st += item[0]
                skip = True

            elif item[1] == 'softmax':
                if skip:
                    skip = False
                    st += item[0]
                    continue

                ed = st + item[0]
                ed_c = st_c + item[0]
                tmp = functional.cross_entropy(data[:, st:ed],
                                               torch.argmax(c[:, st_c:ed_c],
                                                            dim=1),
                                               reduction='none')
                loss.append(tmp)
                st = ed
                st_c = ed_c

            else:
                assert 0

        loss = torch.stack(loss, dim=1)

        return (loss * m).sum() / data.size()[0]

    def fit(self,
            train_data,
            prefered_label,
            black_box_path,
            discrete_columns=tuple(),
            conditional_cols=None,
            epochs=300,
            log_frequency=True):
        """Fit the CTGAN Synthesizer models to the training data.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a
                pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
            epochs (int):
                Number of training epochs. Defaults to 300.
            log_frequency (boolean):
                Whether to use log frequency of categorical levels in conditional
                sampling. Defaults to ``True``.
        """

        self.prefered_label = prefered_label
        self.blackbox_model = pickle.load(open(black_box_path, "rb"))

        self.transformer = DataTransformer()
        self.transformer.fit(train_data, discrete_columns)
        train_data = self.transformer.transform(train_data)

        data_sampler = Sampler(train_data, self.transformer.output_info)

        data_dim = self.transformer.output_dimensions
        self.cond_generator = ConditionalGenerator(
            train_data, self.transformer.output_info, log_frequency,
            conditional_cols)

        self.generator = Generator(
            self.embedding_dim + self.cond_generator.n_opt, self.gen_dim,
            data_dim).to(self.device)

        discriminator = Discriminator(data_dim, self.dis_dim,
                                      1).to(self.device)

        conditonal_discriminator = Discriminator(1 + self.cond_generator.n_opt,
                                                 self.dis_dim).to(self.device)

        optimizerG = optim.Adam(self.generator.parameters(),
                                lr=2e-4,
                                betas=(0.5, 0.9),
                                weight_decay=self.l2scale)
        optimizerD = optim.Adam(discriminator.parameters(),
                                lr=2e-4,
                                betas=(0.5, 0.9))
        optimizerconditionalD = optim.Adam(
            conditonal_discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9))

        assert self.batch_size % 2 == 0
        mean = torch.zeros(self.batch_size,
                           self.embedding_dim,
                           device=self.device)
        std = mean + 1

        steps_per_epoch = max(len(train_data) // self.batch_size, 1)
        for i in range(epochs):
            flip_loss_list = []
            real_flip_loss_list = []
            for id_ in range(steps_per_epoch):

                fakez = torch.normal(mean=mean, std=std)
                condvec = self.cond_generator.sample(self.batch_size)
                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                    real = data_sampler.sample(self.batch_size, col, opt)
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                    perm = np.arange(self.batch_size)
                    np.random.shuffle(perm)
                    real = data_sampler.sample(self.batch_size, col[perm],
                                               opt[perm])
                    c2 = c1[perm]

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                if condvec is None:
                    cross_entropy = 0
                else:
                    cross_entropy = self._cond_loss(fake, c1, m1)

                real = torch.from_numpy(real.astype('float32')).to(self.device)

                if c1 is not None:

                    real_cat = real
                    fake_cat = fakeact

                else:
                    real_cat = real
                    fake_cat = fake

                y_fake = discriminator(fake_cat)
                y_real = discriminator(real_cat)

                if c1 is not None:
                    conditional_fake_cat = torch.cat([y_fake, c1], dim=1)
                    conditional_real_cat = torch.cat([y_real, c2], dim=1)

                else:
                    conditional_fake_cat = y_fake
                    conditional_real_cat = y_real

                conditional_y_fake = conditonal_discriminator(
                    conditional_fake_cat)
                conditional_y_real = conditonal_discriminator(
                    conditional_real_cat)

                pen = discriminator.calc_gradient_penalty(
                    real_cat, fake_cat, self.device)
                loss_d = -(torch.mean(y_real) - torch.mean(y_fake))

                condtional_pen = conditonal_discriminator.calc_gradient_penalty(
                    conditional_real_cat, conditional_fake_cat, self.device)
                loss_condtional_d = -(torch.mean(conditional_y_real) -
                                      torch.mean(conditional_y_fake))

                optimizerD.zero_grad()
                pen.backward(retain_graph=True)
                loss_d.backward(retain_graph=True)
                optimizerD.step()

                optimizerconditionalD.zero_grad()
                condtional_pen.backward(retain_graph=True)
                loss_condtional_d.backward()
                optimizerconditionalD.step()

                fakez = torch.normal(mean=mean, std=std)
                condvec = self.cond_generator.sample(self.batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    y_fake = discriminator(fakeact)
                    conditional_y_fake = conditonal_discriminator(
                        torch.cat([y_fake, c1], dim=1))
                else:
                    y_fake = discriminator(fakeact)
                    conditional_y_fake = conditonal_discriminator(y_fake)

                if condvec is None:
                    cross_entropy = 0
                else:
                    cross_entropy = self._cond_loss(fake, c1, m1)

                fake_act_inv = self.transformer.inverse_transform(
                    fakeact.detach().cpu().numpy(), None)
                fake_act_inv = _factorize_categoricals(fake_act_inv,
                                                       discrete_columns)
                fake_act_inv = xgb.DMatrix(data=fake_act_inv)
                black_box_pred_prob = self.blackbox_model.predict(fake_act_inv)
                #black_box_pred_prob = torch.from_numpy(np.stack([1-black_box_pred_prob,black_box_pred_prob], axis = -1))
                #flip_loss = torch.nn.CrossEntropyLoss()(black_box_pred_prob, torch.tensor([self.prefered_label]).repeat(self.batch_size))
                flip_loss = sum(-np.log(black_box_pred_prob)) / self.batch_size

                real_inv = self.transformer.inverse_transform(
                    real.detach().cpu().numpy(), None)
                real_inv = _factorize_categoricals(real_inv, discrete_columns)
                real_inv = xgb.DMatrix(data=real_inv)
                real_pred_prob = self.blackbox_model.predict(real_inv)
                #black_box_pred_prob = torch.from_numpy(np.stack([1-black_box_pred_prob,black_box_pred_prob], axis = -1))
                #flip_loss = torch.nn.CrossEntropyLoss()(black_box_pred_prob, torch.tensor([self.prefered_label]).repeat(self.batch_size))
                real_flip_loss = sum(-np.log(real_pred_prob)) / self.batch_size

                loss_g = -torch.mean(
                    conditional_y_fake) + cross_entropy + 10 * flip_loss
                #print(f"Base Loss:{-torch.mean(conditional_y_fake)}, Conditional Loss:{cross_entropy}, 10Flip Loss:{flip_loss}")
                flip_loss_list.append(flip_loss)
                real_flip_loss_list.append(real_flip_loss)

                optimizerG.zero_grad()
                loss_g.backward()
                optimizerG.step()

            print(
                f"Generated flip loss {np.mean(flip_loss_list)}, Real flip loss {np.mean(real_flip_loss_list)}"
            )
            print("Condtional Cross Entropy Loss", cross_entropy)
            print(
                "Epoch %d, Loss G: %.4f, Loss D: %.4f, Loss Conditional D: %.4f"
                % (i + 1, loss_g.detach().cpu(), loss_d.detach().cpu(),
                   loss_condtional_d.detach().cpu()),
                flush=True)

    def sample(self, n, col_index=None):
        """Sample data similar to the training data.
        Args:
            n (int):
                Number of rows to sample.
        Returns:
            numpy.ndarray or pandas.DataFrame
        """

        steps = n // self.batch_size + 1
        data = []
        for i in range(steps):
            mean = torch.zeros(self.batch_size, self.embedding_dim)
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self.device)

            condvec, m1 = self.cond_generator.sample_zero(self.batch_size)
            m1 = torch.from_numpy(m1).to(self.device)
            if condvec is None:
                pass
            else:
                c1 = condvec
                if col_index != None:
                    c1 = np.zeros_like(c1)
                    c1[:, col_index] = 1
                c1 = torch.from_numpy(c1).to(self.device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self.generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())
            print(self._cond_loss(fake, c1, m1))

        data = np.concatenate(data, axis=0)
        data = data[:n]

        return self.transformer.inverse_transform(data, None)
class TransformerTest(unittest.TestCase):
    def setUp(self):
        self.transformer = DataTransformer()

    def test_periodo_segundo_semestre(self):
        fecha = '2020-12-12'
        semestre_esperado = '2020-S2'
        semestre_resultado = self.transformer.periodo_semestre(fecha)
        self.assertEqual(semestre_esperado, semestre_resultado)

    def test_periodo_primer_semestre(self):
        fecha = '2020-05-12'
        semestre_esperado = '2020-S1'
        semestre_resultado = self.transformer.periodo_semestre(fecha)
        self.assertEqual(semestre_esperado, semestre_resultado)

    def test_forma_aprobacion_encontrada(self):
        forma_aprobacion = 'P'
        esperado = 'Promocion'
        resultado = self.transformer.get_forma_aprobacion(forma_aprobacion)
        self.assertEqual(esperado, resultado)

    def test_forma_aprobacion_no_encontrada(self):
        """
            Si la forma de aprobación no existe, devuelve la pedida
        """
        forma_aprobacion = 'Pw'
        esperado = 'Pw'
        resultado = self.transformer.get_forma_aprobacion(forma_aprobacion)
        self.assertEqual(esperado, resultado)

    def test_fecha_periodo_primer_semestre(self):
        """
            Si la fecha esta comprendida entre el mes 4 y el 9, pertenece al primer semestre
            Porque las notas del segundo semestre pueden llegar a cerrarse en el mes 3.
        """
        fecha = '2020-05-12'
        semestre_esperado = '2020-06-30'
        semestre_resultado = self.transformer.fecha_periodo(fecha)
        self.assertEqual(semestre_esperado, semestre_resultado)

    def test_fecha_periodo_segundo_semestre_anio_anterior(self):
        """
            Si la fecha esta comprendida entre el mes 1 y el 3, 
            pertenece al segundo semestre del año anterior
        """
        fecha = '2020-02-12'
        semestre_esperado = '2019-12-31'
        semestre_resultado = self.transformer.fecha_periodo(fecha)
        self.assertEqual(semestre_esperado, semestre_resultado)

    def test_fecha_periodo_segundo_semestre_mismo_anio(self):
        """
            Si la fecha esta comprendida entre el mes 10 y el 12, 
            pertenece al segundo semestre de ese mismo año
        """
        fecha = '2020-11-12'
        semestre_esperado = '2020-12-31'
        semestre_resultado = self.transformer.fecha_periodo(fecha)
        self.assertEqual(semestre_esperado, semestre_resultado)

    def test_timestamp_to_semester(self):
        timestamp = '2020-07-01 12:12:12'
        esperado = '2020-S2'
        resultado = self.transformer.transform_timestamp_to_semester(timestamp)
        self.assertEqual(esperado, resultado)

    def test_timestamp_to_semester_1st(self):
        timestamp = '2020-01-01 12:12:12'
        esperado = '2020-S1'
        resultado = self.transformer.transform_timestamp_to_semester(timestamp)
        self.assertEqual(esperado, resultado)

    def test_date_to_semester(self):
        fecha = date(2020, 7, 1)
        esperado = '2020-S2'
        resultado = self.transformer.transform_date_to_semester(fecha)
        self.assertEqual(esperado, resultado)

    def test_date_to_semester_1st(self):
        fecha = date(2020, 1, 1)
        esperado = '2020-S1'
        resultado = self.transformer.transform_date_to_semester(fecha)
        self.assertEqual(esperado, resultado)

    def test_timestamp_to_datetime(self):
        timestamp = '2020-01-01 12:12:12'
        esperado = datetime(2020, 1, 1, 12, 12, 12)
        resultado = self.transformer.transform_timestamp_to_datetime(timestamp)
        self.assertEqual(esperado, resultado)

    def test_merge_materias_con_promedio(self):
        pass

    def test_merge_materias_con_plan(self):
        pass

    def test_transform_scores_unicos(self):
        pass

    def test_materiascursadas_to_dataframe(self):
        pass

    def test_transform_to_dataframe(self):
        pass
 def setUp(self):
     self.transformer = DataTransformer()
Example #22
0
    def fit(self,
            train_data,
            prefered_label,
            black_box_path,
            discrete_columns=tuple(),
            conditional_cols=None,
            epochs=300,
            log_frequency=True):
        """Fit the CTGAN Synthesizer models to the training data.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a
                pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
            epochs (int):
                Number of training epochs. Defaults to 300.
            log_frequency (boolean):
                Whether to use log frequency of categorical levels in conditional
                sampling. Defaults to ``True``.
        """

        self.prefered_label = prefered_label
        self.blackbox_model = pickle.load(open(black_box_path, "rb"))

        self.transformer = DataTransformer()
        self.transformer.fit(train_data, discrete_columns)
        train_data = self.transformer.transform(train_data)

        data_sampler = Sampler(train_data, self.transformer.output_info)

        data_dim = self.transformer.output_dimensions
        self.cond_generator = ConditionalGenerator(
            train_data, self.transformer.output_info, log_frequency,
            conditional_cols)

        self.generator = Generator(
            self.embedding_dim + self.cond_generator.n_opt, self.gen_dim,
            data_dim).to(self.device)

        discriminator = Discriminator(data_dim, self.dis_dim,
                                      1).to(self.device)

        conditonal_discriminator = Discriminator(1 + self.cond_generator.n_opt,
                                                 self.dis_dim).to(self.device)

        optimizerG = optim.Adam(self.generator.parameters(),
                                lr=2e-4,
                                betas=(0.5, 0.9),
                                weight_decay=self.l2scale)
        optimizerD = optim.Adam(discriminator.parameters(),
                                lr=2e-4,
                                betas=(0.5, 0.9))
        optimizerconditionalD = optim.Adam(
            conditonal_discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9))

        assert self.batch_size % 2 == 0
        mean = torch.zeros(self.batch_size,
                           self.embedding_dim,
                           device=self.device)
        std = mean + 1

        steps_per_epoch = max(len(train_data) // self.batch_size, 1)
        for i in range(epochs):
            flip_loss_list = []
            real_flip_loss_list = []
            for id_ in range(steps_per_epoch):

                fakez = torch.normal(mean=mean, std=std)
                condvec = self.cond_generator.sample(self.batch_size)
                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                    real = data_sampler.sample(self.batch_size, col, opt)
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                    perm = np.arange(self.batch_size)
                    np.random.shuffle(perm)
                    real = data_sampler.sample(self.batch_size, col[perm],
                                               opt[perm])
                    c2 = c1[perm]

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                if condvec is None:
                    cross_entropy = 0
                else:
                    cross_entropy = self._cond_loss(fake, c1, m1)

                real = torch.from_numpy(real.astype('float32')).to(self.device)

                if c1 is not None:

                    real_cat = real
                    fake_cat = fakeact

                else:
                    real_cat = real
                    fake_cat = fake

                y_fake = discriminator(fake_cat)
                y_real = discriminator(real_cat)

                if c1 is not None:
                    conditional_fake_cat = torch.cat([y_fake, c1], dim=1)
                    conditional_real_cat = torch.cat([y_real, c2], dim=1)

                else:
                    conditional_fake_cat = y_fake
                    conditional_real_cat = y_real

                conditional_y_fake = conditonal_discriminator(
                    conditional_fake_cat)
                conditional_y_real = conditonal_discriminator(
                    conditional_real_cat)

                pen = discriminator.calc_gradient_penalty(
                    real_cat, fake_cat, self.device)
                loss_d = -(torch.mean(y_real) - torch.mean(y_fake))

                condtional_pen = conditonal_discriminator.calc_gradient_penalty(
                    conditional_real_cat, conditional_fake_cat, self.device)
                loss_condtional_d = -(torch.mean(conditional_y_real) -
                                      torch.mean(conditional_y_fake))

                optimizerD.zero_grad()
                pen.backward(retain_graph=True)
                loss_d.backward(retain_graph=True)
                optimizerD.step()

                optimizerconditionalD.zero_grad()
                condtional_pen.backward(retain_graph=True)
                loss_condtional_d.backward()
                optimizerconditionalD.step()

                fakez = torch.normal(mean=mean, std=std)
                condvec = self.cond_generator.sample(self.batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    y_fake = discriminator(fakeact)
                    conditional_y_fake = conditonal_discriminator(
                        torch.cat([y_fake, c1], dim=1))
                else:
                    y_fake = discriminator(fakeact)
                    conditional_y_fake = conditonal_discriminator(y_fake)

                if condvec is None:
                    cross_entropy = 0
                else:
                    cross_entropy = self._cond_loss(fake, c1, m1)

                fake_act_inv = self.transformer.inverse_transform(
                    fakeact.detach().cpu().numpy(), None)
                fake_act_inv = _factorize_categoricals(fake_act_inv,
                                                       discrete_columns)
                fake_act_inv = xgb.DMatrix(data=fake_act_inv)
                black_box_pred_prob = self.blackbox_model.predict(fake_act_inv)
                #black_box_pred_prob = torch.from_numpy(np.stack([1-black_box_pred_prob,black_box_pred_prob], axis = -1))
                #flip_loss = torch.nn.CrossEntropyLoss()(black_box_pred_prob, torch.tensor([self.prefered_label]).repeat(self.batch_size))
                flip_loss = sum(-np.log(black_box_pred_prob)) / self.batch_size

                real_inv = self.transformer.inverse_transform(
                    real.detach().cpu().numpy(), None)
                real_inv = _factorize_categoricals(real_inv, discrete_columns)
                real_inv = xgb.DMatrix(data=real_inv)
                real_pred_prob = self.blackbox_model.predict(real_inv)
                #black_box_pred_prob = torch.from_numpy(np.stack([1-black_box_pred_prob,black_box_pred_prob], axis = -1))
                #flip_loss = torch.nn.CrossEntropyLoss()(black_box_pred_prob, torch.tensor([self.prefered_label]).repeat(self.batch_size))
                real_flip_loss = sum(-np.log(real_pred_prob)) / self.batch_size

                loss_g = -torch.mean(
                    conditional_y_fake) + cross_entropy + 10 * flip_loss
                #print(f"Base Loss:{-torch.mean(conditional_y_fake)}, Conditional Loss:{cross_entropy}, 10Flip Loss:{flip_loss}")
                flip_loss_list.append(flip_loss)
                real_flip_loss_list.append(real_flip_loss)

                optimizerG.zero_grad()
                loss_g.backward()
                optimizerG.step()

            print(
                f"Generated flip loss {np.mean(flip_loss_list)}, Real flip loss {np.mean(real_flip_loss_list)}"
            )
            print("Condtional Cross Entropy Loss", cross_entropy)
            print(
                "Epoch %d, Loss G: %.4f, Loss D: %.4f, Loss Conditional D: %.4f"
                % (i + 1, loss_g.detach().cpu(), loss_d.detach().cpu(),
                   loss_condtional_d.detach().cpu()),
                flush=True)