def get_materiascursadas(request, cod_carrera=None, inicio=None, fin=None, subjectCode=None): provider = DataProvider() transformer = DataTransformer() manipulator = DataManipulator() # Saco el token del request token = get_token(request) # Formateo los args fecha_inicio = inicio or request.args.get('inicio') fecha_fin = fin or request.args.get('fin') # Tiene que ser una sola carrera y un solo plan para calcular creditos carrera = cod_carrera or request.args.get('carrera') plan = request.args.get('plan') # Traigo las cursadas if carrera: cursadas_json = provider.get_materiascursadas(token, carrera) else: cursadas_json = provider.getTakenSubjects(token, subjectCode) cursadas_data = transformer.transform_materiascursadas_to_dataframe( cursadas_json) # Filtro periodo df = manipulator.filtrar_periodo(cursadas_data, fecha_inicio, fecha_fin) return df
def recursantes_materia(cod_materia): token = get_token(request) cod_materia = cod_materia.zfill(5) carrera = request.args.get('carrera') fecha_fin = request.args.get('fecha') anio = fecha_fin.split('-')[0] if fecha_fin else None mes = fecha_fin.split('-')[1] if fecha_fin else None semestre = 1 if mes and int(mes) <= 6 else 2 dm = DataManipulator() # Filtro los inscriptos de la carrera y materia if carrera: inscriptos = DataProvider().get_inscriptos(token, carrera, anio, semestre) else: inscriptos = DataProvider().getEnrolled(token, cod_materia, anio, semestre) inscriptos_df = DataTransformer().transform_materiascursadas_to_dataframe( inscriptos) # Filtro las cursadas de la carrera y materia if carrera: cursadas = DataProvider().get_materiascursadas(token, carrera) else: cursadas = DataProvider().getTakenSubjects(token, cod_materia) cursadas_df = DataTransformer().transform_materiascursadas_to_dataframe( cursadas) recursantes = dm.get_recursantes(cursadas_df, inscriptos_df, cod_materia) return json.dumps([{ "Legajo": key, "Cantidad": value } for key, value in recursantes.items()])
def dispersion_notas(cod_materia): transformer = DataTransformer() provider = DataProvider() token = get_token(request) # Saco el token del request df = get_alumnos_de_materia_periodo(request, cod_materia) carrera = request.args.get('carrera') if carrera: alumnos_carrera_json = provider.get_alumnos_de_carrera(token, carrera) else: alumnos_carrera_json = provider.getCareerStudents(token) alumnos_carrera_df = transformer.transform_to_dataframe( alumnos_carrera_json) data = transformer.merge_materias_con_promedio(df, alumnos_carrera_df) # Itero para generar el json final resultado = [] for row in data.itertuples(): nota = getattr(row, 'nota') if nota: resultado.append({ "Promedio": getattr(row, 'promedio'), "Alumno": getattr(row, 'alumno'), "Nota": nota }) return json.dumps(resultado)
def get_materiascursadas_plan(request, carrera=None): transformer = DataTransformer() cursadas_data = get_materiascursadas(request, carrera) plan_data = get_plan(request, carrera) data = transformer.merge_materias_con_plan(cursadas_data, plan_data) return data, cursadas_data, plan_data
def detalle_aprobados(cod_materia): manipulator = DataManipulator() transformer = DataTransformer() df = get_alumnos_de_materia_periodo(request, cod_materia) df = manipulator.filtrar_aprobados(df) detalle_aprobados = manipulator.cantidades_formas_aprobacion(df) data = detalle_aprobados.to_dict() resultado = {} for nombre, valor in data.items(): resultado[transformer.get_forma_aprobacion(nombre)] = valor return json.dumps([resultado])
def alumnos_carrera(carrera): token = get_token(request) transformer = DataTransformer() json_data = DataProvider().get_alumnos_de_carrera(token, carrera) data = transformer.transform_to_dataframe(json_data) inscriptos = DataManipulator().inscriptos_por_carrera(data)['alumno'] return json.dumps([{ "nombre": transformer.transform_timestamp_to_semester(key), "cantidad": value } for key, value in inscriptos.items()])
def train_model(): train = pd.read_csv('data/train.csv', index_col=0) test = pd.read_csv('data/test.csv', index_col=0) transformer = DataTransformer().fit(pd.concat([train, test])) X_train = transformer.transform(train) y_train = train.Survived classifier = RandomForestClassifier(criterion='gini', n_estimators=1750, max_depth=7, min_samples_split=6, min_samples_leaf=6, max_features='auto', oob_score=True, random_state=SEED, n_jobs=-1, verbose=0) N = 5 oob = 0 scores, acc_scores = [], [] skf = StratifiedKFold(n_splits=N, random_state=N, shuffle=True) for fold, (trn_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1): # Fitting the model classifier.fit(X_train.iloc[trn_idx], y_train.iloc[trn_idx]) # Computing Validation AUC score val_fpr, val_tpr, val_thresholds = roc_curve( y_train.iloc[val_idx], classifier.predict_proba(X_train.iloc[val_idx])[:, 1]) val_auc_score = auc(val_fpr, val_tpr) scores.append(val_auc_score) acc_scores.append( accuracy_score(y_train.iloc[val_idx], classifier.predict(X_train.iloc[val_idx]))) oob += classifier.oob_score_ / N logger.info('Fold {} OOB Score: {}'.format(fold, classifier.oob_score_)) logger.info('Average OOB Score: {}'.format(oob)) logger.info('Average auc: {}'.format(np.mean(val_auc_score))) logger.info('Average accuracy: {}'.format(np.mean(acc_scores))) return classifier, transformer
def setUp(self): self.manipulator = DataManipulator() transformer = DataTransformer() with open('source/tests/json/api_carreras_materiascursadas.json', 'r') as archivo_alumnos: data = json.loads(archivo_alumnos.read()) df_materias = transformer.transform_materiascursadas_to_dataframe( data) with open('source/tests/json/plan_test.json', 'r') as archivo_plan: data = json.loads(archivo_plan.read()) df_plan = transformer.transform_to_dataframe(data) self.dataframe = transformer.merge_materias_con_plan( df_materias, df_plan)
def get_materiascursadas_promedio(request, carrera, inicio=None, fin=None): cursadas_data = get_materiascursadas(request, carrera, inicio, fin) transformer = DataTransformer() # Obtengo los alumnos de la carrera token = get_token(request) # Saco el token del request alumnos_carrera_json = DataProvider().get_alumnos_de_carrera( token, carrera) alumnos_carrera_df = transformer.transform_to_dataframe( alumnos_carrera_json) data = transformer.merge_materias_con_promedio(cursadas_data, alumnos_carrera_df) return data
def get_plan(request, carrera=None): provider = DataProvider() transformer = DataTransformer() # Saco el token del request token = get_token(request) # Formateo los args fecha_inicio = request.args.get('inicio') fecha_fin = request.args.get('fin') # Tiene que ser una sola carrera y un solo plan para calcular creditos carrera = carrera or request.args.get('carrera') plan = request.args.get('plan') # Traigo el plan plan_json = provider.get_plan(token, carrera, plan) plan_data = transformer.transform_to_dataframe(plan_json) return plan_data
def main(): args = parse_args() config = json.load(open(args.config_file, 'r')) path_list = loadtxt(config['input']) assert 0 < path_list save_dir = config['output']['save_dir'] if not os.path.isdir(save_dir): os.makedirs(save_dir) print('Transformation parameters.') params = config['parameters'] for key in params.keys(): print(' => {}: {}'.format(key, params[key])) print('Data augmentation.') transformer = DataTransformer(config) x_times_sample = params['x_times'] for i in range(x_times_sample): data = np.random.permutation(path_list) savefile = [ os.path.join(save_dir, '{:04d}_x.pkl'.format(i)), os.path.join(save_dir, '{:04d}_y.pkl'.format(i)) ] building(transformer, data, savefile) print(' => # of completed x times sample: {} / {}'.format( i + 1, x_times_sample), end='\r') print('\nDone')
def setUp(self): self.manipulator = DataManipulator() self.transformer = DataTransformer() with open('source/tests/json/api_carreras_materiascursadas.json', 'r') as archivo_alumnos: data = json.loads(archivo_alumnos.read()) self.df_materiascursadas = self.transformer.transform_materiascursadas_to_dataframe(data) with open('source/tests/json/api_carreras_planes_anio.json', 'r') as archivo_plan: data = json.loads(archivo_plan.read()) self.df_plan = self.transformer.transform_to_dataframe(data) with open('source/tests/json/api_carrera_planes_anio_cantidad_materias_necesarias.json', 'r') as archivo_plan: data = json.loads(archivo_plan.read()) self.cantidad_materias_necesarias = data["cantidad"] self.dataframe = self.transformer.merge_materias_con_plan( self.df_materiascursadas, self.df_plan)
def promedios_alumno(legajo): merged_data, _, plan_data = get_materiascursadas_plan(request) manipulator = DataManipulator() scores = manipulator.get_scores_alumno(merged_data, legajo) return json.dumps([{ "nombre": row["periodo_semestre"], "valor": row["score_periodo"] } for index, row in DataTransformer().transform_scores_unicos( scores).iterrows()])
class CTGANSynthesizer(object): """Conditional Table GAN Synthesizer. This is the core class of the CTGAN project, where the different components are orchestrated together. For more details about the process, please check the [Modeling Tabular data using Conditional GAN](https://arxiv.org/abs/1907.00503) paper. Args: embedding_dim (int): Size of the random sample passed to the Generator. Defaults to 128. gen_dim (tuple or list of ints): Size of the output samples for each one of the Residuals. A Resiudal Layer will be created for each one of the values provided. Defaults to (256, 256). dis_dim (tuple or list of ints): Size of the output samples for each one of the Discriminator Layers. A Linear Layer will be created for each one of the values provided. Defaults to (256, 256). l2scale (float): Wheight Decay for the Adam Optimizer. Defaults to 1e-6. batch_size (int): Number of data samples to process in each step. """ def __init__(self, embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6, batch_size=500): self.embedding_dim = embedding_dim self.gen_dim = gen_dim self.dis_dim = dis_dim self.l2scale = l2scale self.batch_size = batch_size self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") def _apply_activate(self, data): data_t = [] st = 0 for item in self.transformer.output_info: if item[1] == 'tanh': ed = st + item[0] data_t.append(torch.tanh(data[:, st:ed])) st = ed elif item[1] == 'softmax': ed = st + item[0] data_t.append( functional.gumbel_softmax(data[:, st:ed], tau=0.2)) st = ed else: assert 0 return torch.cat(data_t, dim=1) def _cond_loss(self, data, c, m): loss = [] st = 0 st_c = 0 skip = False for item in self.transformer.output_info: if item[1] == 'tanh': st += item[0] skip = True elif item[1] == 'softmax': if skip: skip = False st += item[0] continue ed = st + item[0] ed_c = st_c + item[0] tmp = functional.cross_entropy(data[:, st:ed], torch.argmax(c[:, st_c:ed_c], dim=1), reduction='none') loss.append(tmp) st = ed st_c = ed_c else: assert 0 loss = torch.stack(loss, dim=1) return (loss * m).sum() / data.size()[0] def fit(self, train_data, weekday_percentage, discrete_columns=tuple(), epochs=300, log_frequency=True): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. epochs (int): Number of training epochs. Defaults to 300. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. """ self.transformer = DataTransformer() self.transformer.fit(train_data, discrete_columns) train_data = self.transformer.transform(train_data) data_sampler = Sampler(train_data, self.transformer.output_info) data_dim = self.transformer.output_dimensions self.cond_generator = ConditionalGenerator( train_data, self.transformer.output_info, log_frequency) self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim).to(self.device) discriminator = Discriminator(data_dim + self.cond_generator.n_opt, self.dis_dim).to(self.device) optimizerG = optim.Adam(self.generator.parameters(), lr=2e-4, betas=(0.5, 0.9), weight_decay=self.l2scale) optimizerD = optim.Adam(discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) assert self.batch_size % 2 == 0 mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device) std = mean + 1 steps_per_epoch = max(len(train_data) // self.batch_size, 1) print("Number of training steps per epoch is {} for batch size {}\n". format(steps_per_epoch, self.batch_size)) for epoch in range(epochs): for id_ in range(steps_per_epoch): fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample(self.batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype('float32')).to(self.device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake y_fake = discriminator(fake_cat) y_real = discriminator(real_cat) pen = discriminator.calc_gradient_penalty( real_cat, fake_cat, self.device) loss_d = -(torch.mean(y_real) - torch.mean(y_fake)) optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward() optimizerD.step() fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = discriminator(torch.cat([fakeact, c1], dim=1)) else: y_fake = discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) loss_g = -torch.mean(y_fake) + cross_entropy optimizerG.zero_grad() loss_g.backward() optimizerG.step() print("Epoch %d, Loss G: %.4f, Loss D: %.4f" % (epoch + 1, loss_g.detach().cpu(), loss_d.detach().cpu()), flush=True) self.evaluate_model(epoch, weekday_percentage) def evaluate_model(self, epoch, weekday_percentage): # create some genrated samples using the generator model gen_samples = self.sample(18000) gen_weekday_percentage = gen_samples['weekday'].value_counts( normalize=True) # sort series by the weekday name weekday_percentage = weekday_percentage.sort_index(ascending=True) gen_weekday_percentage = gen_weekday_percentage.sort_index( ascending=True) score = mean_squared_error(weekday_percentage, gen_weekday_percentage) print("Evaluation after epoch {} is {}\n".format(epoch + 1, score)) def sample(self, n): """Sample data similar to the training data. Args: n (int): Number of rows to sample. Returns: numpy.ndarray or pandas.DataFrame """ steps = n // self.batch_size + 1 data = [] for i in range(steps): mean = torch.zeros(self.batch_size, self.embedding_dim) std = mean + 1 fakez = torch.normal(mean=mean, std=std).to(self.device) condvec = self.cond_generator.sample_zero(self.batch_size) if condvec is None: pass else: c1 = condvec c1 = torch.from_numpy(c1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) data.append(fakeact.detach().cpu().numpy()) data = np.concatenate(data, axis=0) data = data[:n] return self.transformer.inverse_transform(data, None)
def fit(self, train_data, weekday_percentage, discrete_columns=tuple(), epochs=300, log_frequency=True): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. epochs (int): Number of training epochs. Defaults to 300. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. """ self.transformer = DataTransformer() self.transformer.fit(train_data, discrete_columns) train_data = self.transformer.transform(train_data) data_sampler = Sampler(train_data, self.transformer.output_info) data_dim = self.transformer.output_dimensions self.cond_generator = ConditionalGenerator( train_data, self.transformer.output_info, log_frequency) self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim).to(self.device) discriminator = Discriminator(data_dim + self.cond_generator.n_opt, self.dis_dim).to(self.device) optimizerG = optim.Adam(self.generator.parameters(), lr=2e-4, betas=(0.5, 0.9), weight_decay=self.l2scale) optimizerD = optim.Adam(discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) assert self.batch_size % 2 == 0 mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device) std = mean + 1 steps_per_epoch = max(len(train_data) // self.batch_size, 1) print("Number of training steps per epoch is {} for batch size {}\n". format(steps_per_epoch, self.batch_size)) for epoch in range(epochs): for id_ in range(steps_per_epoch): fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample(self.batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) real = torch.from_numpy(real.astype('float32')).to(self.device) if c1 is not None: fake_cat = torch.cat([fakeact, c1], dim=1) real_cat = torch.cat([real, c2], dim=1) else: real_cat = real fake_cat = fake y_fake = discriminator(fake_cat) y_real = discriminator(real_cat) pen = discriminator.calc_gradient_penalty( real_cat, fake_cat, self.device) loss_d = -(torch.mean(y_real) - torch.mean(y_fake)) optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward() optimizerD.step() fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = discriminator(torch.cat([fakeact, c1], dim=1)) else: y_fake = discriminator(fakeact) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) loss_g = -torch.mean(y_fake) + cross_entropy optimizerG.zero_grad() loss_g.backward() optimizerG.step() print("Epoch %d, Loss G: %.4f, Loss D: %.4f" % (epoch + 1, loss_g.detach().cpu(), loss_d.detach().cpu()), flush=True) self.evaluate_model(epoch, weekday_percentage)
def row_periodos(self, row): transformer = DataTransformer() row['fecha_periodo'] = transformer.fecha_periodo(row.fecha) row['periodo_semestre'] = transformer.periodo_semestre( row['fecha_periodo']) return row
class AlumnoTest(unittest.TestCase): def setUp(self): self.manipulator = DataManipulator() self.transformer = DataTransformer() with open('source/tests/json/api_carreras_materiascursadas.json', 'r') as archivo_alumnos: data = json.loads(archivo_alumnos.read()) self.df_materiascursadas = self.transformer.transform_materiascursadas_to_dataframe(data) with open('source/tests/json/api_carreras_planes_anio.json', 'r') as archivo_plan: data = json.loads(archivo_plan.read()) self.df_plan = self.transformer.transform_to_dataframe(data) with open('source/tests/json/api_carrera_planes_anio_cantidad_materias_necesarias.json', 'r') as archivo_plan: data = json.loads(archivo_plan.read()) self.cantidad_materias_necesarias = data["cantidad"] self.dataframe = self.transformer.merge_materias_con_plan( self.df_materiascursadas, self.df_plan) def test_porcentaje_nucleos(self): materias_alumno = self.manipulator.filtrar_materias_de_alumno(self.dataframe, "1") data = self.manipulator.porcentajes_aprobadas_areas( self.df_plan, materias_alumno) self.assertEqual(data['Inglés'], 50) # Tiene el 50% aprobado segun el mock def test_porcentaje_nucleos_dentro_periodo(self): materias_alumno = self.manipulator.filtrar_materias_de_alumno(self.dataframe, "1") filtradas_periodo = self.manipulator.filtrar_periodo(materias_alumno, '2018-02-10', '2020-02-10') data = self.manipulator.porcentajes_aprobadas_areas(self.df_plan, filtradas_periodo) self.assertEqual(data['Inglés'], 50) # Tiene el 50% aprobado segun el mock def test_porcentaje_nucleos_fuera_periodo(self): materias_alumno = self.manipulator.filtrar_materias_de_alumno(self.dataframe, "1") filtradas_periodo = self.manipulator.filtrar_periodo(materias_alumno, '2018-02-10', '2018-10-10') data = self.manipulator.porcentajes_aprobadas_areas(self.df_plan, filtradas_periodo) self.assertEqual(data['Inglés'], 0) # Tiene el 0% aprobado segun el mock def test_porcentaje_areas(self): materias_alumno = self.manipulator.filtrar_materias_de_alumno(self.dataframe, "1") data = self.manipulator.porcentajes_aprobadas_nucleos( self.df_plan, materias_alumno) self.assertEqual("%.2f" % data['I'], '33.33') # Segun el mock, tiene el 33.333333%, lo limito a 2 decimales def test_porcentaje_areas_dentro_periodo(self): materias_alumno = self.manipulator.filtrar_materias_de_alumno(self.dataframe, "1") filtradas_periodo = self.manipulator.filtrar_periodo(materias_alumno, '2018-02-10', '2020-10-10') data = self.manipulator.porcentajes_aprobadas_nucleos( self.df_plan, filtradas_periodo) self.assertEqual("%.2f" % data['I'], '33.33') # Segun el mock, tiene el 33.333333%, lo limito a 2 decimales def test_porcentaje_areas_fuera_periodo(self): materias_alumno = self.manipulator.filtrar_materias_de_alumno(self.dataframe, "1") filtradas_periodo = self.manipulator.filtrar_periodo(materias_alumno, '2018-02-10', '2018-10-10') data = self.manipulator.porcentajes_aprobadas_nucleos( self.df_plan, filtradas_periodo) self.assertEqual("%.2f" % data['I'], '0.00') # Segun el mock, tiene el 0% def test_scores(self): scores = self.manipulator.get_scores_alumno(self.dataframe, "1") scores_unicos = self.transformer.transform_scores_unicos(scores) # Como en el 2018 tiene un 2, un 7 y un 8, el score de ese semestre deberia ser (2+7+8)/3 = 5.67 resultado = "%.2f" % scores_unicos[scores_unicos.periodo_semestre == '2018-S2'].score_periodo esperado = "5.67" self.assertEqual(resultado, esperado) def test_cantidad_aprobadas(self): materias_alumno = self.manipulator.filtrar_materias_de_alumno( self.dataframe, "1") cantidad_aprobadas = self.manipulator.cantidad_aprobadas(materias_alumno) # Deberian ser 2 self.assertEqual(cantidad_aprobadas, 2) def test_porcentaje_carrera(self): materias_alumno = self.manipulator.filtrar_materias_de_alumno( self.dataframe, "1") cantidad_aprobadas = self.manipulator.cantidad_aprobadas(materias_alumno) # Deberian ser 2 porcentaje = self.manipulator.porcentaje_aprobadas(cantidad_aprobadas, self.cantidad_materias_necesarias) # (2/40)*100 = 5 self.assertEqual(porcentaje, 5)
# In[3]: # np.random.seed(1) # gmm = GaussianMixture(n_components=5, covariance_type='spherical') # gmm.means_ = np.array([[10], [20], [60], [80], [110]]) # gmm.covariances_ = np.array([[3], [3], [2], [2], [1]]) ** 2 # gmm.weights_ = np.array([0.2, 0.2, 0.2, 0.2, 0.2]) # X = gmm.sample(2000) # data = X[0] # data = (data - min(X[0]))/(max(X[0])-min(X[0])) # plt.hist(data, 200000, density=False, histtype='stepfilled', alpha=1) train_data = pd.read_csv('data/testone.csv') transformer = DataTransformer() discrete_columns = tuple() num_gen = train_data.shape[1] transformer.fit(train_data, discrete_columns) train_data = transformer.transform(train_data) # In[4]: class formerNet(torch.nn.Module): def __init__(self): """ In the constructor we instantiate five parameters and assign them as members. """ super().__init__() self.former = torch.nn.Sequential(torch.nn.Linear(Z_dim, h_dim),
class CTGANSynthesizer(object): """Conditional Table GAN Synthesizer. This is the core class of the CTGAN project, where the different components are orchestrated together. For more details about the process, please check the [Modeling Tabular data using Conditional GAN](https://arxiv.org/abs/1907.00503) paper. Args: embedding_dim (int): Size of the random sample passed to the Generator. Defaults to 128. gen_dim (tuple or list of ints): Size of the output samples for each one of the Residuals. A Resiudal Layer will be created for each one of the values provided. Defaults to (256, 256). dis_dim (tuple or list of ints): Size of the output samples for each one of the Discriminator Layers. A Linear Layer will be created for each one of the values provided. Defaults to (256, 256). l2scale (float): Wheight Decay for the Adam Optimizer. Defaults to 1e-6. batch_size (int): Number of data samples to process in each step. """ def __init__(self, embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6, batch_size=500): self.embedding_dim = embedding_dim self.gen_dim = gen_dim self.dis_dim = dis_dim self.l2scale = l2scale self.batch_size = batch_size self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") def _apply_activate(self, data): data_t = [] st = 0 for item in self.transformer.output_info: if item[1] == 'tanh': ed = st + item[0] data_t.append(torch.tanh(data[:, st:ed])) st = ed elif item[1] == 'softmax': ed = st + item[0] data_t.append( functional.gumbel_softmax(data[:, st:ed], tau=0.2)) st = ed else: assert 0 return torch.cat(data_t, dim=1) def _cond_loss(self, data, c, m): loss = [] st = 0 st_c = 0 skip = False for item in self.transformer.output_info: if item[1] == 'tanh': st += item[0] skip = True elif item[1] == 'softmax': if skip: skip = False st += item[0] continue ed = st + item[0] ed_c = st_c + item[0] tmp = functional.cross_entropy(data[:, st:ed], torch.argmax(c[:, st_c:ed_c], dim=1), reduction='none') loss.append(tmp) st = ed st_c = ed_c else: assert 0 loss = torch.stack(loss, dim=1) return (loss * m).sum() / data.size()[0] def fit(self, train_data, prefered_label, black_box_path, discrete_columns=tuple(), conditional_cols=None, epochs=300, log_frequency=True): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. epochs (int): Number of training epochs. Defaults to 300. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. """ self.prefered_label = prefered_label self.blackbox_model = pickle.load(open(black_box_path, "rb")) self.transformer = DataTransformer() self.transformer.fit(train_data, discrete_columns) train_data = self.transformer.transform(train_data) data_sampler = Sampler(train_data, self.transformer.output_info) data_dim = self.transformer.output_dimensions self.cond_generator = ConditionalGenerator( train_data, self.transformer.output_info, log_frequency, conditional_cols) self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim).to(self.device) discriminator = Discriminator(data_dim, self.dis_dim, 1).to(self.device) conditonal_discriminator = Discriminator(1 + self.cond_generator.n_opt, self.dis_dim).to(self.device) optimizerG = optim.Adam(self.generator.parameters(), lr=2e-4, betas=(0.5, 0.9), weight_decay=self.l2scale) optimizerD = optim.Adam(discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) optimizerconditionalD = optim.Adam( conditonal_discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) assert self.batch_size % 2 == 0 mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device) std = mean + 1 steps_per_epoch = max(len(train_data) // self.batch_size, 1) for i in range(epochs): flip_loss_list = [] real_flip_loss_list = [] for id_ in range(steps_per_epoch): fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample(self.batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) real = torch.from_numpy(real.astype('float32')).to(self.device) if c1 is not None: real_cat = real fake_cat = fakeact else: real_cat = real fake_cat = fake y_fake = discriminator(fake_cat) y_real = discriminator(real_cat) if c1 is not None: conditional_fake_cat = torch.cat([y_fake, c1], dim=1) conditional_real_cat = torch.cat([y_real, c2], dim=1) else: conditional_fake_cat = y_fake conditional_real_cat = y_real conditional_y_fake = conditonal_discriminator( conditional_fake_cat) conditional_y_real = conditonal_discriminator( conditional_real_cat) pen = discriminator.calc_gradient_penalty( real_cat, fake_cat, self.device) loss_d = -(torch.mean(y_real) - torch.mean(y_fake)) condtional_pen = conditonal_discriminator.calc_gradient_penalty( conditional_real_cat, conditional_fake_cat, self.device) loss_condtional_d = -(torch.mean(conditional_y_real) - torch.mean(conditional_y_fake)) optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward(retain_graph=True) optimizerD.step() optimizerconditionalD.zero_grad() condtional_pen.backward(retain_graph=True) loss_condtional_d.backward() optimizerconditionalD.step() fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = discriminator(fakeact) conditional_y_fake = conditonal_discriminator( torch.cat([y_fake, c1], dim=1)) else: y_fake = discriminator(fakeact) conditional_y_fake = conditonal_discriminator(y_fake) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) fake_act_inv = self.transformer.inverse_transform( fakeact.detach().cpu().numpy(), None) fake_act_inv = _factorize_categoricals(fake_act_inv, discrete_columns) fake_act_inv = xgb.DMatrix(data=fake_act_inv) black_box_pred_prob = self.blackbox_model.predict(fake_act_inv) #black_box_pred_prob = torch.from_numpy(np.stack([1-black_box_pred_prob,black_box_pred_prob], axis = -1)) #flip_loss = torch.nn.CrossEntropyLoss()(black_box_pred_prob, torch.tensor([self.prefered_label]).repeat(self.batch_size)) flip_loss = sum(-np.log(black_box_pred_prob)) / self.batch_size real_inv = self.transformer.inverse_transform( real.detach().cpu().numpy(), None) real_inv = _factorize_categoricals(real_inv, discrete_columns) real_inv = xgb.DMatrix(data=real_inv) real_pred_prob = self.blackbox_model.predict(real_inv) #black_box_pred_prob = torch.from_numpy(np.stack([1-black_box_pred_prob,black_box_pred_prob], axis = -1)) #flip_loss = torch.nn.CrossEntropyLoss()(black_box_pred_prob, torch.tensor([self.prefered_label]).repeat(self.batch_size)) real_flip_loss = sum(-np.log(real_pred_prob)) / self.batch_size loss_g = -torch.mean( conditional_y_fake) + cross_entropy + 10 * flip_loss #print(f"Base Loss:{-torch.mean(conditional_y_fake)}, Conditional Loss:{cross_entropy}, 10Flip Loss:{flip_loss}") flip_loss_list.append(flip_loss) real_flip_loss_list.append(real_flip_loss) optimizerG.zero_grad() loss_g.backward() optimizerG.step() print( f"Generated flip loss {np.mean(flip_loss_list)}, Real flip loss {np.mean(real_flip_loss_list)}" ) print("Condtional Cross Entropy Loss", cross_entropy) print( "Epoch %d, Loss G: %.4f, Loss D: %.4f, Loss Conditional D: %.4f" % (i + 1, loss_g.detach().cpu(), loss_d.detach().cpu(), loss_condtional_d.detach().cpu()), flush=True) def sample(self, n, col_index=None): """Sample data similar to the training data. Args: n (int): Number of rows to sample. Returns: numpy.ndarray or pandas.DataFrame """ steps = n // self.batch_size + 1 data = [] for i in range(steps): mean = torch.zeros(self.batch_size, self.embedding_dim) std = mean + 1 fakez = torch.normal(mean=mean, std=std).to(self.device) condvec, m1 = self.cond_generator.sample_zero(self.batch_size) m1 = torch.from_numpy(m1).to(self.device) if condvec is None: pass else: c1 = condvec if col_index != None: c1 = np.zeros_like(c1) c1[:, col_index] = 1 c1 = torch.from_numpy(c1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) data.append(fakeact.detach().cpu().numpy()) print(self._cond_loss(fake, c1, m1)) data = np.concatenate(data, axis=0) data = data[:n] return self.transformer.inverse_transform(data, None)
class TransformerTest(unittest.TestCase): def setUp(self): self.transformer = DataTransformer() def test_periodo_segundo_semestre(self): fecha = '2020-12-12' semestre_esperado = '2020-S2' semestre_resultado = self.transformer.periodo_semestre(fecha) self.assertEqual(semestre_esperado, semestre_resultado) def test_periodo_primer_semestre(self): fecha = '2020-05-12' semestre_esperado = '2020-S1' semestre_resultado = self.transformer.periodo_semestre(fecha) self.assertEqual(semestre_esperado, semestre_resultado) def test_forma_aprobacion_encontrada(self): forma_aprobacion = 'P' esperado = 'Promocion' resultado = self.transformer.get_forma_aprobacion(forma_aprobacion) self.assertEqual(esperado, resultado) def test_forma_aprobacion_no_encontrada(self): """ Si la forma de aprobación no existe, devuelve la pedida """ forma_aprobacion = 'Pw' esperado = 'Pw' resultado = self.transformer.get_forma_aprobacion(forma_aprobacion) self.assertEqual(esperado, resultado) def test_fecha_periodo_primer_semestre(self): """ Si la fecha esta comprendida entre el mes 4 y el 9, pertenece al primer semestre Porque las notas del segundo semestre pueden llegar a cerrarse en el mes 3. """ fecha = '2020-05-12' semestre_esperado = '2020-06-30' semestre_resultado = self.transformer.fecha_periodo(fecha) self.assertEqual(semestre_esperado, semestre_resultado) def test_fecha_periodo_segundo_semestre_anio_anterior(self): """ Si la fecha esta comprendida entre el mes 1 y el 3, pertenece al segundo semestre del año anterior """ fecha = '2020-02-12' semestre_esperado = '2019-12-31' semestre_resultado = self.transformer.fecha_periodo(fecha) self.assertEqual(semestre_esperado, semestre_resultado) def test_fecha_periodo_segundo_semestre_mismo_anio(self): """ Si la fecha esta comprendida entre el mes 10 y el 12, pertenece al segundo semestre de ese mismo año """ fecha = '2020-11-12' semestre_esperado = '2020-12-31' semestre_resultado = self.transformer.fecha_periodo(fecha) self.assertEqual(semestre_esperado, semestre_resultado) def test_timestamp_to_semester(self): timestamp = '2020-07-01 12:12:12' esperado = '2020-S2' resultado = self.transformer.transform_timestamp_to_semester(timestamp) self.assertEqual(esperado, resultado) def test_timestamp_to_semester_1st(self): timestamp = '2020-01-01 12:12:12' esperado = '2020-S1' resultado = self.transformer.transform_timestamp_to_semester(timestamp) self.assertEqual(esperado, resultado) def test_date_to_semester(self): fecha = date(2020, 7, 1) esperado = '2020-S2' resultado = self.transformer.transform_date_to_semester(fecha) self.assertEqual(esperado, resultado) def test_date_to_semester_1st(self): fecha = date(2020, 1, 1) esperado = '2020-S1' resultado = self.transformer.transform_date_to_semester(fecha) self.assertEqual(esperado, resultado) def test_timestamp_to_datetime(self): timestamp = '2020-01-01 12:12:12' esperado = datetime(2020, 1, 1, 12, 12, 12) resultado = self.transformer.transform_timestamp_to_datetime(timestamp) self.assertEqual(esperado, resultado) def test_merge_materias_con_promedio(self): pass def test_merge_materias_con_plan(self): pass def test_transform_scores_unicos(self): pass def test_materiascursadas_to_dataframe(self): pass def test_transform_to_dataframe(self): pass
def setUp(self): self.transformer = DataTransformer()
def fit(self, train_data, prefered_label, black_box_path, discrete_columns=tuple(), conditional_cols=None, epochs=300, log_frequency=True): """Fit the CTGAN Synthesizer models to the training data. Args: train_data (numpy.ndarray or pandas.DataFrame): Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame. discrete_columns (list-like): List of discrete columns to be used to generate the Conditional Vector. If ``train_data`` is a Numpy array, this list should contain the integer indices of the columns. Otherwise, if it is a ``pandas.DataFrame``, this list should contain the column names. epochs (int): Number of training epochs. Defaults to 300. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. """ self.prefered_label = prefered_label self.blackbox_model = pickle.load(open(black_box_path, "rb")) self.transformer = DataTransformer() self.transformer.fit(train_data, discrete_columns) train_data = self.transformer.transform(train_data) data_sampler = Sampler(train_data, self.transformer.output_info) data_dim = self.transformer.output_dimensions self.cond_generator = ConditionalGenerator( train_data, self.transformer.output_info, log_frequency, conditional_cols) self.generator = Generator( self.embedding_dim + self.cond_generator.n_opt, self.gen_dim, data_dim).to(self.device) discriminator = Discriminator(data_dim, self.dis_dim, 1).to(self.device) conditonal_discriminator = Discriminator(1 + self.cond_generator.n_opt, self.dis_dim).to(self.device) optimizerG = optim.Adam(self.generator.parameters(), lr=2e-4, betas=(0.5, 0.9), weight_decay=self.l2scale) optimizerD = optim.Adam(discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) optimizerconditionalD = optim.Adam( conditonal_discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9)) assert self.batch_size % 2 == 0 mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device) std = mean + 1 steps_per_epoch = max(len(train_data) // self.batch_size, 1) for i in range(epochs): flip_loss_list = [] real_flip_loss_list = [] for id_ in range(steps_per_epoch): fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None real = data_sampler.sample(self.batch_size, col, opt) else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) perm = np.arange(self.batch_size) np.random.shuffle(perm) real = data_sampler.sample(self.batch_size, col[perm], opt[perm]) c2 = c1[perm] fake = self.generator(fakez) fakeact = self._apply_activate(fake) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) real = torch.from_numpy(real.astype('float32')).to(self.device) if c1 is not None: real_cat = real fake_cat = fakeact else: real_cat = real fake_cat = fake y_fake = discriminator(fake_cat) y_real = discriminator(real_cat) if c1 is not None: conditional_fake_cat = torch.cat([y_fake, c1], dim=1) conditional_real_cat = torch.cat([y_real, c2], dim=1) else: conditional_fake_cat = y_fake conditional_real_cat = y_real conditional_y_fake = conditonal_discriminator( conditional_fake_cat) conditional_y_real = conditonal_discriminator( conditional_real_cat) pen = discriminator.calc_gradient_penalty( real_cat, fake_cat, self.device) loss_d = -(torch.mean(y_real) - torch.mean(y_fake)) condtional_pen = conditonal_discriminator.calc_gradient_penalty( conditional_real_cat, conditional_fake_cat, self.device) loss_condtional_d = -(torch.mean(conditional_y_real) - torch.mean(conditional_y_fake)) optimizerD.zero_grad() pen.backward(retain_graph=True) loss_d.backward(retain_graph=True) optimizerD.step() optimizerconditionalD.zero_grad() condtional_pen.backward(retain_graph=True) loss_condtional_d.backward() optimizerconditionalD.step() fakez = torch.normal(mean=mean, std=std) condvec = self.cond_generator.sample(self.batch_size) if condvec is None: c1, m1, col, opt = None, None, None, None else: c1, m1, col, opt = condvec c1 = torch.from_numpy(c1).to(self.device) m1 = torch.from_numpy(m1).to(self.device) fakez = torch.cat([fakez, c1], dim=1) fake = self.generator(fakez) fakeact = self._apply_activate(fake) if c1 is not None: y_fake = discriminator(fakeact) conditional_y_fake = conditonal_discriminator( torch.cat([y_fake, c1], dim=1)) else: y_fake = discriminator(fakeact) conditional_y_fake = conditonal_discriminator(y_fake) if condvec is None: cross_entropy = 0 else: cross_entropy = self._cond_loss(fake, c1, m1) fake_act_inv = self.transformer.inverse_transform( fakeact.detach().cpu().numpy(), None) fake_act_inv = _factorize_categoricals(fake_act_inv, discrete_columns) fake_act_inv = xgb.DMatrix(data=fake_act_inv) black_box_pred_prob = self.blackbox_model.predict(fake_act_inv) #black_box_pred_prob = torch.from_numpy(np.stack([1-black_box_pred_prob,black_box_pred_prob], axis = -1)) #flip_loss = torch.nn.CrossEntropyLoss()(black_box_pred_prob, torch.tensor([self.prefered_label]).repeat(self.batch_size)) flip_loss = sum(-np.log(black_box_pred_prob)) / self.batch_size real_inv = self.transformer.inverse_transform( real.detach().cpu().numpy(), None) real_inv = _factorize_categoricals(real_inv, discrete_columns) real_inv = xgb.DMatrix(data=real_inv) real_pred_prob = self.blackbox_model.predict(real_inv) #black_box_pred_prob = torch.from_numpy(np.stack([1-black_box_pred_prob,black_box_pred_prob], axis = -1)) #flip_loss = torch.nn.CrossEntropyLoss()(black_box_pred_prob, torch.tensor([self.prefered_label]).repeat(self.batch_size)) real_flip_loss = sum(-np.log(real_pred_prob)) / self.batch_size loss_g = -torch.mean( conditional_y_fake) + cross_entropy + 10 * flip_loss #print(f"Base Loss:{-torch.mean(conditional_y_fake)}, Conditional Loss:{cross_entropy}, 10Flip Loss:{flip_loss}") flip_loss_list.append(flip_loss) real_flip_loss_list.append(real_flip_loss) optimizerG.zero_grad() loss_g.backward() optimizerG.step() print( f"Generated flip loss {np.mean(flip_loss_list)}, Real flip loss {np.mean(real_flip_loss_list)}" ) print("Condtional Cross Entropy Loss", cross_entropy) print( "Epoch %d, Loss G: %.4f, Loss D: %.4f, Loss Conditional D: %.4f" % (i + 1, loss_g.detach().cpu(), loss_d.detach().cpu(), loss_condtional_d.detach().cpu()), flush=True)