def test_float_integrity(self): output_path = BASE_DIR + '/output/clean_coordinates.csv' dc = DataCleaner(self.input_path) dc.clean_file([], output_path) raw_input = raw_csv(self.input_path) raw_output = raw_csv(output_path) self.assertEqual(raw_input, raw_output)
def test_integration_case_1(self): dc = DataCleaner(get_input("integration")) dc.clean_file(rules, get_output("temp_integration")) df = pd.read_csv(get_output("temp_integration")) df_exp = pd.read_csv(get_output("integration")) self.assertEqual(set(df.columns), set(df_exp.columns)) for col in df.columns: self.assertEqual(nan_safe_list(df[col]), nan_safe_list(df_exp[col]))
# { # "field": "horario_de_atencion", # "replacements": {"LUN": ["lunes", "lun"], # "MAR": ["martes", "mar"], # "MIE": ["miercoles", "mie", u"miércoles"], # "JUE": ["jueves", "jue"], # "VIE": ["viernes", "vie"], # "SAB": ["sabado", "sab", "sábado","sáb"], # "DOM": ["domingo", "dom"], # "-": [" a "], # "_": [" y ", ","], # "": ["hs", "hs."], # "00:00-23:59": ["24"] # }, # "keep_original": True # } # ]} ] dc = DataCleaner(input_path) # No implementados aun van derecho con Pandas dc.df['coordenadas_latitud'] = dc.df.recurso.str.split("\s+", 1, expand=True)[0] dc.df['coordenadas_longitud'] = dc.df.recurso.str.split("\s+", 1, expand=True)[1] dc.df['mail'] = dc.df['mail'].str.lower() dc.df['sitio_web'] = dc.df.mail.str.findall('www[^ \s]+').str.join(",") dc.df['mail'] = dc.df.mail.str.findall('[a-z_0-9\.]+@[a-z_0-9\.]+').str.join( ",") dc.clean_file(rules, output_path)
def clean_file(input_path, output_path): """Limpia los datos del input creando un nuevo archivo limpio.""" print("Comenzando limpieza...") dc = DataCleaner(input_path) dc.clean_file(RULES, output_path) print("Limpieza finalizada exitosamente!")
# NO FUNCIONA BIEN DEJO EL CAMPO COMO ESTA # {"reemplazar": [ # { # "field": "horario_de_atencion", # "replacements": {"LUN": ["lunes", "lun"], # "MAR": ["martes", "mar"], # "MIE": ["miercoles", "mie", u"miércoles"], # "JUE": ["jueves", "jue"], # "VIE": ["viernes", "vie"], # "SAB": ["sabado", "sab", "sábado","sáb"], # "DOM": ["domingo", "dom"], # "-": [" a "], # "_": [" y ", ","], # "": ["hs", "hs."], # "00:00-23:59": ["24"] # }, # "keep_original": True # } # ]} ] dc = DataCleaner(input_path) # No implementados aun van derecho con Pandas dc.df['coordenadas_latitud'] = dc.df.recurso.str.split("\s+", 1, expand=True)[0] dc.df['coordenadas_longitud'] = dc.df.recurso.str.split("\s+", 1, expand=True)[1] dc.df['mail'] = dc.df['mail'].str.lower() dc.df['sitio_web'] = dc.df.mail.str.findall('www[^ \s]+').str.join(",") dc.df['mail'] = dc.df.mail.str.findall('[a-z_0-9\.]+@[a-z_0-9\.]+').str.join(",") dc.clean_file(rules, output_path)