def __init__(self): self.stock_code = [] self.stock_name = [] file_name = './data/stockslist.txt' self.read_stock_data(file_name) self.help_subject = ["xem", "xem cho tôi", "cho tôi xem"] self.chu_ngu = [ "tôi có nhu cầu ", "tao muốn", "", "mình cần", "tôi cần", "mình muốn", "đặt lênh" ] self.actions = [ "mua", "bán", "chuyển nhượng", "sang tên", "đầu tư thêm", "gom", "thêm", "mua thêm" ] self.amounts = ["", "khối lượng ", "số lượng"] self.sub_amounts = ["", "cái", "cổ phiếu", "cổ"] self.words = ["tôi muốn", "bán", "mã", "khối lương", "giá"] self.price_prefix = ["giá", "", "với giá", "tại"] self.currency_unit = ["", "nghìn đồng", "vnđ", "nghìn"] self.prefix = ["nhận định", "tình hình", "thông tin", ""] self.suffix = ["biến động", "lên xuống"] self.quesword = ["thế nào", "ra sao", ""] self.infix = ["mã chứng khoán", "mã", "cổ phiếu", "mã cổ phiếu"] self.balance_word = ["", "còn dư", "dư"] self.stock_prefix = ["", "mã", "số"] self.conjunction = ["", "và"] self.advice_prefix = ["có", "nên", "có nên"] self.cash_prefix = ["tài khoản"] self.cash_infix = ["đuôi"] self.check_stopword = DataCleaner()
def test_get_encoding(self): input_path = BASE_DIR + '/input/non_unicode.csv' dc = DataCleaner(input_path) encoding = dc._get_file_encoding(input_path) self.assertNotEqual(encoding, 'utf-8')
def test_get_api_response(self): """Realiza un búsquedas sobre una entidad territorial.""" entity = 'localidad' data_test = { 'localidades': [{ 'nombre': 'laferrere', 'aplanar': True, 'provincia': 'buenos aires', 'max': 1 }] } res_test = [{ 'localidades': [{ u'departamento_nombre': u'La Matanza', u'tipo': u'Entidad (E)', u'centroide_lon': -58.592533, u'municipio_nombre': u'La Matanza', u'provincia_id': u'06', u'departamento_id': u'06427', u'id': u'06427010004', u'centroide_lat': -34.746838, u'provincia_nombre': u'Buenos Aires', u'nombre': u'GREGORIO DE LAFERRERE', u'municipio_id': u'060427' }] }] input_path = get_input('normalize_unidad_territorial') dc = DataCleaner(input_path) res = dc._get_api_response(entity, data_test) self.assertEqual(res_test, res)
def clean_file(input_path, output_path): """Limpia los datos del input creando un nuevo archivo limpio.""" print("Comenzando limpieza...") dc = DataCleaner(input_path, encoding='Latin 1') custom_cleaning_before_rules(dc) dc.clean(RULES) custom_cleaning_after_rules(dc) y = 2015 dc.df.hasta = pd.to_datetime(dc.df.hasta, yearfirst=True) dc.df.desde = pd.to_datetime(dc.df.desde, yearfirst=True) gii = dc.df.desde.dt.year == y gif = dc.df.hasta.dt.year == y gis = (dc.df.desde.dt.year < y) & (dc.df.hasta.dt.year > y) givig = gii | gif | gis df1 = dc.df[givig] gin2016 = dc.df.desde.dt.year == 2016 df2 = dc.df[~gin2016] df1.set_index(df1.columns[0]).to_csv( DEFAULT_OUTPUT_PATH_VIGENTE, encoding=dc.OUTPUT_ENCODING, separator=dc.OUTPUT_SEPARATOR, quotechar=dc.OUTPUT_QUOTECHAR) df2.set_index(df2.columns[0]).to_csv( DEFAULT_OUTPUT_PATH1_HISTORICO, encoding=dc.OUTPUT_ENCODING, separator=dc.OUTPUT_SEPARATOR, quotechar=dc.OUTPUT_QUOTECHAR) print("Limpieza finalizada exitosamente!")
def word_notin_vocab(): # read vocab print ('start getting /word ') input_size = 16 window_size = 2 embedding_dim = 50 batch_size_word2vec = 8 file_to_save_word2vec_data = 'word2vec_ver6/ws-' + str(window_size) + '-embed-' + str(embedding_dim) + 'batch_size-' + str(batch_size_word2vec) + '.pkl' vectors, word2int, int2word = read_trained_data(file_to_save_word2vec_data) # read all sentences in unknown file texts = [] print ("Current day: %d" % now.day) print ("Current year: %d" % now.year) print ("Current month: %d" % now.month) with open(unknown_file_name, encoding="utf8") as file: for line in file : temp = line.split(",",1) temp[1] = temp[1].lower() texts.append(temp[1]) #list of train_word words_notin_vocab = [] for text in texts: data_cleaner = DataCleaner(text) all_words = data_cleaner.separate_sentence() for word in all_words: if word not in word2int: words_notin_vocabi = [] words_notin_vocabi.append(word) words_notin_vocabi.append(text) words_notin_vocab.append(words_notin_vocabi) return jsonify(results = words_notin_vocab)
def test_string_peg_split(self): input_path = get_input("string_separable_complejo") output_path = get_output("string_separable_complejo") # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) parsed_df = dc.string_peg_split( "solicitante", """ allowed_char = anything:x ?(x not in '1234567890() ') nombre = ~('DNI') <allowed_char+>:n ws -> n.strip() number = <digit+>:num -> int(num) nom_comp = <nombre+>:nc -> nc.strip() cargo = '(' <nombre+>:c ')' -> c.strip() dni = ','? ws 'DNI' ws number:num -> num values = nom_comp:n ws cargo?:c ws dni?:d ws anything* -> [n, c, d] """, ["nombre", "cargo", "dni"]) res_1 = nan_safe_list(parsed_df["solicitante_nombre"]) res_2 = nan_safe_list(parsed_df["solicitante_cargo"]) res_3 = nan_safe_list(parsed_df["solicitante_dni"]) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp_1 = nan_safe_list(df["solicitante_nombre"]) exp_2 = nan_safe_list(df["solicitante_cargo"]) exp_3 = nan_safe_list(df["solicitante_dni"]) self.assertEqual(res_1, exp_1) self.assertEqual(res_2, exp_2) self.assertEqual(res_3, exp_3)
def test_string_peg_split(self): input_path = get_input("string_separable_complejo") output_path = get_output("string_separable_complejo") # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) parsed_df = dc.string_peg_split( "solicitante", """ allowed_char = anything:x ?(x not in '1234567890() ') nombre = ~('DNI') <allowed_char+>:n ws -> n.strip() number = <digit+>:num -> int(num) nom_comp = <nombre+>:nc -> nc.strip() cargo = '(' <nombre+>:c ')' -> c.strip() dni = ','? ws 'DNI' ws number:num -> num values = nom_comp:n ws cargo?:c ws dni?:d ws anything* -> [n, c, d] """, ["nombre", "cargo", "dni"] ) res_1 = nan_safe_list(parsed_df["solicitante_nombre"]) res_2 = nan_safe_list(parsed_df["solicitante_cargo"]) res_3 = nan_safe_list(parsed_df["solicitante_dni"]) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp_1 = nan_safe_list(df["solicitante_nombre"]) exp_2 = nan_safe_list(df["solicitante_cargo"]) exp_3 = nan_safe_list(df["solicitante_dni"]) self.assertEqual(res_1, exp_1) self.assertEqual(res_2, exp_2) self.assertEqual(res_3, exp_3)
def clean_file(input_path, output_path): """Limpia los datos del input creando un nuevo archivo limpio.""" print("Comenzando limpieza...") dc = DataCleaner(input_path, encoding='latin1') custom_cleaning_before_rules(dc) dc.clean(RULES) custom_cleaning_after_rules(dc) y = 2015 dc.df.hasta = pd.to_datetime(dc.df.hasta, yearfirst=True) dc.df.desde = pd.to_datetime(dc.df.desde, yearfirst=True) gii = dc.df.desde.dt.year == y gif = dc.df.hasta.dt.year == y gis = (dc.df.desde.dt.year < y) & (dc.df.hasta.dt.year > y) givig = gii | gif | gis df1 = dc.df[givig].copy() print("La cantida de registros 2015 es: ") print(givig.sum()) gin2016 = dc.df.desde.dt.year == 2016 df2 = dc.df[~gin2016].copy() print("La cantida de registros historicos es: ") print((~gin2016).sum()) df1.to_csv( DEFAULT_OUTPUT_PATH_VIGENTE, encoding=dc.OUTPUT_ENCODING, separator=dc.OUTPUT_SEPARATOR, quotechar=dc.OUTPUT_QUOTECHAR, index=False) df2.to_csv( DEFAULT_OUTPUT_PATH1_HISTORICO, encoding=dc.OUTPUT_ENCODING, separator=dc.OUTPUT_SEPARATOR, quotechar=dc.OUTPUT_QUOTECHAR, index=False) print("Limpieza finalizada exitosamente!")
def tokenize(self): data_cleaner = DataCleaner(self.corpus) all_word, all_sentence_split = data_cleaner.clean_content() print ('all_word') print (all_word) # print ('all_sentence_split') # print (all_sentence_split) return all_word, all_sentence_split
def test_shapefile_to_geojson(self): output_path = BASE_DIR + '/output/localidades.geojson' dc = DataCleaner(self.input_path) dc.save(output_path) geojson_df = gpd.read_file(output_path, driver='GeoJSON') self.assertEqual(set(geojson_df.columns), set(dc.df.columns))
def test_shapefile_to_csv(self): output_path = BASE_DIR + '/output/localidades.csv' dc = DataCleaner(self.input_path) dc.save(output_path) csv_df = pd.read_csv(output_path) self.assertEqual(set(csv_df.columns), set(dc.df.columns))
def test_remover_columnas(self): input_path = get_input("nombre_propio") field = "dependencia" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) df = dc.remover_columnas(field) self.assertNotIn(field, df.columns)
def test_remover_filas_duplicadas(self): input_path = get_input("filas_duplicadas") output_path = get_output("filas_duplicadas") dc = DataCleaner(input_path) df = dc.remover_filas_duplicadas(all_fields=True) expected_df = DataCleaner(output_path).df self.assertTrue(df.equals(expected_df))
def test_nombre_propio_keep_original(self): input_path = get_input("nombre_propio") field = "dependencia" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) dc.nombre_propio(field, keep_original=True, inplace=True) self.assertIn("dependencia_normalizado", dc.df.columns)
def test_remover_filas_duplicadas_based_on_field(self): input_path = get_input("filas_duplicadas_con_id") output_path = get_output("filas_duplicadas_con_id") dc = DataCleaner(input_path) df = dc.remover_filas_duplicadas(all_fields=False, fields=["id"]) expected_df = DataCleaner(output_path).df self.assertTrue(df.equals(expected_df))
def test_float_integrity(self): output_path = BASE_DIR + '/output/clean_coordinates.csv' dc = DataCleaner(self.input_path) dc.clean_file([], output_path) raw_input = raw_csv(self.input_path) raw_output = raw_csv(output_path) self.assertEqual(raw_input, raw_output)
def clean_file(input_path, output_path): """Limpia los datos del input creando un nuevo archivo limpio.""" print("Comenzando limpieza...") dc = DataCleaner(input_path) custom_cleaning_before_rules(dc) dc.clean(RULES) custom_cleaning_after_rules(dc) dc.save(output_path) print("Limpieza finalizada exitosamente!")
def test_fecha_completa_keep_original(self): input_path = get_input("fecha_completa") field = "fecha_completa_audiencia" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) dc.fecha_completa(field, "DD-MM-YYYY HH:mm", keep_original=True, inplace=True) self.assertIn("isodatetime_fecha_completa_audiencia", dc.df.columns)
def test_shapefile_to_kml(self): output_path = BASE_DIR + '/output/localidades.kml' dc = DataCleaner(self.input_path) dc.save(output_path) with open(output_path) as kml_file: kml = kml_file.read() assert kml.startswith('<?xml version="1.0" encoding="utf-8" ?>')
def test_integration_case_1(self): dc = DataCleaner(get_input("integration")) dc.clean_file(rules, get_output("temp_integration")) df = pd.read_csv(get_output("temp_integration")) df_exp = pd.read_csv(get_output("integration")) self.assertEqual(set(df.columns), set(df_exp.columns)) for col in df.columns: self.assertEqual(nan_safe_list(df[col]), nan_safe_list(df_exp[col]))
def test_string_regex_substitute(self): input_path = get_input("regex_sub") output_path = get_output("regex_sub") # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) series = dc.string_regex_substitute("lugar_audiencia", "\d+.*$", "") res = list(series) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp = list(df["lugar_audiencia"]) self.assertEqual(res, exp)
def apply_rules_to_dataset(csv_input, csv_output, dataset_file_rules, parse_options): with warnings.catch_warnings(record=True) as catched_warnings: dc = DataCleaner(csv_input, **parse_options) dc.clean(dataset_file_rules['data-cleaner-rules']) dc.df.set_index(dc.df.columns[0]).to_csv( csv_output, encoding=dc.OUTPUT_ENCODING, sep=dc.OUTPUT_SEPARATOR, quotechar=dc.OUTPUT_QUOTECHAR ) return catched_warnings
def test_string_regex_substitute(self): input_path = get_input("regex_sub") output_path = get_output("regex_sub") # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) series = dc.string_regex_substitute("lugar_audiencia", "\d+.*$", "") res = list(series) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") print(series) exp = list(df["lugar_audiencia"]) self.assertEqual(res, exp)
def setUp(self): """ Creates a new database for the unit test to use """ app.config.from_pyfile('test_config.py') db.init_app(app) db.create_all() self.dataCleaner = DataCleaner(test_config.SQLALCHEMY_DATABASE_URI) self.app = app.test_client() return self.app
def test_fecha_separada(self): input_path = get_input("fecha_separada") output_path = get_output("fecha_separada") # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) series = dc.fecha_separada([["fecha_audiencia", "DD-MM-YYYY"], ["hora_audiencia", "HH:mm"]], "audiencia") res = nan_to_empty_string_list(series) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp = nan_to_empty_string_list(df["isodatetime_audiencia"]) self.assertEqual(res, exp)
def test_simplify_geometry(self): input_path = BASE_DIR + '/input/localidades/localidades.shp' original = BASE_DIR + '/output/localidades-original.csv' simplified = BASE_DIR + '/output/localidades-simplificado.csv' dc = DataCleaner(input_path) dc.save(original) # CSV con geometría original. dc = DataCleaner(input_path) dc.simplificar_geometria() dc.save(simplified) # CSV con geometría simplificada. import filecmp files_are_equal = filecmp.cmp(original, simplified, shallow=False) self.assertFalse(files_are_equal)
def test_nombre_propio(self): input_path = get_input("nombre_propio") output_path = get_output("nombre_propio") field = "dependencia" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) series = dc.nombre_propio(field) res = list(series) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp = list(df[field]) self.assertEqual(res, exp)
def test_fecha_simple_mes(self): input_path = get_input("fecha_mes") output_path = get_output("fecha_mes") field = "fecha_audiencia" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) series = dc.fecha_simple(field, "MM-YYYY") res = nan_to_empty_string_list(series) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp = nan_to_empty_string_list(df["isodate_" + field]) self.assertEqual(res, exp)
def test_reemplazar_string(self): input_path = get_input("reemplazar_string") output_path = get_output("reemplazar_string") field = "dependencia" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) series = dc.reemplazar_string(field, {"Jaguarete": ["ABBA", "ABBBA"]}) res = list(series) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp = list(df[field]) self.assertEqual(res, exp)
def test_mail_format(self): input_path = get_input("mail_format") output_path = get_output("mail_format") field = "mail" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) series = dc.mail_format(field) res = list(series) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp = list(df[field]) self.assertEqual(res, exp)
def test_string_normal(self): input_path = get_input("string_normal") output_path = get_output("string_normal") field = "lugar_audiencia" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) series = dc.string(field) res = list(series) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp = list(df[field]) self.assertEqual(res, exp)
def test_nombre_propio_lower_words(self): input_path = get_input("nombre_propio") output_path = get_output("nombre_propio_lower_words") field = "dependencia" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) series = dc.nombre_propio(field, lower_words=["nación", "de", "la"], keep_original=True, inplace=True) res = list(series) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp = list(df[field]) self.assertEqual(res, exp)
def test_reemplazar(self): input_path = get_input("reemplazar") output_path = get_output("reemplazar") field = "tipo" # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) series = dc.reemplazar(field, {"Servicios": ["Serv"], "Otros": ["Otro", "Loc"]}) res = list(series) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp = list(df[field]) self.assertEqual(res, exp)
def test_fecha_separada(self): input_path = get_input("fecha_separada") output_path = get_output("fecha_separada") # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) series = dc.fecha_separada( [["fecha_audiencia", "DD-MM-YYYY"], ["hora_audiencia", "HH:mm"]], "audiencia") res = nan_to_empty_string_list(series) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp = nan_to_empty_string_list(df["isodatetime_audiencia"]) self.assertEqual(res, exp)
def test_build_data(self): """Construye un diccionario con unidades territoriales.""" entity = str('localidad') field = str('nombre') test_data = { 'localidades': [{ 'nombre': 'laferrere', 'aplanar': True, 'max': 1 }] } input_path = get_input('normalize_unidad_territorial') dc = DataCleaner(input_path) data = dc._build_data(field, entity, filters={}) self.assertEqual(data, test_data)
def test_string_simple_split(self): input_path = get_input("string_separable_simple") output_path = get_output("string_separable_simple") # obtengo el resultado de limpiar el csv dc = DataCleaner(input_path) parsed_df = dc.string_simple_split("sujeto_obligado", [", Cargo:", "Cargo:"], ["nombre", "cargo"]) res_1 = nan_safe_list(parsed_df["sujeto_obligado_nombre"]) res_2 = nan_safe_list(parsed_df["sujeto_obligado_cargo"]) # cargo el csv limpio para comparar df = pd.read_csv(output_path, encoding="utf-8") exp_1 = nan_safe_list(df["sujeto_obligado_nombre"]) exp_2 = nan_safe_list(df["sujeto_obligado_cargo"]) self.assertEqual(res_1, exp_1) self.assertEqual(res_2, exp_2)
def test_parsed_entity_level(self): """Pluraliza una unidad territorial.""" test_string = [("provincia", "provincias"), ("departamento", "departamentos"), ("municipio", "municipios"), ("localidad", "localidades")] for (inp, outp) in test_string: self.assertEqual(DataCleaner._plural_entity_level(inp), outp)
def test_invalidate_filters(self): """Debería encontrar filtros ínvalidos.""" entity = 'departamento' test_filters = { "provincia_field": "provincia", "departamento_field": "provincia" } self.assertFalse(DataCleaner._validate_filters(entity, test_filters))
sc.addPyFile("/home/ubuntu/data_utilities/data_cleaner.py") from data_cleaner import DataCleaner # Read input file and create DataCleaner object dc = DataCleaner(sqlCtx) df = dc.read_csv("/home/ubuntu/csv/so_bq_questions.csv") # Remove records that lack a question_id, questioner_id, question_body_length, questioner_reputation, or questioner_up_votes df = dc.drop_na_values(dataframe=df,field_names=["question_id","questioner_id","question_body_length","questioner_reputation","questioner_up_votes"]) # Fix data types df = dc.fix_data_type(dataframe=df, field_names=["question_body_length","question_codeblock_count","answer_count","question_comment_count","questioner_id","questioner_up_votes", "questioner_down_votes","accepted_answer_id","questioner_reputation","questioner_views","max_answer_score"], data_type='int') df = dc.fix_data_type(dataframe=df, field_names=["questioner_account_creation_date","min_answer_creation_date"], data_type="timestamp") df = dc.set_tag_count(dataframe=df, base_field="question_tags", count_field="question_tags_count") df = dc.set_years_between_dates(dataframe=df, start_date="questioner_account_creation_date", end_date="question_creation_date", years_between_field="questioner_years_since_joining") df = dc.fill_na(dataframe=df, field_name="question_favorite_count", fill_value=0) # Create categorical feature question_view_quantile from question_view_count df = dc.create_categorical_feature(dataframe=df, base_field="question_view_count", categorical_field="question_view_quantile", levels=10, increment=0) df = dc.create_binary_feature(dataframe=df, base_field="question_favorite_count", binary_field="question_favorited") df = dc.create_binary_feature(dataframe=df, base_field="answer_count", binary_field="has_answer") df.select("answer_count","has_answer").show(20)
# NO FUNCIONA BIEN DEJO EL CAMPO COMO ESTA # {"reemplazar": [ # { # "field": "horario_de_atencion", # "replacements": {"LUN": ["lunes", "lun"], # "MAR": ["martes", "mar"], # "MIE": ["miercoles", "mie", u"miércoles"], # "JUE": ["jueves", "jue"], # "VIE": ["viernes", "vie"], # "SAB": ["sabado", "sab", "sábado","sáb"], # "DOM": ["domingo", "dom"], # "-": [" a "], # "_": [" y ", ","], # "": ["hs", "hs."], # "00:00-23:59": ["24"] # }, # "keep_original": True # } # ]} ] dc = DataCleaner(input_path) # No implementados aun van derecho con Pandas dc.df['coordenadas_latitud'] = dc.df.recurso.str.split("\s+", 1, expand=True)[0] dc.df['coordenadas_longitud'] = dc.df.recurso.str.split("\s+", 1, expand=True)[1] dc.df['mail'] = dc.df['mail'].str.lower() dc.df['sitio_web'] = dc.df.mail.str.findall('www[^ \s]+').str.join(",") dc.df['mail'] = dc.df.mail.str.findall('[a-z_0-9\.]+@[a-z_0-9\.]+').str.join(",") dc.clean_file(rules, output_path)
def clean_file(input_path, output_path): """Limpia los datos del input creando un nuevo archivo limpio.""" print("Comenzando limpieza...") dc = DataCleaner(input_path) dc.clean_file(RULES, output_path) print("Limpieza finalizada exitosamente!")