Exemple #1
0
    def __init__(self):
        self.stock_code = []
        self.stock_name = []
        file_name = './data/stockslist.txt'
        self.read_stock_data(file_name)
        self.help_subject = ["xem", "xem cho tôi", "cho tôi xem"]
        self.chu_ngu = [
            "tôi có nhu cầu ", "tao muốn", "", "mình cần", "tôi cần",
            "mình muốn", "đặt lênh"
        ]
        self.actions = [
            "mua", "bán", "chuyển nhượng", "sang tên", "đầu tư thêm", "gom",
            "thêm", "mua thêm"
        ]
        self.amounts = ["", "khối lượng ", "số lượng"]
        self.sub_amounts = ["", "cái", "cổ phiếu", "cổ"]
        self.words = ["tôi muốn", "bán", "mã", "khối lương", "giá"]
        self.price_prefix = ["giá", "", "với giá", "tại"]
        self.currency_unit = ["", "nghìn đồng", "vnđ", "nghìn"]
        self.prefix = ["nhận định", "tình hình", "thông tin", ""]
        self.suffix = ["biến động", "lên xuống"]
        self.quesword = ["thế nào", "ra sao", ""]
        self.infix = ["mã chứng khoán", "mã", "cổ phiếu", "mã cổ phiếu"]
        self.balance_word = ["", "còn dư", "dư"]
        self.stock_prefix = ["", "mã", "số"]
        self.conjunction = ["", "và"]
        self.advice_prefix = ["có", "nên", "có nên"]

        self.cash_prefix = ["tài khoản"]
        self.cash_infix = ["đuôi"]

        self.check_stopword = DataCleaner()
    def test_get_encoding(self):
        input_path = BASE_DIR + '/input/non_unicode.csv'

        dc = DataCleaner(input_path)
        encoding = dc._get_file_encoding(input_path)

        self.assertNotEqual(encoding, 'utf-8')
    def test_get_api_response(self):
        """Realiza un búsquedas sobre una entidad territorial."""
        entity = 'localidad'
        data_test = {
            'localidades': [{
                'nombre': 'laferrere',
                'aplanar': True,
                'provincia': 'buenos aires',
                'max': 1
            }]
        }
        res_test = [{
            'localidades': [{
                u'departamento_nombre': u'La Matanza',
                u'tipo': u'Entidad (E)',
                u'centroide_lon': -58.592533,
                u'municipio_nombre': u'La Matanza',
                u'provincia_id': u'06',
                u'departamento_id': u'06427',
                u'id': u'06427010004',
                u'centroide_lat': -34.746838,
                u'provincia_nombre': u'Buenos Aires',
                u'nombre': u'GREGORIO DE LAFERRERE',
                u'municipio_id': u'060427'
            }]
        }]

        input_path = get_input('normalize_unidad_territorial')
        dc = DataCleaner(input_path)
        res = dc._get_api_response(entity, data_test)
        self.assertEqual(res_test, res)
def clean_file(input_path, output_path):
    """Limpia los datos del input creando un nuevo archivo limpio."""
    print("Comenzando limpieza...")
    dc = DataCleaner(input_path, encoding='Latin 1')
    custom_cleaning_before_rules(dc)
    dc.clean(RULES)
    custom_cleaning_after_rules(dc)
    y = 2015
    dc.df.hasta = pd.to_datetime(dc.df.hasta, yearfirst=True)
    dc.df.desde = pd.to_datetime(dc.df.desde, yearfirst=True)
    gii = dc.df.desde.dt.year == y
    gif = dc.df.hasta.dt.year == y
    gis = (dc.df.desde.dt.year < y) & (dc.df.hasta.dt.year > y)
    givig = gii | gif | gis
    df1 = dc.df[givig]
    gin2016 = dc.df.desde.dt.year == 2016
    df2 = dc.df[~gin2016]
    df1.set_index(df1.columns[0]).to_csv(
        DEFAULT_OUTPUT_PATH_VIGENTE, encoding=dc.OUTPUT_ENCODING,
        separator=dc.OUTPUT_SEPARATOR,
        quotechar=dc.OUTPUT_QUOTECHAR)
    df2.set_index(df2.columns[0]).to_csv(
        DEFAULT_OUTPUT_PATH1_HISTORICO, encoding=dc.OUTPUT_ENCODING,
        separator=dc.OUTPUT_SEPARATOR,
        quotechar=dc.OUTPUT_QUOTECHAR)

    print("Limpieza finalizada exitosamente!")
Exemple #5
0
def word_notin_vocab():
    # read vocab
    print ('start getting /word ')
    input_size = 16
    window_size = 2
    embedding_dim = 50
    batch_size_word2vec = 8
    file_to_save_word2vec_data = 'word2vec_ver6/ws-' + str(window_size) + '-embed-' + str(embedding_dim) + 'batch_size-' + str(batch_size_word2vec) + '.pkl'
    vectors, word2int, int2word = read_trained_data(file_to_save_word2vec_data)
    # read all sentences in unknown file
    texts = []
    print ("Current day: %d" % now.day)
    print ("Current year: %d" % now.year)
    print ("Current month: %d" % now.month)
    
    with open(unknown_file_name, encoding="utf8") as file:
        for line in file :
            temp = line.split(",",1)
            temp[1] = temp[1].lower()
            texts.append(temp[1])  #list of train_word
    words_notin_vocab = []
    for text in texts:
        data_cleaner = DataCleaner(text)
        all_words = data_cleaner.separate_sentence()   
        for word in all_words:
            if word not in word2int:
                words_notin_vocabi = []
                words_notin_vocabi.append(word)
                words_notin_vocabi.append(text)
                words_notin_vocab.append(words_notin_vocabi)
    return jsonify(results = words_notin_vocab) 
    def test_string_peg_split(self):
        input_path = get_input("string_separable_complejo")
        output_path = get_output("string_separable_complejo")

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        parsed_df = dc.string_peg_split(
            "solicitante", """
            allowed_char = anything:x ?(x not in '1234567890() ')
            nombre = ~('DNI') <allowed_char+>:n ws -> n.strip()
            number = <digit+>:num -> int(num)

            nom_comp = <nombre+>:nc -> nc.strip()
            cargo = '(' <nombre+>:c ')' -> c.strip()
            dni = ','? ws 'DNI' ws number:num -> num

            values = nom_comp:n ws cargo?:c ws dni?:d ws anything* -> [n, c, d]
            """, ["nombre", "cargo", "dni"])
        res_1 = nan_safe_list(parsed_df["solicitante_nombre"])
        res_2 = nan_safe_list(parsed_df["solicitante_cargo"])
        res_3 = nan_safe_list(parsed_df["solicitante_dni"])

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp_1 = nan_safe_list(df["solicitante_nombre"])
        exp_2 = nan_safe_list(df["solicitante_cargo"])
        exp_3 = nan_safe_list(df["solicitante_dni"])

        self.assertEqual(res_1, exp_1)
        self.assertEqual(res_2, exp_2)
        self.assertEqual(res_3, exp_3)
    def test_string_peg_split(self):
        input_path = get_input("string_separable_complejo")
        output_path = get_output("string_separable_complejo")

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        parsed_df = dc.string_peg_split(
            "solicitante",
            """
            allowed_char = anything:x ?(x not in '1234567890() ')
            nombre = ~('DNI') <allowed_char+>:n ws -> n.strip()
            number = <digit+>:num -> int(num)

            nom_comp = <nombre+>:nc -> nc.strip()
            cargo = '(' <nombre+>:c ')' -> c.strip()
            dni = ','? ws 'DNI' ws number:num -> num

            values = nom_comp:n ws cargo?:c ws dni?:d ws anything* -> [n, c, d]
            """,
            ["nombre", "cargo", "dni"]
        )
        res_1 = nan_safe_list(parsed_df["solicitante_nombre"])
        res_2 = nan_safe_list(parsed_df["solicitante_cargo"])
        res_3 = nan_safe_list(parsed_df["solicitante_dni"])

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp_1 = nan_safe_list(df["solicitante_nombre"])
        exp_2 = nan_safe_list(df["solicitante_cargo"])
        exp_3 = nan_safe_list(df["solicitante_dni"])

        self.assertEqual(res_1, exp_1)
        self.assertEqual(res_2, exp_2)
        self.assertEqual(res_3, exp_3)
Exemple #8
0
def clean_file(input_path, output_path):
    """Limpia los datos del input creando un nuevo archivo limpio."""
    print("Comenzando limpieza...")
    dc = DataCleaner(input_path, encoding='latin1')
    custom_cleaning_before_rules(dc)
    dc.clean(RULES)
    custom_cleaning_after_rules(dc)
    y = 2015
    dc.df.hasta = pd.to_datetime(dc.df.hasta, yearfirst=True)
    dc.df.desde = pd.to_datetime(dc.df.desde, yearfirst=True)
    gii = dc.df.desde.dt.year == y
    gif = dc.df.hasta.dt.year == y
    gis = (dc.df.desde.dt.year < y) & (dc.df.hasta.dt.year > y)
    givig = gii | gif | gis
    df1 = dc.df[givig].copy()
    print("La cantida de registros 2015 es: ")
    print(givig.sum())
    gin2016 = dc.df.desde.dt.year == 2016
    df2 = dc.df[~gin2016].copy()
    print("La cantida de registros historicos es: ")
    print((~gin2016).sum())
    df1.to_csv(
        DEFAULT_OUTPUT_PATH_VIGENTE, encoding=dc.OUTPUT_ENCODING,
        separator=dc.OUTPUT_SEPARATOR,
        quotechar=dc.OUTPUT_QUOTECHAR, index=False)
    df2.to_csv(
        DEFAULT_OUTPUT_PATH1_HISTORICO, encoding=dc.OUTPUT_ENCODING,
        separator=dc.OUTPUT_SEPARATOR,
        quotechar=dc.OUTPUT_QUOTECHAR, index=False)

    print("Limpieza finalizada exitosamente!")
 def tokenize(self):
     data_cleaner = DataCleaner(self.corpus)
     all_word, all_sentence_split = data_cleaner.clean_content()
     print ('all_word')
     print (all_word)
     # print ('all_sentence_split')
     # print (all_sentence_split)
     return all_word, all_sentence_split
    def test_shapefile_to_geojson(self):
        output_path = BASE_DIR + '/output/localidades.geojson'

        dc = DataCleaner(self.input_path)
        dc.save(output_path)

        geojson_df = gpd.read_file(output_path, driver='GeoJSON')
        self.assertEqual(set(geojson_df.columns), set(dc.df.columns))
    def test_shapefile_to_csv(self):
        output_path = BASE_DIR + '/output/localidades.csv'

        dc = DataCleaner(self.input_path)
        dc.save(output_path)

        csv_df = pd.read_csv(output_path)
        self.assertEqual(set(csv_df.columns), set(dc.df.columns))
    def test_remover_columnas(self):
        input_path = get_input("nombre_propio")
        field = "dependencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        df = dc.remover_columnas(field)

        self.assertNotIn(field, df.columns)
    def test_remover_filas_duplicadas(self):
        input_path = get_input("filas_duplicadas")
        output_path = get_output("filas_duplicadas")

        dc = DataCleaner(input_path)
        df = dc.remover_filas_duplicadas(all_fields=True)
        expected_df = DataCleaner(output_path).df

        self.assertTrue(df.equals(expected_df))
    def test_nombre_propio_keep_original(self):
        input_path = get_input("nombre_propio")
        field = "dependencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        dc.nombre_propio(field, keep_original=True, inplace=True)

        self.assertIn("dependencia_normalizado", dc.df.columns)
    def test_remover_filas_duplicadas_based_on_field(self):
        input_path = get_input("filas_duplicadas_con_id")
        output_path = get_output("filas_duplicadas_con_id")

        dc = DataCleaner(input_path)
        df = dc.remover_filas_duplicadas(all_fields=False, fields=["id"])
        expected_df = DataCleaner(output_path).df

        self.assertTrue(df.equals(expected_df))
    def test_float_integrity(self):
        output_path = BASE_DIR + '/output/clean_coordinates.csv'

        dc = DataCleaner(self.input_path)
        dc.clean_file([], output_path)

        raw_input = raw_csv(self.input_path)
        raw_output = raw_csv(output_path)
        self.assertEqual(raw_input, raw_output)
    def test_nombre_propio_keep_original(self):
        input_path = get_input("nombre_propio")
        field = "dependencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        dc.nombre_propio(field, keep_original=True, inplace=True)

        self.assertIn("dependencia_normalizado", dc.df.columns)
def clean_file(input_path, output_path):
    """Limpia los datos del input creando un nuevo archivo limpio."""
    print("Comenzando limpieza...")
    dc = DataCleaner(input_path)
    custom_cleaning_before_rules(dc)
    dc.clean(RULES)
    custom_cleaning_after_rules(dc)
    dc.save(output_path)
    print("Limpieza finalizada exitosamente!")
    def test_remover_columnas(self):
        input_path = get_input("nombre_propio")
        field = "dependencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        df = dc.remover_columnas(field)

        self.assertNotIn(field, df.columns)
    def test_fecha_completa_keep_original(self):
        input_path = get_input("fecha_completa")
        field = "fecha_completa_audiencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        dc.fecha_completa(field, "DD-MM-YYYY HH:mm", keep_original=True, inplace=True)

        self.assertIn("isodatetime_fecha_completa_audiencia", dc.df.columns)
    def test_shapefile_to_kml(self):
        output_path = BASE_DIR + '/output/localidades.kml'

        dc = DataCleaner(self.input_path)
        dc.save(output_path)

        with open(output_path) as kml_file:
            kml = kml_file.read()
            assert kml.startswith('<?xml version="1.0" encoding="utf-8" ?>')
    def test_integration_case_1(self):
        dc = DataCleaner(get_input("integration"))
        dc.clean_file(rules, get_output("temp_integration"))

        df = pd.read_csv(get_output("temp_integration"))
        df_exp = pd.read_csv(get_output("integration"))

        self.assertEqual(set(df.columns), set(df_exp.columns))
        for col in df.columns:
            self.assertEqual(nan_safe_list(df[col]), nan_safe_list(df_exp[col]))
 def test_string_regex_substitute(self):
     input_path = get_input("regex_sub")
     output_path = get_output("regex_sub")
     # obtengo el resultado de limpiar el csv
     dc = DataCleaner(input_path)
     series = dc.string_regex_substitute("lugar_audiencia", "\d+.*$", "")
     res = list(series)
     # cargo el csv limpio para comparar
     df = pd.read_csv(output_path, encoding="utf-8")
     exp = list(df["lugar_audiencia"])
     self.assertEqual(res, exp)
    def test_integration_case_1(self):
        dc = DataCleaner(get_input("integration"))
        dc.clean_file(rules, get_output("temp_integration"))

        df = pd.read_csv(get_output("temp_integration"))
        df_exp = pd.read_csv(get_output("integration"))

        self.assertEqual(set(df.columns), set(df_exp.columns))
        for col in df.columns:
            self.assertEqual(nan_safe_list(df[col]),
                             nan_safe_list(df_exp[col]))
def apply_rules_to_dataset(csv_input, csv_output, dataset_file_rules, parse_options):
    with warnings.catch_warnings(record=True) as catched_warnings:
        dc = DataCleaner(csv_input, **parse_options)
        dc.clean(dataset_file_rules['data-cleaner-rules'])
        dc.df.set_index(dc.df.columns[0]).to_csv(
            csv_output,
            encoding=dc.OUTPUT_ENCODING,
            sep=dc.OUTPUT_SEPARATOR,
            quotechar=dc.OUTPUT_QUOTECHAR
        )
        return catched_warnings
 def test_string_regex_substitute(self):
     input_path = get_input("regex_sub")
     output_path = get_output("regex_sub")
     # obtengo el resultado de limpiar el csv
     dc = DataCleaner(input_path)
     series = dc.string_regex_substitute("lugar_audiencia", "\d+.*$", "")
     res = list(series)
     # cargo el csv limpio para comparar
     df = pd.read_csv(output_path, encoding="utf-8")
     print(series)
     exp = list(df["lugar_audiencia"])
     self.assertEqual(res, exp)
    def setUp(self):
        """
        Creates a new database for the unit test to use
        """
        app.config.from_pyfile('test_config.py')
        db.init_app(app)
        db.create_all()

        self.dataCleaner = DataCleaner(test_config.SQLALCHEMY_DATABASE_URI)

        self.app = app.test_client()
        return self.app
    def test_fecha_completa_keep_original(self):
        input_path = get_input("fecha_completa")
        field = "fecha_completa_audiencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        dc.fecha_completa(field,
                          "DD-MM-YYYY HH:mm",
                          keep_original=True,
                          inplace=True)

        self.assertIn("isodatetime_fecha_completa_audiencia", dc.df.columns)
    def test_fecha_separada(self):
        input_path = get_input("fecha_separada")
        output_path = get_output("fecha_separada")

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.fecha_separada([["fecha_audiencia", "DD-MM-YYYY"], ["hora_audiencia", "HH:mm"]], "audiencia")
        res = nan_to_empty_string_list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = nan_to_empty_string_list(df["isodatetime_audiencia"])

        self.assertEqual(res, exp)
    def test_simplify_geometry(self):
        input_path = BASE_DIR + '/input/localidades/localidades.shp'
        original = BASE_DIR + '/output/localidades-original.csv'
        simplified = BASE_DIR + '/output/localidades-simplificado.csv'

        dc = DataCleaner(input_path)
        dc.save(original)  # CSV con geometría original.
        dc = DataCleaner(input_path)
        dc.simplificar_geometria()
        dc.save(simplified)  # CSV con geometría simplificada.

        import filecmp
        files_are_equal = filecmp.cmp(original, simplified, shallow=False)
        self.assertFalse(files_are_equal)
    def test_nombre_propio(self):
        input_path = get_input("nombre_propio")
        output_path = get_output("nombre_propio")
        field = "dependencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.nombre_propio(field)
        res = list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = list(df[field])

        self.assertEqual(res, exp)
    def test_fecha_simple_mes(self):
        input_path = get_input("fecha_mes")
        output_path = get_output("fecha_mes")
        field = "fecha_audiencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.fecha_simple(field, "MM-YYYY")
        res = nan_to_empty_string_list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = nan_to_empty_string_list(df["isodate_" + field])

        self.assertEqual(res, exp)
    def test_reemplazar_string(self):
        input_path = get_input("reemplazar_string")
        output_path = get_output("reemplazar_string")
        field = "dependencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.reemplazar_string(field, {"Jaguarete": ["ABBA", "ABBBA"]})
        res = list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = list(df[field])

        self.assertEqual(res, exp)
    def test_mail_format(self):
        input_path = get_input("mail_format")
        output_path = get_output("mail_format")
        field = "mail"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.mail_format(field)
        res = list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = list(df[field])

        self.assertEqual(res, exp)
    def test_string_normal(self):
        input_path = get_input("string_normal")
        output_path = get_output("string_normal")
        field = "lugar_audiencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.string(field)
        res = list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = list(df[field])

        self.assertEqual(res, exp)
    def test_string_normal(self):
        input_path = get_input("string_normal")
        output_path = get_output("string_normal")
        field = "lugar_audiencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.string(field)
        res = list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = list(df[field])

        self.assertEqual(res, exp)
    def test_nombre_propio(self):
        input_path = get_input("nombre_propio")
        output_path = get_output("nombre_propio")
        field = "dependencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.nombre_propio(field)
        res = list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = list(df[field])

        self.assertEqual(res, exp)
    def test_mail_format(self):
        input_path = get_input("mail_format")
        output_path = get_output("mail_format")
        field = "mail"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.mail_format(field)
        res = list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = list(df[field])

        self.assertEqual(res, exp)
    def test_fecha_simple_mes(self):
        input_path = get_input("fecha_mes")
        output_path = get_output("fecha_mes")
        field = "fecha_audiencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.fecha_simple(field, "MM-YYYY")
        res = nan_to_empty_string_list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = nan_to_empty_string_list(df["isodate_" + field])

        self.assertEqual(res, exp)
    def test_nombre_propio_lower_words(self):
        input_path = get_input("nombre_propio")
        output_path = get_output("nombre_propio_lower_words")
        field = "dependencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.nombre_propio(field, lower_words=["nación", "de", "la"], keep_original=True, inplace=True)
        res = list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = list(df[field])

        self.assertEqual(res, exp)
    def test_reemplazar(self):
        input_path = get_input("reemplazar")
        output_path = get_output("reemplazar")
        field = "tipo"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.reemplazar(field, {"Servicios": ["Serv"], "Otros": ["Otro", "Loc"]})
        res = list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = list(df[field])

        self.assertEqual(res, exp)
    def test_reemplazar_string(self):
        input_path = get_input("reemplazar_string")
        output_path = get_output("reemplazar_string")
        field = "dependencia"

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.reemplazar_string(field, {"Jaguarete": ["ABBA", "ABBBA"]})
        res = list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = list(df[field])

        self.assertEqual(res, exp)
    def test_fecha_separada(self):
        input_path = get_input("fecha_separada")
        output_path = get_output("fecha_separada")

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        series = dc.fecha_separada(
            [["fecha_audiencia", "DD-MM-YYYY"], ["hora_audiencia", "HH:mm"]],
            "audiencia")
        res = nan_to_empty_string_list(series)

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp = nan_to_empty_string_list(df["isodatetime_audiencia"])

        self.assertEqual(res, exp)
    def test_build_data(self):
        """Construye un diccionario con unidades territoriales."""
        entity = str('localidad')
        field = str('nombre')
        test_data = {
            'localidades': [{
                'nombre': 'laferrere',
                'aplanar': True,
                'max': 1
            }]
        }

        input_path = get_input('normalize_unidad_territorial')
        dc = DataCleaner(input_path)
        data = dc._build_data(field, entity, filters={})
        self.assertEqual(data, test_data)
    def test_string_simple_split(self):
        input_path = get_input("string_separable_simple")
        output_path = get_output("string_separable_simple")

        # obtengo el resultado de limpiar el csv
        dc = DataCleaner(input_path)
        parsed_df = dc.string_simple_split("sujeto_obligado", [", Cargo:", "Cargo:"], ["nombre", "cargo"])
        res_1 = nan_safe_list(parsed_df["sujeto_obligado_nombre"])
        res_2 = nan_safe_list(parsed_df["sujeto_obligado_cargo"])

        # cargo el csv limpio para comparar
        df = pd.read_csv(output_path, encoding="utf-8")
        exp_1 = nan_safe_list(df["sujeto_obligado_nombre"])
        exp_2 = nan_safe_list(df["sujeto_obligado_cargo"])

        self.assertEqual(res_1, exp_1)
        self.assertEqual(res_2, exp_2)
 def test_parsed_entity_level(self):
     """Pluraliza una unidad territorial."""
     test_string = [("provincia", "provincias"),
                    ("departamento", "departamentos"),
                    ("municipio", "municipios"),
                    ("localidad", "localidades")]
     for (inp, outp) in test_string:
         self.assertEqual(DataCleaner._plural_entity_level(inp), outp)
 def test_invalidate_filters(self):
     """Debería encontrar filtros ínvalidos."""
     entity = 'departamento'
     test_filters = {
         "provincia_field": "provincia",
         "departamento_field": "provincia"
     }
     self.assertFalse(DataCleaner._validate_filters(entity, test_filters))
sc.addPyFile("/home/ubuntu/data_utilities/data_cleaner.py")
from data_cleaner import DataCleaner

# Read input file and create DataCleaner object
dc = DataCleaner(sqlCtx)

df = dc.read_csv("/home/ubuntu/csv/so_bq_questions.csv")

# Remove records that lack a question_id, questioner_id, question_body_length, questioner_reputation, or questioner_up_votes
df = dc.drop_na_values(dataframe=df,field_names=["question_id","questioner_id","question_body_length","questioner_reputation","questioner_up_votes"])

# Fix data types
df = dc.fix_data_type(dataframe=df, field_names=["question_body_length","question_codeblock_count","answer_count","question_comment_count","questioner_id","questioner_up_votes",
                      "questioner_down_votes","accepted_answer_id","questioner_reputation","questioner_views","max_answer_score"], data_type='int')

df = dc.fix_data_type(dataframe=df, field_names=["questioner_account_creation_date","min_answer_creation_date"], data_type="timestamp")

df = dc.set_tag_count(dataframe=df, base_field="question_tags", count_field="question_tags_count")

df = dc.set_years_between_dates(dataframe=df, start_date="questioner_account_creation_date", end_date="question_creation_date", years_between_field="questioner_years_since_joining")

df = dc.fill_na(dataframe=df, field_name="question_favorite_count", fill_value=0)

# Create categorical feature question_view_quantile from question_view_count
df = dc.create_categorical_feature(dataframe=df, base_field="question_view_count", categorical_field="question_view_quantile", levels=10, increment=0)

df = dc.create_binary_feature(dataframe=df, base_field="question_favorite_count", binary_field="question_favorited")
df = dc.create_binary_feature(dataframe=df, base_field="answer_count", binary_field="has_answer")

df.select("answer_count","has_answer").show(20)
    # NO FUNCIONA BIEN DEJO EL CAMPO COMO ESTA
#     {"reemplazar": [
#     {
#      "field": "horario_de_atencion",
#      "replacements": {"LUN": ["lunes", "lun"], 
#                       "MAR": ["martes", "mar"],
#                       "MIE": ["miercoles", "mie", u"miércoles"],
#                       "JUE": ["jueves", "jue"],
#                       "VIE": ["viernes", "vie"],
#                       "SAB": ["sabado", "sab", "sábado","sáb"],
#                       "DOM": ["domingo", "dom"],
#                       "-": [" a "],
#                       "_": [" y ", ","],
#                       "": ["hs", "hs."],
#                       "00:00-23:59": ["24"]
#                      },
#      "keep_original": True
#     }
#    ]}

]

dc = DataCleaner(input_path)
# No implementados aun van derecho con Pandas
dc.df['coordenadas_latitud'] = dc.df.recurso.str.split("\s+", 1, expand=True)[0]
dc.df['coordenadas_longitud'] = dc.df.recurso.str.split("\s+", 1, expand=True)[1]
dc.df['mail'] = dc.df['mail'].str.lower()
dc.df['sitio_web'] = dc.df.mail.str.findall('www[^ \s]+').str.join(",")
dc.df['mail'] = dc.df.mail.str.findall('[a-z_0-9\.]+@[a-z_0-9\.]+').str.join(",")
dc.clean_file(rules, output_path)
def clean_file(input_path, output_path):
    """Limpia los datos del input creando un nuevo archivo limpio."""
    print("Comenzando limpieza...")
    dc = DataCleaner(input_path)
    dc.clean_file(RULES, output_path)
    print("Limpieza finalizada exitosamente!")