Example #1
0
def parse_write_manufactures(inpath, outpath, extension, financial_cols,
                             folders_years):
    ""
    ## Parse manufactures
    # Start traking
    t0 = time.time()
    print "Start parsing manufacturas."
    # parse manufacturas
    manufacturas = parse_xlsx_sheet(join(inpath, 'Manufactures.xlsx'))
    # Rename columns
    cols = manufacturas.columns
    newcolnames = clean_colnames_manu(cols)
    manufacturas.columns = newcolnames
    # Compute extra variables
    extra = compute_extra_cols(manufacturas)
    manufacturas = pd.concat([manufacturas, extra], axis=1)
    # Correct coordinates
    coords = ['ES-X', 'ES-Y']
    manufacturas[coords] = reformat_coordinates_manu(manufacturas, coords)
    # Categorize cols
    manufacturas = categorize_cols(manufacturas)
    # Separate and save
    name = 'Manufactures.xlsx'
    write_dataframe(manufacturas[main_cols], name,
                    join(outpath, 'Main'), extension)
    # Tracking task
    print "Manufacturas main lasted %f seconds." % (time.time()-t0)
    for i in range(len(financial_cols)):
        t0 = time.time()
        y = folders_years[i]
        write_dataframe(manufacturas[financial_cols[i]], name,
                        join(join(outpath, 'Financial'), y), extension)
        print "Manufacturas year %s lasted %f seconds." % (y, time.time()-t0)
    del manufacturas
Example #2
0
def parse_write_servicios(inpath, outpath, extension, financial_cols,
                          folders_years):
    ""
    ## 1. Parse servicios
    t0 = time.time()
    onlyfiles = [f for f in os.listdir(join(inpath, 'Servicios'))
                 if isfile(join(join(inpath, 'Servicios'), f))
                 and check_xlsx(f)]
    for f in onlyfiles:
        # parse servicios
        servicios = parse_xlsx_sheet(join(join(inpath, 'Servicios'), f))
        # Rename columns
        cols = servicios.columns
        newcolnames = clean_colnames_servi(cols)
        servicios.columns = newcolnames
        # Compute extra variables
        apertura = obtain_open_aperture_date(servicios)
        servicios['apertura'] = apertura
        servicios = compute_close_date_servicios(servicios)
        ## Categorize cols
        servicios = categorize_cols(servicios)
        # Separate and save
        write_dataframe(servicios[main_cols], f,
                        join(join(outpath, 'Main'), 'Servicios'), extension)
        print "Compute %s." % f
        print "Servicios main lasted %f seconds." % (time.time()-t0)
        # Write servicios
        path_fin = join(outpath, 'Financial')
        for i in range(len(financial_cols)):
            t0 = time.time()
            y = folders_years[i]
            write_dataframe(servicios[financial_cols[i]], f,
                            join(join(path_fin, y), 'Servicios'), extension)
            print "Servicios year %s lasted %f seconds." % (y, time.time()-t0)
def parse_write_manufactures(inpath, outpath, extension, financial_cols,
                             folders_years):
    ""
    ## Parse manufactures
    # Start traking
    t0 = time.time()
    print "Start parsing manufacturas."
    # parse manufacturas
    manufacturas = parse_xlsx_sheet(join(inpath, 'Manufactures.xlsx'))
    # Rename columns
    cols = manufacturas.columns
    newcolnames = clean_colnames_manu(cols)
    manufacturas.columns = newcolnames
    # Compute extra variables
    extra = compute_extra_cols(manufacturas)
    manufacturas = pd.concat([manufacturas, extra], axis=1)
    # Correct coordinates
    coords = ['ES-X', 'ES-Y']
    manufacturas[coords] = reformat_coordinates_manu(manufacturas, coords)
    # Categorize cols
    manufacturas = categorize_cols(manufacturas)
    # Separate and save
    name = 'Manufactures.xlsx'
    write_dataframe(manufacturas[main_cols], name, join(outpath, 'Main'),
                    extension)
    # Tracking task
    print "Manufacturas main lasted %f seconds." % (time.time() - t0)
    for i in range(len(financial_cols)):
        t0 = time.time()
        y = folders_years[i]
        write_dataframe(manufacturas[financial_cols[i]], name,
                        join(join(outpath, 'Financial'), y), extension)
        print "Manufacturas year %s lasted %f seconds." % (y, time.time() - t0)
    del manufacturas
Example #4
0
def aux_transformation(df, lvl):
    "Auxiliar transformation for formatting aggregate data."
    # categorize
    df = categorize_cols(df)
    # Change the cnae code
    from Mscthesis.Retrieve.cnae_utils import transform_cnae_col
    df['cnae'] = transform_cnae_col(df['cnae'], lvl)
    return df
Example #5
0
def aux_transformation(df, lvl=2):
    "Auxiliar transformation for formatting aggregate data."
    # categorize
    df = categorize_cols(df)
    # Change the cnae code
    from Mscthesis.Retrieve.cnae_utils import transform_cnae_col
    df['cnae'] = transform_cnae_col(df['cnae'], lvl)
    return df
def parse_write_servicios(inpath, outpath, extension, financial_cols,
                          folders_years):
    ""
    ## 1. Parse servicios
    t0 = time.time()
    onlyfiles = [
        f for f in os.listdir(join(inpath, 'Servicios'))
        if isfile(join(join(inpath, 'Servicios'), f)) and check_xlsx(f)
    ]
    for f in onlyfiles:
        # parse servicios
        servicios = parse_xlsx_sheet(join(join(inpath, 'Servicios'), f))
        # Rename columns
        cols = servicios.columns
        newcolnames = clean_colnames_servi(cols)
        servicios.columns = newcolnames
        # Compute extra variables
        apertura = obtain_open_aperture_date(servicios)
        servicios['apertura'] = apertura
        servicios = compute_close_date_servicios(servicios)
        ## Categorize cols
        servicios = categorize_cols(servicios)
        # Separate and save
        write_dataframe(servicios[main_cols], f,
                        join(join(outpath, 'Main'), 'Servicios'), extension)
        print "Compute %s." % f
        print "Servicios main lasted %f seconds." % (time.time() - t0)
        # Write servicios
        path_fin = join(outpath, 'Financial')
        for i in range(len(financial_cols)):
            t0 = time.time()
            y = folders_years[i]
            write_dataframe(servicios[financial_cols[i]], f,
                            join(join(path_fin, y), 'Servicios'), extension)
            print "Servicios year %s lasted %f seconds." % (y,
                                                            time.time() - t0)
Example #7
0
def clean(inpath, outpath, extension='csv'):
    """Do the cleaning data from the raw initial data. It formats the data to a
    folder structure in which it is separated the main information of a company
    with the Financial information in order to save memory and read unnecessary
    information for some tasks.
    """
    ## 0. Ensure creation of needed folders
    if not exists(outpath):
        os.mkdir(outpath)
    if not exists(join(outpath, 'Main')):
        os.mkdir(join(outpath, 'Main'))
    if not exists(join(join(outpath, 'Main'), 'Servicios')):
        os.mkdir(join(join(outpath, 'Main'), 'Servicios'))
    if not exists(join(outpath, 'Financial')):
        os.mkdir(join(outpath, 'Financial'))
    folders = os.listdir(join(outpath, 'Financial'))
    folders_years = [str(int(e)) for e in years]
    for f in folders_years:
        if f not in folders:
            os.mkdir(join(join(outpath, 'Financial'), f))
        os.mkdir(join(join(join(outpath, 'Financial'), f), 'Servicios'))
    ## Creation of the Financial cols
    aux = []
    for i in range(len(years_key)):
        aux.append([''.join(e) for e in product([years_key[i]], types)])
    Financial_cols = aux

    ## 1. Parse manufactures
    # Start traking
    t0 = time.time()
    print "Start parsing manufacturas."
    # parse manufacturas
    manufacturas = parse_xlsx_sheet(join(inpath, 'Manufactures.xlsx'))
    # Rename columns
    cols = manufacturas.columns
    newcolnames = clean_colnames_manu(cols)
    manufacturas.columns = newcolnames
    # Compute extra variables
    extra = compute_extra_cols(manufacturas)
    manufacturas = pd.concat([manufacturas, extra], axis=1)
    # Correct coordinates
    coords = ['ES-X', 'ES-Y']
    manufacturas[coords] = reformat_coordinates_manu(manufacturas, coords)
    # Categorize cols
    manufacturas = categorize_cols(manufacturas)
    # Separate and save
    name = 'Manufactures.xlsx'
    write_dataframe(manufacturas[main_cols], name,
                    join(outpath, 'Main'), extension)
    # Tracking task
    print "Manufacturas main lasted %f seconds." % (time.time()-t0)
    for i in range(len(Financial_cols)):
        t0 = time.time()
        y = folders_years[i]
        write_dataframe(manufacturas[Financial_cols[i]], name,
                        join(join(outpath, 'Financial'), y), extension)
        print "Manufacturas year %s lasted %f seconds." % (y, time.time()-t0)
    del manufacturas

    ## 1. Parse servicios
    t0 = time.time()
    onlyfiles = [f for f in os.listdir(join(inpath, 'Servicios'))
                 if isfile(join(join(inpath, 'Servicios'), f))
                 and check_xlsx(f)]
    for f in onlyfiles:
        # parse servicios
        servicios = parse_xlsx_sheet(join(join(inpath, 'Servicios'), f))
        # Rename columns
        cols = servicios.columns
        newcolnames = clean_colnames_servi(cols)
        servicios.columns = newcolnames
        # Compute extra variables
        apertura = obtain_open_aperture_date(servicios)
        servicios['apertura'] = apertura
        servicios = compute_close_date_servicios(servicios)
        ## Categorize cols
        servicios = categorize_cols(servicios)
        # Separate and save
        write_dataframe(servicios[main_cols], f,
                        join(join(outpath, 'Main'), 'Servicios'), extension)
        print "Compute %s." % f
        print "Servicios main lasted %f seconds." % (time.time()-t0)
        # Write servicios
        path_fin = join(outpath, 'Financial')
        for i in range(len(Financial_cols)):
            t0 = time.time()
            y = folders_years[i]
            write_dataframe(servicios[Financial_cols[i]], f,
                            join(join(path_fin, y), 'Servicios'), extension)
            print "Servicios year %s lasted %f seconds." % (y, time.time()-t0)
Example #8
0
def clean(inpath, outpath, extension='csv'):
    """Do the cleaning data from the raw initial data. It formats the data to a
    folder structure in which it is separated the main information of a company
    with the finantial information in order to save memory and read unnecessary
    information for some tasks.
    """
    ## 0. Ensure creation of needed folders
    if not exists(outpath):
        os.mkdir(outpath)
    if not exists(join(outpath, 'Main')):
        os.mkdir(join(outpath, 'Main'))
    if not exists(join(join(outpath, 'Main'), 'Servicios')):
        os.mkdir(join(join(outpath, 'Main'), 'Servicios'))
    if not exists(join(outpath, 'Finantial')):
        os.mkdir(join(outpath, 'Finantial'))
    folders = os.listdir(join(outpath, 'Finantial'))
    folders_years = [str(int(e)) for e in years]
    for f in folders_years:
        if f not in folders:
            os.mkdir(join(join(outpath, 'Finantial'), f))
        os.mkdir(join(join(join(outpath, 'Finantial'), f), 'Servicios'))
    ## Creation of the finantial cols
    aux = []
    for i in range(len(years_key)):
        aux.append([''.join(e) for e in product([years_key[i]], types)])
    finantial_cols = aux

    ## 1. Parse manufactures
    # Start traking
    t0 = time.time()
    print "Start parsing manufacturas."
    # parse manufacturas
    manufacturas = parse_xlsx_sheet(join(inpath, 'Manufactures.xlsx'))
    # Rename columns
    cols = manufacturas.columns
    newcolnames = clean_colnames_manu(cols)
    manufacturas.columns = newcolnames
    # Compute extra variables
    extra = compute_extra_cols(manufacturas)
    manufacturas = pd.concat([manufacturas, extra], axis=1)
    # Correct coordinates
    coords = ['ES-X', 'ES-Y']
    manufacturas[coords] = reformat_coordinates_manu(manufacturas, coords)
    # Categorize cols
    manufacturas = categorize_cols(manufacturas)
    # Separate and save
    name = 'Manufactures.xlsx'
    write_dataframe(manufacturas[main_cols], name,
                    join(outpath, 'Main'), extension)
    # Tracking task
    print "Manufacturas main lasted %f seconds." % (time.time()-t0)
    for i in range(len(finantial_cols)):
        t0 = time.time()
        y = folders_years[i]
        write_dataframe(manufacturas[finantial_cols[i]], name,
                        join(join(outpath, 'Finantial'), y), extension)
        print "Manufacturas year %s lasted %f seconds." % (y, time.time()-t0)
    del manufacturas

    ## 1. Parse servicios
    t0 = time.time()
    onlyfiles = [f for f in os.listdir(join(inpath, 'Servicios'))
                 if isfile(join(join(inpath, 'Servicios'), f))
                 and check_xlsx(f)]
    for f in onlyfiles:
        # parse servicios
        servicios = parse_xlsx_sheet(join(join(inpath, 'Servicios'), f))
        # Rename columns
        cols = servicios.columns
        newcolnames = clean_colnames_servi(cols)
        servicios.columns = newcolnames
        # Compute extra variables
        apertura = obtain_open_aperture_date(servicios)
        servicios['apertura'] = apertura
        servicios = compute_close_date_servicios(servicios)
        ## Categorize cols
        servicios = categorize_cols(servicios)
        # Separate and save
        write_dataframe(servicios[main_cols], f,
                        join(join(outpath, 'Main'), 'Servicios'), extension)
        print "Compute %s." % f
        print "Servicios main lasted %f seconds." % (time.time()-t0)
        # Write servicios
        path_fin = join(outpath, 'Finantial')
        for i in range(len(finantial_cols)):
            t0 = time.time()
            y = folders_years[i]
            write_dataframe(servicios[finantial_cols[i]], f,
                            join(join(path_fin, y), 'Servicios'), extension)
            print "Servicios year %s lasted %f seconds." % (y, time.time()-t0)