def getStatusCDR(service):

    cdr_name_file = "R-{}-{}-{}-{:08d}.zip".format(
        service.empresa.ruc,
        service.tipo_doc.codigo,
        service.serie,
        service.numero,
    )

    rpt = service.getStatusCdr()
    paths = getConfigData("paths")["paths"]
    full_path = ""

    if rpt.content:
        full_path = saveBinaryFile(
            rpt.content, paths["destino_CDR_file"],
            cdr_name_file)

    data = [
        ["Code", rpt.statusCode],
        ["Message", rpt.statusMessage],
    ]

    if rpt.content:
        data.append(["Ubicación", full_path])

    printSingleTable(data, " Respuesta CDR: ", False)
def getListaTiposDocs():

    lista_tipos_docs = []
    data = getConfigData("tipos_docs")

    if len(data["tipos_docs"]) > 0:
        for tipo_doc in data["tipos_docs"]:
            tdoc = TipoDocumento(
                tipo_doc["codigo"],
                tipo_doc["descripcion"]
            )

            lista_tipos_docs.append(tdoc)
    else:
        printOnConsole("No existen Tipos de documentos configurados.", "e")
        exit()

    return lista_tipos_docs
def getListaEmpresas():

    lista_empresas = []
    data = getConfigData("empresas")
    
    if len(data["empresas"]) > 0:
        for empresa in data["empresas"]:
            emp = Empresa(
                empresa["razon_social"],
                empresa["ruc"],
                ClaveSol(
                    empresa["clave_sol"]["usuario"],
                    empresa["clave_sol"]["contrasenha"]
                )
            )

            lista_empresas.append(emp)
    else:
        printOnConsole("No existen empresas configuradas.", "e")
        exit()

    return lista_empresas
Ejemplo n.º 4
0
def cleanMaster():

    config = hp.getConfigData()
    checksumClean = hp.getChecksumClean()

    # load path constants defined in config.yaml
    RAW_DIR = config['dirHeaders']['raw_dir']
    CLEAN_DIR = config['dirHeaders']['cleaned_dir']
    TR_DIR = config['dirHeaders']['transformed_dir']
    DATA_FTYPE = config['dataFileType']

    # define template for the absolute file path (defined in config.yaml)
    sf_template = Template('$base$type$fname$ftype')
    mf_template = Template('$base$type$fname$year$ftype')

    # load the state abbreviation map
    abbrevMap = hp.getAbbreviationMap()

    # f_checksum = open('checksum_CLEAN.txt', 'w')

    for s in config['datasets']:
        try:

            ds = s['set']
            print("Processing dataset: {:s}".format(ds['name']))

            if ds['single_file']:
                absPath = sf_template.substitute(base=ds['directory'],
                                                 type=RAW_DIR,
                                                 fname=ds['file_base'],
                                                 ftype=DATA_FTYPE)

                outPath = sf_template.substitute(base=ds['directory'],
                                                 type=CLEAN_DIR,
                                                 fname=ds['file_base'],
                                                 ftype=DATA_FTYPE)

                # BREAK EARLY IF FILE ALREADY PROCESSED
                if (not hp.goAheadForClean(absPath)):
                    continue

                print("Loading: {:s}".format(absPath))
                rawData = pd.read_csv(absPath)

                if ds['loc_single_column']:  # split the single column

                    # Add 'State' column
                    rawData['State'] = np.zeros(rawData.shape[0])
                    rawData = rawData.rename(columns={'Location': 'County'})
                    # Rearrange columns. [State, County, .... , rest]
                    cols = rawData.columns.tolist()
                    cols.pop(cols.index('State'))
                    cols.pop(cols.index('County'))
                    cols = ['State', 'County'] + cols
                    rawData = rawData[cols]
                    # county value must be non-null
                    rawData = rawData.dropna(thresh=1, subset=['County'])

                    # split the single column
                    for idx, row in rawData.iterrows():
                        tmp = row['County'].split(',')
                        row['County'] = tmp[0]
                        if (len(tmp) > 1):
                            row['State'] = tmp[1]
                        else:
                            # matches either STATE, UNITED STATES, District of Columbia
                            row['State'] = "z_NA"
                        rawData.loc[idx] = row

                    # Convert county list to uppercase
                    for idx, row in rawData.iterrows():
                        row['County'] = row['County'].upper()
                        rawData.loc[idx] = row

                    # drop raw duplicates
                    rawData.drop_duplicates(subset=['State', 'County'],
                                            inplace=True)

                    # clean State and County values
                    for idx, row in rawData.iterrows():
                        if (row['State'] != 'z_NA'):
                            row['State'] = row['State'].upper().replace(
                                "COUNTY", "").replace("PARISH", "").replace(
                                    "'", "").replace("CITY", "").replace(
                                        ".", "").replace(",", "").strip()
                        row['County'] = row['County'].upper().replace(
                            "COUNTY",
                            "").replace("PARISH", "").replace("'", "").replace(
                                "CITY",
                                "").replace(".", "").replace(",", "").strip()
                        rawData.loc[idx] = row

                else:  # data is already stored in "State" and "County"

                    # county AND state value must be non-null
                    rawData = rawData.dropna(thresh=2,
                                             subset=['County', 'State'])
                    # drop raw duplicates
                    rawData.drop_duplicates(subset=['State', 'County'],
                                            inplace=True)
                    # Rearrange columns. [State, County, .... , rest]
                    cols = rawData.columns.tolist()
                    cols.pop(cols.index('State'))
                    cols.pop(cols.index('County'))
                    cols = ['State', 'County'] + cols
                    rawData = rawData[cols]

                    for idx, row in rawData.iterrows():
                        if (ds['fips_flag'] & row['FIPS'] == 0):
                            row['State'] = 'z_NA'
                        else:
                            row['State'] = row['State'].upper().strip()
                        row['County'] = row['County'].upper().replace(
                            "COUNTY",
                            "").replace("PARISH", "").replace("'", "").replace(
                                "CITY",
                                "").replace(".", "").replace(",", "").strip()
                        rawData.loc[idx] = row

                rawData['State'] = rawData['State'].map(abbrevMap,
                                                        na_action='ignore')
                rawData = rawData.sort_values(['State', 'County'], axis=0)
                print("{:s} cleaned. Outputting to: {:s}".format(
                    ds['name'], outPath))
                rawData.to_csv(outPath, index=False)

                # Write to checksum file
                checksumClean['processedFiles'][absPath] = rawData.shape[0]

            else:
                # Handle multiple files
                for year in hp.yearList(ds['year_start'], ds['year_end'],
                                        ds['year_increment'],
                                        ds['years_absent']):

                    absPath = mf_template.substitute(base=ds['directory'],
                                                     type=RAW_DIR,
                                                     fname=ds['file_base'],
                                                     year=year,
                                                     ftype=DATA_FTYPE)

                    outPath = mf_template.substitute(base=ds['directory'],
                                                     type=CLEAN_DIR,
                                                     fname=ds['file_base'],
                                                     year=year,
                                                     ftype=DATA_FTYPE)

                    # BREAK EARLY IF FILE ALREADY PROCESSED
                    if (not hp.goAheadForClean(absPath)):
                        continue

                    print("Loading: {:s}".format(absPath))
                    rawData = pd.read_csv(absPath)

                    if ds['loc_single_column']:
                        # add 'State' column
                        rawData['State'] = np.zeros(rawData.shape[0])
                        rawData = rawData.rename(
                            columns={'Location': 'County'})
                        # Rearrange columns. [State, County, .... , rest]
                        cols = rawData.columns.tolist()
                        cols.pop(cols.index('State'))
                        cols.pop(cols.index('County'))
                        cols = ['State', 'County'] + cols
                        rawData = rawData[cols]
                        # county value must be non-null
                        rawData = rawData.dropna(thresh=1, subset=['County'])

                        # split the single column
                        for idx, row in rawData.iterrows():
                            tmp = row['County'].split(',')
                            row['County'] = tmp[0]
                            if (len(tmp) > 1):
                                row['State'] = tmp[1]
                            else:
                                # matches either STATE, UNITED STATES, District of Columbia
                                row['State'] = "z_NA"
                            rawData.loc[idx] = row

                        # Convert county list to uppercase
                        for idx, row in rawData.iterrows():
                            row['County'] = row['County'].upper()
                            rawData.loc[idx] = row

                        # drop raw duplicates
                        rawData.drop_duplicates(subset=['State', 'County'],
                                                inplace=True)

                        # clean State and County values
                        for idx, row in rawData.iterrows():
                            if (row['State'] != 'z_NA'):
                                row['State'] = row['State'].upper().replace(
                                    "COUNTY",
                                    "").replace("PARISH", "").replace(
                                        "'", "").replace("CITY", "").replace(
                                            ".", "").replace(",", "").strip()
                            row['County'] = tmp[0].upper().replace(
                                "COUNTY", "").replace("PARISH", "").replace(
                                    "'", "").replace("CITY", "").replace(
                                        ".", "").replace(",", "").strip()
                            rawData.loc[idx] = row

                    else:
                        # county AND state value must be non-null
                        rawData = rawData.dropna(thresh=2,
                                                 subset=['County', 'State'])
                        # drop raw duplicates
                        rawData.drop_duplicates(subset=['State', 'County'],
                                                inplace=True)
                        # Rearrange columns. [State, County, .... , rest]
                        cols = rawData.columns.tolist()
                        cols.pop(cols.index('State'))
                        cols.pop(cols.index('County'))
                        cols = ['State', 'County'] + cols
                        rawData = rawData[cols]

                        # data is already stored in "State" and "County"
                        for idx, row in rawData.iterrows():
                            if (ds['fips_flag'] and row['FIPS'] == 0):
                                row['State'] = 'z_NA'
                            elif (row['State'] == 'z_NA'):
                                continue
                            else:
                                row['State'] = row['State'].upper().strip()
                            row['County'] = row['County'].upper().replace(
                                "COUNTY", "").replace("PARISH", "").replace(
                                    "'", "").replace("CITY", "").replace(
                                        ".", "").replace(",", "").strip()
                            rawData.loc[idx] = row

                    rawData['State'] = rawData['State'].map(abbrevMap,
                                                            na_action='ignore')
                    rawData = rawData.sort_values(['State', 'County'], axis=0)
                    print("{:s} cleaned. Outputting to: {:s}".format(
                        ds['name'], outPath))
                    rawData.to_csv(outPath, index=False)
                    # Write to checksum file
                    checksumClean['processedFiles'][absPath] = rawData.shape[0]

        except Exception as e:
            print(e)
            # f_checksum.write("ERROR cleaning dataset: {:s}\n".format(ds['name']))
            print("ERROR cleaning dataset: {:s}\n".format(ds['name']))

    # f_checksum.write('Timestamp: {:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now()))
    # f_checksum.close()
    # Update the clean checksum file
    hp.setChecksumClean(checksumClean)
Ejemplo n.º 5
0
def transformMaster(cityMasterList):

    config = hp.getConfigData()
    checksumTransform = hp.getChecksumTransform()

    # load path constants defined in config.yaml
    RAW_DIR = config['dirHeaders']['raw_dir']
    CLEAN_DIR = config['dirHeaders']['cleaned_dir']
    TR_DIR = config['dirHeaders']['transformed_dir']
    DATA_FTYPE = config['dataFileType']

    # define template for the absolute file path (defined in config.yaml)
    sf_template = Template('$base$type$fname$ftype')
    mf_template = Template('$base$type$fname$year$ftype')

    # load the set of counties with "CITY"
    masterList = pd.read_csv(cityMasterList)

    # Get the dictionary representation of the master county list
    MASTER_COUNTY_SET = hp.getDictionarySet(cityMasterList)

    # open the checksum file descriptor
    f_checksum = open('checksum_TRANSFORM.txt', 'w')

    for s in config['datasets']:
        try:
            ds = s['set']
            print("Processing dataset: {:s}".format(ds['name']))

            if ds['single_file']:
                print("Single file\n")
                # Set up input / output paths
                absPath = sf_template.substitute(base=ds['directory'],
                                                 type=CLEAN_DIR,
                                                 fname=ds['file_base'],
                                                 ftype=DATA_FTYPE)
                outPath = sf_template.substitute(base=ds['directory'],
                                                 type=TR_DIR,
                                                 fname=ds['file_base'],
                                                 ftype=DATA_FTYPE)
                yRange = range(ds['year_start'],
                               ds['year_end'] + ds['year_increment'],
                               ds['year_increment'])
                yRangeStr = [str(y) for y in yRange]

                # BREAK EARLY IF FILE ALREADY PROCESSED
                if (not hp.goAheadForTransform(absPath, cityMasterList)):
                    continue

                print("Loading: {:s}".format(absPath))
                rd = pd.read_csv(absPath)

                # Only include "State", "County", and any "YEAR in range" columns
                labels = ["State", "County"] + yRangeStr
                # Only include columns defined in the configuration file
                for col in rd.columns:
                    if col not in labels:
                        rd.drop(col, axis=1, inplace=True)

                # pad the dataframe with the master county list
                rd = padDataframe(rd, masterList)

                # Marker to indicate if row should be dropped
                rd['DROP_ROW'] = np.zeros(rd.shape[0])
                # Drop all rows not containg city/state pair in the master set
                for idx, row in rd.iterrows():
                    if (MASTER_COUNTY_SET.get(row['County']) == None):
                        # print(row['County'])
                        rd.set_value(idx, 'DROP_ROW', 1)
                    else:
                        if (not (row['State'] in MASTER_COUNTY_SET.get(
                                row['County']))):
                            # print(row['County'])
                            rd.set_value(idx, 'DROP_ROW', 1)

                # Only keep rows not slated to be dropped
                rd = rd[rd['DROP_ROW'] == 0]
                rd.drop('DROP_ROW', axis=1, inplace=True)

                # Output the transformed data to file
                print("{:s} transformed. Outputting to: {:s}".format(
                    ds['name'], outPath))
                rd.to_csv(outPath, index=False, line_terminator=",\n")

                # section to write the checksum file
                for county in MASTER_COUNTY_SET.keys():
                    for state in MASTER_COUNTY_SET[county]:
                        if (rd[(rd['State'] == state)
                               & (rd['County'] == county)].shape[0] == 0):
                            f_checksum.write("{:s} | {:s}\n".format(
                                county, state))

                # f_checksum.write("{:d} Counties\n\n".format(rd.shape[0]))
                checksumTransform['processedFiles'][absPath] = rd.shape[0]

            else:
                print("Multiple files\n")
                for year in hp.yearList(ds['year_start'], ds['year_end'],
                                        ds['year_increment'],
                                        ds['years_absent']):

                    absPath = mf_template.substitute(base=ds['directory'],
                                                     type=CLEAN_DIR,
                                                     fname=ds['file_base'],
                                                     year=year,
                                                     ftype=DATA_FTYPE)

                    outPath = mf_template.substitute(base=ds['directory'],
                                                     type=TR_DIR,
                                                     fname=ds['file_base'],
                                                     year=year,
                                                     ftype=DATA_FTYPE)

                    # BREAK EARLY IF FILE ALREADY PROCESSED
                    if (not hp.goAheadForTransform(absPath, cityMasterList)):
                        continue

                    print("Loading: {:s}".format(absPath))
                    rd = pd.read_csv(absPath)

                    # Only include columns defined in the configuration file
                    for col in rd.columns:
                        if col not in ds['data_labels']:
                            rd.drop(col, axis=1, inplace=True)

                    rd = padDataframe(rd, masterList)

                    # Marker to indicate if row should be dropped
                    rd['DROP_ROW'] = np.zeros(rd.shape[0])
                    # Drop all rows not containg city/state pair in the master set
                    for idx, row in rd.iterrows():
                        if (MASTER_COUNTY_SET.get(row['County']) == None):
                            rd.set_value(idx, 'DROP_ROW', 1)
                            # rd.drop(idx, inplace=True)
                        else:
                            if (not (row['State'] in MASTER_COUNTY_SET.get(
                                    row['County']))):
                                rd.set_value(idx, 'DROP_ROW', 1)
                                # rd.drop(idx, inplace=True)

                    # Only keep rows not slated to be dropped
                    rd = rd[rd['DROP_ROW'] == 0]
                    rd.drop('DROP_ROW', axis=1, inplace=True)

                    # Output the transformed data to file
                    print("{:s} transformed. Outputting to: {:s}".format(
                        ds['name'], outPath))
                    rd.to_csv(outPath, index=False, line_terminator=",\n")

                    # section to write the checksum file
                    for county in MASTER_COUNTY_SET.keys():
                        for state in MASTER_COUNTY_SET[county]:
                            if (rd[(rd['State'] == state)
                                   & (rd['County'] == county)].shape[0] == 0):
                                f_checksum.write("{:s} | {:s}\n".format(
                                    county, state))

                    checksumTransform['processedFiles'][absPath] = rd.shape[0]
            # f_checksum.write("Finished transforming dataset: {:s}\n\n".format(ds['name']))
            # f_checksum.write("---------------------------------------------------\n")

        except Exception as e:
            print(e)
            # f_checksum.write("ERROR transforming dataset: {:s}\n".format(ds['name']))

    # f_checksum.write('Timestamp: {:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now()))
    # f_checksum.close()
    hp.setChecksumTransform(checksumTransform)
    # Update the "previousRun.yaml" file
    prevRun = hp.getPreviousRunData()
    prevRun['masterCountyListFile'] = cityMasterList
    prevRun['numCountiesInMaster'] = masterList.shape[0]
    hp.setPreviousRunData(prevRun)
Ejemplo n.º 6
0
import helpers as hp
from string import Template
import pandas as pd
import numpy as np
import datetime

config = hp.getConfigData()

# load path constants defined in config.yaml
RAW_DIR = config['dirHeaders']['raw_dir']
CLEAN_DIR = config['dirHeaders']['cleaned_dir']
TR_DIR = config['dirHeaders']['transformed_dir']
DATA_FTYPE = config['dataFileType']

# define template for the absolute file path (defined in config.yaml)
sf_template = Template('$base$type$fname$ftype')
mf_template = Template('$base$type$fname$year$ftype')

# load the set of counties with "CITY"
cityList = pd.read_csv('CountyLists/citySet.csv')
baseList = pd.read_csv('CountyLists/baseSet.csv')

# get the master set of counties to check against
MASTER_COUNTY_SET = hp.getDictionarySet()

# open the checksum file descriptor
f_checksum = open('checksum_TRANSFORM.txt', 'w')

for s in config['datasets']:
    try:
        ds = s['set']