Example #1
0
    def read_using_dask(self):
        t1 = timeit.default_timer()
        """ Prepare Column names and column data types from schema file"""
        ipcolumns = []
        ipdatetimecolumns = []
        ipcolumnwidths = []
        ipcolumn_types = {}
        # all empty lines should be removed from ipschemafile
        with open(self.ipschemafile, 'r') as f:
            for line in f:
                if line == '':
                    break
                rec = line.strip().split()
                ipcolumns.append(rec[0])
                if self.sep == 'fixed width':
                    ipcolumnwidths.append(int(rec[1]))
                    if rec[2] == 'datetime':
                        ipdatetimecolumns.append(rec[0])
                        ipcolumn_types[rec[
                            0]] = 'string'  #Taking more time when parsing dates
                    else:
                        ipcolumn_types[rec[0]] = rec[2]
                else:
                    if rec[1] == 'datetime':
                        ipdatetimecolumns.append(rec[0])
                        ipcolumn_types[rec[
                            0]] = 'string'  #Taking more time when parsing dates
                    else:
                        ipcolumn_types[rec[0]] = rec[1]

        # Raise exception if ipcolumns contain duplicates

        if self.sep == 'fixed width':
            """ Create input file dataframe using dask read fwf"""
            ipdf = dd.read_fwf(self.ipfile,
                               header=self.header,
                               names=ipcolumns,
                               dtype=ipcolumn_types,
                               widths=ipcolumnwidths)
            ipdf = ipdf.repartition(self.parallel)
        else:
            """ Create input file dataframe using dask read csv"""
            ipdf = dd.read_csv(
                self.ipfile,
                sep=self.sep,
                skiprows=self.skiprows,
                header=self.header,
                names=ipcolumns,
                compression=self.compression,
                blocksize=self.blocksize,
                #parse_dates=ipdatetimecolumns,
                #infer_datetime_format=True, # not tested
                dtype=ipcolumn_types)
            ipdf = ipdf.repartition(self.parallel)
        print("Time taken : {} seconds for reading file '{}'".format(
            timeit.default_timer() - t1, self.ipfile))
        return ipdf
Example #2
0
def gerar_arquivo_final(extraidos_path, base_path):
    layout = LayoutB3()

    for file_name in os.listdir(extraidos_path):
        file_path = os.path.join(extraidos_path, file_name)
        
        print('Importando arquivo', file_path)
        
        df = dd.read_fwf(
            file_path,
            colspecs=layout.get_posicoes(),
            skiprows=1,
            skipfooter=1,
            names=layout.get_campos(),
            encoding='latin1',
            dtype={'PRAZOT': 'object'}
        )

        df['TIPREG'] = df['TIPREG']
        df['DATA'] = df['DATA']
        df['CODBDI'] = df['CODBDI'].astype(str)
        df['CODNEG'] = df['CODNEG'].astype(str)
        df['TPMERC'] = df['TPMERC']
        df['NOMRES'] = df['NOMRES'].astype(str)
        df['ESPECI'] = df['ESPECI'].astype(str)
        df['PRAZOT'] = df['PRAZOT'].astype(str)
        df['MODREF'] = df['MODREF'].astype(str)
        df['PREABE'] = df['PREABE'].astype(float)
        df['PREMAX'] = df['PREMAX'].astype(float)
        df['PREMIN'] = df['PREMIN'].astype(float)
        df['PREMED'] = df['PREMED'].astype(float)
        df['PREULT'] = df['PREULT'].astype(float)
        df['PREOFC'] = df['PREOFC'].astype(float)
        df['PREOFV'] = df['PREOFV'].astype(float)
        df['TOTNEG'] = df['TOTNEG']
        df['QUATOT'] = df['QUATOT']
        df['VOLTOT'] = df['VOLTOT'].astype(float)
        df['PREEXE'] = df['PREEXE'].astype(float)
        df['INDOPC'] = df['INDOPC']
        df['DATVEN'] = df['DATVEN']
        df['FATCOT'] = df['FATCOT']
        df['PTOEXE'] = df['PTOEXE'].astype(float)
        df['CODISI'] = df['CODISI'].astype(str)
        df['DISMES'] = df['DISMES']

        # Converte campo de data
        df = df.compute()
        df['DATA'] = pd.to_datetime(
            df['DATA'], format='%Y%m%d', errors='coerce')

        print('Importando para a base scraperwiki')
        import_scraperwiki(df)
        time.sleep(120)
        print('ok')
Example #3
0
def load_data(projectname):
    """Fungsi untuk baca dan olah data"""

    start = 32
    kolom = ["V0", "Q0", "ALPHA", "BETA", "CL", "CD", "CM25", "CY", "CYAW", "CROLL"]

    df = dd.read_fwf(f"data/{projectname}/print*", header=None, skiprows=start, skipfooter=2, include_path_column=True,
                     names=kolom)
    df = df[["path", "ALPHA", "CL", "CD", "CM25"]]
    df["RUN"] = df["path"].apply(lambda x: x.split("/")[-1], meta=("path", "string"))
    df = df.drop(["path"], axis=1)

    return df
Example #4
0
def dataframe_loader(_context, config):
    file_type, file_options = list(config.items())[0]
    path = file_options.get("path")

    if file_type == "csv":
        return dd.read_csv(path, **dict_without_keys(file_options, "path"))
    elif file_type == "parquet":
        return dd.read_parquet(path, **dict_without_keys(file_options, "path"))
    elif file_type == "hdf":
        return dd.read_hdf(path, **dict_without_keys(file_options, "path"))
    elif file_type == "json":
        return dd.read_json(path, **dict_without_keys(file_options, "path"))
    elif file_type == "sql_table":
        return dd.read_sql_table(**file_options)
    elif file_type == "table":
        return dd.read_table(path, **dict_without_keys(file_options, "path"))
    elif file_type == "fwf":
        return dd.read_fwf(path, **dict_without_keys(file_options, "path"))
    elif file_type == "orc":
        return dd.read_orc(path, **dict_without_keys(file_options, "path"))
    else:
        raise DagsterInvariantViolationError(
            "Unsupported file_type {file_type}".format(file_type=file_type))
Example #5
0
def dataframe_loader(_context, config):
    file_type, file_options = list(config.items())[0]
    path = file_options.get('path')

    if file_type == 'csv':
        return dd.read_csv(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'parquet':
        return dd.read_parquet(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'hdf':
        return dd.read_hdf(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'json':
        return dd.read_json(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'sql_table':
        return dd.read_sql_table(**file_options)
    elif file_type == 'table':
        return dd.read_table(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'fwf':
        return dd.read_fwf(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'orc':
        return dd.read_orc(path, **dict_without_keys(file_options, 'path'))
    else:
        raise DagsterInvariantViolationError(
            'Unsupported file_type {file_type}'.format(file_type=file_type))
import dask.dataframe as dd


fix_width_lar = [4,10,1,1,1,1,5,1,4,2,3,7,1,1,1,1,4,1,1,1,1,1,7]
header_names_lar = ['year','id','agency','type_loan','purchase_loan','occ','amount','action',\
                'property','state','county','census','race_app','race_coapp',\
                'sex_app','sex_coapp','inc_app','purchaser','denial1','denial2',\
                'denial3','edit_status','seq_num']
df = pd.read_fwf('HMS.U1994.LARS', widths = fix_width_lar, names = header_names_lar)

#test
%time dfpd = pd.read_fwf('HMS.U1994.LARS', widths = fix_width_lar, nrows= 10000, names = header_names_lar)

%time dfpd = pd.read_fwf('HMS.U1994.LARS', widths = fix_width_lar, nrows= 100000, names = header_names_lar, engine = 'c')
%time dfdd = dd.read_fwf('HMS.U1994.LARS', widths = fix_width_lar, names = header_names)

%time dfpd = pd.read_fwf('HMS.U1994.LARS', widths = fix_width_lar, names = header_names_lar, chunksize = 1e6)

chunk_list = []  # append each chunk df here 

for chunk in dfpd:  
    # perform data filtering 
    chunk_filter = pd.DataFrame(chunk)
    
    # Once the data filtering is done, append the chunk to list
    chunk_list.append(chunk_filter)
    
# concat the list into dataframe 
df_concat = pd.concat(chunk_list)