def read_using_dask(self): t1 = timeit.default_timer() """ Prepare Column names and column data types from schema file""" ipcolumns = [] ipdatetimecolumns = [] ipcolumnwidths = [] ipcolumn_types = {} # all empty lines should be removed from ipschemafile with open(self.ipschemafile, 'r') as f: for line in f: if line == '': break rec = line.strip().split() ipcolumns.append(rec[0]) if self.sep == 'fixed width': ipcolumnwidths.append(int(rec[1])) if rec[2] == 'datetime': ipdatetimecolumns.append(rec[0]) ipcolumn_types[rec[ 0]] = 'string' #Taking more time when parsing dates else: ipcolumn_types[rec[0]] = rec[2] else: if rec[1] == 'datetime': ipdatetimecolumns.append(rec[0]) ipcolumn_types[rec[ 0]] = 'string' #Taking more time when parsing dates else: ipcolumn_types[rec[0]] = rec[1] # Raise exception if ipcolumns contain duplicates if self.sep == 'fixed width': """ Create input file dataframe using dask read fwf""" ipdf = dd.read_fwf(self.ipfile, header=self.header, names=ipcolumns, dtype=ipcolumn_types, widths=ipcolumnwidths) ipdf = ipdf.repartition(self.parallel) else: """ Create input file dataframe using dask read csv""" ipdf = dd.read_csv( self.ipfile, sep=self.sep, skiprows=self.skiprows, header=self.header, names=ipcolumns, compression=self.compression, blocksize=self.blocksize, #parse_dates=ipdatetimecolumns, #infer_datetime_format=True, # not tested dtype=ipcolumn_types) ipdf = ipdf.repartition(self.parallel) print("Time taken : {} seconds for reading file '{}'".format( timeit.default_timer() - t1, self.ipfile)) return ipdf
def gerar_arquivo_final(extraidos_path, base_path): layout = LayoutB3() for file_name in os.listdir(extraidos_path): file_path = os.path.join(extraidos_path, file_name) print('Importando arquivo', file_path) df = dd.read_fwf( file_path, colspecs=layout.get_posicoes(), skiprows=1, skipfooter=1, names=layout.get_campos(), encoding='latin1', dtype={'PRAZOT': 'object'} ) df['TIPREG'] = df['TIPREG'] df['DATA'] = df['DATA'] df['CODBDI'] = df['CODBDI'].astype(str) df['CODNEG'] = df['CODNEG'].astype(str) df['TPMERC'] = df['TPMERC'] df['NOMRES'] = df['NOMRES'].astype(str) df['ESPECI'] = df['ESPECI'].astype(str) df['PRAZOT'] = df['PRAZOT'].astype(str) df['MODREF'] = df['MODREF'].astype(str) df['PREABE'] = df['PREABE'].astype(float) df['PREMAX'] = df['PREMAX'].astype(float) df['PREMIN'] = df['PREMIN'].astype(float) df['PREMED'] = df['PREMED'].astype(float) df['PREULT'] = df['PREULT'].astype(float) df['PREOFC'] = df['PREOFC'].astype(float) df['PREOFV'] = df['PREOFV'].astype(float) df['TOTNEG'] = df['TOTNEG'] df['QUATOT'] = df['QUATOT'] df['VOLTOT'] = df['VOLTOT'].astype(float) df['PREEXE'] = df['PREEXE'].astype(float) df['INDOPC'] = df['INDOPC'] df['DATVEN'] = df['DATVEN'] df['FATCOT'] = df['FATCOT'] df['PTOEXE'] = df['PTOEXE'].astype(float) df['CODISI'] = df['CODISI'].astype(str) df['DISMES'] = df['DISMES'] # Converte campo de data df = df.compute() df['DATA'] = pd.to_datetime( df['DATA'], format='%Y%m%d', errors='coerce') print('Importando para a base scraperwiki') import_scraperwiki(df) time.sleep(120) print('ok')
def load_data(projectname): """Fungsi untuk baca dan olah data""" start = 32 kolom = ["V0", "Q0", "ALPHA", "BETA", "CL", "CD", "CM25", "CY", "CYAW", "CROLL"] df = dd.read_fwf(f"data/{projectname}/print*", header=None, skiprows=start, skipfooter=2, include_path_column=True, names=kolom) df = df[["path", "ALPHA", "CL", "CD", "CM25"]] df["RUN"] = df["path"].apply(lambda x: x.split("/")[-1], meta=("path", "string")) df = df.drop(["path"], axis=1) return df
def dataframe_loader(_context, config): file_type, file_options = list(config.items())[0] path = file_options.get("path") if file_type == "csv": return dd.read_csv(path, **dict_without_keys(file_options, "path")) elif file_type == "parquet": return dd.read_parquet(path, **dict_without_keys(file_options, "path")) elif file_type == "hdf": return dd.read_hdf(path, **dict_without_keys(file_options, "path")) elif file_type == "json": return dd.read_json(path, **dict_without_keys(file_options, "path")) elif file_type == "sql_table": return dd.read_sql_table(**file_options) elif file_type == "table": return dd.read_table(path, **dict_without_keys(file_options, "path")) elif file_type == "fwf": return dd.read_fwf(path, **dict_without_keys(file_options, "path")) elif file_type == "orc": return dd.read_orc(path, **dict_without_keys(file_options, "path")) else: raise DagsterInvariantViolationError( "Unsupported file_type {file_type}".format(file_type=file_type))
def dataframe_loader(_context, config): file_type, file_options = list(config.items())[0] path = file_options.get('path') if file_type == 'csv': return dd.read_csv(path, **dict_without_keys(file_options, 'path')) elif file_type == 'parquet': return dd.read_parquet(path, **dict_without_keys(file_options, 'path')) elif file_type == 'hdf': return dd.read_hdf(path, **dict_without_keys(file_options, 'path')) elif file_type == 'json': return dd.read_json(path, **dict_without_keys(file_options, 'path')) elif file_type == 'sql_table': return dd.read_sql_table(**file_options) elif file_type == 'table': return dd.read_table(path, **dict_without_keys(file_options, 'path')) elif file_type == 'fwf': return dd.read_fwf(path, **dict_without_keys(file_options, 'path')) elif file_type == 'orc': return dd.read_orc(path, **dict_without_keys(file_options, 'path')) else: raise DagsterInvariantViolationError( 'Unsupported file_type {file_type}'.format(file_type=file_type))
import dask.dataframe as dd fix_width_lar = [4,10,1,1,1,1,5,1,4,2,3,7,1,1,1,1,4,1,1,1,1,1,7] header_names_lar = ['year','id','agency','type_loan','purchase_loan','occ','amount','action',\ 'property','state','county','census','race_app','race_coapp',\ 'sex_app','sex_coapp','inc_app','purchaser','denial1','denial2',\ 'denial3','edit_status','seq_num'] df = pd.read_fwf('HMS.U1994.LARS', widths = fix_width_lar, names = header_names_lar) #test %time dfpd = pd.read_fwf('HMS.U1994.LARS', widths = fix_width_lar, nrows= 10000, names = header_names_lar) %time dfpd = pd.read_fwf('HMS.U1994.LARS', widths = fix_width_lar, nrows= 100000, names = header_names_lar, engine = 'c') %time dfdd = dd.read_fwf('HMS.U1994.LARS', widths = fix_width_lar, names = header_names) %time dfpd = pd.read_fwf('HMS.U1994.LARS', widths = fix_width_lar, names = header_names_lar, chunksize = 1e6) chunk_list = [] # append each chunk df here for chunk in dfpd: # perform data filtering chunk_filter = pd.DataFrame(chunk) # Once the data filtering is done, append the chunk to list chunk_list.append(chunk_filter) # concat the list into dataframe df_concat = pd.concat(chunk_list)