def create_stop_time_data(): print("*** creating stop time data ***") query = """select trip_id from trip;""" trip_list = [id[0] for id in run_query(query)] delayed_funcs = [ delayed(get_stop_time_df)(t_id, get_conn) for t_id in trip_list ] res = Parallel(n_jobs=-1)(delayed_funcs) stop_time_trip_df = vaex.from_pandas(pd.concat(res)) print(f"concat stop_time data, time: {duration()}") # strong type casting stop_time_trip_df["stop_sequence"] = stop_time_trip_df[ "stop_sequence"].astype("int64") stop_time_trip_df["shape_dist_traveled"] = stop_time_trip_df[ "shape_dist_traveled"].astype("float64") stop_time_trip_df["direction"] = stop_time_trip_df["direction"].astype( "int64") stop_time_trip_df["lat"] = stop_time_trip_df["lat"].astype("float64") stop_time_trip_df["lon"] = stop_time_trip_df["lon"].astype("float64") stop_time_trip_df["direction_angle"] = stop_time_trip_df[ "direction_angle"].astype("float64") stop_time_trip_df["shape_dist_between"] = stop_time_trip_df[ "shape_dist_between"].astype("float64") stop_time_trip_df.export_hdf5(stop_time_data_path) # export to hdf5 return
def jets_from_raw(in_fname, num_evts, tag_mcpid=[6], stride=1000, num_procs=1, offset=None): """Returns a pandas dataframe of jet constituent calorimeter info. Keyword arguments: in_fname: (str) path to file containing data interv: (list) events to read from file, ie. [start, end] tag_mcpid: (array like) list of ancestor particle ids forming jets stride: (int) number events per processor read in at once num_procs: (int) number processes to spawn """ num_chunks = int(np.ceil(num_evts / stride)) starts = np.arange(0, num_evts, stride, dtype=int) ranges = np.array([0, stride - 1]).reshape(1, 2) + starts[:, np.newaxis] ranges[-1, 1] = num_evts with Pool(processes=num_procs) as pool: jet_df = partial(_jet_chunk, in_fname=in_fname, tag_mcpid=tag_mcpid, offset=offset) pd_data = pd.concat(pool.map(jet_df, list(ranges))) return vpd.from_pandas(pd_data, copy_index=True, index_name='event')
def to_hdf5(self, out_folder, min_frame=None, max_frame=None, step=1000): """Dump project project to hdf5. Args: out_folder (str): Path to the folder where to store the outcome. min_frame (int): Minimal frame in selection for saving. max_frame (int): Maximal frame in selection for saving. """ import vaex as vx min_frame = self.min_frame if min_frame is None else min_frame max_frame = self.max_frame if max_frame is None else max_frame out_folder = Path(out_folder) out_folder.mkdir(parents=True, exist_ok=True) types_remap = { 'frame': 'uint16', 'scan': 'uint16', 'tof': 'uint32', 'i': 'uint32' } #TODO: types should be remaped earlier! for f, F in ranges(min_frame, max_frame + 1, step): pd_df = self[f:F] pd_df = pd_df.astype(types_remap) path = str(out_folder / f"{f}_{F}.hdf5") vx_df = vx.from_pandas(pd_df, copy_index=False) vx_df.export_hdf5(path=path) del vx_df, pd_df
def _from_csv_convert_and_read(filename_or_buffer, path_output, chunk_size, fs_options, fs=None, copy_index=False, progress=None, **kwargs): # figure out the CSV file path csv_path = vaex.file.stringyfy(filename_or_buffer) path_output_bare, ext, _ = vaex.file.split_ext(path_output) combined_hdf5 = _convert_name(csv_path) # convert CSV chunks to separate HDF5 files import pandas as pd converted_paths = [] # we don't have indeterminate progress bars, so we cast it to truethy progress = bool(progress) if progress is not None else False if progress: print("Converting csv to chunk files") with vaex.file.open(filename_or_buffer, fs_options=fs_options, fs=fs, for_arrow=True) as f: csv_reader = pd.read_csv(filename_or_buffer, chunksize=chunk_size, **kwargs) for i, df_pandas in enumerate(csv_reader): df = vaex.from_pandas(df_pandas, copy_index=copy_index) chunk_name = f'{path_output_bare}_chunk_{i}{ext}' df.export(chunk_name) converted_paths.append(chunk_name) log.info('saved chunk #%d to %s' % (i, chunk_name)) if progress: print("Saved chunk #%d to %s" % (i, chunk_name)) # combine chunks into one HDF5 file if len(converted_paths) == 1: # no need to merge several HDF5 files os.rename(converted_paths[0], path_output) else: if progress: print('Converting %d chunks into single file %s' % (len(converted_paths), path_output)) log.info('converting %d chunks into single file %s' % (len(converted_paths), path_output)) dfs = [vaex.open(p) for p in converted_paths] df_combined = vaex.concat(dfs) df_combined.export(path_output, progress=progress) log.info('deleting %d chunk files' % len(converted_paths)) for df, df_path in zip(dfs, converted_paths): try: df.close() os.remove(df_path) except Exception as e: log.error( 'Could not close or delete intermediate file %s used to convert %s to single file: %s', (df_path, csv_path, path_output))
def _write_prices(self): for index, chunk in enumerate(self.price): price_sub_df = vaex.from_pandas(chunk) price_sub_df.export_hdf5(self.folder_path + '/' + DMGenerator.PRICES_FOLDER + f'/price_{index}.hdf5') self._consolidate_folder(DMGenerator.PRICES_FOLDER)
def convert(self, force=False): import pandas as pd skips = ["store_and_fwd_flag"] for i, (input, output) in enumerate(zip(self.filenames, self.filenames_vaex)): date_names = ["tpep_pickup_datetime", "tpep_dropoff_datetime"] if not os.path.exists(output) or force: print("Converting %s to %s (%d out of %d)" % (input, output, i + 1, len(self.filenames))) df = pd.read_csv(input, parse_dates=date_names) for skip in skips: if skip in df: del df["store_and_fwd_flag"] ds = vx.from_pandas(df) ds.add_virtual_column("pickup_hour", "hourofday(tpep_pickup_datetime)") ds.add_virtual_column("dropoff_hour", "hourofday(tpep_dropoff_datetime)") ds.add_virtual_column("pickup_dayofweek", "dayofweek(tpep_pickup_datetime)") ds.add_virtual_column("dropoff_dayofweek", "dayofweek(tpep_dropoff_datetime)") ds.select( "(pickup_longitude != 0) & (pickup_latitude != 0) & (dropoff_longitude != 0) & (dropoff_latitude != 0)" ) ds.export_hdf5(output, virtual=True, selection=True)
def fcs_to_feather(file: str, outDir: Path): """Convert fcs file to csv. Copied from polus-fcs-to-csv-converter plugin. Args: file (str): Path to the directory containing the fcs file. outDir (Path): Path to save the output csv file. Returns: Converted csv file. """ file_name = Path(file).stem feather_filename = file_name + ".feather" logger.info("fcs_to_feather : Begin parsing data out of .fcs file" + file_name) # Use fcsparser to parse data into python dataframe meta, data = fcsparser.parse(file, meta_data_only=False, reformat_meta=True) # Export the fcs data to vaex df logger.info("fcs_to_feather: converting data to vaex dataframe...") df = vaex.from_pandas(data) logger.info("fcs_to_feather: writing file...") os.chdir(outDir) logger.info( "fcs_to_feather: Writing Vaex Dataframe to Feather File Format for:" + file_name) df.export_feather(feather_filename, outDir)
def _write_calendar(self): for index, chunk in enumerate(self.calendar): calendar_sub_df = vaex.from_pandas(chunk) calendar_sub_df.export_hdf5(self.folder_path + '/' + DMGenerator.CALENDAR_FOLDER + f'/calendar_{index}.hdf5') self._consolidate_folder(DMGenerator.CALENDAR_FOLDER)
def check_results(fns, tag): def get_field(ds, col): return ds.evaluate(ds[col.upper()]) rr = [] dd = [] for fn in fns: df = pd.read_hdf(fn, key='data') ds = vaex.from_pandas(df) print(len(ds), 'rows') ra = get_field(ds, 'ra') dec = get_field(ds, 'dec') rr.append(ra) dd.append(dec) rr = np.hstack(rr) dd = np.hstack(dd) print('Total of', len(rr), 'stars') T = fits_table() T.ra = rr T.dec = dd T.writeto('all-rd-%s.fits' % tag) plothist(rr, dd, 500) plt.xlabel('RA (deg)') plt.ylabel('Dec (deg)') plt.savefig('all-radec-%s.png' % tag) I, J, d = match_radec(rr, dd, rr, dd, 0.2 / 3600, notself=True) plt.clf() plt.hist(d * 3600. * 1000., bins=50) plt.xlabel('Distance between stars (milli-arcsec)') plt.savefig('all-dists-%s.png' % tag)
def _from_csv_convert_and_read(filename_or_buffer, maybe_convert_path, chunk_size, fs_options, fs=None, copy_index=False, **kwargs): # figure out the CSV file path if isinstance(maybe_convert_path, str): csv_path = re.sub(r'\.hdf5$', '', str(maybe_convert_path), flags=re.IGNORECASE) elif isinstance(filename_or_buffer, str): csv_path = filename_or_buffer else: raise ValueError( 'Cannot derive filename to use for converted HDF5 file, ' 'please specify it using convert="my.csv.hdf5"') combined_hdf5 = _convert_name(csv_path) # convert CSV chunks to separate HDF5 files import pandas as pd converted_paths = [] with vaex.file.open(filename_or_buffer, fs_options=fs_options, fs=fs, for_arrow=True) as f: csv_reader = pd.read_csv(filename_or_buffer, chunksize=chunk_size, **kwargs) for i, df_pandas in enumerate(csv_reader): df = vaex.from_pandas(df_pandas, copy_index=copy_index) filename_hdf5 = _convert_name(csv_path, suffix='_chunk%d' % i) df.export_hdf5(filename_hdf5) converted_paths.append(filename_hdf5) log.info('saved chunk #%d to %s' % (i, filename_hdf5)) # combine chunks into one HDF5 file if len(converted_paths) == 1: # no need to merge several HDF5 files os.rename(converted_paths[0], combined_hdf5) else: log.info('converting %d chunks into single HDF5 file %s' % (len(converted_paths), combined_hdf5)) dfs = [vaex.open(p) for p in converted_paths] df_combined = vaex.concat(dfs) df_combined.export_hdf5(combined_hdf5) log.info('deleting %d chunk files' % len(converted_paths)) for df, df_path in zip(dfs, converted_paths): try: df.close() os.remove(df_path) except Exception as e: log.error( 'Could not close or delete intermediate hdf5 file %s used to convert %s to hdf5: %s' % (df_path, csv_path, e))
def count_volume_premarket_price_m1(): time_925 = np.timedelta64(9, 'h') + np.timedelta64(25, 'm') for file in os.listdir('Data'): if file.endswith('.csv'): pandas_df = pd.read_csv('Data/' + file) print(file) pandas_df['date'] = pd.to_datetime(pandas_df['date']) pandas_df['date_int'] = pd.to_datetime(pandas_df['date']).astype( np.int64) columns_list = pandas_df.columns.tolist() columns_list = columns_list[-1:] + columns_list[:-1] pandas_df = pandas_df[columns_list] vaex_df = vaex.from_pandas(pandas_df) start_date = np.datetime64(vaex_df.data.date[0], 'D') end_date = np.datetime64(vaex_df.data.date[-1], 'D') date = [] volume = [] open_924 = [] while start_date < end_date: date.append(start_date) midnight = start_date.astype('datetime64[ms]').astype( 'int64') * 1_000_000 premarket_stop = (start_date + time_925).astype( 'datetime64[ms]').astype('int64') * 1_000_000 vaex_df.select((vaex_df.date_int > midnight) & (vaex_df.date_int < premarket_stop)) volume.append(vaex_df.sum("volume", selection=True)) if vaex_df.evaluate("open", selection=True).size > 0: open_924.append( vaex_df.evaluate("open", selection=True)[-1]) else: open_924.append(0) start_date = np.busday_offset(start_date + np.timedelta64(1, 'D'), 0, roll='forward') pd.DataFrame({ 'Date': date, 'Open_924': open_924, 'Volume': volume }).to_csv('Data/New_Files/' + file)
def process_data(): if not os.path.exists(stop_time_data_path): create_stop_time_data() print("*** processing data ***") df = vaex.open(gtfs_final_hdf5_path) # compute direction and day of week from realtime data df["direction"] = df["trip_id"].apply(lambda t: dir_f_trip(t)) df["dow"] = df["start_date"].apply(lambda t: get_dt(t, "%Y%m%d").weekday()) # store these columns in memory df.materialize("direction", inplace=True) df.materialize("dow", inplace=True) # 500 is set as an error column to remove, None isnt supported df = df[df["direction"] != 500] # drop trip_id to remove duplicates df.drop("trip_id", inplace=True) # important, we use these columns and later service days in order to # stop being dependent on trip_id. cols = ["route_id", "stop_sequence", "stop_id", "start_time", "direction"] df = vaex_mjoin(df.shallow_copy(), vaex.open(stop_time_data_path), cols, cols, how="inner", allow_duplication=True) # filter to keep only trips that happened on that dayof week df["keep_trip"] = df.apply( lambda sd, dow: sd.replace("[", "").replace("]", "").replace(" ", ""). split(",")[dow], ["service_days", "dow"]) df = df[df.keep_trip == "True"] # drop redundant columns df.drop(["service_days", "dow", "keep_trip"], inplace=True) df = vaex.from_pandas(df.to_pandas_df().drop_duplicates( subset=[i for i in df.get_column_names() if i != "trip_id"])) # df = vx_dedupe(df, columns=[i for i in df.get_column_names() if i != "trip_id"]) print(f"merged stop_time & gtfsr data, time: {duration()}") df = predict_traffic_from_scats(df) df.export_hdf5(gtfsr_processed_path) print(f"finished processing data, {duration()}")
def _write_sales_info(self): info_data = pd.DataFrame() for index, chunk in enumerate(self.sales): info_cols = [ 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id' ] info_data = info_data.append(chunk.loc[:, ['id'] + info_cols]) sales_cols = [col for col in chunk.columns if 'd_' in col] sales = chunk.loc[:, ['id'] + sales_cols].melt(id_vars='id') sales = vaex.from_pandas(sales) sales.export_hdf5(self.folder_path + '/' + DMGenerator.SALES_FOLDER + f'/sales_{index}.hdf5') info_data.drop_duplicates().to_parquet(self.folder_path + '/' + DMGenerator.INFO_FOLDER + '/info.parquet.gzip', index=False) self._consolidate_folder(DMGenerator.SALES_FOLDER)
def extract(ref, vcf, bam, outdir, prefix, skip_bam_readcount, labels, pkl, loglevel): logging.basicConfig( level=loglevel, format= '%(asctime)s (%(relativeCreated)d ms) -> %(levelname)s: %(message)s', datefmt='%I:%M:%S %p') if not prefix: prefix = os.path.basename(bam.split('.')[0]) # Generate matrix of true variants if labels or not os.path.exists(os.path.join(outdir, 'true_vars.pkl')): logger.info('Converting ground truth variants to 1-based coordinates') true_vars = pd.read_csv(label, sep='\t', index_col=None, header=None, dtype={0: str}) true_vars.columns = ['chr', 'start', 'end', 'ref', 'alt'] true_vars = true_vars.progress_apply(convert_one_based, axis=1) true_vars.to_pickle(os.path.join(outdir, 'true_vars.pkl')) logger.info('Preparing data') prep_data = dp.PrepareData(prefix, bam, bed_file_path, ref, outdir) df = prep_data.training_data true_vars = pd.read_pickle( os.path.join(DATA, 'train_data', 'true_vars.pkl')) df['real'] = 0 sample = df.index[0].split('~')[0] true_vars_set = set(df.index.str.replace(sample + '~', '')) for index, row in true_vars.iterrows(): progress(index, true_vars.shape[0]) var = "{0}:{1}-{2}{3}>{4}".format(row.chr, row.start, row.end, row.ref, row.alt) if var in true_vars_set: df.loc[sample + '~' + var, 'real'] = 1 vaex_df = vaex.from_pandas(df) vaex_df.export(os.path.join(outdir, 'train.hdf5'))
def convert(self, force=False): import pandas as pd skips = ["store_and_fwd_flag"] for i, (input, output) in enumerate(zip(self.filenames, self.filenames_vaex)): date_names = ["tpep_pickup_datetime", "tpep_dropoff_datetime"] if not os.path.exists(output) or force: print("Converting %s to %s (%d out of %d)" % (input, output, i + 1, len(self.filenames))) df = pd.read_csv(input, parse_dates=date_names) for skip in skips: if skip in df: del df["store_and_fwd_flag"] ds = vx.from_pandas(df) ds.add_virtual_column("pickup_hour", "hourofday(tpep_pickup_datetime)") ds.add_virtual_column("dropoff_hour", "hourofday(tpep_dropoff_datetime)") ds.add_virtual_column("pickup_dayofweek", "dayofweek(tpep_pickup_datetime)") ds.add_virtual_column("dropoff_dayofweek", "dayofweek(tpep_dropoff_datetime)") ds.select("(pickup_longitude != 0) & (pickup_latitude != 0) & (dropoff_longitude != 0) & (dropoff_latitude != 0)") ds.export_hdf5(output, virtual=True, selection=True)
def h5pandas_to_vaex(file_in: Union[None, str, PurePath], del_found_tmp_files=False): """ Pandas hdf5 to vaex.hdf5 conversion: saves tmp files, then searches and combines them. :param file_in: pandas hdf5 file :return: Uses this module functions: h5pandas_to_vaex_file_names() h5pandas_to_vaex_combine() """ tmp_save_pattern, tmp_search_pattern, path_out_str = h5pandas_to_vaex_file_names(file_in) chunksize = 500000 # will get x00 MB files ichunk = 0 for chunk in pd.read_hdf(file_in, 'csv', chunksize=chunksize): # , where='a < someval' df = vaex.from_pandas(chunk) df.export_hdf5(tmp_save_pattern.format(ichunk)) ichunk += 1 print(ichunk, end=' ') h5pandas_to_vaex_combine(tmp_search_pattern, path_out_str, check_files_number=ichunk, del_found_tmp_files=del_found_tmp_files)
def test_from_pandas(): dd_dict = { 'boolean': [True, True, False, None, True], 'text': ['This', 'is', 'some', 'text', 'so...'], 'text_missing': pd.Series(['Some', 'parts', None, 'missing', None], dtype='string'), 'float': [1, 30, -2, 1.5, 0.000], 'float_missing': [1, None, -2, 1.5, 0.000], 'float_missing_masked': pd.Series([1, None, -2, 1.5, 0.000], dtype=pd.Float64Dtype()), 'int_missing': pd.Series([1, None, 5, 1, 10], dtype='Int64'), 'datetime_1': [pd.NaT, datetime.datetime(2019, 1, 1, 1, 1, 1), datetime.datetime(2019, 1, 1, 1, 1, 1), datetime.datetime(2019, 1, 1, 1, 1, 1), datetime.datetime(2019, 1, 1, 1, 1, 1)], 'datetime_2': [pd.NaT, None, pd.NaT, pd.NaT, pd.NaT], 'datetime_3': [pd.Timedelta('1M'), pd.Timedelta('1D'), pd.Timedelta('100M'), pd.Timedelta('2D'), pd.Timedelta('1H')], 'datetime_4': [pd.Timestamp('2001-1-1 2:2:11'), pd.Timestamp('2001-12'), pd.Timestamp('2001-10-1'), pd.Timestamp('2001-03-1 2:2:11'), pd.Timestamp('2001-1-1 2:2:11')], 'datetime_5': [datetime.date(2010, 1, 1), datetime.date(2010, 1, 1), datetime.date(2010, 1, 1), datetime.date(2010, 1, 1), datetime.date(2010, 1, 1)], 'datetime_6': [datetime.time(21, 1, 1), datetime.time(21, 1, 1), datetime.time(21, 1, 1), datetime.time(21, 1, 1), datetime.time(21, 1, 1)], } # Get pandas dataframe pandas_df = pd.DataFrame(dd_dict) pandas_df['datetime_7'] = pd.to_timedelta(pandas_df['datetime_2'] - pandas_df['datetime_1']) vaex_df = vaex.from_pandas(pandas_df) repr_value = repr(vaex_df) str_value = str(vaex_df) assert 'NaT' in repr_value assert 'NaT' in str_value assert '--' in repr_value assert '--' in str_value # string columns are now arrows arrays # assert vaex_df.text_missing.is_masked == True assert vaex_df.int_missing.is_masked == True assert vaex_df.float_missing.is_masked == False assert vaex_df.float_missing_masked.is_masked == True assert vaex_df.int_missing.tolist() == [1, None, 5, 1, 10] assert vaex_df.text_missing.tolist() == ['Some', 'parts', None, 'missing', None] assert vaex_df.float_missing.values[[0, 2, 3, 4]].tolist() == [1.0, -2.0, 1.5, 0.0] assert np.isnan(vaex_df.float_missing.values[1]) assert vaex_df.float_missing_masked.tolist() == [1.0, None, -2.0, 1.5, 0.0]
def main(): print(f'HDF5 Stored Size: {CSV_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB') stream = pd.read_csv( CSV_FILE_PATH, chunksize=CHUNK_SIZE, low_memory=False, sep=',', encoding='latin-1', ) TMP_PATH.mkdir(parents=True, exist_ok=True) for i, chunk in enumerate(stream): print(f'Processing {i + 1}-th chunk containing "{len(chunk)}" rows of data...') df_chunk = vaex.from_pandas(chunk, copy_index=False) export_path = TMP_PATH / f'part_{i}.hdf5' df_chunk.export_hdf5(str(export_path)) df = vaex.open(str(TMP_PATH / 'part*')) df.export_hdf5(str(COLUMNAR_HDF_FILE_PATH)) print(f'HDF5 Stored Size: {COLUMNAR_HDF_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB') rmtree(TMP_PATH)
def apply_alignments(): from astrom_common import Affine T = fits_table('affines.fits') affs = Affine.fromTable(T) print('Read affines:', affs) ibright = dict([(fn.strip(), i) for fn in affs.filenames]) corners = {} for line in open('corners.txt').readlines(): line = line.strip() words = line.split() ras = np.array([float(words[i]) for i in [1, 3, 5, 7]]) decs = np.array([float(words[i]) for i in [2, 4, 6, 8]]) corners[words[0]] = (ras, decs) from astrometry.util.miscutils import point_in_poly fns = (glob('data/M31-*ST/proc_default/M31-*ST.phot.hdf5') + glob('data/M31-*ST/M31-*ST.phot.hdf5')) fns.sort() print('Files:', fns) veto_polys = [] for photfile in fns: basename = os.path.basename(photfile) basename = basename.replace('.phot.hdf5', '') print('Base name:', basename) brightfn = basename + '-bright.fits' ii = ibright[brightfn] aff = affs[ii] print('Reading', photfile) df = pd.read_hdf(photfile, key='data') ds = vaex.from_pandas(df) print(len(ds), 'rows') ra = ds.evaluate(ds['ra']) dec = ds.evaluate(ds['dec']) ra, dec = aff.apply(ra, dec) corner = corners[basename] Tleft = fits_table() Tleft.ra = ra Tleft.dec = dec Tleft.index = np.arange(len(Tleft)) ras, decs = corner poly = np.vstack((ras, decs)).T inside = point_in_poly(Tleft.ra, Tleft.dec, poly) print(np.sum(inside), 'of', len(Tleft), 'inside corners of this half-brick') inside_veto = np.zeros(len(Tleft), bool) for vp in veto_polys: inveto = point_in_poly(Tleft.ra, Tleft.dec, vp) inside_veto[inveto] = True print(np.sum(inside_veto), 'stars are inside the corners of previous half-bricks') print('inside:', type(inside)) inside[inveto] = False print(np.sum(inside), 'stars are uniquely in this half-brick') veto_polys.append(poly) outfn = 'out-%s.hdf5' % basename df[inside].to_hdf5(outfn, key='data', mode='w', format='table', complevel=9, complib='zlib') print('Wrote', outfn)
import os from datetime import datetime import pandas as pd import vaex from vaex.datasets import nyctaxi_yellow_2015_jan # download and read Pandas CSV nyctaxi_yellow_2015_jan.download() csv_size = os.path.getsize(nyctaxi_yellow_2015_jan.filenames[0]) df = pd.read_csv(nyctaxi_yellow_2015_jan.filenames[0]) # convert to Vaex start = datetime.now() vdf = vaex.from_pandas(df) duration = datetime.now() - start print( 'it took {} to convert {:,} rows ({:.1f} Gb), which is {:,} rows per second' .format(duration, len(df), csv_size / 1024. / 1024 / 1024, int(len(df) / duration.total_seconds()))) # Last result when running on: # 2.8 GHz Quad-Core Intel Core i7; 16 GB 1600 MHz DDR3: # it took 0:00:07.244279 to convert 12,748,986 rows (1.8 Gb), which is 1,759,869 rows per second
def csv_to_h5( read_csv_args, to_hdf_args, dates_formats: Mapping[str, str], correct_fun: Tuple[None, bool, Callable[[pd.DataFrame], None]] = None, processing: Optional[Mapping[Tuple[Tuple[str], Tuple[str]], Callable[[Any], Any]]] = None, out_cols: Optional[Sequence] = None, continue_row=False, vaex_format: Optional[bool]=None ): """ Read csv and write to hdf5 :param read_csv_args: dict, must have keys: filepath_or_buffer, chunksize :param to_hdf_args: path_or_buf: default = read_csv_args['filepath_or_buffer'].with_suffix('vaex.h5' if vaex_format else '.h5') mode: default = 'w' if not continue_row else 'a', key: hdf5 group name in hdf5 file where store data ... :param dates_formats: column: csv column name wich need to be convert from str to DateTime, date_format: date formats :param processing: dict with keys: ((_input cols_), (_output cols_)) and values: function(_input cols_) that will return _output cols_ :param out_cols: default is all excluding columns that in inputs but not in output of custom param:processing :param continue_row: csv row number (excluding header) to start with shifting index. If output file exist and continue_row = True then continue converting starting from row equal to last index in it, useful to continue after program interrupting or csv appending. If not exist then start from row 0 giving it index 0. If continue_row = integer then start from this row, giving starting index = continue_row :param correct_fun: function applied to each chunk returned by read_csv() which is a frame of column data of type str :param vaex_format: bool how to write chunks: - True: to many vaex hdf5 files. They at end will be converted to single vaex hdf5 file - False: appending to single pandas hdf5 table - None: evaluates to True if to_hdf_args['path_or_buf'] has next to last suffix ".vaex" else to False :return: """ if to_hdf_args.get('path_or_buf'): if vaex_format is None: vaex_format = Path(str(to_hdf_args['path_or_buf']).strip()).suffixes[:-1] == ['.vaex'] else: # give default name to output file to_hdf_args['path_or_buf'] = Path(read_csv_args['filepath_or_buffer']).with_suffix( f'{".vaex" if vaex_format else ""}.h5' ) # Deal with vaex/pandas storing difference if vaex_format: open_for_pandas_to_hdf = None tmp_save_pattern, tmp_search_pattern = h5pandas_to_vaex_file_names( path_out_str=str(to_hdf_args['path_or_buf']) ) ichunk = None else: def open_for_pandas_to_hdf(path_or_buf): return pd.HDFStore( to_hdf_args['path_or_buf'], to_hdf_args.get('mode', 'a' if continue_row else 'w') ) # Find csv row to start msg_start = f'Converting in chunks of {read_csv_args["chunksize"]} rows.' if continue_row is True: # isinstance(continue_same_csv, bool) try: if vaex_format: hdf5_list = glob.glob(tmp_search_pattern) if len(hdf5_list): # continue interrupted csv_to_h5() hdf5_list.sort() file_last = hdf5_list[-1] lf.info('Found {:d} temporary files, continue from index found in last file', len(hdf5_list)) "table/columns/index" else: # add next csv data file_last = to_hdf_args['path_or_buf'] with h5py.File(file_last, mode='r') as to_hdf_buf: continue_row = to_hdf_buf['table/columns/index/data'][-1] + 1 else: with pd.HDFStore(to_hdf_args['path_or_buf'], mode='r') as to_hdf_buf: continue_row = to_hdf_buf.select(to_hdf_args['key'], columns=[], start=-1).index[-1] + 1 except (OSError) as e: msg_start += ' No output file.' continue_row = None except KeyError as e: msg_start += ' No data in output file.' continue_row = None else: msg_start += ' Starting from next to last loaded csv row:' elif continue_row: msg_start += ' Starting from specified csv data row:' if continue_row: lf.info('{:s} {:d}...', msg_start, continue_row) read_csv_args['skiprows'] = read_csv_args.get('skiprows', 0) + continue_row else: lf.info('{:s} begining from csv row 0, giving it index 0...', msg_start) dtypes = read_csv_args['dtype'] # Set default output cols if not set if out_cols is None and processing: # we will out all we will have except processing inputs if they are not mentioned in processing outputs cols_in_used = set() cols_out_used = set() for (c_in, c_out) in processing.keys(): cols_in_used.update(c_in) cols_out_used.update(c_out) cols2del = cols_in_used.difference(cols_out_used) out_cols = dtypes.keys() for col in cols2del: del out_cols[col] cols_out_used = set(out_cols if out_cols is not None else dtypes.keys()) # Group cols for conversion by types specified str_cols = [] int_and_nans_cols = [] other_cols = [] for col, typ in dtypes.items(): if out_cols and col not in cols_out_used: continue kind = typ[0] (str_cols if kind == 'S' else int_and_nans_cols if kind == 'I' else other_cols).append(col) str_not_dates = list(set(str_cols).difference(dates_formats.keys())) min_itemsize = {col: int(dtypes[col][1:]) for col in str_not_dates} # Read csv, process, write hdf5 with open(read_csv_args['filepath_or_buffer'], 'r') as read_csv_buf, \ FakeContextIfOpen(open_for_pandas_to_hdf, to_hdf_args['path_or_buf']) as to_hdf_buf: read_csv_args.update({ 'filepath_or_buffer': read_csv_buf, 'memory_map': True, 'dtype': 'string' # switch off read_csv dtypes convertion (because if it fails it is hard to correct: }) # to read same csv place by pandas) to_hdf_args.update({ 'path_or_buf': to_hdf_buf, 'format': 'table', 'data_columns': True, 'append': True, 'min_itemsize': min_itemsize }) # rows_processed = 0 # rows_in_chunk = read_csv_args['chunksize'] for ichunk, chunk in enumerate(pd.read_csv(**read_csv_args)): if continue_row: if chunk.size == 0: ichunk = np.ceil(continue_row / read_csv_args['chunksize']).astype(int) - 1 break # continue_row is > data rows else: chunk.index += continue_row lf.extra['id'] = f'chunk start row {chunk.index[0]:d}' if ichunk % 10 == 0: print(f'{ichunk}', end=' ') else: print('.', end='') if correct_fun: correct_fun(chunk) # Convert to user specified types # 1. dates str to DateTime for col, f in dates_formats.items(): # the convertion of 'bytes' to 'strings' is needed for pd.to_datetime() try: chunk[col] = pd.to_datetime(chunk[col], format=f) except ValueError as e: lf.error( 'Conversion to datetime("{:s}" formatted as "{:s}") {:s} -> ' 'Replacing malformed strings by NaT...', col, f, standard_error_info(e)) chunk[col] = pd.to_datetime(chunk[col], format=f, exact=False, errors='coerce') # 2. str to numeric for other_cols and int_and_nans_cols (which is limited support pandas extension dtypes) # but we use numpy types instead replasing nans by -1 to able write to hdf5 chunk[other_cols] = chunk[other_cols].fillna('NaN') # <NA> to numpy recognized eq meaning string chunk[int_and_nans_cols] = chunk[int_and_nans_cols].fillna('-1') for col in (int_and_nans_cols + other_cols): # for col, typ in zip(nans.columns, chunk[nans.columns].dtypes): typ = dtypes[col] if col in int_and_nans_cols: is_integer = True typ = f'i{typ[1:]}' # typ.numpy_dtype else: is_integer = np.dtype(typ).kind == 'i' try: chunk[col] = chunk[col].astype(typ) continue except (ValueError, OverflowError) as e: # Cleaning. In case of OverflowError we do it here to prevent ValueError while handling of OverflowError below. pattern_match = r'^[\d]$' if is_integer else r'^-?[\d.]$' ibad = ~chunk[col].str.match(pattern_match) rep_val = '-1' if is_integer else 'NaN' # ibad = np.flatnonzero(chunk[col] == re.search(r'(?:")(.*)(?:")', e.args[0]).group(1), 'ascii') lf.error('Conversion {:s}("{:s}") {:s} -> replacing {:d} values not maching pattern "{:s}" with "{' ':s}" and again...', typ, col, standard_error_info(e), ibad.sum(), pattern_match, rep_val) chunk.loc[ibad, col] = rep_val # astype(str).replace(regex=True, to_replace=r'^.*[^\d.].*$', value= try: chunk[col] = chunk[col].astype(typ) except (OverflowError, ValueError) as e: # May be bad value from good symbols: r'^\d*\.\d*\.+\d*$' but instead checking it we do coerce_to_exact_dtype() on ValueError here too lf.error('Conversion {:s}("{:s}") {:s} -> Replacing malformed strings and big numbers' ' by NaN ...', typ, col, standard_error_info(e)) chunk[col] = coerce_to_exact_dtype(chunk[col], dtype=typ) # Limit big strings length and convert StringDtype to str to can save by to_hdf() for col, max_len in min_itemsize.items(): # for col, typ in zip(nans.columns, chunk[nans.columns].dtypes): chunk[col] = chunk[col].str.slice(stop=max_len) # apply(lambda x: x[:max_len]) not handles <NA> chunk[str_not_dates] = chunk[str_not_dates].astype(str) # Apply specified data processing if processing: for (cols_in, c_out), fun in processing.items(): cnv_result = fun(chunk[list(cols_in)]) chunk[list(c_out)] = cnv_result # # Bad rows check # is_different = chunk['wlaWID'].fillna('') != chunk['wlaAPIHartStandard'].fillna('') # if is_different.any(): # i_bad = np.flatnonzero(is_different.values) # lf.debug('have wlaWID != wlaAPIHartStandard in rows {:s}', chunk.index[i_bad]) # # chunk= chunk.drop(chunk.index[i_bad]) # - deleting # pass # Check unique index # if chunk['wlaWID'].duplicated() try: if vaex_format: df = vaex.from_pandas(chunk if out_cols is None else chunk[out_cols]) df.export_hdf5(tmp_save_pattern.format(ichunk)) else: # better to move this command upper and proc. by vaex instead of pandas (chunk if out_cols is None else chunk[out_cols]).to_hdf(**to_hdf_args) #rows_processed += rows_in_chunk # think we red always the same length exept last which length value will not be used except Exception as e: lf.exception('write error') pass try: del lf.extra['id'] except KeyError: lf.info('was no more data rows to read') # If vaex store was specified then we have chunk files that we combine now by export_hdf5(): if vaex_format: h5pandas_to_vaex_combine(tmp_search_pattern, str(to_hdf_args['path_or_buf']), check_files_number=ichunk+1)
import vaex import numpy as np import pandas as pd for i, chunk in enumerate(vaex.read_csv(r'D:\python projects\quandomo\data_center\data\market\ASHAREEODPRICES.csv', chunksize=100_000)): df_chunk = vaex.from_pandas(chunk, copy_index=False) export_path = f'D:/python projects/quandomo/data_center/data/market/part_{i}.hdf5' df_chunk.export_hdf5(export_path) df = vaex.open('D:/python projects/quandomo/data_center/data/market/part*') df.export_hdf5('D:/python projects/quandomo/data_center/data/market/Final.hdf5')
y_binned, yerr=yerr, xerr=xerr, fmt=',', color='k', lw=1.5) fig.savefig('{}_{}.jpg'.format(blue_filter, red_filter), dpi=144) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('filebase', action='store') args = parser.parse_args() photfile = args.filebase # try: # # I have never gotten vaex to read an hdf5 file successfully # ds = vaex.open(photfile) # except: import pandas as pd df = pd.read_hdf(photfile, key='data') ds = vaex.from_pandas(df) filter_sets = [('f336w', 'f275w', 'f336w'), ('f475w', 'f336w', 'f475w'), ('f814w', 'f475w', 'f814w'), ('f160w', 'f475w', 'f160w'), ('f160w', 'f814w', 'f160w'), ('f160w', 'f110w', 'f160w')] for f in filter_sets: make_cmd(ds, *f)
def read_csv(paths: Sequence[Union[str, Path]], cfg_in: Mapping[str, Any]) -> Union[pd.DataFrame, vaex.dataframe.DataFrame]: """ Reads csv in dask DataFrame Calls cfg_in['fun_proc_loaded'] (if specified) Calls time_corr: corrects/checks Time (with arguments defined in cfg_in fields) Sets Time as index :param paths: list of file names :param cfg_in: contains fields for arguments of dask.read_csv correspondence: names=cfg_in['cols'][cfg_in['cols_load']] usecols=cfg_in['cols_load'] error_bad_lines=cfg_in['b_raise_on_err'] comment=cfg_in['comments'] Other arguments corresponds to fields with same name: dtype=cfg_in['dtype'] delimiter=cfg_in['delimiter'] converters=cfg_in['converters'] skiprows=cfg_in['skiprows'] blocksize=cfg_in['blocksize'] Also cfg_in has filds: dtype_out: numpy.dtype, which "names" field used to detrmine output columns fun_proc_loaded: None or Callable[ [Union[pd.DataFrame, np.array], Mapping[str, Any], Optional[Mapping[str, Any]]], Union[pd.DataFrame, pd.DatetimeIndex]] If it returns pd.DataFrame then it also must has attribute: meta_out: Callable[[np.dtype, Iterable[str], Mapping[str, dtype]], Dict[str, np.dtype]] See also time_corr() for used fields :return: tuple (a, b_ok) where a: dask dataframe with time index and only columns listed in cfg_in['dtype_out'].names b_ok: time correction reszult bulean array """ try: try: # raise ValueError('Temporary') # for ichunk, chunk in enumerate(pd.read_csv(paths, chunksize=1000, delimiter='\t')): df = pd.read_csv( paths, dtype=cfg_in['dtype_raw'], names=cfg_in['cols'], delimiter=cfg_in['delimiter'], skipinitialspace=True, usecols=cfg_in['dtype'].names, # cfg_in['cols_load'], converters=cfg_in['converters'], skiprows=cfg_in['skiprows'], error_bad_lines=cfg_in['b_raise_on_err'], comment=cfg_in['comments'], header=None, blocksize=cfg_in['blocksize']) # not infer # , engine='python' - may help load bad file # index_col=False # force pandas to _not_ use the first column as the index (row names) - no in dask # names=None, squeeze=False, prefix=None, mangle_dupe_cols=True, # engine=None, true_values=None, false_values=None, skipinitialspace=False, # nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, # skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, # date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression='infer', # thousands=None, decimal=b'.', lineterminator=None, quotechar='"', quoting=0, # escapechar=None, encoding=None, dialect=None, tupleize_cols=None, # warn_bad_lines=True, skipfooter=0, skip_footer=0, doublequote=True, # delim_whitespace=False, as_recarray=None, compact_ints=None, use_unsigned=None, # low_memory=True, buffer_lines=None, memory_map=False, float_precision=None) except ValueError as e: l.exception('dask lib can not load data. Trying pandas lib...') for i, nf in enumerate(paths): df = pd.read_csv( nf, dtype=cfg_in['dtype_raw'], names=cfg_in['cols'], usecols=cfg_in['dtype'].names, # cfg_in['cols_load'], delimiter=cfg_in['delimiter'], skipinitialspace=True, index_col=False, converters=cfg_in['converters'], skiprows=cfg_in['skiprows'], error_bad_lines=cfg_in['b_raise_on_err'], comment=cfg_in['comments'], header=None) if i > 0: raise NotImplementedError('list of files => need concatenate data') ddf = vaex.from_pandas(df, chunksize=cfg_in['blocksize']) # except Exception as e: # for example NotImplementedError if bad file msg = '- Bad file. skip!' ddf = None if cfg_in['b_raise_on_err']: l.exception('%s\n Try set [in].b_raise_on_err= False\n', msg) raise else: l.exception(msg) if __debug__: l.debug('read_csv initialised') if ddf is None: return None, None meta_time = pd.Series([], name='Time', dtype='M8[ns]') # np.dtype('datetime64[ns]') meta_time_index = pd.DatetimeIndex([], dtype='datetime64[ns]', name='Time') meta_df_with_time_col = cfg_in['cols_load'] # Process ddf and get date in ISO string or numpy standard format cfg_in['file_stem'] = Path(paths[0]).stem # may be need in func below to extract date try: date_delayed = None try: if not getattr(cfg_in['fun_proc_loaded'], 'meta_out', None) is None: # fun_proc_loaded will return modified data. Go to catch it # todo: find better condition raise TypeError date = ddf.map_partitions(lambda *args, **kwargs: pd.Series( cfg_in['fun_proc_loaded'](*args, **kwargs)), cfg_in, meta=meta_time) # meta_time_index # date = date.to_series() except (TypeError, Exception) as e: # fun_proc_loaded retuns tuple (date, a) changing_size = False # ? True # ? if changing_size: @vaex.delayed def run_fun_proc_loaded(): """ delayed(, nout=2)(ddf, cfg_in) :return: """ return cfg_in['fun_proc_loaded']() date_delayed, a = run_fun_proc_loaded() ddf_len = len(ddf) counts_divisions = list(range(1, int(ddf_len / cfg_in.get('decimate_rate', 1)), cfg_in['blocksize'])) counts_divisions.append(ddf_len) ddf = vaex.from_delayed(a, divisions=(0, counts_divisions)) #date, meta = meta_time_index, divisions = counts_divisions); from_dask_array(date.values, index=ddf.index) date = date_delayed.get() else: # getting df with time col meta_out = cfg_in['fun_proc_loaded'].meta_out(cfg_in['dtype']) if callable( cfg_in['fun_proc_loaded'].meta_out) else None ddf = ddf.map_partitions(cfg_in['fun_proc_loaded'], cfg_in, meta=meta_out) date = ddf.Time except IndexError: print('no data?') return None, None # add time shift specified in configuration .ini n_overlap = 2 * int(np.ceil(cfg_in['fs'])) if cfg_in.get('fs') else 50 # reset_index().set_index('index'). meta2 = {'Time': 'M8[ns]', 'b_ok': np.bool8} # pd.DataFrame(columns=('Time', 'b_ok')) # meta2.time = meta2.time.astype('M8[ns]') # meta2.b_ok = meta2.b_ok.astype(np.bool8) def time_corr_df(t, cfg_in): """convert tuple returned by time_corr() to dataframe""" return pd.DataFrame.from_dict(OrderedDict(zip(meta2.keys(), utils_time_corr.time_corr(t, cfg_in)))) # return pd.DataFrame.from_items(zip(meta2.keys(), time_corr(t, cfg_in))) # pd.Series() # date.rename('time').to_series().reset_index().compute() # date.to_series().repartition(divisions=ddf.divisions[1]) ''' def time_corr_ar(t, cfg_in): """convert tuple returned by time_corr() to dataframe""" return np.array(time_corr(t, cfg_in)) #return pd.DataFrame.from_items(zip(meta2.keys(), time_corr(t, cfg_in))) # pd.Series() da.overlap.map_overlap(date.values, time_corr_ar, depth=n_overlap) ''' l.info('time correction in %s blocks...', date.npartitions) df_time_ok = date.map_overlap(time_corr_df, before=n_overlap, after=n_overlap, cfg_in=cfg_in, meta=meta2) # .to_series() # if __debug__: # c = df_time_ok.compute() # tim = date.compute().values() # tim, b_ok = time_corr(tim, cfg_in) # return None, None # if len(ddf) == 1: # size # ddf = ddf[np.newaxis] # npartitions = ddf.npartitions # ddf = ddf.reset_index().set_index('index') # col_temp = set(ddf.columns).difference(cfg_in['dtype_out'].names).pop() # ddf.index is not unique! # if col_temp: # # ddf[col_temp].compute().is_unique # Index.is_monotonic_increasing() # # ddf[col_temp] = ddf[col_temp].map_partitions(lambda s, t: t[s.index], tim, meta=meta) try: df_time_ok = df_time_ok.persist() except Exception as e: l.exception('Can not speed up by persist') # # something that can trigger error to help it identificate ??? # date = date.persist() # df_time_ok = df_time_ok.compute() df_time_ok = time_corr_df( (date_delayed if date_delayed is not None else date).compute(), cfg_in=cfg_in) nbad_time = len(df_time_ok['b_ok']) - df_time_ok['b_ok'].sum() l.info('Removing %d bad time values: %s%s', nbad_time, df_time_ok['b_ok'].fillna(0).ne(True).to_numpy().nonzero()[0][:20], ' (shows first 20)' if nbad_time > 20 else '') df_time_ok.loc[df_time_ok['b_ok'], 'Time'] = pd.NaT try: df_time_ok.Time = df_time_ok.Time.interpolate( inplace=False) # inplace=True - not works, method='linear', - default except ValueError: # if not interpolates (my condition) use simpler method: df_time_ok.Time = df_time_ok.Time.fillna(method='ffill', inplace=True) if nbad_time: # # dask get IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match): # ddf_out = ddf.loc[df_time_ok['b_ok'], list(cfg_in['dtype_out'].names)].set_index( # df_time_ok.loc[df_time_ok['b_ok'], 'Time'], sorted=True) # so we have done interpolate that helps this: ddf_out = ddf.loc[:, list(cfg_in['dtype_out'].names)].set_index(df_time_ok['Time']) # , sorted=True ddf_out = ddf_out.loc[df_time_ok['b_ok'], :] else: # print('data loaded shape: {}'.format(ddf.compute(scheduler='single-threaded').shape)) # debug only ddf_out = ddf.loc[:, list(cfg_in['dtype_out'].names)].set_index(df_time_ok['Time'], sorted=True) logger = logging.getLogger("dask") logger.addFilter(lambda s: s.getMessage() != "Partition indices have overlap.") return ddf_out
"ex-dividend", "split_ratio", "adj_open", "adj_high", "adj_low", "adj_close", "adj_volume" ], axis=1, inplace=True) # Remove rows with missing data, since they are present in minimal amount dataset.dropna(inplace=True) # Convert string-based dates to integers (days) since first date in chronological order (day 0) dataset['date'] = pd.to_datetime(dataset['date']) basedate = dataset.date.min() dataset['date'] = (dataset['date'] - basedate).dt.days # Make all numeric data to be float64 to ease computation dataset['date'] = dataset['date'].astype('float64') # ----------- # # DATA EXPORT # # ----------- # # Vaex-ify Pandas dataset and export to little-endian HDF5 vaex_dataset = vx.from_pandas(dataset) vaex_dataset.export_hdf5("../data/WIKI_PRICES_QUANDL.hdf5", column_names=None, byteorder='<', progress=False) # Greet user! print('Success!')
def __init__(self, df): super(DataFrameAccessorGraphQLPandas, self).__init__(vaex.from_pandas(df))
from traffic.traffic import Traffic import vaex def set_queues_callback(time, queues): global street_queues street_queues[time] = queues in_setup_file = './hashcode.in' in_submission_file = 'submission.hashcode.txt' out_street_queues_hd5_file = 'street_queues.hashcode.h5' # in_setup_file='./example.in' # in_submission_file='submit.example.txt' # out_street_queues_hd5_file='street_queues.example.h5' m = Traffic(in_file=in_setup_file) num_streets = len(m.streets) callback = set_queues_callback street_queues = np.zeros((m.end_time, num_streets), dtype=int) m.read_submission_file(in_file_path=in_submission_file) m.simulate(progress_bar=True, override_end_time=None, queue_callback=callback) m_score = m.calculate_simulation_score() print("Final score: {}".format(m_score)) street_queues_df = vaex.from_pandas( pd.DataFrame(street_queues, columns=list(m.streets.keys()))) street_queues_df.export_hdf5(out_street_queues_hd5_file, progress=True)
def to_fits(): fns = (glob('data/M31-*ST/proc_default/M31-*ST.phot.hdf5') + glob('data/M31-*ST/M31-*ST.phot.hdf5')) fns.sort() print('Files:', fns) plt.clf() for photfile in fns: #photfile = 'data/M31-B23-WEST/M31-B23-WEST.phot.hdf5' basename = os.path.basename(photfile) basename = basename.replace('.phot.hdf5', '') print('Base name:', basename) outfn = basename + '-bright.fits' if os.path.exists(outfn): print('Exists:', outfn) continue words = basename.split('-') assert (len(words) == 3) brick = words[1] assert (brick[0] == 'B') brick = int(brick[1:], 10) print('Brick number:', brick) ew = words[2] assert (ew in ['EAST', 'WEST']) east = (ew == 'EAST') df = pd.read_hdf(photfile, key='data') ds = vaex.from_pandas(df) print('Read', photfile) #print(ds) good = ds['f814w_gst'] print(len(ds), 'rows') ds = ds[good] print(len(ds), 'gst on F814W') # good = ds.evaluate(ds['f475w_gst']) # print(good) # print(len(good)) # print(type(good)) # print(good.dtype) # print('Of those,', np.sum(ds.evaluate(ds['f475w_gst'])), 'are F475W_GST') # print('Of those,', np.sum(ds.evaluate(ds['f336w_gst'])), 'are F336W_GST') # print('Of those,', np.sum(ds.evaluate(ds['f275w_gst'])), 'are F275W_GST') # print('Of those,', np.sum(ds.evaluate(ds['f110w_gst'])), 'are F110W_GST') # print('Of those,', np.sum(ds.evaluate(ds['f160w_gst'])), 'are F160W_GST') mag = ds.evaluate(ds['f814w_vega']) print('Of', len(mag), 'mags,', np.sum(np.isfinite(mag)), 'are finite') print('range:', np.nanmin(mag), np.nanmax(mag)) plt.hist(mag, range=(20, 28), bins=50, label=basename) ds = ds[ds['f814w_vega'] < 24] print(len(ds), 'with F814W < 24') mag = ds.evaluate(ds['f814w_vega']) xx = ds.evaluate(ds['x']) yy = ds.evaluate(ds['y']) xlo = xx.min() xhi = xx.max() ylo = yy.min() yhi = yy.max() nx = int(np.round((xhi - xlo) / 1000.)) + 1 xbins = np.linspace(xlo, xhi, nx) ny = int(np.round((yhi - ylo) / 1000.)) + 1 ybins = np.linspace(ylo, yhi, ny) print('x bins', xbins) print('y bins', ybins) xbin = np.digitize(xx, xbins) ybin = np.digitize(yy, ybins) xybin = ybin * nx + xbin nbins = nx * ny print('N bins:', nbins) nperbin = int(np.ceil(100000. / nbins)) II = [] for ibin in range(nbins): I = np.flatnonzero(xybin == ibin) if len(I) == 0: continue Ibright = np.argsort(mag[I])[:nperbin] II.append(I[Ibright]) II = np.hstack(II) #I = np.argsort(mag) #I = I[:100000] #print('100k-th star: mag', mag[I[-1]]) ds = ds.take(II) T = fits_table() for col in ['ra', 'dec', 'x', 'y', 'index']: T.set(col, ds.evaluate(ds[col])) for filt in [814, 475, 336, 275, 110, 160]: for col in ['f%iw_vega']: colname = col % filt T.set(colname, ds.evaluate(ds[colname])) T.brick = np.zeros(len(T), np.uint8) + brick T.east = np.zeros(len(T), bool) T.east[:] = east T.writeto(outfn) plt.legend() plt.xlabel('F814W mag') plt.savefig('mags.png')
def create_model(): if not os.path.exists(gtfsr_model_df_path): df = vaex.open(gtfsr_processed_path) df = df.sample(frac=1) # # remove outliers from dataset, all delays over 20 minutes outlier = 60 * 20 df = df[(df["arrival"] >= -outlier) & (df["arrival"] <= outlier) & (df["departure"] >= -outlier) & (df["departure"] <= outlier)] df["arr_dow"] = df.apply(apply_dow, ["start_date", "start_time", "arrival_time"]) df["arr_hour"] = df["arrival_time"].apply( lambda t: get_dt(t, "%H:%M:%S").hour) df["arrival"] = df["arrival"].apply(lambda t: 0 if t == 0 else t / 60) cols = [ "route_id", "stop_id", "arr_dow", "arr_hour", "direction", "stop_sequence" ] # if the arrival historical means dataset is not created we create it if not os.path.exists(gtfsr_historical_means_path): print("*** creating gtfsr historical means dataset ***") # creates a dataset of historical average means using the stop_id, arrival_day_of_week and trip_id identifiers vaex.from_pandas( (df.to_pandas_df().groupby(cols).agg({ "arrival": "mean", "p_avg_vol": "mean" }).rename(columns={ "arrival": "arrival_mean", "p_avg_vol": "p_mean_vol" }).reset_index())).export_hdf5(gtfsr_historical_means_path) print("*** joining hist means ***") # join the arrival means to our dataset df = vaex_mjoin(df, vaex.open(gtfsr_historical_means_path), cols, cols, how="left") df = df[[ "start_date", "start_time", "stop_sequence", "arrival", "timestamp", "stop_id", "arrival_time", "shape_dist_traveled", "direction", "route_id", "lat", "lon", "direction_angle", "shape_dist_between", "arr_dow", "arr_hour", "arrival_mean", "p_mean_vol", ]] df.export_hdf5(gtfsr_model_df_path) print("*** Start training ***") # open model ready df = vaex.open(gtfsr_model_df_path) # transform our data df = transform_data(df) # train our data train_gtfsr(df) return
import pandas as pd import glob import os from findatapy.util.loggermanager import LoggerManager start = time.time() data_vendor = 'dukascopy' # 'ncfx' or 'dukascopy' source_folder = '/data/csv_dump/' + data_vendor + '/' destination_folder = '/data/csv_dump/' + data_vendor + '_arrow/' logger = LoggerManager().getLogger(__name__) parquet_list = glob.glob(source_folder + '/*.parquet') for p in parquet_list: df = pd.read_parquet(p) df = vaex.from_pandas(df, name='pandas', copy_index=True, index_name='Date') logger.info("Converting " + p + "...") filename = os.path.basename(p) df.export(destination_folder + "/" + filename.replace('parquet', 'arrow')) finish = time.time() print('Status: calculated ' + str(round(finish - start, 3)) + "s")
def vframe(self): self.pframe() self.df_vaex = vaex.from_pandas(self.df_pandas) return self.df_vaex