Esempio n. 1
0
def create_stop_time_data():
    print("*** creating stop time data ***")

    query = """select trip_id from trip;"""
    trip_list = [id[0] for id in run_query(query)]

    delayed_funcs = [
        delayed(get_stop_time_df)(t_id, get_conn) for t_id in trip_list
    ]
    res = Parallel(n_jobs=-1)(delayed_funcs)

    stop_time_trip_df = vaex.from_pandas(pd.concat(res))
    print(f"concat stop_time data, time: {duration()}")

    # strong type casting
    stop_time_trip_df["stop_sequence"] = stop_time_trip_df[
        "stop_sequence"].astype("int64")
    stop_time_trip_df["shape_dist_traveled"] = stop_time_trip_df[
        "shape_dist_traveled"].astype("float64")
    stop_time_trip_df["direction"] = stop_time_trip_df["direction"].astype(
        "int64")
    stop_time_trip_df["lat"] = stop_time_trip_df["lat"].astype("float64")
    stop_time_trip_df["lon"] = stop_time_trip_df["lon"].astype("float64")
    stop_time_trip_df["direction_angle"] = stop_time_trip_df[
        "direction_angle"].astype("float64")
    stop_time_trip_df["shape_dist_between"] = stop_time_trip_df[
        "shape_dist_between"].astype("float64")

    stop_time_trip_df.export_hdf5(stop_time_data_path)  # export to hdf5
    return
Esempio n. 2
0
def jets_from_raw(in_fname,
                  num_evts,
                  tag_mcpid=[6],
                  stride=1000,
                  num_procs=1,
                  offset=None):
    """Returns a pandas dataframe of jet constituent calorimeter info.
    
    Keyword arguments:
    in_fname: (str) path to file containing data
    interv: (list) events to read from file, ie. [start, end]
    tag_mcpid: (array like) list of ancestor particle ids forming jets
    stride: (int) number events per processor read in at once
    num_procs: (int) number processes to spawn
    """
    num_chunks = int(np.ceil(num_evts / stride))
    starts = np.arange(0, num_evts, stride, dtype=int)
    ranges = np.array([0, stride - 1]).reshape(1, 2) + starts[:, np.newaxis]
    ranges[-1, 1] = num_evts

    with Pool(processes=num_procs) as pool:
        jet_df = partial(_jet_chunk,
                         in_fname=in_fname,
                         tag_mcpid=tag_mcpid,
                         offset=offset)
        pd_data = pd.concat(pool.map(jet_df, list(ranges)))
        return vpd.from_pandas(pd_data, copy_index=True, index_name='event')
Esempio n. 3
0
    def to_hdf5(self, out_folder, min_frame=None, max_frame=None, step=1000):
        """Dump project project to hdf5.

        Args:
            out_folder (str): Path to the folder where to store the outcome.
            min_frame (int): Minimal frame in selection for saving.
            max_frame (int): Maximal frame in selection for saving.
        """
        import vaex as vx
        min_frame = self.min_frame if min_frame is None else min_frame
        max_frame = self.max_frame if max_frame is None else max_frame
        out_folder = Path(out_folder)
        out_folder.mkdir(parents=True, exist_ok=True)
        types_remap = {
            'frame': 'uint16',
            'scan': 'uint16',
            'tof': 'uint32',
            'i': 'uint32'
        }
        #TODO: types should be remaped earlier!
        for f, F in ranges(min_frame, max_frame + 1, step):
            pd_df = self[f:F]
            pd_df = pd_df.astype(types_remap)
            path = str(out_folder / f"{f}_{F}.hdf5")
            vx_df = vx.from_pandas(pd_df, copy_index=False)
            vx_df.export_hdf5(path=path)
            del vx_df, pd_df
Esempio n. 4
0
def _from_csv_convert_and_read(filename_or_buffer,
                               path_output,
                               chunk_size,
                               fs_options,
                               fs=None,
                               copy_index=False,
                               progress=None,
                               **kwargs):
    # figure out the CSV file path
    csv_path = vaex.file.stringyfy(filename_or_buffer)
    path_output_bare, ext, _ = vaex.file.split_ext(path_output)

    combined_hdf5 = _convert_name(csv_path)

    # convert CSV chunks to separate HDF5 files
    import pandas as pd
    converted_paths = []
    # we don't have indeterminate progress bars, so we cast it to truethy
    progress = bool(progress) if progress is not None else False
    if progress:
        print("Converting csv to chunk files")
    with vaex.file.open(filename_or_buffer,
                        fs_options=fs_options,
                        fs=fs,
                        for_arrow=True) as f:
        csv_reader = pd.read_csv(filename_or_buffer,
                                 chunksize=chunk_size,
                                 **kwargs)
        for i, df_pandas in enumerate(csv_reader):
            df = vaex.from_pandas(df_pandas, copy_index=copy_index)
            chunk_name = f'{path_output_bare}_chunk_{i}{ext}'
            df.export(chunk_name)
            converted_paths.append(chunk_name)
            log.info('saved chunk #%d to %s' % (i, chunk_name))
            if progress:
                print("Saved chunk #%d to %s" % (i, chunk_name))

    # combine chunks into one HDF5 file
    if len(converted_paths) == 1:
        # no need to merge several HDF5 files
        os.rename(converted_paths[0], path_output)
    else:
        if progress:
            print('Converting %d chunks into single file %s' %
                  (len(converted_paths), path_output))
        log.info('converting %d chunks into single file %s' %
                 (len(converted_paths), path_output))
        dfs = [vaex.open(p) for p in converted_paths]
        df_combined = vaex.concat(dfs)
        df_combined.export(path_output, progress=progress)

        log.info('deleting %d chunk files' % len(converted_paths))
        for df, df_path in zip(dfs, converted_paths):
            try:
                df.close()
                os.remove(df_path)
            except Exception as e:
                log.error(
                    'Could not close or delete intermediate file %s used to convert %s to single file: %s',
                    (df_path, csv_path, path_output))
Esempio n. 5
0
 def _write_prices(self):
     for index, chunk in enumerate(self.price):
         price_sub_df = vaex.from_pandas(chunk)
         price_sub_df.export_hdf5(self.folder_path + '/' +
                                  DMGenerator.PRICES_FOLDER +
                                  f'/price_{index}.hdf5')
     self._consolidate_folder(DMGenerator.PRICES_FOLDER)
Esempio n. 6
0
    def convert(self, force=False):
        import pandas as pd
        skips = ["store_and_fwd_flag"]
        for i, (input,
                output) in enumerate(zip(self.filenames, self.filenames_vaex)):
            date_names = ["tpep_pickup_datetime", "tpep_dropoff_datetime"]
            if not os.path.exists(output) or force:
                print("Converting %s to %s (%d out of %d)" %
                      (input, output, i + 1, len(self.filenames)))
                df = pd.read_csv(input, parse_dates=date_names)

                for skip in skips:
                    if skip in df:
                        del df["store_and_fwd_flag"]
                ds = vx.from_pandas(df)
                ds.add_virtual_column("pickup_hour",
                                      "hourofday(tpep_pickup_datetime)")
                ds.add_virtual_column("dropoff_hour",
                                      "hourofday(tpep_dropoff_datetime)")
                ds.add_virtual_column("pickup_dayofweek",
                                      "dayofweek(tpep_pickup_datetime)")
                ds.add_virtual_column("dropoff_dayofweek",
                                      "dayofweek(tpep_dropoff_datetime)")
                ds.select(
                    "(pickup_longitude != 0) & (pickup_latitude != 0) & (dropoff_longitude != 0) & (dropoff_latitude != 0)"
                )
                ds.export_hdf5(output, virtual=True, selection=True)
Esempio n. 7
0
def fcs_to_feather(file: str, outDir: Path):
    """Convert fcs file to csv. Copied from polus-fcs-to-csv-converter plugin.

    Args:
        file (str): Path to the directory containing the fcs file.
        outDir (Path): Path to save the output csv file.

    Returns:
        Converted csv file.

    """
    file_name = Path(file).stem
    feather_filename = file_name + ".feather"
    logger.info("fcs_to_feather : Begin parsing data out of .fcs file" +
                file_name)

    # Use fcsparser to parse data into python dataframe
    meta, data = fcsparser.parse(file,
                                 meta_data_only=False,
                                 reformat_meta=True)

    # Export the fcs data to vaex df
    logger.info("fcs_to_feather: converting data to vaex dataframe...")
    df = vaex.from_pandas(data)
    logger.info("fcs_to_feather: writing file...")
    os.chdir(outDir)
    logger.info(
        "fcs_to_feather: Writing Vaex Dataframe to Feather File Format for:" +
        file_name)
    df.export_feather(feather_filename, outDir)
Esempio n. 8
0
 def _write_calendar(self):
     for index, chunk in enumerate(self.calendar):
         calendar_sub_df = vaex.from_pandas(chunk)
         calendar_sub_df.export_hdf5(self.folder_path + '/' +
                                     DMGenerator.CALENDAR_FOLDER +
                                     f'/calendar_{index}.hdf5')
     self._consolidate_folder(DMGenerator.CALENDAR_FOLDER)
Esempio n. 9
0
def check_results(fns, tag):
    def get_field(ds, col):
        return ds.evaluate(ds[col.upper()])

    rr = []
    dd = []
    for fn in fns:
        df = pd.read_hdf(fn, key='data')
        ds = vaex.from_pandas(df)
        print(len(ds), 'rows')
        ra = get_field(ds, 'ra')
        dec = get_field(ds, 'dec')
        rr.append(ra)
        dd.append(dec)
    rr = np.hstack(rr)
    dd = np.hstack(dd)
    print('Total of', len(rr), 'stars')

    T = fits_table()
    T.ra = rr
    T.dec = dd
    T.writeto('all-rd-%s.fits' % tag)

    plothist(rr, dd, 500)
    plt.xlabel('RA (deg)')
    plt.ylabel('Dec (deg)')
    plt.savefig('all-radec-%s.png' % tag)

    I, J, d = match_radec(rr, dd, rr, dd, 0.2 / 3600, notself=True)
    plt.clf()
    plt.hist(d * 3600. * 1000., bins=50)
    plt.xlabel('Distance between stars (milli-arcsec)')
    plt.savefig('all-dists-%s.png' % tag)
Esempio n. 10
0
def _from_csv_convert_and_read(filename_or_buffer,
                               maybe_convert_path,
                               chunk_size,
                               fs_options,
                               fs=None,
                               copy_index=False,
                               **kwargs):
    # figure out the CSV file path
    if isinstance(maybe_convert_path, str):
        csv_path = re.sub(r'\.hdf5$',
                          '',
                          str(maybe_convert_path),
                          flags=re.IGNORECASE)
    elif isinstance(filename_or_buffer, str):
        csv_path = filename_or_buffer
    else:
        raise ValueError(
            'Cannot derive filename to use for converted HDF5 file, '
            'please specify it using convert="my.csv.hdf5"')

    combined_hdf5 = _convert_name(csv_path)

    # convert CSV chunks to separate HDF5 files
    import pandas as pd
    converted_paths = []
    with vaex.file.open(filename_or_buffer,
                        fs_options=fs_options,
                        fs=fs,
                        for_arrow=True) as f:
        csv_reader = pd.read_csv(filename_or_buffer,
                                 chunksize=chunk_size,
                                 **kwargs)
        for i, df_pandas in enumerate(csv_reader):
            df = vaex.from_pandas(df_pandas, copy_index=copy_index)
            filename_hdf5 = _convert_name(csv_path, suffix='_chunk%d' % i)
            df.export_hdf5(filename_hdf5)
            converted_paths.append(filename_hdf5)
            log.info('saved chunk #%d to %s' % (i, filename_hdf5))

    # combine chunks into one HDF5 file
    if len(converted_paths) == 1:
        # no need to merge several HDF5 files
        os.rename(converted_paths[0], combined_hdf5)
    else:
        log.info('converting %d chunks into single HDF5 file %s' %
                 (len(converted_paths), combined_hdf5))
        dfs = [vaex.open(p) for p in converted_paths]
        df_combined = vaex.concat(dfs)
        df_combined.export_hdf5(combined_hdf5)

        log.info('deleting %d chunk files' % len(converted_paths))
        for df, df_path in zip(dfs, converted_paths):
            try:
                df.close()
                os.remove(df_path)
            except Exception as e:
                log.error(
                    'Could not close or delete intermediate hdf5 file %s used to convert %s to hdf5: %s'
                    % (df_path, csv_path, e))
Esempio n. 11
0
def count_volume_premarket_price_m1():
    time_925 = np.timedelta64(9, 'h') + np.timedelta64(25, 'm')

    for file in os.listdir('Data'):
        if file.endswith('.csv'):
            pandas_df = pd.read_csv('Data/' + file)
            print(file)

            pandas_df['date'] = pd.to_datetime(pandas_df['date'])
            pandas_df['date_int'] = pd.to_datetime(pandas_df['date']).astype(
                np.int64)

            columns_list = pandas_df.columns.tolist()
            columns_list = columns_list[-1:] + columns_list[:-1]
            pandas_df = pandas_df[columns_list]

            vaex_df = vaex.from_pandas(pandas_df)

            start_date = np.datetime64(vaex_df.data.date[0], 'D')
            end_date = np.datetime64(vaex_df.data.date[-1], 'D')

            date = []
            volume = []
            open_924 = []
            while start_date < end_date:
                date.append(start_date)
                midnight = start_date.astype('datetime64[ms]').astype(
                    'int64') * 1_000_000
                premarket_stop = (start_date + time_925).astype(
                    'datetime64[ms]').astype('int64') * 1_000_000

                vaex_df.select((vaex_df.date_int > midnight)
                               & (vaex_df.date_int < premarket_stop))
                volume.append(vaex_df.sum("volume", selection=True))

                if vaex_df.evaluate("open", selection=True).size > 0:
                    open_924.append(
                        vaex_df.evaluate("open", selection=True)[-1])
                else:
                    open_924.append(0)

                start_date = np.busday_offset(start_date +
                                              np.timedelta64(1, 'D'),
                                              0,
                                              roll='forward')

            pd.DataFrame({
                'Date': date,
                'Open_924': open_924,
                'Volume': volume
            }).to_csv('Data/New_Files/' + file)
Esempio n. 12
0
def process_data():
    if not os.path.exists(stop_time_data_path):
        create_stop_time_data()

    print("*** processing data ***")
    df = vaex.open(gtfs_final_hdf5_path)

    # compute direction and day of week from realtime data
    df["direction"] = df["trip_id"].apply(lambda t: dir_f_trip(t))
    df["dow"] = df["start_date"].apply(lambda t: get_dt(t, "%Y%m%d").weekday())

    # store these columns in memory
    df.materialize("direction", inplace=True)
    df.materialize("dow", inplace=True)

    # 500 is set as an error column to remove, None isnt supported
    df = df[df["direction"] != 500]

    # drop trip_id to remove duplicates
    df.drop("trip_id", inplace=True)

    # important, we use these columns and later service days in order to
    # stop being dependent on trip_id.
    cols = ["route_id", "stop_sequence", "stop_id", "start_time", "direction"]
    df = vaex_mjoin(df.shallow_copy(),
                    vaex.open(stop_time_data_path),
                    cols,
                    cols,
                    how="inner",
                    allow_duplication=True)

    # filter to keep only trips that happened on that dayof week
    df["keep_trip"] = df.apply(
        lambda sd, dow: sd.replace("[", "").replace("]", "").replace(" ", "").
        split(",")[dow], ["service_days", "dow"])
    df = df[df.keep_trip == "True"]

    # drop redundant columns
    df.drop(["service_days", "dow", "keep_trip"], inplace=True)

    df = vaex.from_pandas(df.to_pandas_df().drop_duplicates(
        subset=[i for i in df.get_column_names() if i != "trip_id"]))
    # df = vx_dedupe(df, columns=[i for i in df.get_column_names() if i != "trip_id"])

    print(f"merged stop_time & gtfsr data, time: {duration()}")

    df = predict_traffic_from_scats(df)

    df.export_hdf5(gtfsr_processed_path)
    print(f"finished processing data, {duration()}")
Esempio n. 13
0
 def _write_sales_info(self):
     info_data = pd.DataFrame()
     for index, chunk in enumerate(self.sales):
         info_cols = [
             'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'
         ]
         info_data = info_data.append(chunk.loc[:, ['id'] + info_cols])
         sales_cols = [col for col in chunk.columns if 'd_' in col]
         sales = chunk.loc[:, ['id'] + sales_cols].melt(id_vars='id')
         sales = vaex.from_pandas(sales)
         sales.export_hdf5(self.folder_path + '/' +
                           DMGenerator.SALES_FOLDER +
                           f'/sales_{index}.hdf5')
     info_data.drop_duplicates().to_parquet(self.folder_path + '/' +
                                            DMGenerator.INFO_FOLDER +
                                            '/info.parquet.gzip',
                                            index=False)
     self._consolidate_folder(DMGenerator.SALES_FOLDER)
Esempio n. 14
0
def extract(ref, vcf, bam, outdir, prefix, skip_bam_readcount, labels, pkl,
            loglevel):
    logging.basicConfig(
        level=loglevel,
        format=
        '%(asctime)s (%(relativeCreated)d ms) -> %(levelname)s: %(message)s',
        datefmt='%I:%M:%S %p')

    if not prefix:
        prefix = os.path.basename(bam.split('.')[0])

    # Generate matrix of true variants
    if labels or not os.path.exists(os.path.join(outdir, 'true_vars.pkl')):
        logger.info('Converting ground truth variants to 1-based coordinates')
        true_vars = pd.read_csv(label,
                                sep='\t',
                                index_col=None,
                                header=None,
                                dtype={0: str})
        true_vars.columns = ['chr', 'start', 'end', 'ref', 'alt']
        true_vars = true_vars.progress_apply(convert_one_based, axis=1)
        true_vars.to_pickle(os.path.join(outdir, 'true_vars.pkl'))

    logger.info('Preparing data')
    prep_data = dp.PrepareData(prefix, bam, bed_file_path, ref, outdir)

    df = prep_data.training_data

    true_vars = pd.read_pickle(
        os.path.join(DATA, 'train_data', 'true_vars.pkl'))
    df['real'] = 0

    sample = df.index[0].split('~')[0]
    true_vars_set = set(df.index.str.replace(sample + '~', ''))

    for index, row in true_vars.iterrows():
        progress(index, true_vars.shape[0])
        var = "{0}:{1}-{2}{3}>{4}".format(row.chr, row.start, row.end, row.ref,
                                          row.alt)
        if var in true_vars_set:
            df.loc[sample + '~' + var, 'real'] = 1

    vaex_df = vaex.from_pandas(df)
    vaex_df.export(os.path.join(outdir, 'train.hdf5'))
Esempio n. 15
0
    def convert(self, force=False):
        import pandas as pd
        skips = ["store_and_fwd_flag"]
        for i, (input, output) in enumerate(zip(self.filenames, self.filenames_vaex)):
            date_names = ["tpep_pickup_datetime", "tpep_dropoff_datetime"]
            if not os.path.exists(output) or force:
                print("Converting %s to %s (%d out of %d)" % (input, output, i + 1, len(self.filenames)))
                df = pd.read_csv(input, parse_dates=date_names)

                for skip in skips:
                    if skip in df:
                        del df["store_and_fwd_flag"]
                ds = vx.from_pandas(df)
                ds.add_virtual_column("pickup_hour", "hourofday(tpep_pickup_datetime)")
                ds.add_virtual_column("dropoff_hour", "hourofday(tpep_dropoff_datetime)")
                ds.add_virtual_column("pickup_dayofweek", "dayofweek(tpep_pickup_datetime)")
                ds.add_virtual_column("dropoff_dayofweek", "dayofweek(tpep_dropoff_datetime)")
                ds.select("(pickup_longitude != 0) & (pickup_latitude != 0) & (dropoff_longitude != 0) & (dropoff_latitude != 0)")
                ds.export_hdf5(output, virtual=True, selection=True)
Esempio n. 16
0
def h5pandas_to_vaex(file_in: Union[None, str, PurePath], del_found_tmp_files=False):
    """
    Pandas hdf5 to vaex.hdf5 conversion: saves tmp files, then searches and combines them.
    :param file_in: pandas hdf5 file
    :return:
    Uses this module functions:
        h5pandas_to_vaex_file_names()
        h5pandas_to_vaex_combine()
    """
    tmp_save_pattern, tmp_search_pattern, path_out_str = h5pandas_to_vaex_file_names(file_in)
    chunksize = 500000  # will get x00 MB files

    ichunk = 0
    for chunk in pd.read_hdf(file_in, 'csv', chunksize=chunksize):  # , where='a < someval'
        df = vaex.from_pandas(chunk)
        df.export_hdf5(tmp_save_pattern.format(ichunk))
        ichunk += 1
        print(ichunk, end=' ')

    h5pandas_to_vaex_combine(tmp_search_pattern, path_out_str,
                             check_files_number=ichunk, del_found_tmp_files=del_found_tmp_files)
Esempio n. 17
0
def test_from_pandas():
    dd_dict = {
        'boolean': [True, True, False, None, True],
        'text': ['This', 'is', 'some', 'text', 'so...'],
        'text_missing': pd.Series(['Some', 'parts', None, 'missing', None], dtype='string'),
        'float': [1, 30, -2, 1.5, 0.000],
        'float_missing': [1, None, -2, 1.5, 0.000],
        'float_missing_masked': pd.Series([1, None, -2, 1.5, 0.000], dtype=pd.Float64Dtype()),
        'int_missing': pd.Series([1, None, 5, 1, 10], dtype='Int64'),
        'datetime_1': [pd.NaT, datetime.datetime(2019, 1, 1, 1, 1, 1), datetime.datetime(2019, 1, 1, 1, 1, 1), datetime.datetime(2019, 1, 1, 1, 1, 1), datetime.datetime(2019, 1, 1, 1, 1, 1)],
        'datetime_2': [pd.NaT, None, pd.NaT, pd.NaT, pd.NaT],
        'datetime_3': [pd.Timedelta('1M'), pd.Timedelta('1D'), pd.Timedelta('100M'), pd.Timedelta('2D'), pd.Timedelta('1H')],
        'datetime_4': [pd.Timestamp('2001-1-1 2:2:11'), pd.Timestamp('2001-12'), pd.Timestamp('2001-10-1'), pd.Timestamp('2001-03-1 2:2:11'), pd.Timestamp('2001-1-1 2:2:11')],
        'datetime_5': [datetime.date(2010, 1, 1), datetime.date(2010, 1, 1), datetime.date(2010, 1, 1), datetime.date(2010, 1, 1), datetime.date(2010, 1, 1)],
        'datetime_6': [datetime.time(21, 1, 1), datetime.time(21, 1, 1), datetime.time(21, 1, 1), datetime.time(21, 1, 1), datetime.time(21, 1, 1)],
    }

    # Get pandas dataframe
    pandas_df = pd.DataFrame(dd_dict)
    pandas_df['datetime_7'] = pd.to_timedelta(pandas_df['datetime_2'] - pandas_df['datetime_1'])
    vaex_df = vaex.from_pandas(pandas_df)
    repr_value = repr(vaex_df)
    str_value = str(vaex_df)

    assert 'NaT' in repr_value
    assert 'NaT' in str_value
    assert '--' in repr_value
    assert '--' in str_value

    # string columns are now arrows arrays
    # assert vaex_df.text_missing.is_masked == True
    assert vaex_df.int_missing.is_masked == True
    assert vaex_df.float_missing.is_masked == False
    assert vaex_df.float_missing_masked.is_masked == True
    assert vaex_df.int_missing.tolist() == [1, None, 5, 1, 10]
    assert vaex_df.text_missing.tolist() == ['Some', 'parts', None, 'missing', None]
    assert vaex_df.float_missing.values[[0, 2, 3, 4]].tolist() == [1.0, -2.0, 1.5, 0.0]
    assert np.isnan(vaex_df.float_missing.values[1])
    assert vaex_df.float_missing_masked.tolist() == [1.0, None, -2.0, 1.5, 0.0]
def main():
    print(f'HDF5 Stored Size: {CSV_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB')

    stream = pd.read_csv(
        CSV_FILE_PATH,
        chunksize=CHUNK_SIZE,
        low_memory=False,
        sep=',',
        encoding='latin-1',
    )
    TMP_PATH.mkdir(parents=True, exist_ok=True)
    for i, chunk in enumerate(stream):
        print(f'Processing {i + 1}-th chunk containing "{len(chunk)}" rows of data...')
        df_chunk = vaex.from_pandas(chunk, copy_index=False)
        export_path = TMP_PATH / f'part_{i}.hdf5'
        df_chunk.export_hdf5(str(export_path))

    df = vaex.open(str(TMP_PATH / 'part*'))

    df.export_hdf5(str(COLUMNAR_HDF_FILE_PATH))
    print(f'HDF5 Stored Size: {COLUMNAR_HDF_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB')

    rmtree(TMP_PATH)
Esempio n. 19
0
def apply_alignments():
    from astrom_common import Affine
    T = fits_table('affines.fits')
    affs = Affine.fromTable(T)
    print('Read affines:', affs)

    ibright = dict([(fn.strip(), i) for fn in affs.filenames])

    corners = {}
    for line in open('corners.txt').readlines():
        line = line.strip()
        words = line.split()
        ras = np.array([float(words[i]) for i in [1, 3, 5, 7]])
        decs = np.array([float(words[i]) for i in [2, 4, 6, 8]])
        corners[words[0]] = (ras, decs)
    from astrometry.util.miscutils import point_in_poly

    fns = (glob('data/M31-*ST/proc_default/M31-*ST.phot.hdf5') +
           glob('data/M31-*ST/M31-*ST.phot.hdf5'))
    fns.sort()
    print('Files:', fns)

    veto_polys = []

    for photfile in fns:
        basename = os.path.basename(photfile)
        basename = basename.replace('.phot.hdf5', '')
        print('Base name:', basename)

        brightfn = basename + '-bright.fits'
        ii = ibright[brightfn]
        aff = affs[ii]

        print('Reading', photfile)
        df = pd.read_hdf(photfile, key='data')
        ds = vaex.from_pandas(df)
        print(len(ds), 'rows')
        ra = ds.evaluate(ds['ra'])
        dec = ds.evaluate(ds['dec'])
        ra, dec = aff.apply(ra, dec)

        corner = corners[basename]
        Tleft = fits_table()
        Tleft.ra = ra
        Tleft.dec = dec
        Tleft.index = np.arange(len(Tleft))
        ras, decs = corner
        poly = np.vstack((ras, decs)).T
        inside = point_in_poly(Tleft.ra, Tleft.dec, poly)
        print(np.sum(inside), 'of', len(Tleft),
              'inside corners of this half-brick')

        inside_veto = np.zeros(len(Tleft), bool)
        for vp in veto_polys:
            inveto = point_in_poly(Tleft.ra, Tleft.dec, vp)
            inside_veto[inveto] = True
        print(np.sum(inside_veto),
              'stars are inside the corners of previous half-bricks')
        print('inside:', type(inside))
        inside[inveto] = False
        print(np.sum(inside), 'stars are uniquely in this half-brick')

        veto_polys.append(poly)

        outfn = 'out-%s.hdf5' % basename
        df[inside].to_hdf5(outfn,
                           key='data',
                           mode='w',
                           format='table',
                           complevel=9,
                           complib='zlib')
        print('Wrote', outfn)
Esempio n. 20
0
import os
from datetime import datetime

import pandas as pd
import vaex
from vaex.datasets import nyctaxi_yellow_2015_jan

# download and read Pandas CSV
nyctaxi_yellow_2015_jan.download()
csv_size = os.path.getsize(nyctaxi_yellow_2015_jan.filenames[0])
df = pd.read_csv(nyctaxi_yellow_2015_jan.filenames[0])

# convert to Vaex
start = datetime.now()
vdf = vaex.from_pandas(df)
duration = datetime.now() - start

print(
    'it took {} to convert {:,} rows ({:.1f} Gb), which is {:,} rows per second'
    .format(duration, len(df), csv_size / 1024. / 1024 / 1024,
            int(len(df) / duration.total_seconds())))

# Last result when running on:
# 2.8 GHz Quad-Core Intel Core i7; 16 GB 1600 MHz DDR3:
# it took 0:00:07.244279 to convert 12,748,986 rows (1.8 Gb), which is 1,759,869 rows per second
Esempio n. 21
0
def csv_to_h5(
        read_csv_args,
        to_hdf_args,
        dates_formats: Mapping[str, str],
        correct_fun: Tuple[None, bool, Callable[[pd.DataFrame], None]] = None,
        processing: Optional[Mapping[Tuple[Tuple[str], Tuple[str]], Callable[[Any], Any]]] = None,
        out_cols: Optional[Sequence] = None,
        continue_row=False,
        vaex_format: Optional[bool]=None
        ):
    """
    Read csv and write to hdf5
    :param read_csv_args: dict, must have keys:
        filepath_or_buffer, chunksize
    :param to_hdf_args:
        path_or_buf: default = read_csv_args['filepath_or_buffer'].with_suffix('vaex.h5' if vaex_format else '.h5')
        mode: default = 'w' if not continue_row else 'a',
        key: hdf5 group name in hdf5 file where store data
        ...
    :param dates_formats:
        column: csv column name wich need to be convert from str to DateTime,
        date_format: date formats
    :param processing: dict with
        keys: ((_input cols_), (_output cols_)) and
        values: function(_input cols_) that will return _output cols_
    :param out_cols: default is all excluding columns that in inputs but not in output of custom param:processing
    :param continue_row: csv row number (excluding header) to start with shifting index.
    If output file exist and continue_row = True then continue converting starting from row equal to last index in it,
    useful to continue after program interrupting or csv appending. If not exist then start from row 0 giving it index 0.
    If continue_row = integer then start from this row, giving starting index = continue_row
    :param correct_fun: function applied to each chunk returned by read_csv() which is a frame of column data of type str
    :param vaex_format: bool how to write chunks:
    - True: to many vaex hdf5 files. They at end will be converted to single vaex hdf5 file
    - False: appending to single pandas hdf5 table
    - None: evaluates to True if to_hdf_args['path_or_buf'] has next to last suffix ".vaex" else to False

    :return:
    """
    if to_hdf_args.get('path_or_buf'):
        if vaex_format is None:
            vaex_format = Path(str(to_hdf_args['path_or_buf']).strip()).suffixes[:-1] == ['.vaex']
    else:  # give default name to output file
        to_hdf_args['path_or_buf'] = Path(read_csv_args['filepath_or_buffer']).with_suffix(
            f'{".vaex" if vaex_format else ""}.h5'
            )

    # Deal with vaex/pandas storing difference
    if vaex_format:
        open_for_pandas_to_hdf = None
        tmp_save_pattern, tmp_search_pattern = h5pandas_to_vaex_file_names(
            path_out_str=str(to_hdf_args['path_or_buf'])
            )
        ichunk = None
    else:
        def open_for_pandas_to_hdf(path_or_buf):
            return pd.HDFStore(
                to_hdf_args['path_or_buf'],
                to_hdf_args.get('mode', 'a' if continue_row else 'w')
                )

    # Find csv row to start
    msg_start = f'Converting in chunks of {read_csv_args["chunksize"]} rows.'
    if continue_row is True:  # isinstance(continue_same_csv, bool)
        try:
            if vaex_format:

                hdf5_list = glob.glob(tmp_search_pattern)
                if len(hdf5_list):      # continue interrupted csv_to_h5()
                    hdf5_list.sort()
                    file_last = hdf5_list[-1]
                    lf.info('Found {:d} temporary files, continue from index found in last file', len(hdf5_list))
                    "table/columns/index"
                else:                   # add next csv data
                    file_last = to_hdf_args['path_or_buf']
                with h5py.File(file_last, mode='r') as to_hdf_buf:
                    continue_row = to_hdf_buf['table/columns/index/data'][-1] + 1
            else:
                with pd.HDFStore(to_hdf_args['path_or_buf'], mode='r') as to_hdf_buf:
                    continue_row = to_hdf_buf.select(to_hdf_args['key'], columns=[], start=-1).index[-1] + 1
        except (OSError) as e:
            msg_start += ' No output file.'
            continue_row = None
        except KeyError as e:
            msg_start += ' No data in output file.'
            continue_row = None
        else:
            msg_start += ' Starting from next to last loaded csv row:'
    elif continue_row:
        msg_start += ' Starting from specified csv data row:'
    if continue_row:
        lf.info('{:s} {:d}...', msg_start, continue_row)
        read_csv_args['skiprows'] = read_csv_args.get('skiprows', 0) + continue_row
    else:
        lf.info('{:s} begining from csv row 0, giving it index 0...', msg_start)

    dtypes = read_csv_args['dtype']

    # Set default output cols if not set
    if out_cols is None and processing:
        # we will out all we will have except processing inputs if they are not mentioned in processing outputs
        cols_in_used = set()
        cols_out_used = set()
        for (c_in, c_out) in processing.keys():
            cols_in_used.update(c_in)
            cols_out_used.update(c_out)
        cols2del = cols_in_used.difference(cols_out_used)
        out_cols = dtypes.keys()
        for col in cols2del:
            del out_cols[col]
    cols_out_used = set(out_cols if out_cols is not None else dtypes.keys())

    # Group cols for conversion by types specified
    str_cols = []
    int_and_nans_cols = []
    other_cols = []
    for col, typ in dtypes.items():
        if out_cols and col not in cols_out_used:
            continue
        kind = typ[0]
        (str_cols if kind == 'S' else
         int_and_nans_cols if kind == 'I' else
         other_cols).append(col)

    str_not_dates = list(set(str_cols).difference(dates_formats.keys()))
    min_itemsize = {col: int(dtypes[col][1:]) for col in str_not_dates}

    # Read csv, process, write hdf5
    with open(read_csv_args['filepath_or_buffer'], 'r') as read_csv_buf, \
            FakeContextIfOpen(open_for_pandas_to_hdf, to_hdf_args['path_or_buf']) as to_hdf_buf:
        read_csv_args.update({
            'filepath_or_buffer': read_csv_buf,
            'memory_map': True,
            'dtype': 'string'  # switch off read_csv dtypes convertion (because if it fails it is hard to correct:
            })  # to read same csv place by pandas)
        to_hdf_args.update({
            'path_or_buf': to_hdf_buf,
            'format': 'table',
            'data_columns': True,
            'append': True,
            'min_itemsize': min_itemsize
            })
        # rows_processed = 0
        # rows_in_chunk = read_csv_args['chunksize']

        for ichunk, chunk in enumerate(pd.read_csv(**read_csv_args)):
            if continue_row:
                if chunk.size == 0:
                    ichunk = np.ceil(continue_row / read_csv_args['chunksize']).astype(int) - 1
                    break  # continue_row is > data rows
                else:
                    chunk.index += continue_row

            lf.extra['id'] = f'chunk start row {chunk.index[0]:d}'
            if ichunk % 10 == 0:
                print(f'{ichunk}', end=' ')
            else:
                print('.', end='')

            if correct_fun:
                correct_fun(chunk)

            # Convert to user specified types

            # 1. dates str to DateTime
            for col, f in dates_formats.items():
                # the convertion of 'bytes' to 'strings' is needed for pd.to_datetime()
                try:
                    chunk[col] = pd.to_datetime(chunk[col], format=f)
                except ValueError as e:
                    lf.error(
                        'Conversion to datetime("{:s}" formatted as "{:s}") {:s} -> '
                        'Replacing malformed strings by NaT...', col, f, standard_error_info(e))
                    chunk[col] = pd.to_datetime(chunk[col], format=f, exact=False, errors='coerce')

            # 2. str to numeric for other_cols and int_and_nans_cols (which is limited support pandas extension dtypes)
            # but we use numpy types instead replasing nans by -1 to able write to hdf5
            chunk[other_cols] = chunk[other_cols].fillna('NaN')  # <NA> to numpy recognized eq meaning string
            chunk[int_and_nans_cols] = chunk[int_and_nans_cols].fillna('-1')
            for col in (int_and_nans_cols + other_cols):  # for col, typ in zip(nans.columns, chunk[nans.columns].dtypes):
                typ = dtypes[col]
                if col in int_and_nans_cols:
                    is_integer = True
                    typ = f'i{typ[1:]}'  # typ.numpy_dtype
                else:
                    is_integer = np.dtype(typ).kind == 'i'
                try:
                    chunk[col] = chunk[col].astype(typ)
                    continue
                except (ValueError, OverflowError) as e:
                    # Cleaning. In case of OverflowError we do it here to prevent ValueError while handling of OverflowError below.
                    pattern_match = r'^[\d]$' if is_integer else r'^-?[\d.]$'
                    ibad = ~chunk[col].str.match(pattern_match)
                    rep_val = '-1' if is_integer else 'NaN'
                    # ibad = np.flatnonzero(chunk[col] == re.search(r'(?:")(.*)(?:")', e.args[0]).group(1), 'ascii')
                    lf.error('Conversion {:s}("{:s}") {:s} -> replacing {:d} values not maching pattern "{:s}" with "{'
                             ':s}" and again...', typ, col, standard_error_info(e), ibad.sum(), pattern_match, rep_val)
                    chunk.loc[ibad, col] = rep_val
                    # astype(str).replace(regex=True, to_replace=r'^.*[^\d.].*$', value=
                try:
                    chunk[col] = chunk[col].astype(typ)
                except (OverflowError,
                        ValueError) as e:  # May be bad value from good symbols: r'^\d*\.\d*\.+\d*$' but instead checking it we do coerce_to_exact_dtype() on ValueError here too
                    lf.error('Conversion {:s}("{:s}") {:s} -> Replacing malformed strings and big numbers'
                    ' by NaN ...', typ, col, standard_error_info(e))
                    chunk[col] = coerce_to_exact_dtype(chunk[col], dtype=typ)

            # Limit big strings length and convert StringDtype to str to can save by to_hdf()
            for col, max_len in min_itemsize.items():  # for col, typ in zip(nans.columns, chunk[nans.columns].dtypes):
                chunk[col] = chunk[col].str.slice(stop=max_len)  # apply(lambda x: x[:max_len]) not handles <NA>
            chunk[str_not_dates] = chunk[str_not_dates].astype(str)

            # Apply specified data processing
            if processing:
                for (cols_in, c_out), fun in processing.items():
                    cnv_result = fun(chunk[list(cols_in)])
                    chunk[list(c_out)] = cnv_result

            # # Bad rows check
            # is_different = chunk['wlaWID'].fillna('') != chunk['wlaAPIHartStandard'].fillna('')
            # if is_different.any():
            #     i_bad = np.flatnonzero(is_different.values)
            #     lf.debug('have wlaWID != wlaAPIHartStandard in rows {:s}', chunk.index[i_bad])
            #     # chunk= chunk.drop(chunk.index[i_bad])   # - deleting
            #     pass

            # Check unique index
            # if chunk['wlaWID'].duplicated()

            try:
                if vaex_format:
                    df = vaex.from_pandas(chunk if out_cols is None else chunk[out_cols])
                    df.export_hdf5(tmp_save_pattern.format(ichunk))
                else:  # better to move this command upper and proc. by vaex instead of pandas
                    (chunk if out_cols is None else chunk[out_cols]).to_hdf(**to_hdf_args)
                #rows_processed += rows_in_chunk  # think we red always the same length exept last which length value will not be used

            except Exception as e:
                lf.exception('write error')
                pass
        try:
            del lf.extra['id']
        except KeyError:
            lf.info('was no more data rows to read')

    # If vaex store was specified then we have chunk files that we combine now by export_hdf5():
    if vaex_format:
        h5pandas_to_vaex_combine(tmp_search_pattern, str(to_hdf_args['path_or_buf']), check_files_number=ichunk+1)
Esempio n. 22
0
import vaex
import numpy as np
import pandas as pd

for i, chunk in enumerate(vaex.read_csv(r'D:\python projects\quandomo\data_center\data\market\ASHAREEODPRICES.csv', chunksize=100_000)):
    df_chunk = vaex.from_pandas(chunk, copy_index=False)
    export_path = f'D:/python projects/quandomo/data_center/data/market/part_{i}.hdf5'
    df_chunk.export_hdf5(export_path)

df = vaex.open('D:/python projects/quandomo/data_center/data/market/part*')
df.export_hdf5('D:/python projects/quandomo/data_center/data/market/Final.hdf5')
Esempio n. 23
0
                y_binned,
                yerr=yerr,
                xerr=xerr,
                fmt=',',
                color='k',
                lw=1.5)
    fig.savefig('{}_{}.jpg'.format(blue_filter, red_filter), dpi=144)


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('filebase', action='store')
    args = parser.parse_args()

    photfile = args.filebase

    # try:
    #     # I have never gotten vaex to read an hdf5 file successfully
    #     ds = vaex.open(photfile)
    # except:
    import pandas as pd
    df = pd.read_hdf(photfile, key='data')
    ds = vaex.from_pandas(df)

    filter_sets = [('f336w', 'f275w', 'f336w'), ('f475w', 'f336w', 'f475w'),
                   ('f814w', 'f475w', 'f814w'), ('f160w', 'f475w', 'f160w'),
                   ('f160w', 'f814w', 'f160w'), ('f160w', 'f110w', 'f160w')]
    for f in filter_sets:
        make_cmd(ds, *f)
Esempio n. 24
0
def read_csv(paths: Sequence[Union[str, Path]], cfg_in: Mapping[str, Any]) -> Union[pd.DataFrame, vaex.dataframe.DataFrame]:
    """
    Reads csv in dask DataFrame
    Calls cfg_in['fun_proc_loaded'] (if specified)
    Calls time_corr: corrects/checks Time (with arguments defined in cfg_in fields)
    Sets Time as index
    :param paths: list of file names
    :param cfg_in: contains fields for arguments of dask.read_csv correspondence:

        names=cfg_in['cols'][cfg_in['cols_load']]
        usecols=cfg_in['cols_load']
        error_bad_lines=cfg_in['b_raise_on_err']
        comment=cfg_in['comments']

        Other arguments corresponds to fields with same name:
        dtype=cfg_in['dtype']
        delimiter=cfg_in['delimiter']
        converters=cfg_in['converters']
        skiprows=cfg_in['skiprows']
        blocksize=cfg_in['blocksize']

        Also cfg_in has filds:
            dtype_out: numpy.dtype, which "names" field used to detrmine output columns
            fun_proc_loaded: None or Callable[
            [Union[pd.DataFrame, np.array], Mapping[str, Any], Optional[Mapping[str, Any]]],
             Union[pd.DataFrame, pd.DatetimeIndex]]
            If it returns pd.DataFrame then it also must has attribute:
                meta_out: Callable[[np.dtype, Iterable[str], Mapping[str, dtype]], Dict[str, np.dtype]]

            See also time_corr() for used fields



    :return: tuple (a, b_ok) where
        a:      dask dataframe with time index and only columns listed in cfg_in['dtype_out'].names
        b_ok:   time correction reszult bulean array
    """


    try:
        try:
            # raise ValueError('Temporary')
            # for ichunk, chunk in enumerate(pd.read_csv(paths, chunksize=1000, delimiter='\t')):
            df = pd.read_csv(
                paths,
                dtype=cfg_in['dtype_raw'],
                names=cfg_in['cols'],
                delimiter=cfg_in['delimiter'],
                skipinitialspace=True,
                usecols=cfg_in['dtype'].names,
                # cfg_in['cols_load'],
                converters=cfg_in['converters'],
                skiprows=cfg_in['skiprows'],
                error_bad_lines=cfg_in['b_raise_on_err'],
                comment=cfg_in['comments'],
                header=None,
                blocksize=cfg_in['blocksize'])  # not infer

            # , engine='python' - may help load bad file

            # index_col=False  # force pandas to _not_ use the first column as the index (row names) - no in dask
            # names=None, squeeze=False, prefix=None, mangle_dupe_cols=True,
            # engine=None, true_values=None, false_values=None, skipinitialspace=False,
            #     nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False,
            #     skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False,
            #     date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression='infer',
            #     thousands=None, decimal=b'.', lineterminator=None, quotechar='"', quoting=0,
            #     escapechar=None, encoding=None, dialect=None, tupleize_cols=None,
            #      warn_bad_lines=True, skipfooter=0, skip_footer=0, doublequote=True,
            #     delim_whitespace=False, as_recarray=None, compact_ints=None, use_unsigned=None,
            #     low_memory=True, buffer_lines=None, memory_map=False, float_precision=None)
        except ValueError as e:
            l.exception('dask lib can not load data. Trying pandas lib...')
            for i, nf in enumerate(paths):
                df = pd.read_csv(
                    nf, dtype=cfg_in['dtype_raw'], names=cfg_in['cols'], usecols=cfg_in['dtype'].names,
                    # cfg_in['cols_load'],
                    delimiter=cfg_in['delimiter'], skipinitialspace=True, index_col=False,
                    converters=cfg_in['converters'], skiprows=cfg_in['skiprows'],
                    error_bad_lines=cfg_in['b_raise_on_err'], comment=cfg_in['comments'],
                    header=None)
                if i > 0:
                    raise NotImplementedError('list of files => need concatenate data')
            ddf = vaex.from_pandas(df, chunksize=cfg_in['blocksize'])  #
    except Exception as e:  # for example NotImplementedError if bad file
        msg = '- Bad file. skip!'
        ddf = None
        if cfg_in['b_raise_on_err']:
            l.exception('%s\n Try set [in].b_raise_on_err= False\n', msg)
            raise
        else:
            l.exception(msg)
    if __debug__:
        l.debug('read_csv initialised')
    if ddf is None:
        return None, None

    meta_time = pd.Series([], name='Time', dtype='M8[ns]')  # np.dtype('datetime64[ns]')
    meta_time_index = pd.DatetimeIndex([], dtype='datetime64[ns]', name='Time')
    meta_df_with_time_col = cfg_in['cols_load']

    # Process ddf and get date in ISO string or numpy standard format
    cfg_in['file_stem'] = Path(paths[0]).stem  # may be need in func below to extract date
    try:
        date_delayed = None
        try:
            if not getattr(cfg_in['fun_proc_loaded'], 'meta_out', None) is None:
                # fun_proc_loaded will return modified data. Go to catch it
                # todo: find better condition
                raise TypeError

            date = ddf.map_partitions(lambda *args, **kwargs: pd.Series(
                cfg_in['fun_proc_loaded'](*args, **kwargs)), cfg_in, meta=meta_time)  # meta_time_index
            # date = date.to_series()
        except (TypeError, Exception) as e:
            # fun_proc_loaded retuns tuple (date, a)
            changing_size = False  # ? True  # ?
            if changing_size:

                @vaex.delayed
                def run_fun_proc_loaded():
                    """
                    delayed(, nout=2)(ddf, cfg_in)
                    :return:
                    """
                    return cfg_in['fun_proc_loaded']()


                date_delayed, a = run_fun_proc_loaded()
                ddf_len = len(ddf)
                counts_divisions = list(range(1, int(ddf_len / cfg_in.get('decimate_rate', 1)), cfg_in['blocksize']))
                counts_divisions.append(ddf_len)
                ddf = vaex.from_delayed(a, divisions=(0, counts_divisions))

                #date, meta = meta_time_index, divisions = counts_divisions); from_dask_array(date.values, index=ddf.index)
                date = date_delayed.get()

            else:  # getting df with time col
                meta_out = cfg_in['fun_proc_loaded'].meta_out(cfg_in['dtype']) if callable(
                    cfg_in['fun_proc_loaded'].meta_out) else None
                ddf = ddf.map_partitions(cfg_in['fun_proc_loaded'], cfg_in, meta=meta_out)
                date = ddf.Time
    except IndexError:
        print('no data?')
        return None, None
        # add time shift specified in configuration .ini

    n_overlap = 2 * int(np.ceil(cfg_in['fs'])) if cfg_in.get('fs') else 50
    # reset_index().set_index('index').
    meta2 = {'Time': 'M8[ns]', 'b_ok': np.bool8}

    #     pd.DataFrame(columns=('Time', 'b_ok'))
    # meta2.time = meta2.time.astype('M8[ns]')
    # meta2.b_ok = meta2.b_ok.astype(np.bool8)

    def time_corr_df(t, cfg_in):
        """convert tuple returned by time_corr() to dataframe"""
        return pd.DataFrame.from_dict(OrderedDict(zip(meta2.keys(), utils_time_corr.time_corr(t, cfg_in))))
        # return pd.DataFrame.from_items(zip(meta2.keys(), time_corr(t, cfg_in)))
        # pd.Series()

    # date.rename('time').to_series().reset_index().compute()
    # date.to_series().repartition(divisions=ddf.divisions[1])

    '''
    def time_corr_ar(t, cfg_in):
        """convert tuple returned by time_corr() to dataframe"""
        return np.array(time_corr(t, cfg_in))
        #return pd.DataFrame.from_items(zip(meta2.keys(), time_corr(t, cfg_in)))
        # pd.Series()
    da.overlap.map_overlap(date.values, time_corr_ar, depth=n_overlap)
    '''

    l.info('time correction in %s blocks...', date.npartitions)
    df_time_ok = date.map_overlap(time_corr_df, before=n_overlap, after=n_overlap, cfg_in=cfg_in, meta=meta2)
    # .to_series()
    # if __debug__:
    #     c = df_time_ok.compute()
    # tim = date.compute().values()
    # tim, b_ok = time_corr(tim, cfg_in)

    # return None, None
    # if len(ddf) == 1:  # size
    #     ddf = ddf[np.newaxis]

    # npartitions = ddf.npartitions
    # ddf = ddf.reset_index().set_index('index')
    # col_temp = set(ddf.columns).difference(cfg_in['dtype_out'].names).pop()

    # ddf.index is not unique!
    # if col_temp:
    #      # ddf[col_temp].compute().is_unique # Index.is_monotonic_increasing()
    #     # ddf[col_temp] = ddf[col_temp].map_partitions(lambda s, t: t[s.index], tim, meta=meta)
    try:
        df_time_ok = df_time_ok.persist()

    except Exception as e:
        l.exception('Can not speed up by persist')
        # # something that can trigger error to help it identificate ???
        # date = date.persist()
        # df_time_ok = df_time_ok.compute()
        df_time_ok = time_corr_df(
            (date_delayed if date_delayed is not None else date).compute(), cfg_in=cfg_in)

    nbad_time = len(df_time_ok['b_ok']) - df_time_ok['b_ok'].sum()
    l.info('Removing %d bad time values: %s%s', nbad_time,
           df_time_ok['b_ok'].fillna(0).ne(True).to_numpy().nonzero()[0][:20],
           ' (shows first 20)' if nbad_time > 20 else '')

    df_time_ok.loc[df_time_ok['b_ok'], 'Time'] = pd.NaT
    try:
        df_time_ok.Time = df_time_ok.Time.interpolate(
            inplace=False)  # inplace=True - not works, method='linear', - default
    except ValueError:  # if not interpolates (my condition) use simpler method:
        df_time_ok.Time = df_time_ok.Time.fillna(method='ffill', inplace=True)

    if nbad_time:
        # # dask get IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match):
        # ddf_out = ddf.loc[df_time_ok['b_ok'], list(cfg_in['dtype_out'].names)].set_index(
        #    df_time_ok.loc[df_time_ok['b_ok'], 'Time'], sorted=True)

        # so we have done interpolate that helps this:
        ddf_out = ddf.loc[:, list(cfg_in['dtype_out'].names)].set_index(df_time_ok['Time'])  # , sorted=True
        ddf_out = ddf_out.loc[df_time_ok['b_ok'], :]
    else:
        # print('data loaded shape: {}'.format(ddf.compute(scheduler='single-threaded').shape))  # debug only
        ddf_out = ddf.loc[:, list(cfg_in['dtype_out'].names)].set_index(df_time_ok['Time'], sorted=True)

    logger = logging.getLogger("dask")
    logger.addFilter(lambda s: s.getMessage() != "Partition indices have overlap.")
    return ddf_out
Esempio n. 25
0
    "ex-dividend", "split_ratio", "adj_open", "adj_high", "adj_low",
    "adj_close", "adj_volume"
],
             axis=1,
             inplace=True)

# Remove rows with missing data, since they are present in minimal amount
dataset.dropna(inplace=True)

# Convert string-based dates to integers (days) since first date in chronological order (day 0)
dataset['date'] = pd.to_datetime(dataset['date'])
basedate = dataset.date.min()
dataset['date'] = (dataset['date'] - basedate).dt.days

# Make all numeric data to be float64 to ease computation
dataset['date'] = dataset['date'].astype('float64')

# ----------- #
# DATA EXPORT #
# ----------- #

# Vaex-ify Pandas dataset and export to little-endian HDF5
vaex_dataset = vx.from_pandas(dataset)
vaex_dataset.export_hdf5("../data/WIKI_PRICES_QUANDL.hdf5",
                         column_names=None,
                         byteorder='<',
                         progress=False)

# Greet user!
print('Success!')
Esempio n. 26
0
 def __init__(self, df):
     super(DataFrameAccessorGraphQLPandas,
           self).__init__(vaex.from_pandas(df))
from traffic.traffic import Traffic

import vaex


def set_queues_callback(time, queues):
    global street_queues

    street_queues[time] = queues


in_setup_file = './hashcode.in'
in_submission_file = 'submission.hashcode.txt'
out_street_queues_hd5_file = 'street_queues.hashcode.h5'

# in_setup_file='./example.in'
# in_submission_file='submit.example.txt'
# out_street_queues_hd5_file='street_queues.example.h5'

m = Traffic(in_file=in_setup_file)
num_streets = len(m.streets)
callback = set_queues_callback
street_queues = np.zeros((m.end_time, num_streets), dtype=int)
m.read_submission_file(in_file_path=in_submission_file)
m.simulate(progress_bar=True, override_end_time=None, queue_callback=callback)
m_score = m.calculate_simulation_score()
print("Final score: {}".format(m_score))

street_queues_df = vaex.from_pandas(
    pd.DataFrame(street_queues, columns=list(m.streets.keys())))
street_queues_df.export_hdf5(out_street_queues_hd5_file, progress=True)
Esempio n. 28
0
def to_fits():
    fns = (glob('data/M31-*ST/proc_default/M31-*ST.phot.hdf5') +
           glob('data/M31-*ST/M31-*ST.phot.hdf5'))
    fns.sort()
    print('Files:', fns)

    plt.clf()

    for photfile in fns:
        #photfile = 'data/M31-B23-WEST/M31-B23-WEST.phot.hdf5'

        basename = os.path.basename(photfile)
        basename = basename.replace('.phot.hdf5', '')
        print('Base name:', basename)

        outfn = basename + '-bright.fits'
        if os.path.exists(outfn):
            print('Exists:', outfn)
            continue

        words = basename.split('-')
        assert (len(words) == 3)
        brick = words[1]
        assert (brick[0] == 'B')
        brick = int(brick[1:], 10)
        print('Brick number:', brick)
        ew = words[2]
        assert (ew in ['EAST', 'WEST'])
        east = (ew == 'EAST')

        df = pd.read_hdf(photfile, key='data')
        ds = vaex.from_pandas(df)
        print('Read', photfile)
        #print(ds)

        good = ds['f814w_gst']
        print(len(ds), 'rows')
        ds = ds[good]
        print(len(ds), 'gst on F814W')

        # good = ds.evaluate(ds['f475w_gst'])
        # print(good)
        # print(len(good))
        # print(type(good))
        # print(good.dtype)
        # print('Of those,', np.sum(ds.evaluate(ds['f475w_gst'])), 'are F475W_GST')
        # print('Of those,', np.sum(ds.evaluate(ds['f336w_gst'])), 'are F336W_GST')
        # print('Of those,', np.sum(ds.evaluate(ds['f275w_gst'])), 'are F275W_GST')
        # print('Of those,', np.sum(ds.evaluate(ds['f110w_gst'])), 'are F110W_GST')
        # print('Of those,', np.sum(ds.evaluate(ds['f160w_gst'])), 'are F160W_GST')

        mag = ds.evaluate(ds['f814w_vega'])
        print('Of', len(mag), 'mags,', np.sum(np.isfinite(mag)), 'are finite')
        print('range:', np.nanmin(mag), np.nanmax(mag))

        plt.hist(mag, range=(20, 28), bins=50, label=basename)

        ds = ds[ds['f814w_vega'] < 24]
        print(len(ds), 'with F814W < 24')

        mag = ds.evaluate(ds['f814w_vega'])
        xx = ds.evaluate(ds['x'])
        yy = ds.evaluate(ds['y'])

        xlo = xx.min()
        xhi = xx.max()
        ylo = yy.min()
        yhi = yy.max()
        nx = int(np.round((xhi - xlo) / 1000.)) + 1
        xbins = np.linspace(xlo, xhi, nx)
        ny = int(np.round((yhi - ylo) / 1000.)) + 1
        ybins = np.linspace(ylo, yhi, ny)
        print('x bins', xbins)
        print('y bins', ybins)
        xbin = np.digitize(xx, xbins)
        ybin = np.digitize(yy, ybins)
        xybin = ybin * nx + xbin
        nbins = nx * ny
        print('N bins:', nbins)
        nperbin = int(np.ceil(100000. / nbins))
        II = []
        for ibin in range(nbins):
            I = np.flatnonzero(xybin == ibin)
            if len(I) == 0:
                continue
            Ibright = np.argsort(mag[I])[:nperbin]
            II.append(I[Ibright])
        II = np.hstack(II)

        #I = np.argsort(mag)
        #I = I[:100000]
        #print('100k-th star: mag', mag[I[-1]])
        ds = ds.take(II)

        T = fits_table()
        for col in ['ra', 'dec', 'x', 'y', 'index']:
            T.set(col, ds.evaluate(ds[col]))
        for filt in [814, 475, 336, 275, 110, 160]:
            for col in ['f%iw_vega']:
                colname = col % filt
                T.set(colname, ds.evaluate(ds[colname]))
        T.brick = np.zeros(len(T), np.uint8) + brick
        T.east = np.zeros(len(T), bool)
        T.east[:] = east
        T.writeto(outfn)

    plt.legend()
    plt.xlabel('F814W mag')
    plt.savefig('mags.png')
Esempio n. 29
0
def create_model():
    if not os.path.exists(gtfsr_model_df_path):
        df = vaex.open(gtfsr_processed_path)
        df = df.sample(frac=1)

        # # remove outliers from dataset, all delays over 20 minutes
        outlier = 60 * 20
        df = df[(df["arrival"] >= -outlier)
                & (df["arrival"] <= outlier)
                & (df["departure"] >= -outlier)
                & (df["departure"] <= outlier)]

        df["arr_dow"] = df.apply(apply_dow,
                                 ["start_date", "start_time", "arrival_time"])
        df["arr_hour"] = df["arrival_time"].apply(
            lambda t: get_dt(t, "%H:%M:%S").hour)
        df["arrival"] = df["arrival"].apply(lambda t: 0 if t == 0 else t / 60)

        cols = [
            "route_id", "stop_id", "arr_dow", "arr_hour", "direction",
            "stop_sequence"
        ]

        # if the arrival historical means dataset is not created we create it
        if not os.path.exists(gtfsr_historical_means_path):
            print("*** creating gtfsr historical means dataset ***")
            # creates a dataset of historical average means using the stop_id, arrival_day_of_week and trip_id identifiers

            vaex.from_pandas(
                (df.to_pandas_df().groupby(cols).agg({
                    "arrival": "mean",
                    "p_avg_vol": "mean"
                }).rename(columns={
                    "arrival": "arrival_mean",
                    "p_avg_vol": "p_mean_vol"
                }).reset_index())).export_hdf5(gtfsr_historical_means_path)

        print("*** joining hist means ***")

        # join the arrival means to our dataset
        df = vaex_mjoin(df,
                        vaex.open(gtfsr_historical_means_path),
                        cols,
                        cols,
                        how="left")

        df = df[[
            "start_date",
            "start_time",
            "stop_sequence",
            "arrival",
            "timestamp",
            "stop_id",
            "arrival_time",
            "shape_dist_traveled",
            "direction",
            "route_id",
            "lat",
            "lon",
            "direction_angle",
            "shape_dist_between",
            "arr_dow",
            "arr_hour",
            "arrival_mean",
            "p_mean_vol",
        ]]

        df.export_hdf5(gtfsr_model_df_path)

    print("*** Start training ***")
    # open model ready
    df = vaex.open(gtfsr_model_df_path)

    # transform our data
    df = transform_data(df)

    # train our data
    train_gtfsr(df)

    return
Esempio n. 30
0
    import pandas as pd

    import glob
    import os

    from findatapy.util.loggermanager import LoggerManager

    start = time.time()

    data_vendor = 'dukascopy' # 'ncfx' or 'dukascopy'

    source_folder = '/data/csv_dump/' + data_vendor + '/'
    destination_folder = '/data/csv_dump/' + data_vendor + '_arrow/'

    logger = LoggerManager().getLogger(__name__)

    parquet_list = glob.glob(source_folder + '/*.parquet')

    for p in parquet_list:
        df = pd.read_parquet(p)

        df = vaex.from_pandas(df, name='pandas', copy_index=True, index_name='Date')

        logger.info("Converting " + p + "...")
        filename = os.path.basename(p)

        df.export(destination_folder + "/" + filename.replace('parquet', 'arrow'))

    finish = time.time()
    print('Status: calculated ' + str(round(finish - start, 3)) + "s")
Esempio n. 31
0
 def vframe(self):
     self.pframe()
     self.df_vaex = vaex.from_pandas(self.df_pandas)
     return self.df_vaex