Beispiel #1
0
def combine_csv():
    # read from the gtfs records
    with zipfile.ZipFile(gtfs_csv_zip, "r") as zip:
        dirs = zip.namelist()

        # merge all the csv's in the zip file
        combined_csv = pd.concat(
            [pd.read_csv(zip.open(f), header=None) for f in dirs])
        combined_csv.columns = entity_cols

        # dropping duplicates
        combined_csv = combined_csv.drop_duplicates(subset=entity_cols[:-1])

        # convert to csv
        combined_csv.to_csv(gtfs_final_csv_path, index=False, header=True)
        print(f"finished combining the zip files, time: {duration()}")

        if os.path.exists(gtfs_final_hdf5_path):
            os.remove(gtfs_final_hdf5_path)

        vaex.from_csv(gtfs_final_csv_path,
                      convert=True,
                      copy_index=False,
                      chunk_size=1000000)
        print(f"finished converting to hdf5, time: {duration()}")
    return
Beispiel #2
0
def read_states(statesf):
    """Exomol IO for a state file.

    Args:
        statesf: state file

    Returns:
        states data in pandas DataFrame

    Note:
        i=state counting number
        E=state energy
        g=state degeneracy
        J=total angular momentum
        See Table 11 in https://arxiv.org/pdf/1603.05890.pdf
    """
    try:
        dat = vaex.from_csv(statesf,
                            compression='bz2',
                            sep='\s+',
                            usecols=range(4),
                            names=('i', 'E', 'g', 'J'),
                            convert=True)
    except:
        dat = vaex.read_csv(statesf,
                            sep='\s+',
                            usecols=range(4),
                            names=('i', 'E', 'g', 'J'),
                            convert=True)

    return dat
Beispiel #3
0
def csv_to_df(file: str):
    """Convert csv into datafram or hdf5 file.

    Args:
        file (str): Path to input file.
        filePattern (str): extension of file to convert.

    Returns:
        Vaex dataframe

    """
    logger.info("csv_to_df: Copy csv file into outDir for processing...")
    file_name = Path(file).stem
    output = file_name + ".csv"
    outputfile = os.path.join(outDir, output)
    shutil.copyfile(file, outputfile)

    logger.info("csv_to_df: Checking size of csv file...")
    # Open csv file and count rows in file
    with open(outputfile, "r", encoding="utf-8") as fr:
        ncols = len(fr.readline().split(","))

    chunk_size = max([2**24 // ncols, 1])
    logger.info("csv_to_df: # of columns are: " + str(ncols))

    # Convert large csv files to hdf5 if more than 1,000,000 rows
    logger.info("csv_to_df: converting file into hdf5 format")
    df = vaex.from_csv(outputfile, convert=True, chunk_size=chunk_size)
    return df
Beispiel #4
0
    def extract_file_data(self, file_path, file_name):
        """

        :param file_path: This is the data location,ex: csv file location
        :param file_name: This is the file name
        :return:
        """
        print('extract_file_data')
        size = round(get_file_size(file_path, SIZE_UNIT.MB), 2)
        print('Size of file is : ', size, 'MB')
        file_type_list = str(file_name).split('.')
        file_type = file_type_list[len(file_type_list) - 1]

        if file_type == 'csv':
            df = vaex.from_csv(file_path, copy_index=False)

        elif file_type == 'hdf5':
            df = vaex.open(file_path)

        elif file_type == 'parquet':
            df = vaex.open(file_path)

        elif file_type =='s3':
            df = vaex.open(file_path)


        self.column_datatype_list = []
        column_data_types_raw = list(df.dtypes)
        for x in range(len(column_data_types_raw)):
            self.column_datatype_list.append(str(column_data_types_raw[x]))
        raw_detected_column_datatype=self.column_datatype_list
        return df,raw_detected_column_datatype
Beispiel #5
0
def read_trans(transf):
    """Exomol IO for a transition file.

    Args:
        transf: transition file

    Returns:
        transition data in vaex DataFrame

    Note:
        i_upper=Upper state counting number
        i_lower=Lower state counting number
        A=Einstein coefficient in s-1
        nu_lines=transition wavenumber in cm-1
        See Table 12 in https://arxiv.org/pdf/1603.05890.pdf
    """
    try:
        dat = vaex.from_csv(transf,
                            compression='bz2',
                            sep='\s+',
                            names=('i_upper', 'i_lower', 'A', 'nu_lines'),
                            convert=True)
    except:
        dat = vaex.read_csv(transf,
                            sep='\s+',
                            names=('i_upper', 'i_lower', 'A', 'nu_lines'),
                            convert=True)

    return dat
Beispiel #6
0
    def _load_csvs_into_dfs(self, start_time, end_time):

        if settings.PRINT_EVENTS:
            print("读取CSV文件...")

        # 得到csv文件名称
        hdf5_files = self._obtain_stock_hdf5_files()

        csv_dfs = {}

        if len(hdf5_files) == 0:
            #将csv文件转为hdf5文件
            csv_files = self._obtain_stock_csv_files()

            for csv_file in csv_files:
                stock_symbol = self._obtain_stock_symbol_from_filename(
                    csv_file)
                csv_df = vaex.from_csv(csv_file,
                                       convert=True,
                                       chunk_size=5000000)
                #暂时不知道如何修改列名
                csv_df['date_time'] = csv_df['Unnamed: 0']

                if stock_symbol not in csv_dfs.keys():
                    csv_dfs[stock_symbol] = {}

                if 'Tick' in csv_file:
                    csv_dfs[stock_symbol]['snapshot'] = csv_df
                    if settings.PRINT_EVENTS:
                        print("加载 '%s' %s snapshot数据..." %
                              (self.code, stock_symbol))
                elif 'Transaction' in csv_file:
                    csv_dfs[stock_symbol]['tick'] = csv_df
                    if settings.PRINT_EVENTS:
                        print("加载 '%s' %s tick数据..." %
                              (self.code, stock_symbol))
        else:
            for csv_file in hdf5_files:
                stock_symbol = self._obtain_stock_symbol_from_filename(
                    csv_file)

                csv_df = self._load_hdf5_into_df(csv_file, start_time,
                                                 end_time)

                if stock_symbol not in csv_dfs.keys():
                    csv_dfs[stock_symbol] = {}

                if 'Tick' in csv_file:
                    csv_dfs[stock_symbol]['snapshot'] = csv_df
                    if settings.PRINT_EVENTS:
                        print("加载 '%s' %s snapshot数据..." %
                              (self.code, stock_symbol))
                elif 'Transaction' in csv_file:
                    csv_dfs[stock_symbol]['tick'] = csv_df
                    if settings.PRINT_EVENTS:
                        print("加载 '%s' %s tick数据..." %
                              (self.code, stock_symbol))

        return csv_dfs
Beispiel #7
0
def test_from_csv():
    # can read with default options
    df = vaex.from_csv(csv_path, copy_index=True)
    _assert_csv_content(df, with_index=True)

    # can read an empty CSV
    df = vaex.from_csv(os.path.join(path, 'data', 'empty.csv'))
    assert len(df) == 0

    # can read as chunks iterator
    df_iterator = vaex.from_csv(csv_path, chunk_size=1)
    df1 = next(df_iterator)
    assert len(df1) == 1
    df2, df3 = next(df_iterator), next(df_iterator)
    with pytest.raises(StopIteration):
        next(df_iterator)
    _assert_csv_content(vaex.dataframe.DataFrameConcatenated([df1, df2, df3]))
Beispiel #8
0
def test_from_csv():
    # can read with default options
    df = vaex.from_csv(csv_path, copy_index=True)
    _assert_csv_content(df, with_index=True)

    # can read an empty CSV
    df = vaex.from_csv(os.path.join(path, 'data', 'empty.csv'))
    assert len(df) == 0

    # can read csv with no header
    df = vaex.from_csv(os.path.join(path, 'data', 'noheader.csv'), header=None)
    assert len(df) == 5
    assert df.get_column_names() == ['0', '1', '2']

    # can read as chunks iterator
    df_iterator = vaex.from_csv(csv_path, chunk_size=1)
    df1 = next(df_iterator)
    assert len(df1) == 1
    df2, df3 = next(df_iterator), next(df_iterator)
    with pytest.raises(StopIteration):
        next(df_iterator)
    _assert_csv_content(vaex.concat([df1, df2, df3]))
Beispiel #9
0
def test_from_big_csv_convert():
    # csv = '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-01_0.csv'
    csv = '/Users/byaminov/fun/datasets/yellow_tripdata_2019-01.csv'
    # csv = '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-h1.csv'
    os.remove(csv + '.hdf5')

    start = datetime.now()
    df = vaex.from_csv(csv, convert=True)
    duration = datetime.now() - start
    print('it took {} to convert {:,} rows, which is {:,} rows per second'.
          format(duration, df.length(),
                 int(df.length() / duration.total_seconds())))
    assert df.length() == 7_667_792
Beispiel #10
0
def create_scats_ml_model():
    print("starting scats ml modeling")

    # load existing csv into vaex dataframe
    if not os.path.exists(finalScatsPath + ".hdf5"):
        vaex.from_csv(finalScatsPath, convert=True, copy_index=False, chunk_size=1_000_000)

    df = vaex.open(finalScatsPath + ".hdf5", shuffle=True)

    # transform the features into more machine learning friendly vars
    pca_coord = vaex.ml.PCA(features=["lat", "lon"], n_components=2, prefix="pca")
    df = pca_coord.fit_transform(df)

    cycl_transform_hour = vaex.ml.CycleTransformer(features=["hour"], n=24)
    df = cycl_transform_hour.fit_transform(df)

    cycl_transform_dow = vaex.ml.CycleTransformer(features=["dow"], n=7)
    df = cycl_transform_dow.fit_transform(df)

    print("dataWrangling done, ready to create model, time: {}s".format(duration()))

    # create a randomForestRegression model
    vaex_model = Predictor(
        features=df.get_column_names(regex="pca[\d]") + df.get_column_names(regex=".*_[xy]"),
        target="avg_vol",
        model=RandomForestRegressor(random_state=42, n_estimators=7 * 24),
        prediction_name="p_avg_vol",
    )

    # here we fit and train the model
    with parallel_backend("threading", n_jobs=8):
        vaex_model.fit(df)
        print("\n\nmodel created, time: {}s".format(duration()))

        dump(value=vaex_model, filename=model_out, compress=3)

    print("model written to output, time: {}s".format(duration()))
    return
Beispiel #11
0
def test_from_big_csv_read():
    # csv = '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-01_0.csv'
    csv = '/Users/byaminov/fun/datasets/yellow_tripdata_2019-01.csv'
    # csv = '/Users/byaminov/fun/datasets/test_yellow_tripdata/yellow_tripdata_2019-h1.csv'

    start = datetime.now()
    read_length = 0
    # for df in vaex.from_csv(csv, chunk_size=2_000_000):
    #     read_length += len(df)
    read_length += len(vaex.from_csv(csv))
    duration = datetime.now() - start
    print('it took {} to convert {:,} rows, which is {:,} rows per second'.
          format(duration, read_length,
                 int(read_length / duration.total_seconds())))
    assert read_length == 7_667_792
Beispiel #12
0
def test_from_csv_converting_in_chunks():
    # can read several chunks with converting, intermediate files are deleted
    df = vaex.from_csv(csv_path, chunk_size=1, convert=True)
    _assert_csv_content(df)
    for filename in [
            'small3.csv_chunk0.hdf5', 'small3.csv_chunk1.hdf5',
            'small3.csv_chunk2.hdf5'
    ]:
        assert not os.path.exists(os.path.join(path, 'data', filename))
    assert os.path.exists(os.path.join(path, 'data', 'small3.csv.hdf5'))
    _cleanup_generated_files(df)

    # fails to convert if filename cannot be derived
    with pytest.raises(ValueError, match='Cannot convert.*'):
        with io.StringIO() as f:
            vaex.from_csv(f, convert=True)
    # f.name reveals the path
    with open(csv_path) as f:
        vaex.from_csv(f, convert=True)
    with open(csv_path) as f:
        converted_path = os.path.join(path, 'data', 'small3.my.csv.hdf5')
        df = vaex.from_csv(f, convert=converted_path)
    _assert_csv_content(df)
    assert os.path.exists(converted_path)
    _cleanup_generated_files(df)

    # reuses converted HDF5 file
    vaex.from_csv(csv_path, convert=True)
    assert os.path.exists(os.path.join(path, 'data', 'small3.csv.hdf5'))
    try:
        with pytest.raises(FileNotFoundError):
            os.rename(csv_path, csv_path + '_')
            df = vaex.from_csv(csv_path, convert=True)
            _assert_csv_content(df)
            _cleanup_generated_files(df)
    finally:
        os.rename(csv_path + '_', csv_path)
Beispiel #13
0
def ascii_to_vaex(asciifile, df_col=None, names=None, usecols=None, **kwargs):
    """Read raw DOLPHOT output photometry into Vaex.

    Inputs
    ------
    asciifile : str, path object or file-like object
        Path to DOLPHOT photometry file; see `pandas.read_csv`
    df_col : pandas.DataFrame, optional
        DataFrame read in from DOLPHOT columns file; see `read_colfile`
    names : list-like, optional
        Sequence of column names; see `pandas.read_csv`
        Either `names` or `df_col` must be specified.
    usecols : list-like or callable, optional
        Subset of columns to be read in; see `pandas.read_csv`
        If None, assumes `names` corresponds to first N columns
        Default: None
    **kwargs
        Other keyword arguments to be passed to `vaex.from_csv`

    Returns
    -------
    ds : vaex.dataframe.DataFrame
        Photometry table.
    """
    if df_col is not None:
        names = df_col['names'].tolist()
        usecols = (df_col.index - 1).tolist()
    else:
        usecols = list(range(len(names))) if usecols is None else usecols
    # null mag values are 99.999, null mag err values are 9.999
    na_values = {
        n: '99.999'
        for n in names if (n.endswith('VEGA') | n.endswith('TRANS'))
    }
    na_values.update({n: '9.999' for n in names if n.endswith('ERR')})
    ds = vaex.from_csv(asciifile,
                       names=names,
                       usecols=usecols,
                       header=None,
                       delim_whitespace=True,
                       na_values=na_values,
                       float_precision='round_trip',
                       **kwargs)
    return ds
Beispiel #14
0
def process_exp_files(expnum, exp_files):
    # Import vaex
    import vaex

    # Unpack exp_files
    exp_file, xtr_file = exp_files

    # Read in the exp_file
    exp_data = vaex.from_csv(exp_file, skipinitialspace=True, header=None,
                             names=EXP_HEADER, squeeze=True, dtype=EXP_HEADER,
                             copy_index=False)

    # Read in the xtr_file
    xtr_data = pd.read_csv(xtr_file, skipinitialspace=True, header=None,
                           names=XTR_HEADER, squeeze=True, dtype=XTR_HEADER,
                           usecols=range(len(XTR_HEADER)-1))
    xtr_data = xtr_data.to_numpy()[0]

    # Check if the 'expnum' column contains solely expnum
    if not (exp_data['expnum'] == expnum).evaluate().all():
        # If not, raise error and exit
        raise_error(f"Exposure file {exp_file!r} contains multiple exposures!")

    # Export vaex DataFrame to HDF5
    exp_file_hdf5 = path.join(ARGS.mld, TEMP_EXP_FILE.format(expnum))
    exp_data.export_hdf5(exp_file_hdf5)

    # Open master file
    with h5py.File(ARGS.master_file, 'r+') as m_file:
        # Check if this exposure has been processed before
        expnums = m_file['expnums']['expnum']
        expnums = expnums if expnums.size else expnums['expnum']
        index = np.nonzero(expnums == expnum)[0]

        # Save that this exposure has been processed
        if index.size:
            m_file['expnums'][index[0]] = (*xtr_data, path.getmtime(exp_file))
        else:
            m_file['expnums'].resize(m_file.attrs['n_expnums']+1, axis=0)
            m_file['expnums'][-1] = (*xtr_data, path.getmtime(exp_file))
            m_file.attrs['n_expnums'] += 1

    # Return exp_file_hdf5
    return(exp_file_hdf5)
def feats2csv(feat_scp, out_dir, utt2spk, spk_dict):
    os.makedirs(out_dir, exist_ok=True)
    feat_name = feat_scp.rstrip(".scp").split("/")[-1]
    # print(feat_name)
    feats = open(feat_scp, "r")
    out_csv = os.path.join(out_dir, feat_name + "." + args.data_type + ".csv")
    with open(out_csv, "w") as f:
        f.write("utt_id ark_path class_label\n")
        for feat in feats:
            utt_id, ark_path = feat.rstrip().split()
            class_label = spk_dict[utt2spk[utt_id]]
            f.write("%s %s %d\n" % (utt_id, ark_path, class_label))
    feats.close()

    # convert csv to h5 file
    df = vaex.from_csv(out_csv, sep=" ")
    df = df.sort(by="class_label")
    h5_name = os.path.join(out_dir, feat_name + "." + args.data_type + ".hdf5")
    df.export(h5_name)
    df.export(out_csv)
Beispiel #16
0
    def wrapper():

        file_size = os.environ.get("FILE_SIZE", "m")
        file_path = f"/data/{file_size}.csv"

        print("Reading dataset")
        df = vaex.from_csv(file_path,
                           convert=True,
                           chunk_size=5_000_000,
                           copy_index=False)
        size = round(Path(file_path).stat().st_size / 1024 / 1024 / 1024, 3)
        print("Transferring")
        startTime = time.time()

        try:
            func(df)
        except Exception:
            print(traceback.format_exc())
            print(f"====> Transfer failed for the file whose size is {size}Gb")
        else:
            endTime = time.time()

            print(f"====> Transfered {size}Gb in"
                  f"{round(endTime - startTime, 3)} seconds")
Beispiel #17
0
def test_from_csv_converting_in_chunks():
    # can read several chunks with converting, intermediate files are deleted
    df = vaex.from_csv(csv_path, chunk_size=1, convert=True)
    _assert_csv_content(df)
    for filename in [
            'small3.csv_chunk0.hdf5', 'small3.csv_chunk1.hdf5',
            'small3.csv_chunk2.hdf5'
    ]:
        assert not os.path.exists(os.path.join(path, 'data', filename))
    assert os.path.exists(os.path.join(path, 'data', 'small3.csv.hdf5'))
    _cleanup_generated_files(df)

    # fails to convert if filename cannot be derived
    with pytest.raises(
            ValueError,
            match='Cannot derive filename to use for converted HDF5 file, '
            'please specify it using convert="my.csv.hdf5"'):
        with open(csv_path) as f:
            vaex.from_csv(f, convert=True)
    with open(csv_path) as f:
        converted_path = os.path.join(path, 'data', 'small3.my.csv.hdf5')
        df = vaex.from_csv(f, convert=converted_path)
    _assert_csv_content(df)
    assert os.path.exists(converted_path)
    _cleanup_generated_files(df)

    # reuses converted HDF5 file
    vaex.from_csv(csv_path, convert=True)
    assert os.path.exists(os.path.join(path, 'data', 'small3.csv.hdf5'))
    try:
        os.rename(csv_path, csv_path + '_')
        df = vaex.from_csv(csv_path, convert=True)
        _assert_csv_content(df)
        _cleanup_generated_files(df)
    except FileNotFoundError as e:
        assert False, "vaex.from_csv tried to read from CSV file while a converted HDF5 file existed: %s" % e
    finally:
        os.rename(csv_path + '_', csv_path)
Beispiel #18
0
def get_raw_dataframe_as_vaex(name):
    df = vaex.from_csv(
        "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol="
        + name + "&apikey=WCXVE7BAD668SJHL&datatype=csv")
    return df
Beispiel #19
0
import vaex
import matplotlib.pyplot as plt

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--divisor_sums_csv_path',
                        type=str,
                        help='The csv file containing divisor sum data')
    parser.add_argument('--divisor_sums_hdf5_path',
                        type=str,
                        help='The hdf5 file containing divisor sum data')
    args = parser.parse_args()

    if args.divisor_sums_csv_path:
        df = vaex.from_csv(args.divisor_sums_csv_path)
    else:
        # can convert to an hdf5 once using
        # vaex.from_csv('divisor_sums.csv', convert=True, chunk_size=5_000_000)
        df = vaex.open(args.divisor_sums_hdf5_path)

    df.select(df.log_n > 12.3, name='log_n_min')
    df.select(df.witness_value > 1.68, name='witness_value_min')

    df.viz.heatmap(df.log_n,
                   df.witness_value,
                   limits=['minmax', [1.68, 1.782]],
                   selection=['witness_value_min', 'log_n_min'],
                   colormap='coolwarm',
                   ylabel='witness_value',
                   xlabel='$\log(n)$',
Beispiel #20
0
import time
from config import *
import vaex

# from_csv with convert=True writes the converted file back to the same folder
# takes 346s (circa 5mins) to write to
# 4835469885 Oct 23 17:46 pp-complete-202009.csv.hdf5 (4.8GB file)
#RAW_DATA = '/home/ian/data/land_registry/pp-complete-202009.short.csv'
print(f"Processing from {RAW_DATA}")
t1 = time.time()
df = vaex.from_csv(RAW_DATA,
                   copy_index=False,
                   chunk_size=None,
                   convert=True,
                   parse_dates=['date'],
                   names=COLUMNS)
print(f"Took {time.time()-t1:0.1}f")
Beispiel #21
0
def open(path, convert=False, progress=None, shuffle=False, fs_options={}, fs=None, *args, **kwargs):
    """Open a DataFrame from file given by path.

    Example:

    >>> df = vaex.open('sometable.hdf5')
    >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5')

    :param str or list path: local or absolute path to file, or glob string, or list of paths
    :param convert: Uses `dataframe.export` when convert is a path. If True, ``convert=path+'.hdf5'``
                    The conversion is skipped if the input file or conversion argument did not change.
    :param progress: (_Only applies when convert is not False_) {progress}
    :param bool shuffle: shuffle converted DataFrame or not
    :param dict fs_options: Extra arguments passed to an optional file system if needed:
        * Amazon AWS S3
            * `anonymous` - access file without authentication (public files)
            * `access_key` - AWS access key, if not provided will use the standard env vars, or the `~/.aws/credentials` file
            * `secret_key` - AWS secret key, similar to `access_key`
            * `profile` - If multiple profiles are present in `~/.aws/credentials`, pick this one instead of 'default', see https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html
            * `region` - AWS Region, e.g. 'us-east-1`, will be determined automatically if not provided.
            * `endpoint_override` - URL/ip to connect to, instead of AWS, e.g. 'localhost:9000' for minio
        * Google Cloud Storage
            * :py:class:`gcsfs.core.GCSFileSystem`
        In addition you can pass the boolean "cache" option.
    :param group: (optional) Specify the group to be read from and HDF5 file. By default this is set to "/table".
    :param fs: Apache Arrow FileSystem object, or FSSpec FileSystem object, if specified, fs_options should be empty.
    :param args: extra arguments for file readers that need it
    :param kwargs: extra keyword arguments
    :return: return a DataFrame on success, otherwise None
    :rtype: DataFrame

    Cloud storage support:

    Vaex supports streaming of HDF5 files from Amazon AWS S3 and Google Cloud Storage.
    Files are by default cached in $HOME/.vaex/file-cache/(s3|gs) such that successive access
    is as fast as native disk access.

    The following common fs_options are used for S3 access:

     * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0)
     * cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0)

    All fs_options can also be encoded in the file path as a query string.

    Examples:

    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', fs_options={{'anonymous': True}})
    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true')
    >>> df = vaex.open('s3://mybucket/path/to/file.hdf5', fs_options={{'access_key': my_key, 'secret_key': my_secret_key}})
    >>> df = vaex.open(f's3://mybucket/path/to/file.hdf5?access_key={{my_key}}&secret_key={{my_secret_key}}')
    >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile=myproject')

    Google Cloud Storage support:

    The following fs_options are used for GCP access:

     * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details.
     * cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0).
     * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem`

    Examples:

    >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5', fs_options={{'token': None}})
    >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon')
    >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False')
    """
    import vaex
    import vaex.convert
    try:
        if not isinstance(path, (list, tuple)):
            # remote and clusters only support single path, not a list
            path = vaex.file.stringyfy(path)
            if path in aliases:
                path = aliases[path]
            path = vaex.file.stringyfy(path)
            if path.startswith("http://") or path.startswith("ws://") or \
                path.startswith("vaex+wss://") or path.startswith("wss://") or \
               path.startswith("vaex+http://") or path.startswith("vaex+ws://"):
                server, name = path.rsplit("/", 1)
                url = urlparse(path)
                if '?' in name:
                    name = name[:name.index('?')]
                extra_args = {key: values[0] for key, values in parse_qs(url.query).items()}
                if 'token' in extra_args:
                    kwargs['token'] = extra_args['token']
                if 'token_trusted' in extra_args:
                    kwargs['token_trusted'] = extra_args['token_trusted']
                client = vaex.connect(server, **kwargs)
                return client[name]
            if path.startswith("cluster"):
                import vaex.enterprise.distributed
                return vaex.enterprise.distributed.open(path, *args, **kwargs)

        import vaex.file
        import glob
        if isinstance(path, str):
            paths = [path]
        else:
            paths = path
        filenames = []
        for path in paths:
            path = vaex.file.stringyfy(path)
            if path in aliases:
                path = aliases[path]
            path = vaex.file.stringyfy(path)
            naked_path, options = vaex.file.split_options(path)
            if glob.has_magic(naked_path):
                filenames.extend(list(sorted(vaex.file.glob(path, fs_options=fs_options, fs=fs))))
            else:
                filenames.append(path)
        df = None
        if len(filenames) == 0:
            raise IOError(f'File pattern did not match anything {path}')
        filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle)
        filename_hdf5_noshuffle = vaex.convert._convert_name(filenames, shuffle=False)
        if len(filenames) == 1:
            path = filenames[0]
            # # naked_path, _ = vaex.file.split_options(path, fs_options)
            _, ext, _ = vaex.file.split_ext(path)
            if ext == '.csv':  # special case for csv
                return vaex.from_csv(path, fs_options=fs_options, fs=fs, convert=convert, progress=progress, **kwargs)
            if convert:
                path_output = convert if isinstance(convert, str) else filename_hdf5
                vaex.convert.convert(
                    path_input=path, fs_options_input=fs_options, fs_input=fs,
                    path_output=path_output, fs_options_output=fs_options, fs_output=fs,
                    progress=progress,
                    *args, **kwargs
                )
                ds = vaex.dataset.open(path_output, fs_options=fs_options, fs=fs, **kwargs)
            else:
                ds = vaex.dataset.open(path, fs_options=fs_options, fs=fs, **kwargs)
            df = vaex.from_dataset(ds)
            if df is None:
                if os.path.exists(path):
                    raise IOError('Could not open file: {}, did you install vaex-hdf5? Is the format supported?'.format(path))
        elif len(filenames) > 1:
            if convert not in [True, False]:
                filename_hdf5 = convert
            else:
                filename_hdf5 = vaex.convert._convert_name(filenames, shuffle=shuffle)
            if os.path.exists(filename_hdf5) and convert:  # also check mtime
                df = vaex.open(filename_hdf5)
            else:
                dfs = []
                for filename in filenames:
                    dfs.append(vaex.open(filename, fs_options=fs_options, fs=fs, convert=bool(convert), shuffle=shuffle, **kwargs))
                df = vaex.concat(dfs)
                if convert:
                    if shuffle:
                        df = df.shuffle()
                    df.export_hdf5(filename_hdf5, progress=progress)
                    df = vaex.open(filename_hdf5)

        if df is None:
            raise IOError('Unknown error opening: {}'.format(path))
        return df
    except:
        logger.exception("error opening %r" % path)
        raise
Beispiel #22
0
def open(path, convert=False, shuffle=False, fs_options={}, *args, **kwargs):
    """Open a DataFrame from file given by path.

    Example:

    >>> df = vaex.open('sometable.hdf5')
    >>> df = vaex.open('somedata*.csv', convert='bigdata.hdf5')

    :param str or list path: local or absolute path to file, or glob string, or list of paths
    :param convert: convert files to an hdf5 file for optimization, can also be a path
    :param bool shuffle: shuffle converted DataFrame or not
    :param args: extra arguments for file readers that need it
    :param kwargs: extra keyword arguments
    :return: return a DataFrame on success, otherwise None
    :rtype: DataFrame

    S3 support:

    Vaex supports streaming of hdf5 files from Amazon AWS object storage S3.
    Files are by default cached in $HOME/.vaex/file-cache/s3 such that successive access
    is as fast as native disk access. The following url parameters control S3 options:

     * anon: Use anonymous access or not (false by default). (Allowed values are: true,True,1,false,False,0)
     * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0)
     * profile and other arguments are passed to :py:class:`s3fs.core.S3FileSystem`

    All arguments can also be passed as kwargs, but then arguments such as `anon` can only be a boolean, not a string.

    Examples:

    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true')
    >>> df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5', anon=True)  # Note that anon is a boolean, not the string 'true'
    >>> df = vaex.open('s3://mybucket/path/to/file.hdf5?profile=myprofile')

    GCS support:
    Vaex supports streaming of hdf5 files from Google Cloud Storage.
    Files are by default cached in $HOME/.vaex/file-cache/gs such that successive access
    is as fast as native disk access. The following url parameters control GCS options:
     * token: Authentication method for GCP. Use 'anon' for annonymous access. See https://gcsfs.readthedocs.io/en/latest/index.html#credentials for more details.
     * use_cache: Use the disk cache or not, only set to false if the data should be accessed once. (Allowed values are: true,True,1,false,False,0).
     * project and other arguments are passed to :py:class:`gcsfs.core.GCSFileSystem`

    Examples:

    >>> df = vaex.open('gs://vaex-data/airlines/us_airline_data_1988_2019.hdf5?token=anon')
    >>> df = vaex.open('gs://vaex-data/testing/xys.hdf5?token=anon&cache=False')
    """
    import vaex
    import vaex.convert
    try:
        path = vaex.file.stringyfy(path)
        if path in aliases:
            path = aliases[path]
        path = vaex.file.stringyfy(path)
        if path.startswith("http://") or path.startswith("ws://") or \
           path.startswith("vaex+http://") or path.startswith("vaex+ws://"):  # TODO: think about https and wss
            server, name = path.rsplit("/", 1)
            url = urlparse(path)
            if '?' in name:
                name = name[:name.index('?')]
            extra_args = {
                key: values[0]
                for key, values in parse_qs(url.query).items()
            }
            if 'token' in extra_args:
                kwargs['token'] = extra_args['token']
            if 'token_trusted' in extra_args:
                kwargs['token_trusted'] = extra_args['token_trusted']
            client = vaex.connect(server, **kwargs)
            return client[name]
        if path.startswith("cluster"):
            import vaex.enterprise.distributed
            return vaex.enterprise.distributed.open(path, *args, **kwargs)
        else:
            import vaex.file
            import glob
            if isinstance(path, str):
                paths = [path]
            else:
                paths = path
            filenames = []
            for path in paths:
                naked_path, options = vaex.file.split_options(path)
                if glob.has_magic(naked_path):
                    filenames.extend(
                        list(sorted(vaex.file.glob(path, **kwargs))))
                else:
                    filenames.append(path)
            df = None
            if len(filenames) == 0:
                raise IOError(f'File pattern did not match anything {path}')
            filename_hdf5 = vaex.convert._convert_name(filenames,
                                                       shuffle=shuffle)
            filename_hdf5_noshuffle = vaex.convert._convert_name(filenames,
                                                                 shuffle=False)
            if len(filenames) == 1:
                path = filenames[0]
                # # naked_path, _ = vaex.file.split_options(path, fs_options)
                _, ext, _ = vaex.file.split_ext(path)
                if ext == '.csv':  # special case for csv
                    return vaex.from_csv(path,
                                         fs_options=fs_options,
                                         convert=convert,
                                         **kwargs)
                if convert:
                    path_output = convert if isinstance(convert,
                                                        str) else filename_hdf5
                    vaex.convert.convert(path_input=path,
                                         fs_options_input=fs_options,
                                         path_output=path_output,
                                         fs_options_output=fs_options,
                                         *args,
                                         **kwargs)
                    ds = vaex.dataset.open(path_output, fs_options=fs_options)
                else:
                    ds = vaex.dataset.open(path, fs_options=fs_options)
                df = vaex.from_dataset(ds)
                if df is None:
                    if os.path.exists(path):
                        raise IOError(
                            'Could not open file: {}, did you install vaex-hdf5? Is the format supported?'
                            .format(path))
            elif len(filenames) > 1:
                if convert not in [True, False]:
                    filename_hdf5 = convert
                else:
                    filename_hdf5 = vaex.convert._convert_name(filenames,
                                                               shuffle=shuffle)
                if os.path.exists(
                        filename_hdf5) and convert:  # also check mtime
                    df = vaex.open(filename_hdf5)
                else:
                    dfs = []
                    for filename in filenames:
                        dfs.append(
                            vaex.open(filename,
                                      convert=bool(convert),
                                      shuffle=shuffle,
                                      **kwargs))
                    df = vaex.concat(dfs)
                    if convert:
                        if shuffle:
                            df = df.shuffle()
                        df.export_hdf5(filename_hdf5)
                        df = vaex.open(filename_hdf5)

        if df is None:
            raise IOError('Unknown error opening: {}'.format(path))
        return df
    except:
        logging.getLogger("vaex").error("error opening %r" % path)
        raise
Beispiel #23
0
#%%
import pandas as pd
import os
import vaex

os.chdir(r'\\NAS\WDbackup\备份\Yaf')
path = '//Nas/市场营销部/X销售数据留档/2020/2020-销售数据留档/'
data = []

for file in os.listdir(path):
    df = pd.read_csv(path + file, encoding="gbk", thousands=",")
    df['date'] = file.split(".")[0].split("-")[1]
    df['date'] = df['date'].astype('datetime64[ns]')
    df['Ordered Product Sales'] = df['Ordered Product Sales'].str[1:]
    data.append(df)
df = pd.concat(data)

df.to_csv('FBA.csv')

dv = vaex.from_csv('FBA.csv', convert=True, chunk_size=5_000_000)

dv = vaex.open('FBA.csv.hdf5')

print(dv.shape)
print('------------------------------')

# %%
def convert_file_to_hdf5(file_name):
    import vaex
    vaex.from_csv(file_name, convert=True, chunk_size=500_000)
Beispiel #25
0
def convert(filename):
    colnames = ['col']*136
    for i in range(136):
        colnames[i] += str(i+1) 
    df = vaex.from_csv(f'MSLR-WEB10K/Fold1/{filename}.txt', sep=' ', usecols=range(138), names=['relevance', 'qid'] + colnames)
    df.export(f'MSLR-WEB10K/Fold1/{filename}.arrow')
Beispiel #26
0
def test_diffent_extension():
    df = vaex.from_csv(data_path / 'small2.nocsv')
    assert df.x.tolist() == [1, 3]
    df = vaex.from_csv(data_path / 'small2.nocsv', convert=True)
    assert df.x.tolist() == [1, 3]