Esempio n. 1
0
 def open(self, temp_path):
     self._file_handle = super().open(temp_path)
     if ARROW_MAJOR_VERSION < 4:
         return pq.ParquetWriter(self._file_handle,
                                 self._schema,
                                 compression=self._codec,
                                 use_deprecated_int96_timestamps=self.
                                 _use_deprecated_int96_timestamps)
     return pq.ParquetWriter(
         self._file_handle,
         self._schema,
         compression=self._codec,
         use_deprecated_int96_timestamps=self.
         _use_deprecated_int96_timestamps,
         use_compliant_nested_type=self._use_compliant_nested_type)
Esempio n. 2
0
def test_it_handles_files_with_multiple_row_groups_and_pandas_indexes(
    mock_load_parquet, ):
    # Arrange
    data = [
        {
            "customer_id": "12345"
        },
        {
            "customer_id": "34567"
        },
    ]
    columns = [{
        "Column": "customer_id",
        "MatchIds": ["12345"],
        "Type": "Simple"
    }]
    df = pd.DataFrame(data, list("ab"))
    table = pa.Table.from_pandas(df)
    buf = BytesIO()
    # Create parquet with multiple row groups
    with pq.ParquetWriter(buf, table.schema) as writer:
        for i in range(3):
            writer.write_table(table)
    br = pa.BufferReader(buf.getvalue())
    f = pq.ParquetFile(br, memory_map=False)
    mock_load_parquet.return_value = f
    # Act
    out, stats = delete_matches_from_parquet_file("input_file.parquet",
                                                  columns)
    # Assert
    assert {"ProcessedRows": 6, "DeletedRows": 3} == stats
    res = pa.BufferReader(out.getvalue())
    newf = pq.ParquetFile(res, memory_map=False)
    assert 3 == newf.num_row_groups
    assert 3 == newf.read().num_rows
Esempio n. 3
0
    def _write_buffer(self):
        buffer_table = []
        if not isinstance(self.schema, ParquetSchema):
            self.schema = ParquetSchema.convert(self.schema)
        if not self.pq_writer:
            self.pq_writer = pq.ParquetWriter(
                self.options.outfile, self.schema.to_arrow()
            )

        for col_name, col_data in self.column_buffer.items():
            col_type = self.schema.columns[col_name].type_py
            pa_type = self.schema.columns[col_name].type_pa
            col = pa.column(
                col_name,
                pa.array(
                    self.coerce_column(col_name, col_data, col_type),
                    type=pa_type,
                ),
            )
            buffer_table.append(col)

        self.pq_writer.write_table(
            pa.Table.from_arrays(buffer_table, schema=self.schema.to_arrow())
        )

        for col in self.column_buffer.keys():
            self.column_buffer[col] = []
        self.buffer_line = 0
Esempio n. 4
0
def _chunk_readwrite(archive_url, dest_path, chunksize, header, encoding,
                     dtype, dataset):
    """stream read and write archives

    pandas reads and parquet writes

    notes
    -----
    * dest_path can be either a file.parquet, or in hte case of partitioned parquet
      it will be only the destination folder of the parquet partition files
    """
    pqwriter = None
    header = []
    for i, df in enumerate(
            pd.read_csv(archive_url,
                        chunksize=chunksize,
                        names=header,
                        encoding=encoding,
                        dtype=dtype)):
        table = pa.Table.from_pandas(df)
        if i == 0:
            if dataset:
                header = np.copy(table.schema)
            else:
                pqwriter = pq.ParquetWriter(dest_path, table.schema)
        if dataset:
            pq.write_to_dataset(table,
                                root_path=dest_path,
                                partition_cols=partition_cols)
        else:
            pqwriter.write_table(table)
    if pqwriter:
        pqwriter.close()

    return header
Esempio n. 5
0
def parquet():

    query = '''
        select * from evcard_tmp.order_info where strleft(created_time,4)='2015' limit 10
    '''

    with get_impala_connection() as impala_conn:
        with impala_conn.cursor() as cursor:
            cursor.arraysize = 500

            cursor.execute(query)
            _columns = [metadata[0] for metadata in cursor.description]

            writer = pq.ParquetWriter('city.1.parquet', table.schema)

            _page = 1
            total_count = 0
            df = as_pandas(cursor, _columns)
            while len(df) > 0:
                total_count += len(df)
                logger.debug('process page %s(%s)...', _page, total_count)

                table = pa.Table.from_pandas(df, preserve_index=False)
                writer.write_table(table)

                df = as_pandas(cursor, _columns)
                _page += 1
Esempio n. 6
0
    def write(self, gdf, shuffle=True):

        # Shuffle the dataframe
        gdf_size = len(gdf)
        if shuffle:
            sort_key = "__sort_index__"
            arr = cp.arange(gdf_size)
            cp.random.shuffle(arr)
            gdf[sort_key] = cudf.Series(arr)
            gdf = gdf.sort_values(sort_key).drop(columns=[sort_key])

        # Write to
        chunk_size = int(gdf_size / self.nfiles)
        for i, fn in enumerate(FileIterator(self.path, self.nfiles)):
            s1 = i * chunk_size
            s2 = (i + 1) * chunk_size
            if i == (self.nfiles - 1):
                s2 = gdf_size
            chunk = gdf[s1:s2]
            pa_table = chunk.to_arrow()
            if self.writers[fn] is None:
                self.writers[fn] = pq.ParquetWriter(
                    fn,
                    pa_table.schema,
                    metadata_collector=self.new_metadata[fn],
                )
            self.writers[fn].write_table(pa_table)
def parquet_converter(file):
    chunksize = 500000
    i = 0
    data = pd.DataFrame()  # creates a new dataframe that's empty
    for chunk in pd.read_csv(file,
                             chunksize=chunksize,
                             usecols=["End_Lon", "End_Lat"],
                             dtype={
                                 "End_Lon": np.float32,
                                 "End_Lat": np.float32
                             },
                             delimiter=' *, *',
                             engine="python"):
        # chunk = chunk.rename(columns={"dropoff_latitude": "End_Lat", "dropoff_longitude": "End_Lon"})
        table = pa.Table.from_pandas(chunk)
        # for the first chunk of records
        if i == 0:
            # create a parquet write object giving it an output file
            pqwriter = pq.ParquetWriter(target,
                                        table.schema,
                                        compression='snappy')
            pqwriter.write_table(table)
        # subsequent chunks can be written to the same file
        else:
            pqwriter.write_table(table)
        i += 1
    # close the parquet writer
    if pqwriter:
        pqwriter.close()
Esempio n. 8
0
def test_parquet_incremental_file_build(tmpdir):
    import pyarrow.parquet as pq

    df = _test_dataframe(100)
    df['unique_id'] = 0

    arrow_table = pa.Table.from_pandas(df, preserve_index=False)
    out = pa.BufferOutputStream()

    writer = pq.ParquetWriter(out, arrow_table.schema, version='2.0')

    frames = []
    for i in range(10):
        df['unique_id'] = i
        arrow_table = pa.Table.from_pandas(df, preserve_index=False)
        writer.write_table(arrow_table)

        frames.append(df.copy())

    writer.close()

    buf = out.get_result()
    result = _read_table(pa.BufferReader(buf))

    expected = pd.concat(frames, ignore_index=True)
    tm.assert_frame_equal(result.to_pandas(), expected)
Esempio n. 9
0
def collect_results(filepointer_item, outbase, compression, mutation_mode,  kmer_list = None):
    if filepointer_item is not None:
        s1 = timeit.default_timer()

        if kmer_list:
           file_to_collect = filepointer_item['path'].values()
        else:
            file_to_collect = [filepointer_item['path']]
        for file_path in file_to_collect:
            file_name = os.path.basename(file_path)
            tmp_file_list = glob.glob(os.path.join(outbase, 'tmp_out_{}_[0-9]*'.format(mutation_mode), file_name))
            tot_shape = 0
            for tmp_file in tmp_file_list:
                try:
                    table = pq.read_table(tmp_file)
                    if tot_shape == 0:
                        pqwriter = pq.ParquetWriter(file_path, table.schema, compression=compression)
                    pqwriter.write_table(table)
                    tot_shape += table.shape[0]
                except:
                    logging.info("ERROR: file {} could not be read".format(tmp_file))
                    sys.exit(1)
            if tmp_file_list:
                pqwriter.close()
                logging.info('Collecting {} with {} lines. Took {} seconds'.format(file_name, tot_shape, timeit.default_timer()-s1))
Esempio n. 10
0
    def write_df_to_parquet(self,
                            df,
                            preserve_index=False,
                            close_writer=True,
                            schema=None):
        """Writes Pandas Dataframe to a new Parquet file

        Closes the writer after writing all of dataframe and before returning

        Args:
            df (Pandas Datafram): Data to be written to parquet file.
            preserve_index (bool): Set this to True if you want to
                preserve the index of pandas dataframe, default is False
                and indexes are dropped.
            close_writer (bool): Set to true closes the writer at the end
                Default: True
            schema (pyarrow.Schema, optional): The expected schema of the
                Arrow Table. This can be used to indicate the type of columns
                if we cannot infer it automatically.
        """
        table = pa.Table.from_pandas(df,
                                     preserve_index=preserve_index,
                                     schema=schema)
        self._pqwriter = pq.ParquetWriter(self._output_file, table.schema)
        self._pqwriter.write_table(table)
        if close_writer:
            self.close()
Esempio n. 11
0
 def parquet_write(write_path, block):
     block = BlockAccessor.for_block(block)
     logger.debug(
         f"Writing {block.num_rows()} records to {write_path}.")
     table = block.to_arrow_table()
     with pq.ParquetWriter(write_path, table.schema) as writer:
         writer.write_table(table)
Esempio n. 12
0
def from_ticker_to_parquet():

    names = ('timestamp', 'exchange_name', 'pair', 'rate')
    files = listdir(TICKERS_DIR)
    files.sort(reverse=True)
    count = 0

    for file_name in files:
        if file_name[:7] != 'ticker_':
            continue
        count += 1
        if count <= 3:
            continue  #pass last 3 files (still can be open for writing)

        with open(TICKERS_DIR + '/' + file_name, 'r') as f:
            batch = defaultdict(list)
            data_string = f.read().replace('\n', ',')
            tickers = loads('[' + data_string[:len(data_string) - 1] + ']')
            for t in tickers:
                for n in names:
                    if n == 'rate':
                        value = float(t[n])
                    else:
                        value = t[n]
                    batch[n].append(value)

            tables = pa.Table.from_arrays([pa.array(batch[n]) for n in names],
                                          names)
            with pq.ParquetWriter(TICKERS_DIR + '/ticker.parquet',
                                  tables.schema,
                                  use_dictionary=False,
                                  flavor={'spark'}) as writer:
                writer.write_table(tables)
Esempio n. 13
0
def test_write_compliant_nested_type_enable(tempdir, use_legacy_dataset,
                                            test_data):
    # prepare dataframe for testing
    df = pd.DataFrame(data=test_data)
    # verify that we can read/write pandas df with new flag
    _roundtrip_pandas_dataframe(
        df,
        write_kwargs={'use_compliant_nested_type': True},
        use_legacy_dataset=use_legacy_dataset)

    # Write to a parquet file with compliant nested type
    table = pa.Table.from_pandas(df, preserve_index=False)
    path = str(tempdir / 'data.parquet')
    with pq.ParquetWriter(path,
                          table.schema,
                          use_compliant_nested_type=True,
                          version='2.0') as writer:
        writer.write_table(table)
    # Read back as a table
    new_table = _read_table(path)
    # Validate that "items" columns compliant to Parquet nested format
    # Should be like this: list<element: struct<name: string, value: string>>
    assert isinstance(new_table.schema.types[0], pa.ListType)
    assert new_table.schema.types[0].value_field.name == 'element'

    # Verify that the new table can be read/written correctly
    _check_roundtrip(new_table,
                     use_legacy_dataset=use_legacy_dataset,
                     use_compliant_nested_type=True)
Esempio n. 14
0
 def execute(self, df):
     df = df.copy()
     bucket = 'iotcs-as-bucket'
     path = 's3test-alberto-%s.parquet' % (dt.datetime.now().isoformat())
     bucket_uri = '{bucket}/{path}'.format(**{
         'bucket': bucket,
         'path': path
     })
     client_kwargs = {"endpoint_url": self.cos_credentials["endpoint_url"]}
     fs = s3fs.S3FileSystem(
         key=self.cos_credentials["cos_hmac_keys"]["access_key_id"],
         secret=self.cos_credentials["cos_hmac_keys"]["secret_access_key"],
         client_kwargs=client_kwargs)
     sink = fs.open(bucket_uri, 'wb')
     df2 = []
     for sens in df[self.sens_pos].drop_duplicates():
         df1 = pd.concat([
             pd.Series([
                 int(element) for list_ in df[df[self.sens_pos] == sens]
                 [axis].str.split(',').values for element in list_
             ]).rename(axis) for axis in [self.X, self.Y, self.Z]
         ],
                         axis=1)
         df1[self.sens_pos] = sens
         df2.append(df1)
     df2 = pd.concat(df2, ignore_index=True)
     ta = pa.Table.from_pandas(df2)
     pw = pq.ParquetWriter(sink, schema=ta.schema)
     pw.write_table(ta)
     df[self.output_status] = bucket_uri
     return df
Esempio n. 15
0
    def to_parquet_align(self, output_dir=None, output_prefix='d6tstack-', write_params={}):
        """
        Same as `to_csv_align` but outputs parquet files

        """
        # write_params for pyarrow.parquet.write_table

        # stream all chunks to multiple files
        self._combine_preview_available()

        import pyarrow as pa
        import pyarrow.parquet as pq

        fnamesout = []
        pqschema = pa.Table.from_pandas(self.df_combine_preview).schema
        for fname in self.fname_list:
            filename = self._get_filepath_out(fname, output_dir, output_prefix, '.pq')
            if self.logger:
                self.logger.send_log('writing '+filename , 'ok')
            pqwriter = pq.ParquetWriter(filename, pqschema)
            for dfc in self._read_csv_yield(fname, self.read_csv_params):
                pqwriter.write_table(pa.Table.from_pandas(dfc.astype(self.df_combine_preview.dtypes), schema=pqschema),**write_params)
            pqwriter.close()
            fnamesout.append(filename)

        return fnamesout
Esempio n. 16
0
def write_files(metadata: AlchemyMetadata) -> None:
    """
    Creates a Parquet file for each table in the schema.
    """
    tables: Iterator[AlchemyTable] = metadata.tables.values()
    for table in tables:
        name = table.name
        print(name)

        def get_path(prefix: Path, suffix: str):
            parent_dir = prefix.joinpath(metadata.schema)
            parent_dir.mkdir(exist_ok=True, parents=True)
            return parent_dir.joinpath(name).with_suffix(suffix)

        extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst")
        parquet_file = get_path(PARQUET_PREFIX, ".parquet")

        arrow_schema = pa.schema(get_fields(table))
        column_names = [name for name, dtype in get_fields(table)]

        read_options = pcsv.ReadOptions(column_names=column_names, block_size=1000000000)
        parse_options = pcsv.ParseOptions(newlines_in_values=True)
        convert_options = pcsv.ConvertOptions(column_types=arrow_schema, timestamp_parsers=["%Y%m%d", "%Y-%m-%d"],
                                              true_values=["1", "T"], false_values=["0", "F"], strings_can_be_null=True)

        parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='zstd',
                                          version="2.0", use_dictionary=True)
        stream_reader = pcsv.open_csv(extract_file, read_options=read_options, parse_options=parse_options,
                                      convert_options=convert_options)
        for batch in stream_reader:
            table = pa.Table.from_batches([batch])
            parquet_writer.write_table(table)
        parquet_writer.close()
Esempio n. 17
0
def test_merging_parquet_tables_with_different_pandas_metadata(tempdir):
    # ARROW-3728: Merging Parquet Files - Pandas Meta in Schema Mismatch
    schema = pa.schema([
        pa.field('int', pa.int16()),
        pa.field('float', pa.float32()),
        pa.field('string', pa.string())
    ])
    df1 = pd.DataFrame({
        'int': np.arange(3, dtype=np.uint8),
        'float': np.arange(3, dtype=np.float32),
        'string': ['ABBA', 'EDDA', 'ACDC']
    })
    df2 = pd.DataFrame({
        'int': [4, 5],
        'float': [1.1, None],
        'string': [None, None]
    })
    table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False)
    table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False)

    assert not table1.schema.equals(table2.schema, check_metadata=True)
    assert table1.schema.equals(table2.schema)

    writer = pq.ParquetWriter(tempdir / 'merged.parquet', schema=schema)
    writer.write_table(table1)
    writer.write_table(table2)
def csv_to_parquet():
    logger.info(f'Starting...')

    stream = pd.read_csv(
        CSV_FILE_PATH,
        chunksize=CHUNK_SIZE,
        low_memory=False,
        sep=',',
        encoding='latin-1',
    )

    logger.info(
        f'CSV Stored Size: {CSV_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB')

    chunk = next(stream)
    logger.debug(f'Processing 1-th chunk...')
    parquet_schema = pa.Table.from_pandas(chunk).schema
    parquet_writer = pq.ParquetWriter(PARQUET_FILE_PATH,
                                      parquet_schema,
                                      compression='snappy')

    for i, chunk in enumerate(stream, 2):
        logger.debug(f'Processing {i}-th chunk...')
        table = pa.Table.from_pandas(chunk, parquet_schema)
        parquet_writer.write_table(table)

    parquet_writer.close()

    logger.info(
        f'Parquet Stored Size: {PARQUET_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB'
    )

    logger.info(f'Finished!')
Esempio n. 19
0
def test_parquet_writer_context_obj_with_exception(tempdir,
                                                   use_legacy_dataset):
    df = _test_dataframe(100)
    df['unique_id'] = 0

    arrow_table = pa.Table.from_pandas(df, preserve_index=False)
    out = pa.BufferOutputStream()
    error_text = 'Artificial Error'

    try:
        with pq.ParquetWriter(out, arrow_table.schema,
                              version='2.0') as writer:

            frames = []
            for i in range(10):
                df['unique_id'] = i
                arrow_table = pa.Table.from_pandas(df, preserve_index=False)
                writer.write_table(arrow_table)
                frames.append(df.copy())
                if i == 5:
                    raise ValueError(error_text)
    except Exception as e:
        assert str(e) == error_text

    buf = out.getvalue()
    result = _read_table(pa.BufferReader(buf),
                         use_legacy_dataset=use_legacy_dataset)

    expected = pd.concat(frames, ignore_index=True)
    tm.assert_frame_equal(result.to_pandas(), expected)
Esempio n. 20
0
 def open(self, temp_path):
   self._file_handle = super(_ParquetSink, self).open(temp_path)
   return pq.ParquetWriter(
       self._file_handle,
       self._schema,
       compression=self._codec,
       use_deprecated_int96_timestamps=self._use_deprecated_int96_timestamps)
Esempio n. 21
0
def stream_json(fn, parquet_fn, schema=None, chunk_size=10000000):
    if isinstance(fn, str):
        fn = [fn]

    if schema is None:
        schema = read_json(fn[0]).schema

    writer = pq.ParquetWriter(parquet_fn, schema)

    for _f in fn:
        check_gz = _f.endswith('.gz')

        if check_gz:
            f = gzip.open(_f, 'r')
        else:
            f = open(_f, 'r')

        while True:
            chunk = f.readlines(chunk_size)
            if not chunk:
                break

            tbl = read_json(io.BytesIO(''.join(chunk).encode()))
            assert tbl.schema == schema  # make sure the read table schema is the same as the parsed schema
            writer.write_table(tbl)

        f.close()

    writer.close()
Esempio n. 22
0
def compress_descriptions(encoding='utf-8',
                          batch_size=1000,
                          compression='BROTLI'):
    """Convert tarfile to parquet"""

    names = ('symbol', 'html')

    def read_incremental():
        """Incremental generator of batches"""
        with tarfile.open(YAHOO_ARCH) as archive:
            batch = defaultdict(list)
            for member in tqdm(archive):
                if member.isfile() and member.name.endswith('.html'):
                    batch['symbol'].append(Path(member.name).stem)
                    batch['html'].append(
                        archive.extractfile(member).read().decode(encoding))
                    if len(batch['symbol']) >= batch_size:
                        yield pa.Table.from_arrays(
                            [pa.array(batch[n]) for n in names], names)
                        batch = defaultdict(list)
            if batch:
                yield pa.Table.from_arrays([pa.array(batch[n]) for n in names],
                                           names)  # last partial batch

    writer = None
    for batch in read_incremental():
        if writer is None:
            writer = pq.ParquetWriter(YAHOO_PARQUET,
                                      batch.schema,
                                      use_dictionary=False,
                                      compression=compression,
                                      flavor={'spark'})
        writer.write_table(batch)
    writer.close()
Esempio n. 23
0
def convert_csv_file_to_typed_parquet_file(csv_file, parquet_file, compression, schema, ntimes=1):
    table = read_parquet_from_csv_file(csv_file)
    table = table.cast(schema)
    pqwriter = pq.ParquetWriter(parquet_file, schema=schema, compression=compression)
    for i in range(0, ntimes):
        pqwriter.write_table(table)
    pqwriter.close()
Esempio n. 24
0
def concatenate_files(files, output_path, variant_metadata_path, frequency):
    vmf = pq.ParquetFile(variant_metadata_path) if variant_metadata_path else None

    columns = []
    for i,file_path in enumerate(files):
        logging.info(file_path)
        p = pq.ParquetFile(file_path)

        if vmf is not None and frequency is not None:
            cleared = get_cleared_columns(vmf, frequency, i)
            logging.info("%d cleared snps", len(cleared))
            t = p.read(columns=cleared)
        else:
            t = p.read()

        from_ = 0 if i==0 else 1
        for c_ in range(from_, t.num_columns):
            columns.append(t.column(c_))

    logging.info("Creating table")
    table = pa.Table.from_arrays(columns)
    logging.info("saving...")
    o = pq.ParquetWriter(output_path, table.schema, flavor="spark")
    o.write_table(table)
    o.close()
    logging.info("finished.")
Esempio n. 25
0
def test_write_compliant_nested_type_disable(tempdir, use_legacy_dataset,
                                             test_data):
    # prepare dataframe for testing
    df = pd.DataFrame(data=test_data)
    # verify that we can read/write with new flag disabled (default behaviour)
    _roundtrip_pandas_dataframe(df,
                                write_kwargs={},
                                use_legacy_dataset=use_legacy_dataset)

    # Write to a parquet file while disabling compliant nested type
    table = pa.Table.from_pandas(df, preserve_index=False)
    path = str(tempdir / 'data.parquet')
    with pq.ParquetWriter(path, table.schema, version='2.6') as writer:
        writer.write_table(table)
    new_table = _read_table(path)

    # Validate that "items" columns is not compliant to Parquet nested format
    # Should be like this: list<item: struct<name: string, value: string>>
    assert isinstance(new_table.schema.types[0], pa.ListType)
    assert new_table.schema.types[0].value_field.name == 'item'

    # Verify that the new table can be read/written correctly
    _check_roundtrip(new_table,
                     use_legacy_dataset=use_legacy_dataset,
                     use_compliant_nested_type=False)
Esempio n. 26
0
    def generate_parquet_file(
        cls, name: str, columns: Mapping[str, str], num_rows: int, custom_rows: Mapping[int, List[str]] = None
    ) -> str:
        """Generates  a random data and save it to a tmp file"""
        filename = os.path.join(TMP_FOLDER, name + "." + cls.filetype)

        pq_writer = None
        types = list(columns.values()) if num_rows else []
        custom_rows = custom_rows or {}
        column_names = list(columns.keys())
        buffer = []
        for i in range(num_rows):
            buffer.append(custom_rows.get(i) or cls._generate_row(types))
            if i != (num_rows - 1) and len(buffer) < 100:
                continue
            data = {col_values[0]: list(col_values[1:]) for col_values in zip(column_names, *buffer)}
            buffer = []
            df = pd.DataFrame(data)
            table = pa.Table.from_pandas(df)
            if not pq_writer:
                pq_writer = pq.ParquetWriter(filename, table.schema)
            pq_writer.write_table(table, row_group_size=100)

        if not pq_writer:
            pq.write_table(pa.Table.from_arrays([]), filename)
        return filename
Esempio n. 27
0
def append_to_parquet(df: pd.DataFrame, writer: pq.ParquetWriter,
                      filepath: str) -> pq.ParquetWriter:
    table = pa.Table.from_pandas(str)
    if writer is None:
        writer = pq.ParquetWriter(filepath, table.schema)
    writer.write_table(table=table)
    return writer
Esempio n. 28
0
def create_parquet():
    myschema = define_schema()
    wikifiles = findwikifiles()
    for wikifile in wikifiles:
        outfilename = get_outfilename_parquet(wikifile)
        if not os.path.exists(outfilename):
            writer = pq.ParquetWriter(outfilename, schema=myschema)
            titles = []
            pageids = []
            sentences = []
            linked_pages = []
            descriptions = []

            for i, (title, pageid, text) in enumerate(process_file(wikifile)):
                if i % 100 == 99:
                    logging.info(f"processing {title}")
                records = get_single_record(title, pageid, text)
                for record in records:
                    titles.append(record[0])
                    pageids.append(record[1])
                    sentences.append(record[2])
                    linked_pages.append(record[3])
                    descriptions.append(record[4])

            t = pa.Table.from_arrays(
                [titles, pageids, sentences, linked_pages, descriptions],
                schema=myschema)
            writer.write_table(t)
            writer.close()
Esempio n. 29
0
def df_to_parquet(df, filename, workdir=None, chunksize=100000, debug=False):
    if workdir:
        full_filename = os.path.join(workdir, filename)
    else:
        full_filename = filename
    writer = None

    # check if we are overwriting an existing file
    if os.path.exists(full_filename):
        os.remove(full_filename)

    i = 0

    # write in chunksizes
    while len(df) >= chunksize:
        # select data
        if debug:
            print('Writing ' + str(i) + '-' + str(i + chunksize))
        i += chunksize
        data_table = pa.Table.from_pandas(df[0:chunksize],
                                          preserve_index=False)
        df = df[chunksize:]
        # create writer if we did not have one yet
        if writer is None:
            writer = pq.ParquetWriter(full_filename,
                                      data_table.schema,
                                      compression='ZSTD')
        # save result
        writer.write_table(data_table)

    # save dangling results
    if not df.empty:
        if debug:
            print('Writing ' + str(i) + '-' + str(i + len(df)))
        data_table = pa.Table.from_pandas(df, preserve_index=False)
        if writer is None:
            writer = pq.ParquetWriter(full_filename,
                                      data_table.schema,
                                      compression='ZSTD')
        writer.write_table(data_table)

    # close the writer if we made one
    if writer is not None:
        writer.close()

    # cleanup
    del df
Esempio n. 30
0
def update_files(tables, writers, prefix):
    schemas = get_schemas();
    for table_name in ["variants", "annotations", "gts"]:
        df = pd.DataFrame(tables[table_name])
        table = pa.Table.from_pandas(df, schema=schemas[table_name], preserve_index=False)
        if not writers[table_name]:
            writers[table_name] = pq.ParquetWriter(f"{prefix}_{table_name}.parquet", schema=schemas[table_name], compression='snappy')
        writers[table_name].write_table(table=table)