Ejemplo n.º 1
0
 def get_pq_writer(self, prefix, s, mem):
     pw, fil = self.pq_writer_cache.get(prefix, (None, None))
     if pw is None:
         if mem:
             fil = BytesIO()
             pw = pwriter(fil, compression=None)
             self.pq_writer_cache[prefix] = (pw, fil)
         else:
             outfile_id = guid() + ".parquet"
             full_path = ".".join([prefix, outfile_id])
             pw = pwriter(full_path, compression=None)
             self.pq_writer_cache[prefix] = (pw, full_path)
     return pw
Ejemplo n.º 2
0
    def _get_or_create_writer(self, idx):
        # lazily initializes a writer for the given index
        with self._lock:
            while len(self.data_writers) <= idx:
                path = self._get_filename(len(self.data_writers))
                self.data_paths.append(path)
                if self.bytes_io:
                    bio = BytesIO()
                    self.data_bios.append(bio)
                    self.data_writers.append(pwriter(bio, compression=None))
                else:
                    self.data_writers.append(pwriter(path, compression=None))

            return self.data_writers[idx]
Ejemplo n.º 3
0
    def _get_or_create_writer(self, idx):
        # lazily initializes a writer for the given index
        with self._lock:
            while len(self.data_writers) <= idx:
                path = self._get_filename(len(self.data_writers))
                self.data_paths.append(path)
                if self.bytes_io:
                    bio = BytesIO()
                    self.data_bios.append(bio)
                    # Passing index=False when creating ParquetWriter
                    # to avoid bug: https://github.com/rapidsai/cudf/issues/7011
                    self.data_writers.append(pwriter(bio, compression=None, index=False))
                else:
                    self.data_writers.append(pwriter(path, compression=None, index=False))

            return self.data_writers[idx]
Ejemplo n.º 4
0
    def __init__(self, out_dir, **kwargs):
        super().__init__(out_dir, **kwargs)
        self.data_paths = []
        self.data_writers = []
        self.data_bios = []
        for i in range(self.num_out_files):
            if self.use_guid:
                fn = f"{i}.{guid()}.parquet"
            else:
                fn = f"{i}.parquet"

            path = os.path.join(out_dir, fn)
            self.data_paths.append(path)
            if self.bytes_io:
                bio = BytesIO()
                self.data_bios.append(bio)
                self.data_writers.append(pwriter(bio, compression=None))
            else:
                self.data_writers.append(pwriter(path, compression=None))
Ejemplo n.º 5
0
 def __init__(self, out_dir, use_guid=False, **kwargs):
     super().__init__(out_dir, **kwargs)
     self.data_paths = []
     self.data_writers = []
     for i in range(self.num_out_files):
         if use_guid:
             fn = f"{i}.{guid()}.parquet"
         else:
             fn = f"{i}.parquet"
         path = os.path.join(out_dir, fn)
         self.data_paths.append(path)
         self.data_writers.append(pwriter(path, compression=None))
Ejemplo n.º 6
0
 def __init__(
     self,
     out_dir,
     num_out_files=30,
     num_threads=4,
     cats=None,
     conts=None,
     labels=None,
 ):
     super().__init__(out_dir, num_out_files, num_threads, cats, conts,
                      labels)
     self.data_files = [
         os.path.join(out_dir, f"{i}.parquet") for i in range(num_out_files)
     ]
     self.data_writers = [
         pwriter(f, compression=None) for f in self.data_files
     ]