def split_by_key(self, key_fn: str, outfile: str, num_partitions: int, auto_mkdir: bool = False, verbose: bool = True): """Spit the partition into multiple smaller partitions by key Args: key_fn (str): function outfile (str): outfile num_partitions (int): [description] auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False verbose (bool, optional): [description]. Defaults to True. """ outfile = create_filepath_template(outfile, False) key_fn = get_func_by_name(key_fn) with contextlib.ExitStack() as stack, self._open() as f: writers = [ stack.enter_context( PartitionWriter(outfile.format(auto=i, stem=self.stem), auto_mkdir=auto_mkdir)) for i in range(num_partitions) ] for line in tqdm(f, total=self.n_records) if verbose else f: bucket_no = key_fn(self.deser_fn(line)) partno = bucket_no % num_partitions writers[partno].write(line)
def filter(self, fn: str, outfile: str, delete_on_empty: bool = False, auto_mkdir: bool = False, verbose: bool = True): """Filter function Args: fn (str): [description] outfile (str): [description] delete_on_empty (bool, optional): delete the partition if there is no record. Defaults to False auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False verbose (bool, optional): [description]. Defaults to True. """ fn = get_func_by_name(fn) outfile = create_filepath_template(outfile, True).format(auto=0, stem=self.stem) with self._open() as f, PartitionWriter( outfile, on_close_delete_if_empty=delete_on_empty, auto_mkdir=auto_mkdir) as g: for line in tqdm(f, total=self.n_records) if verbose else f: if fn(self.deser_fn(line)): g.write(line)
def flat_map(self, fn: str, outfile: str, auto_mkdir: bool = False, verbose: bool = True): """Apply a flat map function on every record of this partition Args: fn (str): an import path of the map function, which should has this signature: (record: Any) -> Any outfile (str): output file of the new partition, `*` or `{stem}` in the path are placeholders, which will be replaced by the stem (i.e., file name without extension) of the current partition auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False verbose (bool, optional): show the execution progress bar. Defaults to True. """ fn = get_func_by_name(fn) outfile = create_filepath_template(outfile, True).format(auto=0, stem=self.stem) with self._open() as f, PartitionWriter(outfile, auto_mkdir=auto_mkdir) as g: for record in tqdm(f, total=self.n_records) if verbose else f: record = self.deser_fn(record) for sub_record in fn(record): sub_record = self.ser_fn(sub_record) g.write(sub_record) g.write_new_line()
def coalesce(self, outfile: str, records_per_partition: Optional[int] = None, num_partitions: Optional[int] = None, auto_mkdir: bool = False, verbose: bool = True): """The current partitions are coalesce into `num_partitions` partitions. If `num_partitions` is not specified, it will be figure out automatically from `records_per_partition`. Args: outfile (str): path template of output partitions records_per_partition (Optional[int], optional): number of records per partition. Defaults to None. num_partitions (Optional[int], optional): number of partitions. Defaults to None. auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False verbose (bool, optional): log execution progress. Defaults to True. Raises: ValueError: if the output directory does not exist """ outfile = create_filepath_template(outfile, False) if records_per_partition is None: assert num_partitions is not None assert self.size is not None, "Cannot determine the records per partition based on number of partitions because of unknown size of partitions. Consider running partition.count or provide the `records_per_partition` parameter" records_per_partition = ceil(self.size / num_partitions) writer = None part_counter = 0 with (tqdm(total=self.size) if verbose else fake_tqdm()) as pbar: try: writer = PartitionWriter(outfile.format(auto=part_counter, stem=""), on_close_delete_if_empty=True, auto_mkdir=auto_mkdir).open() for inpart in self.partitions: with inpart._open() as f: for i, line in enumerate(f): writer.write(line) pbar.update(1) if (i + 1) % records_per_partition == 0: writer.close() part_counter += 1 writer = PartitionWriter( outfile.format(auto=part_counter, stem=""), on_close_delete_if_empty=True).open() finally: if writer is not None: writer.close()
def reduce(self, fn: str, outfile: str, init_val: Any = None, auto_mkdir: bool = False, verbose: bool = True): """Reduce Args: fn (str): [description] outfile (str): [description] init_val (Any): [description] auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False verbose (bool, optional): [description]. Defaults to True. """ fn = get_func_by_name(fn) if init_val is not None: accum = init_val else: accum = None outfile = create_filepath_template(outfile, False).format(auto=0, stem="") with PartitionWriter(outfile, auto_mkdir=auto_mkdir) as g, (tqdm( total=self.size) if verbose else fake_tqdm()) as pbar: for inpart in self.partitions: with inpart._open() as f: if accum is None: try: record = inpart.deser_fn(next(f)) accum = fn(record) if verbose: pbar.update(1) except StopIteration: pass for line in f: record = inpart.deser_fn(line) accum = fn(record, accum) pbar.update(1) g.write(self.ser_fn(accum)) g.write_new_line()
def reduce_by_key(self, key_fn: str, fn: str, outfile: str, init_val: Any = None, auto_mkdir: bool = False, verbose: bool = True): """Reduce Args: key_fn (str): [description] fn (str): [description] outfile (str): [description] init_val (Any): [description] auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False verbose (bool, optional): [description]. Defaults to True. """ key_fn = get_func_by_name(key_fn) fn = get_func_by_name(fn) outfile = create_filepath_template(outfile, True).format(auto=0, stem=self.stem) with self._open() as f, PartitionWriter(outfile, auto_mkdir=auto_mkdir) as g: groups = {} for line in tqdm(f, total=self.n_records if self.n_records is not None else None) if verbose else f: record = self.deser_fn(line) rid = key_fn(record) if rid not in groups: if init_val is None: groups[rid] = fn(record) else: groups[rid] = fn(record, init_val) else: groups[rid] = fn(record, groups[rid]) for value in tqdm( groups.values(), total=len(groups)) if verbose else groups.values(): g.write(self.ser_fn(value)) g.write_new_line()
def count(self, outfile: Optional[str] = None, auto_mkdir: bool = False, verbose: bool = True) -> int: """Count Args: outfile (Optional[str], optional): output file to write to the value to if it is not None. if outfile is stdout we will print to stdout auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False verbose (bool, optional): [description]. Defaults to True. Returns: the number of records """ if self.n_records is None: n_records = 0 with self._open() as f: for _ in tqdm(f) if verbose else f: n_records += 1 PartitionMetadata(self.path).write({"n_records": n_records}) self.n_records = n_records if outfile is not None: if outfile == "stdout": print(self.n_records) else: outfile = create_filepath_template(outfile, True).format(auto=0, stem=self.stem) if not Path(outfile).parent.exists(): if auto_mkdir: Path(outfile).parent.mkdir(parents=True) else: raise ValueError( f"Output directory does not exist: {Path(outfile).parent}" ) with open(outfile, "w") as f: f.write(str(self.n_records)) return self.n_records
def reduce(self, fn: str, outfile: str, init_val: Any = None, auto_mkdir: bool = False, verbose: bool = True): """Reduce Args: fn (str): [description] outfile (str): [description] init_val (Any): [description] auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False verbose (bool, optional): [description]. Defaults to True. """ fn = get_func_by_name(fn) outfile = create_filepath_template(outfile, True).format(auto=0, stem=self.stem) if init_val is not None: accum = init_val else: accum = None with self._open() as f, PartitionWriter(outfile, auto_mkdir=auto_mkdir) as g: if accum is None: try: record = self.deser_fn(next(f)) accum = fn(record) except StopIteration: pass for line in tqdm(f, total=self.n_records - 1 if self.n_records is not None else None) if verbose else f: record = self.deser_fn(line) accum = fn(record, accum) g.write(self.ser_fn(accum)) g.write_new_line()
def concat(self, outfile: str, auto_mkdir: bool = False, verbose: bool = True): """Concatenate partitions to one file Args: outfile (str): path to output partition auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False verbose (bool): log execution progress. Defaults to True Returns: ValueError: if the output directory does not exist """ outfile = create_filepath_template(outfile, False).format(auto=0, stem="") with PartitionWriter(outfile, auto_mkdir=auto_mkdir) as g, (tqdm( total=self.size) if verbose else fake_tqdm()) as pbar: for inpart in self.partitions: with inpart._open() as f: for line in f: g.write(line) pbar.update(1)
def distinct(self, key_fn: str, outfile: str, auto_mkdir: bool = False, verbose: bool = True): """Create a new partition containing the distinct elements in this partition. Args: key_fn (str): key function outfile (str): output file auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False verbose (bool, optional): [description]. Defaults to True. """ key_fn = get_func_by_name(key_fn) outfile = create_filepath_template(outfile, True).format(auto=0, stem=self.stem) with self._open() as f, PartitionWriter(outfile, auto_mkdir=auto_mkdir) as g: keys = set() for line in tqdm(f, total=self.n_records) if verbose else f: key = key_fn(self.deser_fn(line)) if key not in keys: keys.add(key) g.write(line)
def join(self, key_fn: str, outfile: str, partition: str, partition_key_fn: str, partition_skip_nrows: int = 0, partition_deser_fn: str = None, partition_ser_fn: str = None, output_ser_fn: str = None, auto_mkdir: bool = False, verbose: bool = True): """Join two partitions together based on its key. Args: key_fn (str): function that extracts key of the current partition outfile (str): the output file partition (str): path to the other partition partition_key_fn (str): function that extracts key of the other partition partition_skip_nrows (int, optional): number of rows to skip, default to be 0 partition_deser_fn (str, optional): the deserialization function of other partition, default to be the same as current partition partition_ser_fn (str, optional): the serialization function of other partition, default to be the same as current partition output_ser_fn (str, optional): the serialization function of the output partition, default to be the same as current partition auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False verbose (bool, optional): whether printing the process bar, default to be true """ if partition_deser_fn is None: partition_deser_fn = self.deser_fn else: partition_deser_fn = get_func_by_name(partition_deser_fn) if partition_ser_fn is None: partition_ser_fn = self.ser_fn else: partition_ser_fn = get_func_by_name(partition_ser_fn) if output_ser_fn is None: output_ser_fn = self.ser_fn else: output_ser_fn = get_func_by_name(output_ser_fn) key_fn = get_func_by_name(key_fn) partition_key_fn = get_func_by_name(partition_key_fn) outfile = create_filepath_template(outfile, True).format(auto=0, stem=self.stem) other = Partition(partition, partition_deser_fn, partition_ser_fn, partition_skip_nrows) with self._open() as f, other._open() as g: join = {} used_keys = set() for line in tqdm(f, total=self.n_records, desc="process part 0") if verbose else f: record = self.deser_fn(line) key = key_fn(record) join[key] = [record] used_keys.add(key) for line in tqdm(g, total=other.n_records, desc="process part 1") if verbose else g: record = partition_deser_fn(line) key = partition_key_fn(record) if key in join: join[key].append(record) used_keys.pop(key) for key in used_keys: join.pop(key) with PartitionWriter(outfile, auto_mkdir=auto_mkdir) as k: for r in tqdm( join.values(), total=len(join), desc="writing result") if verbose else join.values(): k.write(output_ser_fn(r)) k.write_new_line()