コード例 #1
0
    def split_by_key(self,
                     key_fn: str,
                     outfile: str,
                     num_partitions: int,
                     auto_mkdir: bool = False,
                     verbose: bool = True):
        """Spit the partition into multiple smaller partitions by key
        
        Args:
            key_fn (str): function
            outfile (str): outfile
            num_partitions (int): [description]
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): [description]. Defaults to True.
        """
        outfile = create_filepath_template(outfile, False)
        key_fn = get_func_by_name(key_fn)

        with contextlib.ExitStack() as stack, self._open() as f:
            writers = [
                stack.enter_context(
                    PartitionWriter(outfile.format(auto=i, stem=self.stem),
                                    auto_mkdir=auto_mkdir))
                for i in range(num_partitions)
            ]
            for line in tqdm(f, total=self.n_records) if verbose else f:
                bucket_no = key_fn(self.deser_fn(line))
                partno = bucket_no % num_partitions
                writers[partno].write(line)
コード例 #2
0
    def filter(self,
               fn: str,
               outfile: str,
               delete_on_empty: bool = False,
               auto_mkdir: bool = False,
               verbose: bool = True):
        """Filter function
        
        Args:
            fn (str): [description]
            outfile (str): [description]
            delete_on_empty (bool, optional): delete the partition if there is no record. Defaults to False
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): [description]. Defaults to True.
        """
        fn = get_func_by_name(fn)
        outfile = create_filepath_template(outfile,
                                           True).format(auto=0, stem=self.stem)

        with self._open() as f, PartitionWriter(
                outfile,
                on_close_delete_if_empty=delete_on_empty,
                auto_mkdir=auto_mkdir) as g:
            for line in tqdm(f, total=self.n_records) if verbose else f:
                if fn(self.deser_fn(line)):
                    g.write(line)
コード例 #3
0
    def flat_map(self,
                 fn: str,
                 outfile: str,
                 auto_mkdir: bool = False,
                 verbose: bool = True):
        """Apply a flat map function on every record of this partition

        Args:
            fn (str): an import path of the map function, which should has this signature: (record: Any) -> Any
            outfile (str): output file of the new partition, `*` or `{stem}` in the path are placeholders, which will be replaced by the stem (i.e., file name without extension) of the current partition
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): show the execution progress bar. Defaults to True.
        """
        fn = get_func_by_name(fn)
        outfile = create_filepath_template(outfile,
                                           True).format(auto=0, stem=self.stem)

        with self._open() as f, PartitionWriter(outfile,
                                                auto_mkdir=auto_mkdir) as g:
            for record in tqdm(f, total=self.n_records) if verbose else f:
                record = self.deser_fn(record)
                for sub_record in fn(record):
                    sub_record = self.ser_fn(sub_record)
                    g.write(sub_record)
                    g.write_new_line()
コード例 #4
0
    def coalesce(self,
                 outfile: str,
                 records_per_partition: Optional[int] = None,
                 num_partitions: Optional[int] = None,
                 auto_mkdir: bool = False,
                 verbose: bool = True):
        """The current partitions are coalesce into `num_partitions` partitions. If `num_partitions` is not specified, it will be figure out automatically
        from `records_per_partition`.
        
        Args:
            outfile (str): path template of output partitions
            records_per_partition (Optional[int], optional): number of records per partition. Defaults to None.
            num_partitions (Optional[int], optional): number of partitions. Defaults to None.
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): log execution progress. Defaults to True.
        
        Raises:
            ValueError: if the output directory does not exist
        """
        outfile = create_filepath_template(outfile, False)
        if records_per_partition is None:
            assert num_partitions is not None
            assert self.size is not None, "Cannot determine the records per partition based on number of partitions because of unknown size of partitions. Consider running partition.count or provide the `records_per_partition` parameter"
            records_per_partition = ceil(self.size / num_partitions)

        writer = None
        part_counter = 0

        with (tqdm(total=self.size) if verbose else fake_tqdm()) as pbar:
            try:
                writer = PartitionWriter(outfile.format(auto=part_counter,
                                                        stem=""),
                                         on_close_delete_if_empty=True,
                                         auto_mkdir=auto_mkdir).open()
                for inpart in self.partitions:
                    with inpart._open() as f:
                        for i, line in enumerate(f):
                            writer.write(line)
                            pbar.update(1)

                            if (i + 1) % records_per_partition == 0:
                                writer.close()
                                part_counter += 1
                                writer = PartitionWriter(
                                    outfile.format(auto=part_counter, stem=""),
                                    on_close_delete_if_empty=True).open()

            finally:
                if writer is not None:
                    writer.close()
コード例 #5
0
    def reduce(self,
               fn: str,
               outfile: str,
               init_val: Any = None,
               auto_mkdir: bool = False,
               verbose: bool = True):
        """Reduce

        Args:
            fn (str): [description]
            outfile (str): [description]
            init_val (Any): [description]
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): [description]. Defaults to True.
        """
        fn = get_func_by_name(fn)
        if init_val is not None:
            accum = init_val
        else:
            accum = None

        outfile = create_filepath_template(outfile, False).format(auto=0,
                                                                  stem="")
        with PartitionWriter(outfile, auto_mkdir=auto_mkdir) as g, (tqdm(
                total=self.size) if verbose else fake_tqdm()) as pbar:
            for inpart in self.partitions:
                with inpart._open() as f:
                    if accum is None:
                        try:
                            record = inpart.deser_fn(next(f))
                            accum = fn(record)
                            if verbose:
                                pbar.update(1)
                        except StopIteration:
                            pass

                    for line in f:
                        record = inpart.deser_fn(line)
                        accum = fn(record, accum)
                        pbar.update(1)

            g.write(self.ser_fn(accum))
            g.write_new_line()
コード例 #6
0
    def reduce_by_key(self,
                      key_fn: str,
                      fn: str,
                      outfile: str,
                      init_val: Any = None,
                      auto_mkdir: bool = False,
                      verbose: bool = True):
        """Reduce

        Args:
            key_fn (str): [description]
            fn (str): [description]
            outfile (str): [description]
            init_val (Any): [description]
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): [description]. Defaults to True.
        """
        key_fn = get_func_by_name(key_fn)
        fn = get_func_by_name(fn)
        outfile = create_filepath_template(outfile,
                                           True).format(auto=0, stem=self.stem)

        with self._open() as f, PartitionWriter(outfile,
                                                auto_mkdir=auto_mkdir) as g:
            groups = {}
            for line in tqdm(f,
                             total=self.n_records if self.n_records is not None
                             else None) if verbose else f:
                record = self.deser_fn(line)
                rid = key_fn(record)
                if rid not in groups:
                    if init_val is None:
                        groups[rid] = fn(record)
                    else:
                        groups[rid] = fn(record, init_val)
                else:
                    groups[rid] = fn(record, groups[rid])

            for value in tqdm(
                    groups.values(),
                    total=len(groups)) if verbose else groups.values():
                g.write(self.ser_fn(value))
                g.write_new_line()
コード例 #7
0
    def count(self,
              outfile: Optional[str] = None,
              auto_mkdir: bool = False,
              verbose: bool = True) -> int:
        """Count

        Args:
            outfile (Optional[str], optional): output file to write to the value to if it is not None. if outfile is stdout we will print to stdout
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): [description]. Defaults to True.

        Returns:
            the number of records
        """
        if self.n_records is None:
            n_records = 0
            with self._open() as f:
                for _ in tqdm(f) if verbose else f:
                    n_records += 1
            PartitionMetadata(self.path).write({"n_records": n_records})
            self.n_records = n_records

        if outfile is not None:
            if outfile == "stdout":
                print(self.n_records)
            else:
                outfile = create_filepath_template(outfile,
                                                   True).format(auto=0,
                                                                stem=self.stem)
                if not Path(outfile).parent.exists():
                    if auto_mkdir:
                        Path(outfile).parent.mkdir(parents=True)
                    else:
                        raise ValueError(
                            f"Output directory does not exist: {Path(outfile).parent}"
                        )

                with open(outfile, "w") as f:
                    f.write(str(self.n_records))

        return self.n_records
コード例 #8
0
    def reduce(self,
               fn: str,
               outfile: str,
               init_val: Any = None,
               auto_mkdir: bool = False,
               verbose: bool = True):
        """Reduce

        Args:
            fn (str): [description]
            outfile (str): [description]
            init_val (Any): [description]
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): [description]. Defaults to True.
        """
        fn = get_func_by_name(fn)
        outfile = create_filepath_template(outfile,
                                           True).format(auto=0, stem=self.stem)
        if init_val is not None:
            accum = init_val
        else:
            accum = None

        with self._open() as f, PartitionWriter(outfile,
                                                auto_mkdir=auto_mkdir) as g:
            if accum is None:
                try:
                    record = self.deser_fn(next(f))
                    accum = fn(record)
                except StopIteration:
                    pass

            for line in tqdm(f,
                             total=self.n_records - 1 if self.n_records
                             is not None else None) if verbose else f:
                record = self.deser_fn(line)
                accum = fn(record, accum)

            g.write(self.ser_fn(accum))
            g.write_new_line()
コード例 #9
0
    def concat(self,
               outfile: str,
               auto_mkdir: bool = False,
               verbose: bool = True):
        """Concatenate partitions to one file

        Args:
            outfile (str): path to output partition
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool): log execution progress. Defaults to True

        Returns:
            ValueError: if the output directory does not exist
        """
        outfile = create_filepath_template(outfile, False).format(auto=0,
                                                                  stem="")
        with PartitionWriter(outfile, auto_mkdir=auto_mkdir) as g, (tqdm(
                total=self.size) if verbose else fake_tqdm()) as pbar:
            for inpart in self.partitions:
                with inpart._open() as f:
                    for line in f:
                        g.write(line)
                        pbar.update(1)
コード例 #10
0
    def distinct(self,
                 key_fn: str,
                 outfile: str,
                 auto_mkdir: bool = False,
                 verbose: bool = True):
        """Create a new partition containing the distinct elements in this partition.

        Args:
            key_fn (str): key function
            outfile (str): output file
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): [description]. Defaults to True.
        """
        key_fn = get_func_by_name(key_fn)
        outfile = create_filepath_template(outfile,
                                           True).format(auto=0, stem=self.stem)
        with self._open() as f, PartitionWriter(outfile,
                                                auto_mkdir=auto_mkdir) as g:
            keys = set()
            for line in tqdm(f, total=self.n_records) if verbose else f:
                key = key_fn(self.deser_fn(line))
                if key not in keys:
                    keys.add(key)
                    g.write(line)
コード例 #11
0
    def join(self,
             key_fn: str,
             outfile: str,
             partition: str,
             partition_key_fn: str,
             partition_skip_nrows: int = 0,
             partition_deser_fn: str = None,
             partition_ser_fn: str = None,
             output_ser_fn: str = None,
             auto_mkdir: bool = False,
             verbose: bool = True):
        """Join two partitions together based on its key.

        Args:
            key_fn (str): function that extracts key of the current partition
            outfile (str): the output file
            partition (str): path to the other partition
            partition_key_fn (str): function that extracts key of the other partition
            partition_skip_nrows (int, optional): number of rows to skip, default to be 0
            partition_deser_fn (str, optional): the deserialization function of other partition, default to be the same as current partition
            partition_ser_fn (str, optional): the serialization function of other partition, default to be the same as current partition
            output_ser_fn (str, optional): the serialization function of the output partition, default to be the same as current partition
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): whether printing the process bar, default to be true
        """
        if partition_deser_fn is None:
            partition_deser_fn = self.deser_fn
        else:
            partition_deser_fn = get_func_by_name(partition_deser_fn)

        if partition_ser_fn is None:
            partition_ser_fn = self.ser_fn
        else:
            partition_ser_fn = get_func_by_name(partition_ser_fn)

        if output_ser_fn is None:
            output_ser_fn = self.ser_fn
        else:
            output_ser_fn = get_func_by_name(output_ser_fn)

        key_fn = get_func_by_name(key_fn)
        partition_key_fn = get_func_by_name(partition_key_fn)
        outfile = create_filepath_template(outfile,
                                           True).format(auto=0, stem=self.stem)

        other = Partition(partition, partition_deser_fn, partition_ser_fn,
                          partition_skip_nrows)
        with self._open() as f, other._open() as g:
            join = {}
            used_keys = set()

            for line in tqdm(f, total=self.n_records,
                             desc="process part 0") if verbose else f:
                record = self.deser_fn(line)
                key = key_fn(record)
                join[key] = [record]
                used_keys.add(key)

            for line in tqdm(g, total=other.n_records,
                             desc="process part 1") if verbose else g:
                record = partition_deser_fn(line)
                key = partition_key_fn(record)
                if key in join:
                    join[key].append(record)
                    used_keys.pop(key)

            for key in used_keys:
                join.pop(key)

            with PartitionWriter(outfile, auto_mkdir=auto_mkdir) as k:
                for r in tqdm(
                        join.values(), total=len(join),
                        desc="writing result") if verbose else join.values():
                    k.write(output_ser_fn(r))
                    k.write_new_line()