Exemple #1
0
    def split_by_key(self,
                     key_fn: str,
                     outfile: str,
                     num_partitions: int,
                     auto_mkdir: bool = False,
                     verbose: bool = True):
        """Spit the partition into multiple smaller partitions by key
        
        Args:
            key_fn (str): function
            outfile (str): outfile
            num_partitions (int): [description]
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): [description]. Defaults to True.
        """
        outfile = create_filepath_template(outfile, False)
        key_fn = get_func_by_name(key_fn)

        with contextlib.ExitStack() as stack, self._open() as f:
            writers = [
                stack.enter_context(
                    PartitionWriter(outfile.format(auto=i, stem=self.stem),
                                    auto_mkdir=auto_mkdir))
                for i in range(num_partitions)
            ]
            for line in tqdm(f, total=self.n_records) if verbose else f:
                bucket_no = key_fn(self.deser_fn(line))
                partno = bucket_no % num_partitions
                writers[partno].write(line)
Exemple #2
0
    def filter(self,
               fn: str,
               outfile: str,
               delete_on_empty: bool = False,
               auto_mkdir: bool = False,
               verbose: bool = True):
        """Filter function
        
        Args:
            fn (str): [description]
            outfile (str): [description]
            delete_on_empty (bool, optional): delete the partition if there is no record. Defaults to False
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): [description]. Defaults to True.
        """
        fn = get_func_by_name(fn)
        outfile = create_filepath_template(outfile,
                                           True).format(auto=0, stem=self.stem)

        with self._open() as f, PartitionWriter(
                outfile,
                on_close_delete_if_empty=delete_on_empty,
                auto_mkdir=auto_mkdir) as g:
            for line in tqdm(f, total=self.n_records) if verbose else f:
                if fn(self.deser_fn(line)):
                    g.write(line)
Exemple #3
0
    def flat_map(self,
                 fn: str,
                 outfile: str,
                 auto_mkdir: bool = False,
                 verbose: bool = True):
        """Apply a flat map function on every record of this partition

        Args:
            fn (str): an import path of the map function, which should has this signature: (record: Any) -> Any
            outfile (str): output file of the new partition, `*` or `{stem}` in the path are placeholders, which will be replaced by the stem (i.e., file name without extension) of the current partition
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): show the execution progress bar. Defaults to True.
        """
        fn = get_func_by_name(fn)
        outfile = create_filepath_template(outfile,
                                           True).format(auto=0, stem=self.stem)

        with self._open() as f, PartitionWriter(outfile,
                                                auto_mkdir=auto_mkdir) as g:
            for record in tqdm(f, total=self.n_records) if verbose else f:
                record = self.deser_fn(record)
                for sub_record in fn(record):
                    sub_record = self.ser_fn(sub_record)
                    g.write(sub_record)
                    g.write_new_line()
Exemple #4
0
    def reduce_by_key(self,
                      key_fn: str,
                      fn: str,
                      outfile: str,
                      init_val: Any = None,
                      auto_mkdir: bool = False,
                      verbose: bool = True):
        """Reduce

        Args:
            key_fn (str): [description]
            fn (str): [description]
            outfile (str): [description]
            init_val (Any): [description]
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): [description]. Defaults to True.
        """
        key_fn = get_func_by_name(key_fn)
        fn = get_func_by_name(fn)
        outfile = create_filepath_template(outfile,
                                           True).format(auto=0, stem=self.stem)

        with self._open() as f, PartitionWriter(outfile,
                                                auto_mkdir=auto_mkdir) as g:
            groups = {}
            for line in tqdm(f,
                             total=self.n_records if self.n_records is not None
                             else None) if verbose else f:
                record = self.deser_fn(line)
                rid = key_fn(record)
                if rid not in groups:
                    if init_val is None:
                        groups[rid] = fn(record)
                    else:
                        groups[rid] = fn(record, init_val)
                else:
                    groups[rid] = fn(record, groups[rid])

            for value in tqdm(
                    groups.values(),
                    total=len(groups)) if verbose else groups.values():
                g.write(self.ser_fn(value))
                g.write_new_line()
Exemple #5
0
    def apply(self, fn: str, verbose: bool = True):
        """Apply

        Args:
            fn (str): function
            verbose (bool, optional): show execution progress. Defaults to True.
        """
        fn = get_func_by_name(fn)
        with self._open() as f:
            for line in tqdm(f, total=self.n_records) if verbose else f:
                fn(self.deser_fn(line))
Exemple #6
0
def main(args):
    # build parser and parse arguments
    start = time()
    parser = build_parser()
    args = parser.parse_args(args)
    end = time()

    args = vars(args)
    command = args.pop("command")
    verbose = args.pop("verbose")
    infile = args.pop("infile")
    skip_nrows = args.pop("skip_nrows")
    deser_fn = get_func_by_name(args.pop("deser_fn"))
    ser_fn = get_func_by_name(args.pop("ser_fn"))

    if verbose > 1:
        print(f">>> run sh map-reduce version {__version__}")
        print(
            f">>> build parser and parsing arguments takes {end - start:.03f}s"
        )

    classname, method_name = command.rsplit(".", 1)
    instance = CLASSES[classname](infile, deser_fn, ser_fn, skip_nrows)

    fn = getattr(instance, method_name)

    if 'verbose' in FUNCTIONS[command]:
        args['verbose'] = verbose > 0

    if verbose > 1:
        print(f">>> execute functions with arguments:\n%s" %
              orjson.dumps(args))

    fn(**args)

    if verbose > 1:
        print(f">>> finish! execution time: {time() - start:.03f}s")
Exemple #7
0
    def reduce(self,
               fn: str,
               outfile: str,
               init_val: Any = None,
               auto_mkdir: bool = False,
               verbose: bool = True):
        """Reduce

        Args:
            fn (str): [description]
            outfile (str): [description]
            init_val (Any): [description]
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): [description]. Defaults to True.
        """
        fn = get_func_by_name(fn)
        if init_val is not None:
            accum = init_val
        else:
            accum = None

        outfile = create_filepath_template(outfile, False).format(auto=0,
                                                                  stem="")
        with PartitionWriter(outfile, auto_mkdir=auto_mkdir) as g, (tqdm(
                total=self.size) if verbose else fake_tqdm()) as pbar:
            for inpart in self.partitions:
                with inpart._open() as f:
                    if accum is None:
                        try:
                            record = inpart.deser_fn(next(f))
                            accum = fn(record)
                            if verbose:
                                pbar.update(1)
                        except StopIteration:
                            pass

                    for line in f:
                        record = inpart.deser_fn(line)
                        accum = fn(record, accum)
                        pbar.update(1)

            g.write(self.ser_fn(accum))
            g.write_new_line()
Exemple #8
0
    def reduce(self,
               fn: str,
               outfile: str,
               init_val: Any = None,
               auto_mkdir: bool = False,
               verbose: bool = True):
        """Reduce

        Args:
            fn (str): [description]
            outfile (str): [description]
            init_val (Any): [description]
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): [description]. Defaults to True.
        """
        fn = get_func_by_name(fn)
        outfile = create_filepath_template(outfile,
                                           True).format(auto=0, stem=self.stem)
        if init_val is not None:
            accum = init_val
        else:
            accum = None

        with self._open() as f, PartitionWriter(outfile,
                                                auto_mkdir=auto_mkdir) as g:
            if accum is None:
                try:
                    record = self.deser_fn(next(f))
                    accum = fn(record)
                except StopIteration:
                    pass

            for line in tqdm(f,
                             total=self.n_records - 1 if self.n_records
                             is not None else None) if verbose else f:
                record = self.deser_fn(line)
                accum = fn(record, accum)

            g.write(self.ser_fn(accum))
            g.write_new_line()
Exemple #9
0
    def distinct(self,
                 key_fn: str,
                 outfile: str,
                 auto_mkdir: bool = False,
                 verbose: bool = True):
        """Create a new partition containing the distinct elements in this partition.

        Args:
            key_fn (str): key function
            outfile (str): output file
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): [description]. Defaults to True.
        """
        key_fn = get_func_by_name(key_fn)
        outfile = create_filepath_template(outfile,
                                           True).format(auto=0, stem=self.stem)
        with self._open() as f, PartitionWriter(outfile,
                                                auto_mkdir=auto_mkdir) as g:
            keys = set()
            for line in tqdm(f, total=self.n_records) if verbose else f:
                key = key_fn(self.deser_fn(line))
                if key not in keys:
                    keys.add(key)
                    g.write(line)
Exemple #10
0
    def join(self,
             key_fn: str,
             outfile: str,
             partition: str,
             partition_key_fn: str,
             partition_skip_nrows: int = 0,
             partition_deser_fn: str = None,
             partition_ser_fn: str = None,
             output_ser_fn: str = None,
             auto_mkdir: bool = False,
             verbose: bool = True):
        """Join two partitions together based on its key.

        Args:
            key_fn (str): function that extracts key of the current partition
            outfile (str): the output file
            partition (str): path to the other partition
            partition_key_fn (str): function that extracts key of the other partition
            partition_skip_nrows (int, optional): number of rows to skip, default to be 0
            partition_deser_fn (str, optional): the deserialization function of other partition, default to be the same as current partition
            partition_ser_fn (str, optional): the serialization function of other partition, default to be the same as current partition
            output_ser_fn (str, optional): the serialization function of the output partition, default to be the same as current partition
            auto_mkdir (bool, optional): automatically create directory if the directory of the output file does not exist. Defaults to False
            verbose (bool, optional): whether printing the process bar, default to be true
        """
        if partition_deser_fn is None:
            partition_deser_fn = self.deser_fn
        else:
            partition_deser_fn = get_func_by_name(partition_deser_fn)

        if partition_ser_fn is None:
            partition_ser_fn = self.ser_fn
        else:
            partition_ser_fn = get_func_by_name(partition_ser_fn)

        if output_ser_fn is None:
            output_ser_fn = self.ser_fn
        else:
            output_ser_fn = get_func_by_name(output_ser_fn)

        key_fn = get_func_by_name(key_fn)
        partition_key_fn = get_func_by_name(partition_key_fn)
        outfile = create_filepath_template(outfile,
                                           True).format(auto=0, stem=self.stem)

        other = Partition(partition, partition_deser_fn, partition_ser_fn,
                          partition_skip_nrows)
        with self._open() as f, other._open() as g:
            join = {}
            used_keys = set()

            for line in tqdm(f, total=self.n_records,
                             desc="process part 0") if verbose else f:
                record = self.deser_fn(line)
                key = key_fn(record)
                join[key] = [record]
                used_keys.add(key)

            for line in tqdm(g, total=other.n_records,
                             desc="process part 1") if verbose else g:
                record = partition_deser_fn(line)
                key = partition_key_fn(record)
                if key in join:
                    join[key].append(record)
                    used_keys.pop(key)

            for key in used_keys:
                join.pop(key)

            with PartitionWriter(outfile, auto_mkdir=auto_mkdir) as k:
                for r in tqdm(
                        join.values(), total=len(join),
                        desc="writing result") if verbose else join.values():
                    k.write(output_ser_fn(r))
                    k.write_new_line()