def stack(cls, *iter_streams, how: How = 'vertical', name=AUTO, context=None, **kwargs): iter_streams = arg.update(iter_streams) assert cls.is_same_stream_type( iter_streams), 'concat(): streams must have same type: {}'.format( iter_streams) result = None for cur_stream in iter_streams: assert isinstance(cur_stream, StreamInterface) if result is None: if hasattr(cur_stream, 'copy'): result = cur_stream.copy() else: result = cur_stream if arg.is_defined(name): result.set_name(name) if arg.is_defined(context): result.set_context(context) elif how == 'vertical': result = result.add_stream(cur_stream) else: result = result.join(cur_stream, how=how, **kwargs) gc.collect() return result
def group_by( self, *keys, values: Columns = None, step: AutoCount = AUTO, as_pairs: bool = False, take_hash: bool = True, verbose: bool = True, ) -> Stream: keys = arg.update(keys) keys = arg.get_names(keys) values = arg.get_names(values) if hasattr(keys[0], 'get_field_names'): # if isinstance(keys[0], FieldGroup) keys = keys[0].get_field_names() step = arg.acquire(step, self.max_items_in_memory) if as_pairs: key_for_sort = keys else: key_for_sort = get_key_function(keys, take_hash=take_hash) sorted_stream = self.sort( key_for_sort, step=step, verbose=verbose, ) grouped_stream = sorted_stream.sorted_group_by( keys, values=values, as_pairs=as_pairs, ) return grouped_stream
def map_side_join( self, right: Native, key: UniKey, how: How = JoinType.Left, right_is_uniq: bool = True, inplace: bool = False, ) -> Optional[Native]: key = arg.get_names(key) keys = arg.update([key]) if not isinstance(how, JoinType): how = JoinType(how) joined_items = algo.map_side_join( iter_left=self.get_items(), iter_right=right.get_items(), key_function=fs.composite_key(keys), merge_function=fs.merge_two_items(), dict_function=fs.items_to_dict(), how=how, uniq_right=right_is_uniq, ) if self.is_in_memory(): joined_items = list(joined_items) if inplace: self.set_items(joined_items, count=self.get_count(), inplace=True) else: stream = self.stream(joined_items) meta = self.get_compatible_static_meta() stream = stream.set_meta(**meta) return self._assume_native(stream)
def sorted_group_by( self, *keys, values: Columns = None, as_pairs: bool = False, skip_missing: bool = False, ) -> Stream: keys = arg.update(keys) keys = arg.get_names(keys, or_callable=True) values = arg.get_names(values) key_function = self._get_key_function(keys) iter_groups = self._get_groups(key_function, as_pairs=as_pairs) if as_pairs: stream_groups = sm.KeyValueStream( iter_groups, value_stream_type=StreamType.RowStream) else: stream_groups = sm.RowStream(iter_groups, check=False) if values: item_type = self.get_item_type() # ItemType.Record fold_mapper = fs.fold_lists(keys=keys, values=values, skip_missing=skip_missing, item_type=item_type) stream_groups = stream_groups.map_to_type( fold_mapper, stream_type=StreamType.RecordStream) if self.is_in_memory(): return stream_groups.to_memory() else: stream_groups.set_estimated_count(self.get_count() or self.get_estimated_count()) return stream_groups
def sorted_group_by( self, *keys, values: Columns = None, as_pairs: bool = False, skip_missing: bool = False, ) -> Stream: keys = arg.update(keys) keys = arg.get_names(keys) values = arg.get_names(values) key_function = get_key_function(keys) groups = self._get_groups(key_function, as_pairs=as_pairs) if as_pairs: sm_groups = sm.KeyValueStream( groups, value_stream_type=StreamType.RowStream) else: sm_groups = sm.RowStream(groups, check=False) if values: sm_groups = sm_groups.map_to_type( lambda r: ms.fold_lists( r, keys, values, skip_missing=skip_missing), stream_type=StreamType.RecordStream, ) if self.is_in_memory(): return sm_groups.to_memory() else: sm_groups.set_estimated_count(self.get_count() or self.get_estimated_count()) return sm_groups
def to_row_stream(self, *args, **kwargs) -> RowStream: function, delimiter = None, None if 'function' in kwargs: function = kwargs.pop('function') elif args: if callable(args[0]): function, args = args[0], args[1:] elif self.get_stream_type() in (StreamType.LineStream, StreamType.AnyStream): delimiter, args = args[0], args[1:] elif self.get_stream_type() == StreamType.RecordStream: add_title_row = kwargs.pop('add_title_row', None) columns = arg.update(args, kwargs.pop('columns', None)) assert isinstance(self, RecordStream) if not columns: columns = self.get_columns() function = self.get_rows(columns=columns, add_title_row=add_title_row) elif 'delimiter' in kwargs and self.get_stream_type() in ( StreamType.LineStream, StreamType.AnyStream): delimiter = kwargs.pop('delimiter') elif args: assert not kwargs return self.to_any_stream().select(*args) if function: items = self._get_mapped_items( lambda i: function(i, *args, **kwargs)) elif delimiter: csv_reader = fs.csv_reader(delimiter=delimiter, *args, **kwargs) items = csv_reader(self.get_items()) else: items = self.get_items() stream = self.stream(items, stream_type=StreamType.RowStream) return self._assume_native(stream)
def not_in(*list_values) -> Callable: list_values = arg.update(list_values) def func(value: Any) -> bool: return value not in list_values return func
def composite_key(*functions) -> Callable: key_functions = arg.update(functions) def func(item) -> tuple: return sf.get_composite_key(item=item, keys_descriptions=key_functions) return func
def sorted_group_by( self, *keys, values: Optional[Iterable] = None, as_pairs: bool = False, output_struct: Optional[StructInterface] = None, skip_missing: bool = True, # tmp ) -> Stream: keys = arg.update(keys) key_function = self._get_key_function(keys, take_hash=False) output_keys = [self._get_field_getter(f) for f in keys] groups = self._get_groups(key_function, as_pairs=as_pairs) if as_pairs: stream_builder = StreamType.KeyValueStream.get_class() stream_groups = stream_builder(groups, value_stream_type=self.get_stream_type()) else: stream_builder = StreamType.RowStream.get_class() stream_groups = stream_builder(groups, check=False) if values: item_type = self.get_item_type() values = [self._get_field_getter(f, item_type=item_type) for f in values] fold_func = fs.fold_lists(keys=output_keys, values=values, skip_missing=skip_missing, item_type=item_type) stream_type = StreamType.RowStream if output_struct else self.get_stream_type() stream_groups = stream_groups.map_to_type(fold_func, stream_type=stream_type) if output_struct: stream_groups = stream_groups.structure(output_struct) if self.is_in_memory(): return stream_groups.to_memory() else: stream_groups.set_estimated_count(self.get_count() or self.get_estimated_count(), inplace=True) return stream_groups
def group_by(self, *keys, values: Optional[Iterable] = None, as_pairs: bool = False) -> Stream: keys = arg.get_names(keys) keys = arg.update(keys) values = arg.get_names(values) return self.sort(*keys).sorted_group_by(*keys, values=values, as_pairs=as_pairs)
def _get_uniq_records(self, *keys) -> Iterable: keys = arg.update(keys) key_fields = arg.get_names(keys) key_function = get_key_function(key_fields) prev_value = AUTO for r in self.get_records(): value = key_function(r) if value != prev_value: yield r prev_value = value
def format_message( self, *messages, max_len: Union[int, arg.Auto] = arg.AUTO, truncate: bool = True, ) -> str: messages = arg.update(messages) max_len = arg.acquire(max_len, self.max_line_len) message = SPACE.join([str(m) for m in messages]) if truncate and len(message) > max_len: message = message[:max_len - 2] + TRUNCATED_SUFFIX return message
def remove_fields(self, *fields, multiple: bool = False, inplace: bool = True): removing_fields = arg.update(fields) removing_field_names = arg.get_names(removing_fields) existing_fields = self.get_fields() if inplace: for e in existing_fields: if arg.get_name(e) in removing_field_names: existing_fields.remove(e) if not multiple: break else: new_fields = [f for f in existing_fields if arg.get_name(f) not in removing_field_names] return self.make_new(new_fields)
def remove_fields(self, *fields, inplace: bool = True): removing_fields = arg.update(fields) existing_fields = self.get_fields_descriptions() if inplace: for f in existing_fields.copy(): if isinstance(f, ARRAY_TYPES): name = f[0] elif hasattr(f, 'get_name'): name = f.get_name() else: name = f if name in removing_fields: existing_fields.remove(f) else: raise NotImplementedError
def maybe(*conditions) -> Callable: conditions = arg.update(conditions) def func_conditioned(value) -> bool: for c in conditions: if c(value): return True return False def func_simple(*values) -> bool: return max(map(bool, values)) if conditions: return func_conditioned else: return func_simple
def never(*conditions) -> Callable: conditions = arg.update(conditions) def func_conditioned(value) -> bool: for c in conditions: if c(value): return False return True def func_simple(value) -> bool: return not value if conditions: return func_conditioned else: return func_simple
def always(*conditions) -> Callable: conditions = arg.update(conditions) def func_conditioned(value) -> bool: for c in conditions: if not c(value): return False return True def func_simple(*values) -> bool: values = arg.update(values) return min(map(bool, values)) if conditions: return func_conditioned else: return func_simple
def add_fields( self, *fields, default_type: Optional[Type] = None, exclude_duplicates: bool = False, name: StructName = None, reassign_struct_name: bool = False, inplace: bool = False, ) -> Optional[Native]: fields = arg.update(fields) if inplace: for f in fields: self.append( f, default_type=default_type, exclude_duplicates=exclude_duplicates, reassign_struct_name=reassign_struct_name, inplace=True, ) else: return self.make_new(fields=self.get_fields_descriptions() + list(fields), name=name)
def sort(self, *keys, reverse: bool = False, step: AutoCount = AUTO, verbose: AutoBool = True) -> Native: keys = arg.update(keys) step = arg.acquire(step, self.max_items_in_memory) if len(keys) == 0: key_function = fs.same() else: key_function = fs.composite_key(keys) if self.can_be_in_memory(step=step) or step is None: stream = self.memory_sort(key_function, reverse=reverse, verbose=verbose) else: stream = self.disk_sort(key_function, reverse=reverse, step=step, verbose=verbose) return self._assume_native(stream)
def map_side_join(self, right: Native, key: UniKey, how: How = JoinType.Left, right_is_uniq: bool = True) -> Native: key = arg.get_names(key) keys = arg.update([key]) if not isinstance(how, JoinType): how = JoinType(how) joined_items = algo.map_side_join( iter_left=self.get_items(), iter_right=right.get_items(), key_function=fs.composite_key(keys), merge_function=fs.merge_two_items(), dict_function=fs.items_to_dict(), how=how, uniq_right=right_is_uniq, ) stream = self.stream( list(joined_items) if self.is_in_memory() else joined_items, ).set_meta(**self.get_static_meta()) return self._assume_native(stream)
def get_composite_key(item, keys_descriptions: list, item_type=arg.AUTO, logger=None, skip_errors=True) -> tuple: keys_descriptions = arg.update(keys_descriptions) keys_descriptions = [ d.get_field_names() if hasattr(d, 'get_field_names') else d for d in keys_descriptions ] result = list() for d in keys_descriptions: if isinstance(d, Callable): value = d(item) else: value = value_from_item(item, d, item_type=item_type, logger=logger, skip_errors=skip_errors) result.append(value) return tuple(result)
def sorted_join( self, right: Native, key: UniKey, how: How = JoinType.Left, sorting_is_reversed: bool = False, ) -> Native: keys = arg.update([key]) if not isinstance(how, JoinType): how = JoinType(how) joined_items = algo.sorted_join( iter_left=self.get_iter(), iter_right=right.get_iter(), key_function=fs.composite_key(keys), merge_function=fs.merge_two_items(), order_function=bf.is_ordered(reverse=sorting_is_reversed, including=True), how=how, ) return self.stream( list(joined_items) if self.is_in_memory() else joined_items, **self.get_static_meta())
def func_simple(*values) -> bool: values = arg.update(values) return min(map(bool, values))
def is_same_stream_type(*iter_streams) -> bool: iter_streams = arg.update(iter_streams) stream_types = [i.get_stream_type() for i in iter_streams] return len(set(stream_types)) == 1