Beispiel #1
0
 def __init__(
     self,
     data: Iterable,
     name: AutoName = AUTO,
     check: bool = False,
     count: AutoCount = None,
     less_than: AutoCount = None,
     source: Connector = None,
     context: Context = None,
     max_items_in_memory: AutoCount = AUTO,
     tmp_files: TmpMask = AUTO,
 ):
     count = arg.get_optional_len(data, count)
     less_than = less_than or count
     self.max_items_in_memory = arg.acquire(max_items_in_memory,
                                            sm.MAX_ITEMS_IN_MEMORY)
     super().__init__(
         data=data,
         name=name,
         check=check,
         source=source,
         context=context,
         count=count,
         less_than=less_than,
     )
     self._tmp_files = arg.delayed_acquire(tmp_files, sm.get_tmp_mask,
                                           self.get_name())
Beispiel #2
0
 def _get_filtered_items(
         self,
         *args,
         item_type: ItemType = ItemType.Auto,
         skip_errors: bool = False,
         logger: Union[LoggerInterface, Auto] = AUTO,
         **kwargs
 ) -> Iterable:
     logger = arg.delayed_acquire(logger, self.get_logger)
     item_type = arg.delayed_acquire(item_type, self.get_item_type)
     filter_function = sf.filter_items(
         *args, item_type=item_type,
         skip_errors=skip_errors, logger=logger,
         **kwargs,
     )
     return filter(filter_function, self.get_items())
Beispiel #3
0
def set_to_item_inplace(
        field: FieldID, value: Value,
        item: SelectableItem, item_type: ItemType = ItemType.Auto,
) -> None:
    item_type = arg.delayed_acquire(item_type, ItemType.detect, item, default=ItemType.Any)
    if not isinstance(item_type, ItemType):
        if hasattr(item_type, 'value'):
            item_type = ItemType(item_type.value)
        else:
            item_type = ItemType(item_type)
    if item_type == ItemType.Record:
        item[field] = value
    elif item_type == ItemType.Row:
        cols_count = len(item)
        if field >= cols_count:
            item += [None] * (field - cols_count + 1)
        item[field] = value
    elif item_type == ItemType.StructRow:
        if isinstance(item, StructRowInterface):
            item.set_value(field, value, update_struct=True)
        elif isinstance(item, ROW_SUBCLASSES):
            assert isinstance(field, int), 'Expected column number as int, got {}'.format(field)
            cur_item_len = len(item)
            need_extend = field >= cur_item_len
            if need_extend:
                if isinstance(item, tuple):
                    item = list(item)
                item += [None] * (field + 1 - cur_item_len)
            item[field] = value
        else:
            raise TypeError('Expected Row or StructRow, got {}'.format(item))
    else:  # item_type == 'any' or not item_type:
        raise TypeError('type {} not supported'.format(item_type))
Beispiel #4
0
 def get_parsed_line(
     self,
     line: str,
     item_type: Union[ItemType, Auto] = AUTO,
     struct: Union[Array, StructInterface, Auto] = AUTO,
 ) -> Item:
     item_type = arg.delayed_acquire(item_type, self.get_default_item_type)
     if item_type == ItemType.Line:
         return line
     line_parser = fs.csv_loads(delimiter=self.get_delimiter())
     row = line_parser(line)
     if isinstance(struct, StructInterface):
         field_converters = struct.get_converters()
         row_converter = self._get_row_converter(
             converters=field_converters)
         row = row_converter(row)
     if item_type in (ItemType.Row, ItemType.Any, ItemType.Auto):
         return row
     if not arg.is_defined(struct, check_name=False):
         column_count = len(row)
         struct = list(range(column_count))
     if item_type == ItemType.Record:
         return {arg.get_name(k): v for k, v in zip(struct, row)}
     elif item_type == ItemType.StructRow:
         return ItemType.StructRow.build(data=row, struct=struct)
     else:
         msg = 'item_type {} is not supported for {}.parse_lines()'
         raise ValueError(msg.format(item_type, self.__class__.__name__))
Beispiel #5
0
 def to_stream(
         self,
         data: Data = AUTO,
         stream_type: AutoStreamType = AUTO,
         ex: OptionalFields = None,
         **kwargs
 ) -> Stream:
     stream_type = arg.delayed_acquire(stream_type, self.get_stream_type)
     if isinstance(stream_type, str):
         stream_class = StreamType(stream_type).get_class()
     elif isclass(stream_type):
         stream_class = stream_type
     elif isinstance(stream_type, StreamType) or hasattr(stream_type, 'get_class'):
         stream_class = stream_type.get_class()
     else:
         raise TypeError('AnyStream.to_stream(data, stream_type): expected StreamType, got {}'.format(stream_type))
     if not arg.is_defined(data):
         if hasattr(self, 'get_items_of_type'):
             item_type = stream_class.get_item_type()
             data = self.get_items_of_type(item_type)
         else:
             data = self.get_data()
     meta = self.get_compatible_meta(stream_class, ex=ex)
     meta.update(kwargs)
     if 'count' not in meta:
         meta['count'] = self.get_count()
     if 'source' not in meta:
         meta['source'] = self.get_source()
     stream = stream_class(data, **meta)
     return self._assume_stream(stream)
Beispiel #6
0
 def to_column_file(
     self,
     filename: str,
     columns: Union[Iterable, Auto] = AUTO,
     add_title_row=True,
     gzip=False,
     delimiter='\t',
     encoding=AUTO,
     check=True,
     verbose=True,
     return_stream=True,
 ) -> Optional[Native]:
     encoding = arg.delayed_acquire(encoding, self.get_encoding)
     meta = self.get_meta()
     if not gzip:
         meta.pop('count')
     sm_csv_file = self.to_row_stream(
         columns=columns,
         add_title_row=add_title_row,
     ).to_line_stream(delimiter=delimiter, ).to_text_file(
         filename,
         encoding=encoding,
         gzip=gzip,
         check=check,
         verbose=verbose,
         return_stream=return_stream,
     )
     if return_stream:
         return sm_csv_file.skip(1 if add_title_row else 0, ).to_row_stream(
             delimiter=delimiter, ).to_record_stream(
                 columns=columns, ).update_meta(**meta)
Beispiel #7
0
 def to_stream(self,
               data: Union[Iterable, Auto] = AUTO,
               name: AutoName = AUTO,
               stream_type: Union[StreamType, Auto] = AUTO,
               ex: OptionalFields = None,
               step: AutoCount = AUTO,
               **kwargs) -> Stream:
     name = arg.delayed_acquire(name, self._get_generated_stream_name)
     stream_type = self._get_stream_type(stream_type)
     stream_class = self._get_stream_class(stream_type)
     if hasattr(stream_class, 'get_item_type'):
         item_type = stream_class.get_item_type()
     else:
         item_type = AUTO
     if not arg.is_defined(data):
         data = self._get_items_of_type(item_type,
                                        verbose=kwargs.get('verbose', AUTO),
                                        step=step)
     meta = self.get_compatible_meta(stream_class,
                                     name=name,
                                     ex=ex,
                                     **kwargs)
     if 'count' not in meta:
         meta['count'] = self._get_fast_count()
     if 'source' not in meta:
         meta['source'] = self
     stream = stream_class(data, **meta)
     return self._assume_stream(stream)
Beispiel #8
0
 def group_by(
     self,
     *keys,
     values: Columns = None,
     as_pairs: bool = False,
     take_hash: bool = True,
     step: AutoCount = AUTO,
     verbose: bool = True,
 ) -> Stream:
     keys = arg.update(keys)
     keys = arg.get_names(keys)
     values = arg.get_names(values)
     if hasattr(keys[0],
                'get_field_names'):  # if isinstance(keys[0], FieldGroup)
         keys = keys[0].get_field_names()
     step = arg.delayed_acquire(step, self.get_limit_items_in_memory)
     if as_pairs:
         key_for_sort = keys
     else:
         key_for_sort = self._get_key_function(keys, take_hash=take_hash)
     sorted_stream = self.sort(
         key_for_sort,
         step=step,
         verbose=verbose,
     )
     grouped_stream = sorted_stream.sorted_group_by(
         keys,
         values=values,
         as_pairs=as_pairs,
     )
     return grouped_stream
Beispiel #9
0
 def filter(self, *args, item_type: ItemType = ItemType.Auto, skip_errors: bool = False, **kwargs) -> Native:
     item_type = arg.delayed_acquire(item_type, self.get_item_type)
     stream_type = self.get_stream_type()
     assert isinstance(stream_type, StreamType), 'Expected StreamType, got {}'.format(stream_type)
     filtered_items = self._get_filtered_items(*args, item_type=item_type, skip_errors=skip_errors, **kwargs)
     stream = self.to_stream(data=filtered_items, stream_type=stream_type)
     return self._assume_native(stream)
Beispiel #10
0
def set_to_item_inplace(
    field: FieldID,
    value: Value,
    item: SelectableItem,
    item_type: ItemType = ItemType.Auto,
) -> NoReturn:
    item_type = arg.delayed_acquire(item_type,
                                    ItemType.detect,
                                    item,
                                    default=ItemType.Any)
    if not isinstance(item_type, ItemType):
        if hasattr(item_type, 'value'):
            item_type = ItemType(item_type.value)
        else:
            item_type = ItemType(item_type)
    if item_type == ItemType.Record:
        item[field] = value
    elif item_type == ItemType.Row:
        cols_count = len(item)
        if field >= cols_count:
            item += [None] * (field - cols_count + 1)
        item[field] = value
    elif item_type == ItemType.StructRow:
        item.set_value(field, value)
    else:  # item_type == 'any' or not item_type:
        raise TypeError('type {} not supported'.format(item_type))
Beispiel #11
0
 def _get_field_getter(self, field: UniKey, item_type: Union[ItemType, Auto] = AUTO, default=None):
     if isinstance(self, RegularStreamInterface) or hasattr(self, 'get_item_type'):
         item_type = arg.delayed_acquire(item_type, self.get_item_type)
     return lambda i: fs.it.get_field_value_from_item(
         field=field, item=i, item_type=item_type,
         default=default, logger=self.get_selection_logger(),
     )
Beispiel #12
0
 def get_rows(self,
              columns: Union[Columns, Auto] = AUTO,
              add_title_row=False) -> Iterable:
     columns = arg.delayed_acquire(columns, self.get_columns)
     columns = arg.get_names(columns)
     if add_title_row:
         yield columns
     for r in self.get_items():
         yield [r.get(c) for c in columns]
Beispiel #13
0
def get_fields_names_from_item(item: SelectableItem, item_type: ItemType = ItemType.Auto) -> Row:
    item_type = arg.delayed_acquire(item_type, ItemType.detect, item, default=ItemType.Any)
    if item_type == ItemType.Row:
        return list(range(len(item)))
    elif item_type == ItemType.Record:
        return item.keys()
    elif item_type == ItemType.StructRow:
        return item.get_columns()
    else:
        raise TypeError('type {} not supported'.format(item_type))
Beispiel #14
0
def simple_select_fields(fields: Array, item: SelectableItem, item_type: ItemType = ItemType.Auto) -> SelectableItem:
    item_type = arg.delayed_acquire(item_type, ItemType.detect, item, default=ItemType.Any)
    if isinstance(item_type, str):
        item_type = ItemType(item_type)
    if item_type == ItemType.Record:
        return {f: item.get(f) for f in fields}
    elif item_type == ItemType.Row:
        return [item[f] for f in fields]
    elif item_type == ItemType.StructRow:
        return item.simple_select_fields(fields)
Beispiel #15
0
 def to_stream(self,
               data: Data = AUTO,
               stream_type: AutoStreamType = AUTO,
               ex: OptionalFields = None,
               **kwargs) -> Stream:
     stream_type = arg.delayed_acquire(stream_type, self.get_stream_type)
     if isinstance(stream_type, str):
         stream_class = StreamType(stream_type).get_class()
     elif isclass(stream_type):
         stream_class = stream_type
     else:
         stream_class = stream_type.get_class()
     data = arg.delayed_acquire(data, self.get_data)
     meta = self.get_compatible_meta(stream_class, ex=ex)
     meta.update(kwargs)
     if 'count' not in meta:
         meta['count'] = self.get_count()
     if 'source' not in meta:
         meta['source'] = self.get_source()
     return stream_class(data, **meta)
Beispiel #16
0
def get_date_from_day_abs(
    day_abs: int,
    min_date: Union[Date, arg.Auto] = arg.AUTO,
    as_iso_date: bool = True,
) -> Date:
    min_date = arg.delayed_acquire(min_date,
                                   get_year_start_monday,
                                   get_min_year(),
                                   as_iso_date=as_iso_date)
    cur_date = get_shifted_date(min_date, days=day_abs)
    return cur_date
Beispiel #17
0
 def set_verbose(self,
                 verbose: AutoBool = AUTO,
                 parent: AutoConnector = AUTO) -> Native:
     if not arg.is_defined(verbose):
         parent = arg.delayed_acquire(parent, self.get_parent)
         if hasattr(parent, 'is_verbose'):
             verbose = parent.is_verbose()
         elif hasattr(parent, 'verbose'):
             verbose = parent.verbose
         else:
             verbose = DEFAULT_VERBOSE
     self._verbose = verbose
     return self
Beispiel #18
0
 def filter(self,
            *args,
            item_type: ItemType = ItemType.Auto,
            skip_errors: bool = False,
            **kwargs) -> Native:
     item_type = arg.delayed_acquire(item_type, self.get_item_type)
     stream_type = StreamType.detect(item_type)
     filtered_items = self._get_filtered_items(*args,
                                               item_type=item_type,
                                               skip_errors=skip_errors,
                                               **kwargs)
     stream = self.to_stream(data=filtered_items, stream_type=stream_type)
     return self._assume_native(stream)
Beispiel #19
0
 def bucket(self,
            name: Name,
            access_key=AUTO,
            secret_key=AUTO) -> ConnectorInterface:
     bucket = self.get_buckets().get(name)
     if bucket:
         if arg.is_defined(access_key) and hasattr(bucket,
                                                   'set_access_key'):
             bucket.set_access_key(access_key)
         if arg.is_defined(secret_key) and hasattr(bucket,
                                                   'set_secret_key'):
             bucket.set_secret_key(secret_key)
     else:
         bucket_class = self.get_default_child_obj_class()
         bucket = bucket_class(
             name=name,
             storage=self,
             access_key=arg.delayed_acquire(access_key,
                                            self.get_access_key),
             secret_key=arg.delayed_acquire(secret_key,
                                            self.get_secret_key),
         )
     return bucket
Beispiel #20
0
 def sort(self,
          *keys,
          reverse: bool = False,
          step: AutoCount = AUTO,
          verbose: bool = True) -> Native:
     key_function = self._get_key_function(keys)
     step = arg.delayed_acquire(step, self.get_limit_items_in_memory)
     if self.can_be_in_memory(step=step):
         stream = self.memory_sort(key_function, reverse, verbose=verbose)
     else:
         stream = self.disk_sort(key_function,
                                 reverse,
                                 step=step,
                                 verbose=verbose)
     return self._assume_native(stream)
Beispiel #21
0
 def to_stream(self,
               data: Union[Iterable, Auto] = AUTO,
               name: AutoName = AUTO,
               stream_type: Union[StreamType, Auto] = AUTO,
               ex: OptionalFields = None,
               step: AutoCount = AUTO,
               **kwargs) -> Stream:
     if arg.is_defined(data):
         kwargs['data'] = data
     stream_type = arg.delayed_acquire(stream_type, self.get_stream_type)
     assert not ex, 'ex-argument for LocalFile.to_stream() not supported (got {})'.format(
         ex)
     return self.to_stream_type(stream_type=stream_type,
                                step=step,
                                **kwargs)
Beispiel #22
0
 def get_parsed_line(self,
                     line: str,
                     item_type: Union[ItemType, arg.Auto] = AUTO) -> Item:
     item_type = arg.delayed_acquire(item_type, self.get_default_item_type)
     if item_type in (ItemType.Line, ItemType.Any, ItemType.Auto):
         return line
     elif item_type == ItemType.Row:
         return [line]
     elif item_type == ItemType.Record:
         return dict(line=line)
     elif item_type == ItemType.StructRow:
         return ItemType.StructRow.build(data=[line], struct=['line'])
     else:
         msg = 'item_type {} is not supported for {}.parse_lines()'
         raise ValueError(msg.format(item_type, self.__class__.__name__))
Beispiel #23
0
 def to_stream_type(
     self,
     stream_type: StreamType,
     step: AutoCount = AUTO,
     verbose: AutoBool = AUTO,
     **kwargs,
 ) -> Stream:
     stream_type = arg.delayed_acquire(stream_type, self._get_stream_type)
     item_type = self._get_item_type(stream_type)
     data = kwargs.pop('data', None)
     if not arg.is_defined(data):
         data = self._get_items_of_type(item_type,
                                        step=step,
                                        verbose=verbose)
     stream_kwargs = self.get_stream_kwargs(data=data,
                                            step=step,
                                            verbose=verbose,
                                            **kwargs)
     return stream_type.stream(**stream_kwargs)
Beispiel #24
0
 def detect(cls, obj, default=arg.AUTO) -> ClassType:
     if isinstance(obj, str):
         name = obj
     elif inspect.isclass(obj):
         name = obj.__name__
     else:
         name = obj.__class__.__name__
         if name == 'ItemType':
             item_type_name = obj.get_name()
             if item_type_name == 'StructRow':
                 stream_type_obj = StreamType.StructStream
             else:
                 stream_type_name = '{}Stream'.format(item_type_name)
                 stream_type_obj = cls.find_instance(stream_type_name)
             if stream_type_obj is None:
                 stream_type_obj = arg.delayed_acquire(
                     default, cls.get_default)
             return stream_type_obj
     return StreamType(name)
Beispiel #25
0
 def log(
     self,
     msg: Union[str, list, tuple],
     level: Level = arg.AUTO,
     logger: Union[BaseLogger, arg.Auto] = arg.AUTO,
     end: Union[str, arg.Auto] = arg.AUTO,
     verbose: bool = True,
     truncate: bool = True,
     category: Optional[Type] = None,
     stacklevel: Optional[int] = None,
 ) -> None:
     level = arg.acquire(
         level, LoggingLevel.Info if verbose else LoggingLevel.Debug)
     logger = arg.delayed_acquire(logger, self.get_base_logger)
     if isinstance(msg, BaseException):
         msg = str(msg)
     if isinstance(msg, str):
         msg = [msg]
     elif isinstance(msg, Iterable):
         msg = list(msg)
     else:
         raise TypeError(
             'Expected msg as str or list[str], got {}'.format(msg))
     if category:
         category_name = arg.get_name(category)
         msg = [category_name] + msg
     if stacklevel:
         caller = getframeinfo(stack()[stacklevel + 1][0])
         file_name_without_path = caller.filename.split('\\')[-1].split(
             '/')[-1]
         msg = ['{}:{}:'.format(file_name_without_path, caller.lineno)
                ] + msg
     if isinstance(msg, (list, tuple)):
         msg = self.format_message(*msg)
     if not isinstance(level, LoggingLevel):
         level = LoggingLevel(level)
     if logger:
         if self.is_suitable_level(level):
             logging_method = getattr(logger, level.get_method_name())
             logging_method(msg)
     if verbose and not self.is_suitable_level(level):
         self.show(msg, end=end, truncate=truncate)
Beispiel #26
0
 def convert(
     cls,
     obj: Union[EnumItem, Name],
     default: Union[EnumItem, arg.Auto, None] = arg.AUTO,
     skip_missing: bool = False,
 ):
     assert cls.is_prepared(), 'DynamicEnum must be prepared before usage'
     if isinstance(obj, cls):
         return obj
     for string in cls._get_name_and_value(obj):
         instance = cls.find_instance(string)
         if instance:
             return instance
     default = arg.delayed_acquire(default, cls.get_default)
     if default:
         return cls.convert(default)
     elif not skip_missing:
         raise ValueError(
             'item {} is not an instance of DynamicEnum {}'.format(
                 obj, cls.get_enum_name()))
Beispiel #27
0
 def log(
         self,
         msg: Union[str, list, tuple],
         level: Level = arg.AUTO,
         logger: Union[BaseLogger, arg.Auto] = arg.AUTO,
         end: Union[str, arg.Auto] = arg.AUTO,
         verbose: bool = True, truncate: bool = True,
 ) -> NoReturn:
     level = arg.acquire(level, LoggingLevel.Info if verbose else LoggingLevel.Debug)
     logger = arg.delayed_acquire(logger, self.get_base_logger)
     if isinstance(msg, (list, tuple)):
         msg = self.format_message(*msg)
     if not isinstance(level, LoggingLevel):
         level = LoggingLevel(level)
     if logger:
         if self.is_suitable_level(level):
             logging_method = getattr(logger, level.get_method_name())
             logging_method(msg)
     if verbose and not self.is_suitable_level(level):
         self.show(msg, end=end, truncate=truncate)
Beispiel #28
0
 def get_stream_kwargs(self,
                       data: Union[Iterable, Auto] = AUTO,
                       name: AutoName = AUTO,
                       verbose: AutoBool = AUTO,
                       step: AutoCount = AUTO,
                       **kwargs) -> dict:
     name = arg.delayed_acquire(name, self._get_generated_stream_name)
     if not arg.is_defined(data):
         item_type = self._get_item_type()
         data = self._get_items_of_type(item_type,
                                        verbose=verbose,
                                        step=step)
     result = dict(
         data=data,
         name=name,
         source=self,
         count=self._get_fast_count(),
         context=self.get_context(),
     )
     result.update(kwargs)
     return result
Beispiel #29
0
 def get_parsed_line(self,
                     line: str,
                     item_type: Union[ItemType, arg.Auto] = AUTO,
                     default_value=None) -> Item:
     item_type = arg.delayed_acquire(item_type, self.get_default_item_type)
     if item_type in (ItemType.Record, ItemType.Row, ItemType.Any,
                      ItemType.Auto):
         parsed = self._parse_json_line(line, default_value=default_value)
         if isinstance(parsed,
                       ARRAY_TYPES) and item_type == ItemType.Record:
             return dict(item=parsed)
         elif isinstance(parsed, dict) and item_type == ItemType.Row:
             return [parsed]
         else:
             return parsed
     elif item_type == ItemType.Line:
         return line
     elif item_type == ItemType.StructRow:
         return ItemType.StructRow.build(data=[line], struct=['line'])
     else:
         msg = 'item_type {} is not supported for {}.parse_lines()'
         raise ValueError(msg.format(item_type, self.__class__.__name__))
Beispiel #30
0
 def get_items_from_lines(
     self,
     lines: Iterable,
     item_type: Union[ItemType, Auto] = AUTO,
     struct: Union[Array, StructInterface, Auto] = AUTO,
 ) -> Generator:
     item_type = arg.delayed_acquire(item_type, self.get_default_item_type)
     if item_type in (ItemType.Record, ItemType.Row, ItemType.StructRow,
                      ItemType.Any, ItemType.Auto):
         iter_parser = fs.csv_reader(delimiter=self.get_delimiter())
         rows = iter_parser(lines)
         if isinstance(struct, StructInterface):
             column_names = struct.get_columns()
             field_converters = struct.get_converters()
             rows = map(
                 self._get_row_converter(converters=field_converters), rows)
         elif isinstance(struct, ARRAY_TYPES):
             column_names = struct
         else:
             column_names = None
         if item_type in (ItemType.Row, ItemType.Any, ItemType.Auto):
             yield from rows
         elif item_type == ItemType.Record:
             for r in rows:
                 if column_names:
                     yield {k: v for k, v in zip(column_names, r)}
                 else:
                     yield {k: v for k, v in enumerate(r)}
         elif item_type == ItemType.StructRow:
             assert arg.is_defined(struct, check_name=False)
             for r in rows:
                 yield ItemType.StructRow.build(data=r, struct=struct)
     else:  # item_type == ItemType.Line
         for line in lines:
             yield self.get_parsed_line(line,
                                        item_type=item_type,
                                        struct=struct)