def file( self, name: str, content_format: Union[ContentFormatInterface, ContentType, Auto] = AUTO, filetype: Union[ContentType, Auto] = AUTO, # deprecated argument **kwargs ) -> ConnectorInterface: file = self.get_children().get(name) if kwargs or not file: filename = kwargs.pop('filename', name) file_class = self.get_default_child_obj_class() # LocalFile assert file_class, "connector class or type name aren't detected" if arg.is_defined(filetype): if arg.is_defined(content_format): msg = 'Only one of arguments allowed: filetype (got {}) or content_format (got {})' raise ValueError(msg.format(filetype, content_format)) else: msg = 'LocalFolder.file(): filetype-argument is deprecated, use content_format instead' self.log(level=LoggingLevel.Warning, msg=msg, stacklevel=1) if isinstance(filetype, (ContentType, Auto, str)): content_format = filetype else: # temporary workaround for deprecated FileType class content_format = filetype.get_value() try: file = file_class(filename, content_format=content_format, folder=self, **kwargs) except TypeError as e: raise TypeError('{}.{}'.format(file_class.__name__, e)) self.add_child(file) return file
def stream(self, data, source=arg.AUTO, context=arg.AUTO, *args, **kwargs): if arg.is_defined(source): kwargs['source'] = source if arg.is_defined(context): kwargs['context'] = context stream_class = self.get_class() return stream_class(data, *args, **kwargs)
def update(self, position, step=None, message=None): if arg.is_defined(message): self.set_name(message, inplace=True) if step == 1 or not arg.is_defined(step): self.update_now(position) else: self.update_with_step(position, step)
def stack(cls, *iter_streams, how: How = 'vertical', name=AUTO, context=None, **kwargs): iter_streams = arg.update(iter_streams) assert cls.is_same_stream_type( iter_streams), 'concat(): streams must have same type: {}'.format( iter_streams) result = None for cur_stream in iter_streams: assert isinstance(cur_stream, StreamInterface) if result is None: if hasattr(cur_stream, 'copy'): result = cur_stream.copy() else: result = cur_stream if arg.is_defined(name): result.set_name(name) if arg.is_defined(context): result.set_context(context) elif how == 'vertical': result = result.add_stream(cur_stream) else: result = result.join(cur_stream, how=how, **kwargs) gc.collect() return result
def set_context(self, context: AutoContext, reset: bool = False, inplace: bool = True) -> Optional[Native]: if not arg.is_defined(context): if not self.get_context(skip_missing=True): context = self.get_default_context() if arg.is_defined(context): self.set_parent(context, reset=False, inplace=True) if not inplace: return self
def get_kwargs(self, ex: Optional[Iterable] = None, upd: Options = None) -> dict: kwargs = dict() kwargs.update(self.get_connectors()) kwargs.update(self.get_options()) if arg.is_defined(ex): for k in ex: kwargs.pop(k) if arg.is_defined(upd): kwargs.update(upd) return kwargs
def get_group_header(self, name: Comment = AUTO, caption: Comment = AUTO, comment: Comment = None) -> Iterable[str]: is_title_row = name == arg.AUTO name = arg.acquire(name, self.get_name()) caption = arg.acquire(caption, self.get_caption()) if arg.is_defined(name): yield name if arg.is_defined(caption): yield caption if is_title_row: yield self.get_str_fields_count() if arg.is_defined(comment): yield comment
def get_count(self, allow_reopen: bool = True, allow_slow_gzip: bool = True, force: bool = False) -> Optional[int]: must_recount = force or self.is_changed_by_another( ) or not arg.is_defined(self.get_prev_lines_count()) if self.is_existing() and must_recount: count = self.get_actual_lines_count( allow_reopen=allow_reopen, allow_slow_gzip=allow_slow_gzip) self.set_count(count) else: count = self.get_prev_lines_count() if arg.is_defined(count): return count
def set_context(self, context: AutoContext, reset: bool = False, inplace: bool = True) -> Optional[Native]: if arg.is_defined(context): parent = self.get_parent() if arg.is_defined(parent): parent.set_context(context, reset=False, inplace=True) else: self.set_parent(context, reset=False, inplace=True) if not inplace: return self else: return self
def __init__( self, mask: str, parent: HierarchicConnector, context: AutoContext = None, verbose: AutoBool = AUTO, ): if not arg.is_defined(parent): if arg.is_defined(context): parent = context.get_local_storage() assert parent.is_folder() or parent.is_storage() super().__init__(path=mask, parent=parent, context=context, verbose=verbose)
def __init__( self, name: str = arg.DEFAULT, source: Source = None, context: Context = None, check: bool = True, ): name = arg.undefault(name, arg.get_generated_name(self._get_default_name_prefix())) if arg.is_defined(context): if arg.is_defined(source): source.set_context(context) else: source = context super().__init__(name=name, source=source, check=check) if arg.is_defined(self.get_context()): self.put_into_context(check=check)
def to_stream( self, data: Data = AUTO, stream_type: AutoStreamType = AUTO, ex: OptionalFields = None, **kwargs ) -> Stream: stream_type = arg.delayed_acquire(stream_type, self.get_stream_type) if isinstance(stream_type, str): stream_class = StreamType(stream_type).get_class() elif isclass(stream_type): stream_class = stream_type elif isinstance(stream_type, StreamType) or hasattr(stream_type, 'get_class'): stream_class = stream_type.get_class() else: raise TypeError('AnyStream.to_stream(data, stream_type): expected StreamType, got {}'.format(stream_type)) if not arg.is_defined(data): if hasattr(self, 'get_items_of_type'): item_type = stream_class.get_item_type() data = self.get_items_of_type(item_type) else: data = self.get_data() meta = self.get_compatible_meta(stream_class, ex=ex) meta.update(kwargs) if 'count' not in meta: meta['count'] = self.get_count() if 'source' not in meta: meta['source'] = self.get_source() stream = stream_class(data, **meta) return self._assume_stream(stream)
def map(self, function: Callable, to: Union[StreamType, Auto] = AUTO) -> Native: if arg.is_defined(to): self.log('to-argument for map() is deprecated, use map_to_type() method instead', level=30) stream = super().map_to_type(function, stream_type=to) else: stream = super().map(function) return self._assume_native(stream)
def __init__( self, name: OptName = arg.AUTO, level: Level = arg.AUTO, formatter: Union[Formatter, arg.Auto] = arg.AUTO, loggers: SubLoggers = arg.AUTO, context: Context = None, file: Optional[FileOrName] = None, ): name = arg.acquire(name, DEFAULT_LOGGER_NAME) level = arg.acquire(level, DEFAULT_LOGGING_LEVEL) formatter = arg.acquire(formatter, DEFAULT_FORMATTER) if not isinstance(level, LoggingLevel): level = LoggingLevel(level) if isinstance(loggers, list): loggers = {i: i.get_name() for i in loggers} elif not arg.is_defined(loggers): loggers = dict() if name not in loggers: level_value = arg.get_value(level) base_logger = self.build_base_logger(name, level_value, formatter) loggers[name] = base_logger self._level = level super().__init__(name=name, children=loggers, context=context) if file: self.set_file(file)
def __init__( self, name: Name, src: Connector, dst: Connector, procedure: Optional[Callable], options: Options = None, apply_to_stream: bool = True, stream_type: OptStreamType = arg.DEFAULT, context: Context = arg.DEFAULT, ): if not arg.is_defined(options): options = dict() super().__init__( name=name, connectors={ SRC_ID: src, DST_ID: dst }, procedure=procedure, options=options, apply_to_stream=apply_to_stream, stream_type=stream_type, context=context, )
def csv_reader(delimiter: Union[str, arg.Auto, None] = arg.AUTO, *args, **kwargs) -> Callable: if arg.is_defined(delimiter): return lambda a: csv.reader(a, delimiter=delimiter, *args, **kwargs) else: return csv.reader
def get_parsed_line( self, line: str, item_type: Union[ItemType, Auto] = AUTO, struct: Union[Array, StructInterface, Auto] = AUTO, ) -> Item: item_type = arg.delayed_acquire(item_type, self.get_default_item_type) if item_type == ItemType.Line: return line line_parser = fs.csv_loads(delimiter=self.get_delimiter()) row = line_parser(line) if isinstance(struct, StructInterface): field_converters = struct.get_converters() row_converter = self._get_row_converter( converters=field_converters) row = row_converter(row) if item_type in (ItemType.Row, ItemType.Any, ItemType.Auto): return row if not arg.is_defined(struct, check_name=False): column_count = len(row) struct = list(range(column_count)) if item_type == ItemType.Record: return {arg.get_name(k): v for k, v in zip(struct, row)} elif item_type == ItemType.StructRow: return ItemType.StructRow.build(data=row, struct=struct) else: msg = 'item_type {} is not supported for {}.parse_lines()' raise ValueError(msg.format(item_type, self.__class__.__name__))
def get_field_value_from_item(field: FieldID, item: ConcreteItem, item_type: ItemType = ItemType.Auto, skip_errors: bool = False, logger=None, default: Value = None): if field == STAR: return item if item_type == ItemType.Auto or not arg.is_defined(item_type): item_type = ItemType.detect(item, default='any') if isinstance(item_type, str): item_type = ItemType(item_type) else: item_type = ItemType(item_type.value) try: return item_type.get_value_from_item( item=item, field=field, default=default, skip_unsupported_types=skip_errors, ) except IndexError as e: pass except TypeError as e: pass msg = 'Field {} does no exists in current item ({})'.format(field, e) if skip_errors: if logger: logger.log(msg) return default else: raise IndexError(msg)
def get_child_class_by_name_and_type(self, name: str, filetype: Union[FileType, Auto] = AUTO) -> Type: if arg.is_defined(filetype): return FileType(filetype).get_class() else: supposed_type = self.get_type_by_name(name) if supposed_type: return supposed_type.get_class()
def get_lines( self, count: Optional[int] = None, skip_first: bool = False, allow_reopen: bool = True, check: bool = True, verbose: AutoBool = AUTO, message: Union[str, Auto] = AUTO, step: AutoCount = AUTO, ) -> Iterable: if check and not self.is_gzip(): # assert self.get_count(allow_reopen=True) > 0 assert not self.is_empty( ), 'for get_lines() file must be non-empty: {}'.format(self) self.open(allow_reopen=allow_reopen) lines = self.get_next_lines(count=count, skip_first=skip_first, close=True) verbose = arg.acquire(verbose, self.is_verbose()) if verbose or arg.is_defined(message): message = arg.acquire(message, 'Reading {}') if '{}' in message: message = message.format(self.get_name()) logger = self.get_logger() assert hasattr(logger, 'progress'), '{} has no progress in {}'.format( self, logger) if not count: count = self.get_count(allow_slow_gzip=False) lines = self.get_logger().progress(lines, name=message, count=count, step=step) return lines
def get_dataframe(self, columns: Columns = None) -> DataFrame: if pd and get_use_objects_for_output(): dataframe = DataFrame(self.get_items()) if arg.is_defined(columns): columns = arg.get_names(columns) dataframe = dataframe[columns] return dataframe
def describe( self, example: Optional[dict] = None, as_dataframe: bool = False, separate_by_tabs: bool = False, show_header: bool = True, comment: Comment = None, select_fields: Optional[Array] = None, logger: Union[ExtLogger, Auto] = AUTO, ) -> Optional[DataFrame]: log = logger.log if arg.is_defined(logger) else print if show_header: for line in self.get_group_header(comment=comment): log(line) log('') if as_dataframe: return self.show() else: columns, template = self._get_describe_template(example) log('\t'.join(columns) if separate_by_tabs else template.format(*columns)) for (n, type_name, name, caption, is_valid) in self.get_struct_description(include_header=False): if type_name == GROUP_TYPE_STR: log('') for line in self.get_group_header(name, caption=caption): log(line) else: if name in (select_fields or []): is_valid = '>' if is_valid == '.' else str(is_valid).upper() if example: value = str(example.get(name)) row = (is_valid, n, type_name, name, value, caption) else: row = (is_valid, n, type_name, name, caption) log('\t'.join(row) if separate_by_tabs else template.format(*row))
def __init__( self, name: str, content_format: Union[ContentFormatInterface, Auto] = AUTO, struct: Union[Struct, Auto, None] = AUTO, folder: Connector = None, context: Context = AUTO, expected_count: AutoCount = AUTO, verbose: AutoBool = AUTO, ): if folder: message = 'only LocalFolder supported for *File instances (got {})'.format( type(folder)) assert isinstance( folder, ConnectorInterface) or folder.is_folder(), message elif arg.is_defined(context): folder = context.get_job_folder() else: folder = self.get_default_folder() self._fileholder = None super().__init__( name=name, content_format=content_format, struct=struct, expected_count=expected_count, parent=folder, context=context, verbose=verbose, )
def map(self, function: Callable, to: AutoStreamType = AUTO) -> Native: if arg.is_defined(to): self.get_logger().warning('to-argument for map() is deprecated, use map_to() instead') stream = super().map_to(function, stream_type=to) else: stream = super().map(function) return self._assume_native(stream)
def append_field( self, field: Field, default_type: FieldType = FieldType.Any, before: bool = False, exclude_duplicates: bool = True, reassign_struct_name: bool = False, inplace: bool = True, ) -> Optional[Native]: if self._is_field(field): field_desc = field elif isinstance(field, str): field_desc = AdvancedField(field, default_type) elif isinstance(field, ARRAY_TYPES): field_desc = AdvancedField(*field) elif isinstance(field, dict): field_desc = AdvancedField(**field) else: raise TypeError('Expected field, str or dict, got {} as {}'.format(field, type(field))) if exclude_duplicates and field_desc.get_name() in self.get_field_names(): return self else: if isinstance(field_desc, AdvancedField): if reassign_struct_name or not arg.is_defined(field_desc.get_group_name()): field_desc.set_group_name(self.get_name(), inplace=True) field_desc.set_group_caption(self.get_caption(), inplace=True) if before: fields = [field_desc] + self.get_fields() else: fields = self.get_fields() + [field_desc] return self.set_fields(fields, inplace=inplace)
def get_context(self) -> Parent: parent = self.get_parent() if arg.is_defined(parent): if parent.is_context(): return parent elif hasattr(parent, 'get_context'): return parent.get_context()
def to_stream(self, data: Union[Iterable, Auto] = AUTO, name: AutoName = AUTO, stream_type: Union[StreamType, Auto] = AUTO, ex: OptionalFields = None, step: AutoCount = AUTO, **kwargs) -> Stream: name = arg.delayed_acquire(name, self._get_generated_stream_name) stream_type = self._get_stream_type(stream_type) stream_class = self._get_stream_class(stream_type) if hasattr(stream_class, 'get_item_type'): item_type = stream_class.get_item_type() else: item_type = AUTO if not arg.is_defined(data): data = self._get_items_of_type(item_type, verbose=kwargs.get('verbose', AUTO), step=step) meta = self.get_compatible_meta(stream_class, name=name, ex=ex, **kwargs) if 'count' not in meta: meta['count'] = self._get_fast_count() if 'source' not in meta: meta['source'] = self stream = stream_class(data, **meta) return self._assume_stream(stream)
def stream(self, data: Iterable, stream_type: AutoStreamType = AUTO, ex: OptionalArguments = None, save_name: bool = True, save_count: bool = True, **kwargs) -> Stream: if arg.is_defined(stream_type): if isinstance(stream_type, str): stream_class = StreamType(stream_type).get_class() else: stream_class = stream_type.get_class() meta = self.get_compatible_meta(stream_class, ex=ex) else: stream_class = self.__class__ meta = self.get_meta() if not save_name: meta.pop('name') if not save_count: meta.pop('count') meta.update(kwargs) if 'context' not in meta: meta['context'] = self.get_context() stream = stream_class(data, **meta) return stream
def __init__( self, path: str, path_is_relative: AutoBool = AUTO, parent: AutoConnector = AUTO, context: AutoContext = None, verbose: AutoBool = AUTO, ): if not arg.is_defined(parent): if arg.is_defined(context): parent = context.get_local_storage() else: parent = self.get_default_storage() parent = self._assume_native(parent) self._path_is_relative = arg.acquire(path_is_relative, not arg.is_absolute_path(path)) super().__init__(name=path, parent=parent, verbose=verbose)
def get_lines(self, items: Iterable, item_type: ItemType, add_title_row: AutoBool = AUTO) -> Generator: if arg.is_defined(add_title_row): assert not add_title_row, 'title_row available in FlatStructFormat only' for i in items: yield self.get_formatted_item(i, item_type=item_type)