def to_stream(self, data: Union[Iterable, Auto] = AUTO, name: AutoName = AUTO, stream_type: Union[StreamType, Auto] = AUTO, ex: OptionalFields = None, step: AutoCount = AUTO, **kwargs) -> Stream: name = Auto.delayed_acquire(name, self._get_generated_stream_name) stream_type = self._get_stream_type(stream_type) stream_class = self._get_stream_class(stream_type) if hasattr(stream_class, 'get_item_type'): item_type = stream_class.get_item_type() else: stream_obj = stream_class([]) if hasattr(stream_obj, 'get_item_type'): item_type = stream_obj.get_item_type() else: item_type = AUTO if not Auto.is_defined(data): data = self._get_items_of_type(item_type, verbose=kwargs.get('verbose', AUTO), step=step) meta = self.get_compatible_meta(stream_class, name=name, ex=ex, **kwargs) if 'count' not in meta and 'count' not in kwargs: meta['count'] = self._get_fast_count() if 'source' not in meta: meta['source'] = self stream = stream_class(data, **meta) return self._assume_stream(stream)
def to_stream(self, data: Union[Iterable, Auto] = AUTO, name: AutoName = AUTO, stream_type: Union[StreamType, Auto] = AUTO, ex: OptionalFields = None, step: AutoCount = AUTO, **kwargs) -> Stream: stream_type = Auto.acquire(stream_type, StreamType.SqlStream) if stream_type == StreamType.SqlStream: assert not Auto.is_defined(data) name = Auto.delayed_acquire(name, self._get_generated_stream_name) stream_class = stream_type.get_class() meta = self.get_compatible_meta(stream_class, name=name, ex=ex, **kwargs) meta['source'] = self return stream_class(data, **meta) else: return super().to_stream( data=data, name=name, stream_type=stream_type, ex=ex, step=step, **kwargs, )
def disk_sort( self, key: UniKey = fs.same(), reverse: bool = False, step: AutoCount = AUTO, verbose: AutoBool = False, ) -> Native: step = Auto.delayed_acquire(step, self.get_limit_items_in_memory) key_function = fs.composite_key(key) stream_parts = self.split_to_disk_by_step( step=step, sort_each_by=key_function, reverse=reverse, verbose=verbose, ) assert stream_parts, 'streams must be non-empty' iterables = [f.get_iter() for f in stream_parts] counts = [f.get_count() or 0 for f in stream_parts] self.log('Merging {} parts... '.format(len(iterables)), verbose=verbose) items = algo.merge_iter( iterables, key_function=key_function, reverse=reverse, post_action=self.get_tmp_files().remove_all, ) stream = self.stream(items, count=sum(counts)) return self._assume_native(stream)
def __init__( self, data: Iterable, name: AutoName = AUTO, check: bool = False, count: AutoCount = None, less_than: AutoCount = None, source: Connector = None, context: Context = None, max_items_in_memory: AutoCount = AUTO, tmp_files: TmpMask = AUTO, ): count = get_optional_len(data, count) if count and Auto.is_defined(count) and not Auto.is_defined(less_than): less_than = count self._tmp_files = None super().__init__( data=data, name=name, check=check, source=source, context=context, count=count, less_than=less_than, max_items_in_memory=max_items_in_memory, ) self._tmp_files = Auto.delayed_acquire(tmp_files, sm.get_tmp_mask, self.get_name())
def can_be_in_memory(self, step: AutoCount = AUTO) -> bool: step = Auto.delayed_acquire(step, self.get_limit_items_in_memory) if self.is_in_memory() or step is None: return True else: count = self.get_estimated_count() if count is None: return False else: return count <= step
def write_items( self, items: Iterable, item_type: Union[ItemType, Auto] = AUTO, add_title_row: AutoBool = AUTO, verbose: AutoBool = AUTO, ) -> Native: item_type = Auto.delayed_acquire(item_type, self.get_default_item_type) content_format = self.get_content_format() assert isinstance(content_format, ParsedFormat) lines = content_format.get_lines(items, item_type=item_type, add_title_row=add_title_row) return self.write_lines(lines, verbose=verbose)
def _get_detected_struct( self, set_struct: bool = False, use_declared_types: AutoBool = AUTO, # ? verbose: AutoBool = AUTO, ) -> Optional[StructInterface]: struct = self.get_struct_from_database(set_struct=set_struct) if not isinstance(struct, StructInterface) and Auto.delayed_acquire( verbose, self.is_verbose): message = 'Struct as {} is deprecated. Use items.FlatStruct instead.'.format( type(struct)) self.log(msg=message, level=LoggingLevel.Warning) return struct
def set_verbose(self, verbose: AutoBool = AUTO, parent: AutoConnector = AUTO) -> Native: if not Auto.is_defined(verbose): parent = Auto.delayed_acquire(parent, self.get_parent) if hasattr(parent, 'is_verbose'): verbose = parent.is_verbose() elif hasattr(parent, 'verbose'): verbose = parent.verbose else: verbose = DEFAULT_VERBOSE self._verbose = verbose return self
def get_count(self, in_memory: AutoBool = AUTO, final: bool = False) -> Count: in_memory = Auto.delayed_acquire(in_memory, self.is_in_memory) if in_memory: data = self.get_list() self._count = len(data) self._data = data return self._count else: if final: return self.final_count() else: return self.get_expected_count()
def to_stream( self, data: Union[Iterable, Auto] = AUTO, name: AutoName = AUTO, stream_type: Union[StreamType, Auto] = AUTO, ex: OptionalFields = None, step: AutoCount = AUTO, **kwargs ) -> Stream: if Auto.is_defined(data): kwargs['data'] = data stream_type = Auto.delayed_acquire(stream_type, self.get_stream_type) assert not ex, 'ex-argument for LocalFile.to_stream() not supported (got {})'.format(ex) return self.to_stream_type(stream_type=stream_type, step=step, **kwargs)
def __init__( self, name: Name, content_format: Union[ContentFormatInterface, ContentType, Auto] = AUTO, struct: Union[StructInterface, Auto, None] = AUTO, first_line_is_title: AutoBool = AUTO, parent: Parent = None, context: AutoContext = AUTO, streams: Links = None, expected_count: AutoCount = AUTO, caption: Optional[str] = None, verbose: AutoBool = AUTO, **kwargs ): self._declared_format = None self._detected_format = None self._modification_ts = None self._count = expected_count self._caption = caption super().__init__(name=name, parent=parent, context=context, children=streams, verbose=verbose) content_format = Auto.delayed_acquire(content_format, self._get_detected_format_by_name, name, **kwargs) suit_classes = ContentType, ContentFormatInterface, str is_deprecated_class = hasattr(content_format, 'get_value') and not isinstance(content_format, suit_classes) if is_deprecated_class: msg = 'LeafConnector({}, {}): content_format as {} is deprecated, use ContentType or ContentFormat instead' self.log(msg.format(name, content_format, content_format.__class__.__name__), level=30) content_format = content_format.get_value() if isinstance(content_format, str): content_format = ContentType(content_format) # ContentType.detect(content_format) ? if isinstance(content_format, ContentType): # tmp fix content_class = content_format.get_class() content_format = content_class(**kwargs) elif isinstance(content_format, ContentFormatInterface): content_format.set_inplace(**kwargs) else: if kwargs: msg = 'LeafConnector: kwargs allowed for ContentType only, not for {}, got kwargs={}' raise ValueError(msg.format(content_format, kwargs)) assert isinstance(content_format, ContentFormatInterface), 'Expect ContentFormat, got {}'.format(content_format) self.set_content_format(content_format, inplace=True) self.set_first_line_title(first_line_is_title) if struct is not None: if struct == AUTO: struct = self._get_detected_struct(use_declared_types=False) if Auto.is_defined(struct, check_name=False): self.set_struct(struct, inplace=True)
def _get_linked_fields_descriptions( self, fields: Union[Iterable, Auto] = AUTO, group_name: str = 'used', prefix: str = ' - ', max_len: int = JUPYTER_LINE_LEN, ) -> Generator: fields = list(Auto.delayed_acquire(fields, self.get_linked_fields)) count = len(fields) yield '{count} {name} fields:'.format(count=count, name=group_name) for f in fields: if isinstance(f, DescribeMixin) or hasattr(f, 'get_one_line_repr'): f_repr = f.get_one_line_repr(max_len=120) else: f_repr = repr(f) f_repr = prefix + f_repr if len(f_repr) > max_len: f_repr = f_repr[:max_len - len(CROP_SUFFIX)] + CROP_SUFFIX yield f_repr
def get_detected_struct_by_title_row( self, set_struct: bool = False, # deprecated argument types: Union[dict, Auto, None] = AUTO, verbose: AutoBool = AUTO, # deprecated argument ) -> Struct: assert self.is_first_line_title( ), 'Can detect struct by title row only if first line is a title row' assert self.is_existing( ), 'For detect struct file/object must be existing: {}'.format( self.get_path()) verbose = Auto.delayed_acquire(verbose, self.is_verbose) title_row = self.get_title_row(close=True) struct = self._get_struct_detected_by_title_row(title_row, types=types) message = 'Struct for {} detected by title row: {}'.format( self.get_name(), struct.get_struct_str(None)) self.log(message, end='\n', verbose=verbose) if set_struct: self.set_struct(struct, inplace=True) return struct
def get_items_of_type( self, item_type: Union[ItemType, Auto], verbose: AutoBool = AUTO, message: AutoName = AUTO, step: AutoCount = AUTO, ) -> Iterable: item_type = Auto.delayed_acquire(item_type, self.get_item_type) rows = self.get_rows(verbose=verbose) if item_type == ItemType.Row: items = rows else: if item_type == ItemType.StructRow: row_class = ItemType.StructRow.get_class() items = map(lambda i: row_class(i, self.get_struct()), rows) elif item_type == ItemType.Record: items = map( lambda r: {c: v for c, v in zip(r, self.get_columns())}, rows) elif item_type == ItemType.Line: items = map(lambda r: '\t'.join([str(v) for v in r]), rows) else: raise ValueError( 'Table.get_items_of_type(): cannot convert Rows to {}'. format(item_type)) if step: logger = self.get_logger() if isinstance(logger, ExtendedLoggerInterface): count = self._get_fast_count() if not Auto.is_defined(message): message = 'Downloading {count} lines from {name}' if '{}' in message: message = message.format(count, self.get_name()) if '{' in message: message = message.format(count=count, name=self.get_name()) items = logger.progress(items, name=message, count=count, step=step, context=self.get_context()) return items
def describe( self, *filters, take_struct_from_source: bool = False, count: Count = DEFAULT_SHOW_COUNT, columns: Columns = None, allow_collect: bool = True, show_header: bool = True, struct_as_dataframe: bool = False, delimiter: str = ' ', output=AUTO, **filter_kwargs ): output = Auto.delayed_acquire(output, self.get_logger) if show_header: for line in self.get_str_headers(): self.output_line(line, output=output) example = self.example(*filters, **filter_kwargs, count=count) if hasattr(self, 'get_struct'): expected_struct = self.get_struct() source_str = 'native' elif take_struct_from_source: expected_struct = self.get_source_struct() source_str = 'from source {}'.format(self.get_source().__repr__()) else: expected_struct = self.get_detected_struct() source_str = 'detected from example items' expected_struct = fc.FlatStruct.convert_to_native(expected_struct) detected_struct = example.get_detected_struct(count) assert isinstance(expected_struct, fc.FlatStruct) or hasattr(expected_struct, 'describe'), expected_struct assert isinstance(detected_struct, fc.FlatStruct) or hasattr(expected_struct, 'describe'), expected_struct detected_struct.validate_about(expected_struct) validation_message = '{} {}'.format(source_str, expected_struct.get_validation_message()) struct_as_dataframe = struct_as_dataframe and get_use_objects_for_output() struct_dataframe = expected_struct.describe( as_dataframe=struct_as_dataframe, show_header=False, output=output, delimiter=delimiter, example=example.get_one_item(), comment=validation_message, ) if struct_as_dataframe: return struct_dataframe else: return example.get_demo_example(as_dataframe=get_use_objects_for_output())
def sort(self, *keys, reverse: bool = False, step: AutoCount = AUTO, verbose: AutoBool = True) -> Native: keys = update(keys) step = Auto.delayed_acquire(step, self.get_limit_items_in_memory) if len(keys) == 0: key_function = fs.same() else: key_function = fs.composite_key(keys) if self.can_be_in_memory(step=step) or step is None: stream = self.memory_sort(key_function, reverse=reverse, verbose=verbose) else: stream = self.disk_sort(key_function, reverse=reverse, step=step, verbose=verbose) return self._assume_native(stream)
def get_stream_kwargs(self, data: Union[Iterable, Auto] = AUTO, name: AutoName = AUTO, verbose: AutoBool = AUTO, step: AutoCount = AUTO, message: AutoName = AUTO, **kwargs) -> dict: name = Auto.delayed_acquire(name, self._get_generated_stream_name) if not Auto.is_defined(data): item_type = self._get_item_type() data = self._get_items_of_type(item_type, verbose=verbose, step=step, message=message) result = dict( data=data, name=name, source=self, count=self._get_fast_count(), context=self.get_context(), ) result.update(kwargs) return result
def to_stream_type( self, stream_type: StreamType, step: AutoCount = AUTO, verbose: AutoBool = AUTO, message: Union[str, Auto, None] = AUTO, **kwargs, ) -> Stream: stream_type = Auto.delayed_acquire(stream_type, self._get_stream_type) item_type = self._get_item_type(stream_type) if item_type == ItemType.StructRow and hasattr( self, 'get_struct') and 'struct' not in kwargs: kwargs['struct'] = self.get_struct() data = kwargs.pop('data', None) if not Auto.is_defined(data): data = self._get_items_of_type(item_type, step=step, verbose=verbose, message=message) stream_kwargs = self.get_stream_kwargs(data=data, step=step, verbose=verbose, **kwargs) return stream_type.stream(**stream_kwargs)
def get_dict_output_field_types(self, struct: Union[Struct, Auto] = AUTO) -> dict: struct = Auto.delayed_acquire(struct, self.get_input_struct) output_types = dict() for d in self.get_descriptions(): output_types.update(d.get_dict_output_field_types(struct)) return output_types