def set_logger( self, logger: Union[Logger, Auto] = AUTO, selection_logger: Union[Logger, Auto] = AUTO, ) -> None: self._logger = Auto.acquire(logger, getattr(logger, 'get_logger', None)) self._selection_logger = Auto.acquire(selection_logger, getattr(logger, 'get_selection_logger', None))
def to_stream(self, data: Union[Iterable, Auto] = AUTO, name: AutoName = AUTO, stream_type: Union[StreamType, Auto] = AUTO, ex: OptionalFields = None, step: AutoCount = AUTO, **kwargs) -> Stream: name = Auto.delayed_acquire(name, self._get_generated_stream_name) stream_type = self._get_stream_type(stream_type) stream_class = self._get_stream_class(stream_type) if hasattr(stream_class, 'get_item_type'): item_type = stream_class.get_item_type() else: stream_obj = stream_class([]) if hasattr(stream_obj, 'get_item_type'): item_type = stream_obj.get_item_type() else: item_type = AUTO if not Auto.is_defined(data): data = self._get_items_of_type(item_type, verbose=kwargs.get('verbose', AUTO), step=step) meta = self.get_compatible_meta(stream_class, name=name, ex=ex, **kwargs) if 'count' not in meta and 'count' not in kwargs: meta['count'] = self._get_fast_count() if 'source' not in meta: meta['source'] = self stream = stream_class(data, **meta) return self._assume_stream(stream)
def to_stream(self, data: Union[Iterable, Auto] = AUTO, name: AutoName = AUTO, stream_type: Union[StreamType, Auto] = AUTO, ex: OptionalFields = None, step: AutoCount = AUTO, **kwargs) -> Stream: stream_type = Auto.acquire(stream_type, StreamType.SqlStream) if stream_type == StreamType.SqlStream: assert not Auto.is_defined(data) name = Auto.delayed_acquire(name, self._get_generated_stream_name) stream_class = stream_type.get_class() meta = self.get_compatible_meta(stream_class, name=name, ex=ex, **kwargs) meta['source'] = self return stream_class(data, **meta) else: return super().to_stream( data=data, name=name, stream_type=stream_type, ex=ex, step=step, **kwargs, )
def __init__( self, data: Iterable, name: AutoName = AUTO, check: bool = False, count: AutoCount = None, less_than: AutoCount = None, source: Connector = None, context: Context = None, max_items_in_memory: AutoCount = AUTO, tmp_files: TmpMask = AUTO, ): count = get_optional_len(data, count) if count and Auto.is_defined(count) and not Auto.is_defined(less_than): less_than = count self._tmp_files = None super().__init__( data=data, name=name, check=check, source=source, context=context, count=count, less_than=less_than, max_items_in_memory=max_items_in_memory, ) self._tmp_files = Auto.delayed_acquire(tmp_files, sm.get_tmp_mask, self.get_name())
def simple_select( self, fields: OptionalFields, filters: OptionalFields = None, sort: OptionalFields = None, count: Count = None, stream_type: Union[StreamType, Auto] = AUTO, verbose: AutoBool = AUTO, ) -> Stream: stream_type = Auto.acquire(stream_type, StreamType.RecordStream) stream_class = stream_type.get_class() stream_rows = self.execute_select(fields=fields, filters=filters, sort=sort, count=count, verbose=verbose) if stream_type == StreamType.RowStream: stream_data = stream_rows elif stream_type == StreamType.RecordStream: columns = self.get_columns() stream_data = map(lambda r: dict(zip(columns, r)), stream_rows) else: raise NotImplementedError if Auto.is_defined(count): if count < MAX_ITEMS_IN_MEMORY: stream_data = list(stream_data) count = len(stream_data) return stream_class(stream_data, count=count, source=self, context=self.get_context())
def __init__( self, name: str, content_format: Union[ContentFormatInterface, Auto] = AUTO, struct: Union[Struct, Auto, None] = AUTO, folder: Connector = None, context: Context = AUTO, first_line_is_title: AutoBool = AUTO, expected_count: AutoCount = AUTO, caption: Optional[str] = None, verbose: AutoBool = AUTO, **kwargs ): parent = kwargs.pop('parent', None) if folder: message = 'only LocalFolder supported for *File instances (got {})'.format(type(folder)) assert isinstance(folder, ConnectorInterface) or folder.is_folder(), message assert folder == parent or not Auto.is_defined(parent) elif Auto.is_defined(parent): folder = parent elif Auto.is_defined(context): folder = context.get_job_folder() else: folder = self.get_default_folder() self._fileholder = None super().__init__( name=name, content_format=content_format, struct=struct, first_line_is_title=first_line_is_title, expected_count=expected_count, caption=caption, parent=folder, context=context, verbose=verbose, **kwargs, )
def get_new_progress(self, name: str, count: Optional[int] = None, context: AutoContext = AUTO): logger = self.get_logger() if Auto.is_defined(context) and not Auto.is_defined(logger): logger = context.get_logger() if isinstance(logger, ExtendedLoggerInterface) or hasattr( logger, 'get_new_progress'): return logger.get_new_progress(name, count=count, context=context)
def __init__( self, mask: str, parent: HierarchicConnector, context: AutoContext = None, verbose: AutoBool = AUTO, ): if not Auto.is_defined(parent): if Auto.is_defined(context): parent = context.get_local_storage() assert parent.is_folder() or parent.is_storage() super().__init__(path=mask, parent=parent, context=context, verbose=verbose)
def set_context(self, context: AutoContext, reset: bool = False, inplace: bool = True) -> Optional[Native]: if context: parent = self.get_parent() if Auto.is_defined(parent): parent.set_context(context, reset=False, inplace=True) elif Auto.is_defined(context): self.set_parent(context, reset=False, inplace=True) if not inplace: return self
def set_verbose(self, verbose: AutoBool = AUTO, parent: AutoConnector = AUTO) -> Native: if not Auto.is_defined(verbose): parent = Auto.delayed_acquire(parent, self.get_parent) if hasattr(parent, 'is_verbose'): verbose = parent.is_verbose() elif hasattr(parent, 'verbose'): verbose = parent.verbose else: verbose = DEFAULT_VERBOSE self._verbose = verbose return self
def get_count(self, allow_reopen: bool = True, allow_slow_mode: bool = True, force: bool = False) -> Count: must_recount = force or self.is_outdated() or not Auto.is_defined( self.get_prev_lines_count()) if self.is_existing() and must_recount: count = self.get_actual_lines_count( allow_reopen=allow_reopen, allow_slow_mode=allow_slow_mode) self.set_count(count) else: count = self.get_prev_lines_count() if Auto.is_defined(count): return count
def to_stream( self, data: Union[Iterable, Auto] = AUTO, name: AutoName = AUTO, stream_type: Union[StreamType, Auto] = AUTO, ex: OptionalFields = None, step: AutoCount = AUTO, **kwargs ) -> Stream: if Auto.is_defined(data): kwargs['data'] = data stream_type = Auto.delayed_acquire(stream_type, self.get_stream_type) assert not ex, 'ex-argument for LocalFile.to_stream() not supported (got {})'.format(ex) return self.to_stream_type(stream_type=stream_type, step=step, **kwargs)
def _collect_inplace(self, log: AutoBool = AUTO) -> None: estimated_count = self.get_estimated_count() if Auto.is_defined(estimated_count): log = Auto.acquire( log, estimated_count > self.get_limit_items_in_memory()) if log and estimated_count: self.log( 'Trying to collect {} items into memory from {}...'.format( estimated_count, self.__repr__())) self.set_data(self.get_list(), inplace=True) self.update_count(force=False) if log: self.log('Collected {} items into memory from {}...'.format( estimated_count, self.__repr__()))
def get_fast_lines_count(self, ending: Union[str, Auto] = AUTO, verbose: AutoBool = AUTO) -> int: if self.is_gzip(): raise ValueError('get_fast_lines_count() method is not available for gzip-files') if not Auto.is_defined(ending): if hasattr(self, 'get_content_format'): ending = self.get_content_format().get_ending() else: ending = '\n' verbose = Auto.acquire(verbose, self.is_verbose()) self.log('Counting lines in {}...'.format(self.get_name()), end='\r', verbose=verbose) count_n_symbol = sum(chunk.count(ending) for chunk in self.get_chunks()) count_lines = count_n_symbol + 1 self.set_count(count_lines) return count_lines
def get_group_header(self, name: Comment = AUTO, caption: Comment = AUTO, comment: Comment = None) -> Iterable[str]: is_title_row = name == AUTO name = Auto.acquire(name, self.get_name()) caption = Auto.acquire(caption, self.get_caption()) if Auto.is_defined(name): yield name if Auto.is_defined(caption): yield caption if is_title_row: yield self.get_str_fields_count() if Auto.is_defined(comment): yield comment
def is_inside_folder(self, folder: Union[str, Connector, Auto] = AUTO) -> bool: folder_obj = Auto.acquire(folder, self.get_folder()) if isinstance(folder_obj, str): folder_path = folder_obj else: # elif isinstance(folder_obj, LocalFolder) folder_path = folder_obj.get_path() return self.get_folder_path() in folder_path
def disk_sort( self, key: UniKey = fs.same(), reverse: bool = False, step: AutoCount = AUTO, verbose: AutoBool = False, ) -> Native: step = Auto.delayed_acquire(step, self.get_limit_items_in_memory) key_function = fs.composite_key(key) stream_parts = self.split_to_disk_by_step( step=step, sort_each_by=key_function, reverse=reverse, verbose=verbose, ) assert stream_parts, 'streams must be non-empty' iterables = [f.get_iter() for f in stream_parts] counts = [f.get_count() or 0 for f in stream_parts] self.log('Merging {} parts... '.format(len(iterables)), verbose=verbose) items = algo.merge_iter( iterables, key_function=key_function, reverse=reverse, post_action=self.get_tmp_files().remove_all, ) stream = self.stream(items, count=sum(counts)) return self._assume_native(stream)
def insert_data( self, table: Union[Table, Name], data: Data, struct: Struct = None, encoding: Optional[str] = None, skip_errors: bool = False, skip_lines: Count = 0, skip_first_line: bool = False, step: AutoCount = DEFAULT_STEP, verbose: AutoBool = AUTO, ) -> tuple: if not Auto.is_defined(skip_lines): skip_lines = 0 is_struct_description = isinstance(struct, StructInterface) or hasattr(struct, 'get_struct_str') if not is_struct_description: message = 'Struct as {} is deprecated, use FlatStruct instead'.format(type(struct)) self.log(msg=message, level=LoggingLevel.Warning) struct = FlatStruct(struct or []) input_stream = self._get_struct_stream_from_data( data, struct=struct, encoding=encoding, skip_first_line=skip_first_line, verbose=verbose, ) if skip_lines: input_stream = input_stream.skip(skip_lines) if input_stream.get_stream_type() != StreamType.StructStream: input_stream = input_stream.structure( struct, skip_bad_rows=True, verbose=True, ).update_meta( count=input_stream.get_count(), ) initial_count = input_stream.get_estimated_count() + skip_lines final_count = self.insert_struct_stream( table, input_stream, skip_errors=skip_errors, step=step, verbose=verbose, ) return initial_count, final_count
def force_upload_table( self, table: Union[Table, Name], struct: Struct, data: Data, encoding: Optional[str] = None, step: AutoCount = DEFAULT_STEP, skip_lines: Count = 0, skip_first_line: bool = False, max_error_rate: float = 0.0, verbose: AutoBool = AUTO, ) -> Table: verbose = Auto.acquire(verbose, self.verbose) table_name, struct = self._get_table_name_and_struct(table, struct) if not skip_lines: self.create_table(table_name, struct=struct, drop_if_exists=True, verbose=verbose) skip_errors = (max_error_rate is None) or (max_error_rate > DEFAULT_ERRORS_THRESHOLD) initial_count, write_count = self.insert_data( table, struct=struct, data=data, encoding=encoding, skip_first_line=skip_first_line, step=step, skip_lines=skip_lines, skip_errors=skip_errors, verbose=verbose, ) write_count += (skip_lines if isinstance(skip_lines, int) else 0) # can be None or Auto result_count = self.select_count(table) if write_count: error_rate = (write_count - result_count) / write_count message = 'Check counts: {} initial, {} uploaded, {} written, {} error_rate' else: error_rate = 1.0 message = 'ERR: Data {} and/or Table {} is empty.'.format(data, table) self.log(message.format(initial_count, write_count, result_count, error_rate), verbose=verbose) if max_error_rate is not None: message = 'Too many errors or skipped lines ({} > {})'.format(error_rate, max_error_rate) assert error_rate < max_error_rate, message return self.table(table, struct=struct)
def get_struct_from_source( self, set_struct: bool = False, use_declared_types: bool = True, verbose: AutoBool = AUTO, ) -> Struct: assert self.is_existing( ), 'For detect struct file/object must be existing: {}'.format( self.get_path()) verbose = Auto.acquire(verbose, self.is_verbose()) declared_types = dict() if use_declared_types: declared_format = self.get_declared_format() if isinstance(declared_format, FlatStructFormat) or hasattr( declared_format, 'get_struct'): declared_struct = declared_format.get_struct() if isinstance(declared_struct, StructInterface) or hasattr( declared_struct, 'get_types_dict'): declared_types = declared_struct.get_types_dict() struct = self._get_struct_from_source(types=declared_types, verbose=verbose) message = 'Struct for {} detected by title row: {}'.format( self.get_name(), struct.get_struct_str(None)) self.log(message, end='\n', verbose=verbose) if set_struct: self.set_struct(struct, inplace=True) return struct
def execute_if_exists( self, query: str, table: Union[Table, Name], message_if_yes: Optional[str] = None, message_if_no: Optional[str] = None, stop_if_no: bool = False, verbose: AutoBool = AUTO, ) -> Optional[Iterable]: verbose = Auto.acquire(verbose, message_if_yes or message_if_no) table_name = self._get_table_name(table) table_exists = self.exists_table(table_name, verbose=verbose) if table_exists: if '{}' in query: query = query.format(table_name) result = self.execute(query, verbose=verbose) if message_if_yes: if '{}' in message_if_yes: message_if_yes = message_if_yes.format(table_name) self.log(message_if_yes, verbose=verbose) return result else: if message_if_no and '{}' in message_if_no: message_if_no = message_if_no.format(table_name) if stop_if_no: raise ValueError(message_if_no) else: if message_if_no: self.log(message_if_no, verbose=verbose)
def __init__( self, fields: Iterable, name: StructName = None, caption: Optional[str] = None, default_type: Type = AUTO, exclude_duplicates: bool = False, reassign_struct_name: bool = False, ): name = Auto.acquire(name, get_generated_name(prefix='FieldGroup')) self._caption = caption or '' super().__init__(name=name, data=list()) for field_or_group in fields: kwargs = dict( default_type=default_type, exclude_duplicates=exclude_duplicates, reassign_struct_name=reassign_struct_name, inplace=True, ) if isinstance(field_or_group, StructInterface): # FieldGroup self.add_fields(field_or_group.get_fields_descriptions(), **kwargs) elif isinstance( field_or_group, list ): # not tuple (tuple can be old-style FieldDescription self.add_fields(*field_or_group, **kwargs) elif field_or_group: self.append_field(field_or_group, **kwargs)
def get_struct_repr_lines(self, example: Optional[dict] = None, delimiter: str = COLUMN_DELIMITER, select_fields: Optional[Array] = None, count: Optional[int] = None) -> Generator: columns, template = self._get_describe_template(example) separate_by_tabs = delimiter == '\t' yield '\t'.join(columns) if separate_by_tabs else template.format( *columns) for (n, type_name, name, caption, is_valid) in self.get_struct_description(include_header=False): if type_name == GROUP_TYPE_STR: yield '' for line in self.get_group_header(name, caption=caption): yield line else: if name in (select_fields or []): is_valid = '>' if is_valid == '.' else str( is_valid).upper() if example: value = str(example.get(name)) row = (is_valid, n, type_name, name, value, caption) else: row = (is_valid, n, type_name, name, caption) yield '\t'.join(row) if separate_by_tabs else template.format( *row) if Auto.is_defined(count): if n >= count - 1: break
def get_types_list( self, dialect: Union[DialectType, Auto] = DialectType.String) -> list: if Auto.is_defined(dialect): return [f.get_type_in(dialect) for f in self.get_fields()] else: return [f.get_type() for f in self.get_fields()]
def hist( data: Data, *fields, in_memory: AutoBool = AUTO, step: Count = DEFAULT_STEP, logger: Union[LoggerInterface, Auto] = AUTO, msg: Optional[Message] = None, ) -> RegularStream: stream = _stream(data) total_count = stream.get_count() in_memory = Auto.acquire(in_memory, stream.is_in_memory()) logger = Auto.acquire(logger, stream.get_logger, delayed=True) # if in_memory: if in_memory or len(fields) > 1: stream = stream.stream( get_hist_records(stream, fields, in_memory=in_memory, logger=logger, msg=msg), stream_type='RecordStream', ) else: stream = stream if len(fields) <= 1 else stream.tee_stream() f = fields[0] if logger: logger.log('Calc hist for field {}...'.format(f)) stream = stream.to_stream( stream_type='RecordStream', columns=fields, ).select( f, ).group_by( f, values=['-'], step=step, ).select( field=lambda r, k=f: k, value=f, count=('-', len), ).sort('value') if not total_count: stream = stream.to_memory() total_count = sum(stream.filter(field=fields[0]).get_one_column_values('count')) stream = stream.select( '*', total_count=fs.const(total_count), share=('count', 'total_count', lambda c, t: c / t if t else None), ) return _assume_native(stream)
def __init__( self, name: Name, content_format: Union[ContentFormatInterface, ContentType, Auto] = AUTO, struct: Union[StructInterface, Auto, None] = AUTO, first_line_is_title: AutoBool = AUTO, parent: Parent = None, context: AutoContext = AUTO, streams: Links = None, expected_count: AutoCount = AUTO, caption: Optional[str] = None, verbose: AutoBool = AUTO, **kwargs ): self._declared_format = None self._detected_format = None self._modification_ts = None self._count = expected_count self._caption = caption super().__init__(name=name, parent=parent, context=context, children=streams, verbose=verbose) content_format = Auto.delayed_acquire(content_format, self._get_detected_format_by_name, name, **kwargs) suit_classes = ContentType, ContentFormatInterface, str is_deprecated_class = hasattr(content_format, 'get_value') and not isinstance(content_format, suit_classes) if is_deprecated_class: msg = 'LeafConnector({}, {}): content_format as {} is deprecated, use ContentType or ContentFormat instead' self.log(msg.format(name, content_format, content_format.__class__.__name__), level=30) content_format = content_format.get_value() if isinstance(content_format, str): content_format = ContentType(content_format) # ContentType.detect(content_format) ? if isinstance(content_format, ContentType): # tmp fix content_class = content_format.get_class() content_format = content_class(**kwargs) elif isinstance(content_format, ContentFormatInterface): content_format.set_inplace(**kwargs) else: if kwargs: msg = 'LeafConnector: kwargs allowed for ContentType only, not for {}, got kwargs={}' raise ValueError(msg.format(content_format, kwargs)) assert isinstance(content_format, ContentFormatInterface), 'Expect ContentFormat, got {}'.format(content_format) self.set_content_format(content_format, inplace=True) self.set_first_line_title(first_line_is_title) if struct is not None: if struct == AUTO: struct = self._get_detected_struct(use_declared_types=False) if Auto.is_defined(struct, check_name=False): self.set_struct(struct, inplace=True)
def show(self, count: Optional[int] = None, as_dataframe: Union[bool, Auto] = AUTO) -> Optional[DataFrame]: as_dataframe = Auto.acquire(as_dataframe, get_use_objects_for_output()) if as_dataframe: return self.get_dataframe() else: return self.describe(as_dataframe=False)
def format(self, value, skip_errors: bool = False) -> str: representation = self.get_representation() if Auto.is_defined(representation): try: return representation.format(value, skip_errors=skip_errors) except AttributeError: return representation.format(value) else: return str(value)
def get_detected_format( self, detect: bool = True, force: bool = False, skip_missing: bool = True, ) -> ContentFormatInterface: if force or (detect and not Auto.is_defined(self._detected_format)): self.reset_detected_format(use_declared_types=True, skip_missing=skip_missing) return self._detected_format
def insert_rows( self, table: str, rows: Iterable, columns: Array, step: int = DEFAULT_STEP, skip_errors: bool = False, expected_count: Count = None, return_count: bool = True, verbose: AutoBool = AUTO, ) -> Count: assert isinstance(columns, ARRAY_TYPES), 'list or tuple expected, got {}'.format(columns) verbose = Auto.acquire(verbose, self.verbose) if isinstance(rows, Sized): count = len(rows) else: count = expected_count conn = self.connect(reconnect=True) cur = conn.cursor() use_fast_batch_method = not skip_errors query_args = dict(table=table) if use_fast_batch_method: query_template = 'INSERT INTO {table} VALUES ({values});' placeholders = ['%({})s'.format(c) for c in columns] else: # elif skip_errors: query_template = 'INSERT INTO {table} ({columns}) VALUES ({values})' placeholders = ['%s' for _ in columns] query_args['columns'] = ', '.join(columns) query_args['values'] = ', '.join(placeholders) query = query_template.format(**query_args) message = verbose if isinstance(verbose, str) else 'Commit {}b to {}'.format(step, table) progress = self.get_new_progress(message, count=count) progress.start() records_batch = list() n = 0 for n, row in enumerate(rows): if use_fast_batch_method: current_record = {k: v for k, v in zip(columns, row)} records_batch.append(current_record) elif skip_errors: try: cur.execute(query, row) except TypeError or IndexError as e: # TypeError: not all arguments converted during string formatting self.log('Error line: {}'.format(str(row)), level=LoggingLevel.Debug, verbose=verbose) self.log('{}: {}'.format(e.__class__.__name__, e), level=LoggingLevel.Error) if (n + 1) % step == 0: if use_fast_batch_method: self.execute_batch(query, records_batch, step, cursor=cur) records_batch = list() if not progress.get_position(): progress.update(0) conn.commit() progress.update(n) gc.collect() if use_fast_batch_method: self.execute_batch(query, records_batch, step, cursor=cur) conn.commit() progress.finish(n) if return_count: return n