def set_logger( self, logger: Union[Logger, Auto] = AUTO, selection_logger: Union[Logger, Auto] = AUTO, ) -> None: self._logger = Auto.acquire(logger, getattr(logger, 'get_logger', None)) self._selection_logger = Auto.acquire(selection_logger, getattr(logger, 'get_selection_logger', None))
def get_group_header(self, name: Comment = AUTO, caption: Comment = AUTO, comment: Comment = None) -> Iterable[str]: is_title_row = name == AUTO name = Auto.acquire(name, self.get_name()) caption = Auto.acquire(caption, self.get_caption()) if Auto.is_defined(name): yield name if Auto.is_defined(caption): yield caption if is_title_row: yield self.get_str_fields_count() if Auto.is_defined(comment): yield comment
def __init__( self, fields: Iterable, name: StructName = None, caption: Optional[str] = None, default_type: Type = AUTO, exclude_duplicates: bool = False, reassign_struct_name: bool = False, ): name = Auto.acquire(name, get_generated_name(prefix='FieldGroup')) self._caption = caption or '' super().__init__(name=name, data=list()) for field_or_group in fields: kwargs = dict( default_type=default_type, exclude_duplicates=exclude_duplicates, reassign_struct_name=reassign_struct_name, inplace=True, ) if isinstance(field_or_group, StructInterface): # FieldGroup self.add_fields(field_or_group.get_fields_descriptions(), **kwargs) elif isinstance( field_or_group, list ): # not tuple (tuple can be old-style FieldDescription self.add_fields(*field_or_group, **kwargs) elif field_or_group: self.append_field(field_or_group, **kwargs)
def force_upload_table( self, table: Union[Table, Name], struct: Struct, data: Data, encoding: Optional[str] = None, step: AutoCount = DEFAULT_STEP, skip_lines: Count = 0, skip_first_line: bool = False, max_error_rate: float = 0.0, verbose: AutoBool = AUTO, ) -> Table: verbose = Auto.acquire(verbose, self.verbose) table_name, struct = self._get_table_name_and_struct(table, struct) if not skip_lines: self.create_table(table_name, struct=struct, drop_if_exists=True, verbose=verbose) skip_errors = (max_error_rate is None) or (max_error_rate > DEFAULT_ERRORS_THRESHOLD) initial_count, write_count = self.insert_data( table, struct=struct, data=data, encoding=encoding, skip_first_line=skip_first_line, step=step, skip_lines=skip_lines, skip_errors=skip_errors, verbose=verbose, ) write_count += (skip_lines if isinstance(skip_lines, int) else 0) # can be None or Auto result_count = self.select_count(table) if write_count: error_rate = (write_count - result_count) / write_count message = 'Check counts: {} initial, {} uploaded, {} written, {} error_rate' else: error_rate = 1.0 message = 'ERR: Data {} and/or Table {} is empty.'.format(data, table) self.log(message.format(initial_count, write_count, result_count, error_rate), verbose=verbose) if max_error_rate is not None: message = 'Too many errors or skipped lines ({} > {})'.format(error_rate, max_error_rate) assert error_rate < max_error_rate, message return self.table(table, struct=struct)
def execute_if_exists( self, query: str, table: Union[Table, Name], message_if_yes: Optional[str] = None, message_if_no: Optional[str] = None, stop_if_no: bool = False, verbose: AutoBool = AUTO, ) -> Optional[Iterable]: verbose = Auto.acquire(verbose, message_if_yes or message_if_no) table_name = self._get_table_name(table) table_exists = self.exists_table(table_name, verbose=verbose) if table_exists: if '{}' in query: query = query.format(table_name) result = self.execute(query, verbose=verbose) if message_if_yes: if '{}' in message_if_yes: message_if_yes = message_if_yes.format(table_name) self.log(message_if_yes, verbose=verbose) return result else: if message_if_no and '{}' in message_if_no: message_if_no = message_if_no.format(table_name) if stop_if_no: raise ValueError(message_if_no) else: if message_if_no: self.log(message_if_no, verbose=verbose)
def simple_select( self, fields: OptionalFields, filters: OptionalFields = None, sort: OptionalFields = None, count: Count = None, stream_type: Union[StreamType, Auto] = AUTO, verbose: AutoBool = AUTO, ) -> Stream: stream_type = Auto.acquire(stream_type, StreamType.RecordStream) stream_class = stream_type.get_class() stream_rows = self.execute_select(fields=fields, filters=filters, sort=sort, count=count, verbose=verbose) if stream_type == StreamType.RowStream: stream_data = stream_rows elif stream_type == StreamType.RecordStream: columns = self.get_columns() stream_data = map(lambda r: dict(zip(columns, r)), stream_rows) else: raise NotImplementedError if Auto.is_defined(count): if count < MAX_ITEMS_IN_MEMORY: stream_data = list(stream_data) count = len(stream_data) return stream_class(stream_data, count=count, source=self, context=self.get_context())
def to_stream(self, data: Union[Iterable, Auto] = AUTO, name: AutoName = AUTO, stream_type: Union[StreamType, Auto] = AUTO, ex: OptionalFields = None, step: AutoCount = AUTO, **kwargs) -> Stream: stream_type = Auto.acquire(stream_type, StreamType.SqlStream) if stream_type == StreamType.SqlStream: assert not Auto.is_defined(data) name = Auto.delayed_acquire(name, self._get_generated_stream_name) stream_class = stream_type.get_class() meta = self.get_compatible_meta(stream_class, name=name, ex=ex, **kwargs) meta['source'] = self return stream_class(data, **meta) else: return super().to_stream( data=data, name=name, stream_type=stream_type, ex=ex, step=step, **kwargs, )
def is_inside_folder(self, folder: Union[str, Connector, Auto] = AUTO) -> bool: folder_obj = Auto.acquire(folder, self.get_folder()) if isinstance(folder_obj, str): folder_path = folder_obj else: # elif isinstance(folder_obj, LocalFolder) folder_path = folder_obj.get_path() return self.get_folder_path() in folder_path
def get_struct_from_source( self, set_struct: bool = False, use_declared_types: bool = True, verbose: AutoBool = AUTO, ) -> Struct: assert self.is_existing( ), 'For detect struct file/object must be existing: {}'.format( self.get_path()) verbose = Auto.acquire(verbose, self.is_verbose()) declared_types = dict() if use_declared_types: declared_format = self.get_declared_format() if isinstance(declared_format, FlatStructFormat) or hasattr( declared_format, 'get_struct'): declared_struct = declared_format.get_struct() if isinstance(declared_struct, StructInterface) or hasattr( declared_struct, 'get_types_dict'): declared_types = declared_struct.get_types_dict() struct = self._get_struct_from_source(types=declared_types, verbose=verbose) message = 'Struct for {} detected by title row: {}'.format( self.get_name(), struct.get_struct_str(None)) self.log(message, end='\n', verbose=verbose) if set_struct: self.set_struct(struct, inplace=True) return struct
def show(self, count: Optional[int] = None, as_dataframe: Union[bool, Auto] = AUTO) -> Optional[DataFrame]: as_dataframe = Auto.acquire(as_dataframe, get_use_objects_for_output()) if as_dataframe: return self.get_dataframe() else: return self.describe(as_dataframe=False)
def hist( data: Data, *fields, in_memory: AutoBool = AUTO, step: Count = DEFAULT_STEP, logger: Union[LoggerInterface, Auto] = AUTO, msg: Optional[Message] = None, ) -> RegularStream: stream = _stream(data) total_count = stream.get_count() in_memory = Auto.acquire(in_memory, stream.is_in_memory()) logger = Auto.acquire(logger, stream.get_logger, delayed=True) # if in_memory: if in_memory or len(fields) > 1: stream = stream.stream( get_hist_records(stream, fields, in_memory=in_memory, logger=logger, msg=msg), stream_type='RecordStream', ) else: stream = stream if len(fields) <= 1 else stream.tee_stream() f = fields[0] if logger: logger.log('Calc hist for field {}...'.format(f)) stream = stream.to_stream( stream_type='RecordStream', columns=fields, ).select( f, ).group_by( f, values=['-'], step=step, ).select( field=lambda r, k=f: k, value=f, count=('-', len), ).sort('value') if not total_count: stream = stream.to_memory() total_count = sum(stream.filter(field=fields[0]).get_one_column_values('count')) stream = stream.select( '*', total_count=fs.const(total_count), share=('count', 'total_count', lambda c, t: c / t if t else None), ) return _assume_native(stream)
def insert_rows( self, table: str, rows: Iterable, columns: Array, step: int = DEFAULT_STEP, skip_errors: bool = False, expected_count: Count = None, return_count: bool = True, verbose: AutoBool = AUTO, ) -> Count: assert isinstance(columns, ARRAY_TYPES), 'list or tuple expected, got {}'.format(columns) verbose = Auto.acquire(verbose, self.verbose) if isinstance(rows, Sized): count = len(rows) else: count = expected_count conn = self.connect(reconnect=True) cur = conn.cursor() use_fast_batch_method = not skip_errors query_args = dict(table=table) if use_fast_batch_method: query_template = 'INSERT INTO {table} VALUES ({values});' placeholders = ['%({})s'.format(c) for c in columns] else: # elif skip_errors: query_template = 'INSERT INTO {table} ({columns}) VALUES ({values})' placeholders = ['%s' for _ in columns] query_args['columns'] = ', '.join(columns) query_args['values'] = ', '.join(placeholders) query = query_template.format(**query_args) message = verbose if isinstance(verbose, str) else 'Commit {}b to {}'.format(step, table) progress = self.get_new_progress(message, count=count) progress.start() records_batch = list() n = 0 for n, row in enumerate(rows): if use_fast_batch_method: current_record = {k: v for k, v in zip(columns, row)} records_batch.append(current_record) elif skip_errors: try: cur.execute(query, row) except TypeError or IndexError as e: # TypeError: not all arguments converted during string formatting self.log('Error line: {}'.format(str(row)), level=LoggingLevel.Debug, verbose=verbose) self.log('{}: {}'.format(e.__class__.__name__, e), level=LoggingLevel.Error) if (n + 1) % step == 0: if use_fast_batch_method: self.execute_batch(query, records_batch, step, cursor=cur) records_batch = list() if not progress.get_position(): progress.update(0) conn.commit() progress.update(n) gc.collect() if use_fast_batch_method: self.execute_batch(query, records_batch, step, cursor=cur) conn.commit() progress.finish(n) if return_count: return n
def describe(self, *filter_args, count: Optional[int] = EXAMPLE_ROW_COUNT, columns: Optional[Array] = None, show_header: bool = True, struct_as_dataframe: bool = False, safe_filter: bool = True, actualize: AutoBool = AUTO, output: AutoOutput = AUTO, **filter_kwargs): if show_header: for line in self.get_str_headers(): self.output_line(line, output=output) example_item, example_stream, example_comment = dict(), None, '' if self.is_existing(): if Auto.acquire(actualize, not self.is_actual()): self.actualize() if self.is_empty(): message = '[EMPTY] file is empty, expected {} columns:'.format( self.get_column_count()) else: message = self.get_validation_message() example_tuple = self._prepare_examples(safe_filter=safe_filter, filters=filter_args, **filter_kwargs) example_item, example_stream, example_comment = example_tuple else: message = '[NOT_EXISTS] file is not created yet, expected {} columns:'.format( self.get_column_count()) if show_header: self.output_line('{} {}'.format(self.get_datetime_str(), message), output=output) if self.get_invalid_fields_count(): line = 'Invalid columns: {}'.format( get_str_from_args_kwargs(*self.get_invalid_columns())) self.output_line(line, output=output) self.output_blank_line(output=output) struct = self.get_struct() struct_dataframe = struct.describe( show_header=False, as_dataframe=struct_as_dataframe, example=example_item, output=output, comment=example_comment, ) if struct_dataframe is not None: return struct_dataframe if example_stream and count: return self.show_example( count=count, example=example_stream, columns=columns, comment=example_comment, )
def __init__( self, name: Union[Name, Auto] = AUTO, logger: Union[LoggerInterface, Auto] = AUTO, skip_not_implemented: bool = True ): self._logger = logger self._local_storage = None self._skip_not_implemented = skip_not_implemented self._tmp_folder = None super().__init__(name=Auto.acquire(name, NAME))
def file( self, suffix: Union[Suffix, Auto], content_format: Union[ContentType, ContentFormatInterface, Auto] = AUTO, filetype: Union[ContentType, ContentFormatInterface, Auto] = AUTO, # deprecated argument **kwargs ) -> Connector: acquired_suffix = Auto.acquire(suffix, self.get_suffix()) assert acquired_suffix, 'suffix must be defined, got argument {}, default {}'.format(suffix, self.get_suffix()) filename = self.get_mask().format(acquired_suffix) return super().file(filename, content_format=content_format, filetype=filetype, **kwargs)
def grant_permission(self, name: str, permission='SELECT', group=DEFAULT_GROUP, verbose: AutoBool = AUTO) -> None: verbose = Auto.acquire(verbose, self.verbose) message = 'Grant access:' query = 'GRANT {permission} ON {name} TO {group};'.format( name=name, permission=permission, group=group, ) self.execute( query, get_data=False, commit=True, verbose=message if verbose is True else verbose, )
def __init__( self, data: Any, name: AutoName = AUTO, source: Connector = None, context: Context = None, check: bool = False, ): if source: name = Auto.acquire(name, source.get_name()) else: name = Auto.acquire(name, get_generated_name()) if source and not context: context = source.get_context() if not context: context = sm.get_context() super().__init__(name=name, data=data, source=source, context=context, check=check)
def _collect_inplace(self, log: AutoBool = AUTO) -> None: estimated_count = self.get_estimated_count() if Auto.is_defined(estimated_count): log = Auto.acquire( log, estimated_count > self.get_limit_items_in_memory()) if log and estimated_count: self.log( 'Trying to collect {} items into memory from {}...'.format( estimated_count, self.__repr__())) self.set_data(self.get_list(), inplace=True) self.update_count(force=False) if log: self.log('Collected {} items into memory from {}...'.format( estimated_count, self.__repr__()))
def map_to(self, function: Callable, stream_type: OptStreamType = AUTO) -> Native: stream_type = Auto.acquire(stream_type, self.get_stream_type, delayed=True) stream = self.stream( map(function, self.get_iter()), stream_type=stream_type, ) stream = self._assume_native(stream) if self.is_in_memory(): stream = stream.to_memory() return stream
def get_fast_lines_count(self, ending: Union[str, Auto] = AUTO, verbose: AutoBool = AUTO) -> int: if self.is_gzip(): raise ValueError('get_fast_lines_count() method is not available for gzip-files') if not Auto.is_defined(ending): if hasattr(self, 'get_content_format'): ending = self.get_content_format().get_ending() else: ending = '\n' verbose = Auto.acquire(verbose, self.is_verbose()) self.log('Counting lines in {}...'.format(self.get_name()), end='\r', verbose=verbose) count_n_symbol = sum(chunk.count(ending) for chunk in self.get_chunks()) count_lines = count_n_symbol + 1 self.set_count(count_lines) return count_lines
def __init__( self, name: Union[Name, Auto] = AUTO, stream_config: Union[dict, Auto] = AUTO, conn_config: Union[dict, Auto] = AUTO, logger: Union[Logger, Auto, None] = AUTO, clear_tmp: bool = False, ): self.logger = logger self.stream_config = Auto.acquire(stream_config, DEFAULT_STREAM_CONFIG) self.conn_config = Auto.acquire(conn_config, dict()) self.stream_instances = dict() self.conn_instances = dict() name = Auto.acquire(name, NAME) super().__init__(name) self.sm = sm self.sm.set_context(self) self.ct = ct self.ct.set_context(self) if clear_tmp: self.clear_tmp_files()
def disconnect(self, skip_errors: bool = False, verbose=AUTO) -> Count: verbose = Auto.acquire(verbose, self.verbose) if self.is_connected(): if not psycopg2: raise ImportError('psycopg2 must be installed (pip install psycopg2)') if skip_errors: try: self.connection.close() except psycopg2.OperationalError: message = 'Connection to {} already closed.'.format(self.host) self.log(message, level=LoggingLevel.Warning, verbose=verbose) else: self.connection.close() self.connection = None return 1
def get_items_of_type( self, item_type: Union[ItemType, Auto], verbose: AutoBool = AUTO, message: AutoName = AUTO, step: AutoCount = AUTO, ) -> Iterable: item_type = Auto.acquire(item_type, self.get_default_item_type()) verbose = Auto.acquire(verbose, self.is_verbose()) content_format = self.get_content_format() assert isinstance(content_format, ParsedFormat) count = self.get_count(allow_slow_mode=False) if isinstance(verbose, str): if Auto.is_defined(message): self.log(verbose, verbose=bool(verbose)) else: message = verbose elif (count or 0) > 0: template = '{count} lines expected from file {name}...' msg = template.format(count=count, name=self.get_name()) self.log(msg, verbose=verbose) lines = self.get_lines(skip_first=self.is_first_line_title(), step=step, verbose=verbose, message=message) items = content_format.get_items_from_lines(lines, item_type=item_type) return items
def write_lines(self, lines: Iterable, verbose: AutoBool = AUTO) -> Native: verbose = Auto.acquire(verbose, self.is_verbose()) ending = self.get_ending().encode(self.get_encoding()) if self.is_gzip() else self.get_ending() self.open('w', allow_reopen=True) n = 0 for n, i in enumerate(lines): if n > 0: self.get_fileholder().write(ending) line = str(i).encode(self.get_encoding()) if self.is_gzip() else str(i) self.get_fileholder().write(line) self.close() count = n + 1 self.set_count(count) self.log('Done. {} rows has written into {}'.format(count, self.get_name()), verbose=verbose) return self
def stream(self, stream_type: Union[StreamType, Stream, Name], name: Union[Name, Auto] = AUTO, check: bool = True, **kwargs) -> Stream: name = Auto.acquire(name, get_generated_name('Stream')) if sm.is_stream(stream_type): stream_object = stream_type else: stream_object = sm.stream(stream_type, **kwargs) stream_object = stream_object.set_name( name, register=False, ).fill_meta(context=self, check=check, **self.stream_config) self.stream_instances[name] = stream_object return stream_object
def progress( self, expected_count: AutoCount = AUTO, step: AutoCount = AUTO, message: str = 'Progress', ) -> Native: count = Auto.acquire(expected_count, self.get_count()) or self.get_estimated_count() logger = self.get_logger() if isinstance(logger, ExtLogger): items_with_logger = logger.progress(self.get_items(), name=message, count=count, step=step) else: if logger: logger.log(msg=message, level=LoggingLevel.Info) items_with_logger = self.get_items() stream = self.stream(items_with_logger) return self._assume_native(stream)
def __init__( self, descriptions: Array, target_item_type: ItemType = ItemType.Auto, input_item_type: ItemType = ItemType.Auto, input_struct: Struct = None, logger: Logger = None, selection_logger: Union[Logger, Auto] = AUTO, ): self._descriptions = descriptions self._target_item_type = target_item_type self._input_item_type = input_item_type self._input_struct = input_struct self._logger = logger self._selection_logger = Auto.acquire(selection_logger, getattr(logger, 'get_selection_logger', None)) self._has_trivial_multiple_selectors = AUTO self._output_field_names = AUTO
def __init__( self, data: Iterable, name: AutoName = AUTO, source: Source = None, context: Context = None, count: Count = None, less_than: Count = None, check: bool = False, max_items_in_memory: AutoCount = AUTO, ): self._count = count self._less_than = less_than or count self.check = check self.max_items_in_memory = Auto.acquire(max_items_in_memory, MAX_ITEMS_IN_MEMORY) super().__init__( data=self._get_typing_validated_items(data, context=context) if check else data, name=name, source=source, context=context, check=False, )
def create_table( self, table: Union[Table, Name], struct: Struct, drop_if_exists: bool = False, verbose: AutoBool = AUTO, ) -> Table: verbose = Auto.acquire(verbose, self.verbose) table_name, struct_str = self._get_table_name_and_struct_str(table, struct, check_struct=True) if drop_if_exists: self.drop_table(table_name, verbose=verbose) message = 'Creating table:' query = 'CREATE TABLE {name} ({struct});'.format(name=table_name, struct=struct_str) self.execute( query, get_data=False, commit=True, verbose=message if verbose is True else verbose, ) self.post_create_action(table_name, verbose=verbose) self.log('Table {name} is created.'.format(name=table_name), verbose=verbose) if struct: return self.table(table, struct=struct) else: return self.table(table)
def get_lines( self, count: Optional[int] = None, skip_first: bool = False, allow_reopen: bool = True, check: bool = True, verbose: AutoBool = AUTO, message: AutoName = AUTO, step: AutoCount = AUTO, ) -> Generator: if check and not self.is_gzip(): assert not self.is_empty(), 'for get_lines() file must be non-empty: {}'.format(self) self.open(allow_reopen=allow_reopen) lines = self.get_next_lines(count=count, skip_first=skip_first, close=True) verbose = Auto.acquire(verbose, self.is_verbose()) if verbose or Auto.is_defined(message): if not Auto.is_defined(message): message = 'Reading {}' if '{}' in message: message = message.format(self.get_name()) logger = self.get_logger() assert hasattr(logger, 'progress'), '{} has no progress in {}'.format(self, logger) if not count: count = self.get_count(allow_slow_mode=False) lines = self.get_logger().progress(lines, name=message, count=count, step=step) return lines