def __init__( self, name: OptName = arg.AUTO, level: Level = arg.AUTO, formatter: Union[Formatter, arg.Auto] = arg.AUTO, loggers: SubLoggers = arg.AUTO, context: Context = None, file: Optional[FileOrName] = None, ): name = arg.acquire(name, DEFAULT_LOGGER_NAME) level = arg.acquire(level, DEFAULT_LOGGING_LEVEL) formatter = arg.acquire(formatter, DEFAULT_FORMATTER) if not isinstance(level, LoggingLevel): level = LoggingLevel(level) if isinstance(loggers, list): loggers = {i: i.get_name() for i in loggers} elif not arg.is_defined(loggers): loggers = dict() if name not in loggers: level_value = arg.get_value(level) base_logger = self.build_base_logger(name, level_value, formatter) loggers[name] = base_logger self._level = level super().__init__(name=name, children=loggers, context=context) if file: self.set_file(file)
def get_lines( self, count: Optional[int] = None, skip_first: bool = False, allow_reopen: bool = True, check: bool = True, verbose: AutoBool = AUTO, message: Union[str, Auto] = AUTO, step: AutoCount = AUTO, ) -> Iterable: if check and not self.is_gzip(): # assert self.get_count(allow_reopen=True) > 0 assert not self.is_empty( ), 'for get_lines() file must be non-empty: {}'.format(self) self.open(allow_reopen=allow_reopen) lines = self.get_next_lines(count=count, skip_first=skip_first, close=True) verbose = arg.acquire(verbose, self.is_verbose()) if verbose or arg.is_defined(message): message = arg.acquire(message, 'Reading {}') if '{}' in message: message = message.format(self.get_name()) logger = self.get_logger() assert hasattr(logger, 'progress'), '{} has no progress in {}'.format( self, logger) if not count: count = self.get_count(allow_slow_gzip=False) lines = self.get_logger().progress(lines, name=message, count=count, step=step) return lines
def to_line_stream( self, delimiter: Union[str, Auto] = AUTO, columns: Columns = AUTO, add_title_row: Union[bool, Auto] = AUTO, ) -> LineStream: stream_type = self.get_stream_type() delimiter = arg.acquire( delimiter, '\t' if stream_type == StreamType.RowStream else None) stream = self if stream.get_stream_type() == StreamType.RecordStream: assert isinstance(stream, RegularStream) or hasattr( stream, 'get_columns'), 'got {}'.format(stream) columns = arg.acquire(columns, stream.get_columns, delayed=True) add_title_row = arg.acquire(add_title_row, True) stream = stream.to_row_stream(columns=columns, add_title_row=add_title_row) if delimiter: func = delimiter.join else: func = str stream = self.stream( stream._get_mapped_items(func), stream_type=StreamType.LineStream, ) return self._assume_native(stream)
def set_logger( self, logger: Union[Logger, Auto] = AUTO, selection_logger: Union[Logger, Auto] = AUTO, ) -> NoReturn: self._logger = arg.acquire(logger, getattr(logger, 'get_logger', None)) self._selection_logger = arg.acquire( selection_logger, getattr(logger, 'get_selection_logger', None))
def get_group_header(self, name: Comment = AUTO, caption: Comment = AUTO, comment: Comment = None) -> Iterable[str]: is_title_row = name == arg.AUTO name = arg.acquire(name, self.get_name()) caption = arg.acquire(caption, self.get_caption()) if arg.is_defined(name): yield name if arg.is_defined(caption): yield caption if is_title_row: yield self.get_str_fields_count() if arg.is_defined(comment): yield comment
def group_by( self, *keys, values: Columns = None, step: AutoCount = AUTO, as_pairs: bool = False, take_hash: bool = True, verbose: bool = True, ) -> Stream: keys = arg.update(keys) keys = arg.get_names(keys) values = arg.get_names(values) if hasattr(keys[0], 'get_field_names'): # if isinstance(keys[0], FieldGroup) keys = keys[0].get_field_names() step = arg.acquire(step, self.max_items_in_memory) if as_pairs: key_for_sort = keys else: key_for_sort = get_key_function(keys, take_hash=take_hash) sorted_stream = self.sort( key_for_sort, step=step, verbose=verbose, ) grouped_stream = sorted_stream.sorted_group_by( keys, values=values, as_pairs=as_pairs, ) return grouped_stream
def to_column_file( self, filename, delimiter='\t', encoding=AUTO, gzip=False, check=AUTO, verbose=True, return_stream=True, ): encoding = arg.acquire(encoding, self.get_encoding()) meta = self.get_meta() if not gzip: meta.pop('count') stream_csv_file = self.to_line_stream( delimiter=delimiter, ).to_text_file( filename, encoding=encoding, gzip=gzip, check=check, verbose=verbose, return_stream=return_stream, ) if return_stream: return stream_csv_file.to_row_stream( delimiter=delimiter, ).update_meta(**meta)
def folder(self, name: str, folder_type: Union[FolderType, Auto] = AUTO, **kwargs) -> ConnectorInterface: supposed_type = FolderType.detect_by_name(name) folder_type = arg.acquire(folder_type, supposed_type) folder_class = FolderType(folder_type).get_class() folder_obj = folder_class(name, parent=self, **kwargs) self.add_folder(folder_obj) return folder_obj
def execute_if_exists( self, query: str, table: Union[Table, Name], message_if_yes: Optional[str] = None, message_if_no: Optional[str] = None, stop_if_no: bool = False, verbose: AutoBool = AUTO, ) -> Optional[Iterable]: verbose = arg.acquire(verbose, message_if_yes or message_if_no) table_name = self._get_table_name(table) table_exists = self.exists_table(table_name, verbose=verbose) if table_exists: if '{}' in query: query = query.format(table_name) result = self.execute(query, verbose=verbose) if message_if_yes: if '{}' in message_if_yes: message_if_yes = message_if_yes.format(table_name) self.log(message_if_yes, verbose=verbose) return result else: if message_if_no and '{}' in message_if_no: message_if_no = message_if_no.format(table_name) if stop_if_no: raise ValueError(message_if_no) else: if message_if_no: self.log(message_if_no, verbose=verbose)
def __init__( self, data: Iterable, name: AutoName = AUTO, check: bool = False, count: AutoCount = None, less_than: AutoCount = None, source: Connector = None, context: Context = None, max_items_in_memory: AutoCount = AUTO, tmp_files: TmpMask = AUTO, ): count = arg.get_optional_len(data, count) less_than = less_than or count self.max_items_in_memory = arg.acquire(max_items_in_memory, sm.MAX_ITEMS_IN_MEMORY) super().__init__( data=data, name=name, check=check, source=source, context=context, count=count, less_than=less_than, ) self._tmp_files = arg.delayed_acquire(tmp_files, sm.get_tmp_mask, self.get_name())
def create_table( self, table: Union[Table, Name], struct: Struct, drop_if_exists: bool = False, verbose: AutoBool = AUTO, ) -> Table: verbose = arg.acquire(verbose, self.verbose) table_name, struct_str = self._get_table_name_and_struct_str( table, struct, check_struct=True) if drop_if_exists: self.drop_table(table_name, verbose=verbose) message = 'Creating table:' query = 'CREATE TABLE {name} ({struct});'.format(name=table_name, struct=struct_str) self.execute( query, get_data=False, commit=True, verbose=message if verbose is True else verbose, ) self.post_create_action(table_name, verbose=verbose) self.log('Table {name} is created.'.format(name=table_name), verbose=verbose) if struct: return self.table(table, struct=struct) else: return self.table(table)
def disk_sort( self, key: UniKey = fs.same(), reverse: bool = False, step: AutoCount = AUTO, verbose: AutoBool = False, ) -> Native: step = arg.acquire(step, self.max_items_in_memory) key_function = fs.composite_key(key) stream_parts = self.split_to_disk_by_step( step=step, sort_each_by=key_function, reverse=reverse, verbose=verbose, ) assert stream_parts, 'streams must be non-empty' iterables = [f.get_iter() for f in stream_parts] counts = [f.get_count() or 0 for f in stream_parts] self.log('Merging {} parts... '.format(len(iterables)), verbose=verbose) return self.stream( algo.merge_iter( iterables, key_function=key_function, reverse=reverse, post_action=self.get_tmp_files().remove_all, ), count=sum(counts), )
def reset_modification_timestamp(self, timestamp: Union[float, Auto, None] = AUTO) -> Native: timestamp = arg.acquire(timestamp, self.get_modification_timestamp(reset=False)) self._modification_ts = timestamp return self
def __init__( self, name: Union[str, arg.Auto] = arg.AUTO, ignore_warnings: bool = False, ): self._name = arg.acquire(name, DEFAULT_LOGGER_NAME) self._ignore_warnings = ignore_warnings
def reset_selection_logger(self, name: OptName = arg.AUTO, **kwargs) -> Optional[SelectionLoggerInterface]: name = arg.acquire(name, SELECTION_LOGGER_NAME) context = self.get_context() if context: selection_logger = context.get_new_selection_logger(name, **kwargs) if selection_logger: self.set_selection_logger(selection_logger) return selection_logger
def log(self, msg, level=arg.AUTO, end=arg.AUTO, verbose=arg.AUTO) -> None: logger = self.get_logger() if logger is not None: logger.log( logger=self.get_logger(), msg=msg, level=level, end=end, verbose=arg.acquire(verbose, self.verbose), )
def get_year_and_week_from_week_abs(week_abs: int, min_year: Union[int, arg.Auto] = arg.AUTO ) -> tuple: min_year = arg.acquire(min_year, _min_year) delta_year = int(week_abs / WEEKS_IN_YEAR) year = min_year + delta_year week = week_abs - delta_year * WEEKS_IN_YEAR return year, week
def get_week_abs_from_year_and_week( year: int, week: int, min_year: Union[int, arg.Auto] = arg.AUTO, ) -> int: min_year = arg.acquire(min_year, get_min_year()) week_abs = (year - min_year) * WEEKS_IN_YEAR + week return week_abs
def disk_sort_by_key(self, reverse=False, step=AUTO) -> Native: step = arg.acquire(step, self.max_items_in_memory) stream = self.disk_sort( key=self._get_key, reverse=reverse, step=step, ) return self._assume_native(stream)
def is_inside_folder(self, folder: Union[str, Connector, Auto] = AUTO) -> bool: folder_obj = arg.acquire(folder, self.get_folder()) if isinstance(folder_obj, str): folder_path = folder_obj else: # elif isinstance(folder_obj, LocalFolder) folder_path = folder_obj.get_path() return self.get_folder_path() in folder_path
def get_selection_logger(self, name: OptName = arg.AUTO, **kwargs) -> Optional[SelectionLoggerInterface]: name = arg.acquire(name, SELECTION_LOGGER_NAME) selection_logger = self.get_child(name) if selection_logger: if kwargs: selection_logger.set_meta(name, **kwargs) else: selection_logger = self.reset_selection_logger(name, **kwargs) return selection_logger
def get_new_progress(self, name: Name, count: Count = None, context: OptContext = arg.AUTO) -> ProgressInterface: progress = Progress( name=name, count=count, logger=self, context=arg.acquire(context, self.get_context, delayed=True), ) self.add_child(progress, check=False) return progress
def __init__(self, name: Union[Name, Auto] = AUTO, logger: Union[LoggerInterface, Auto] = AUTO, skip_not_implemented: bool = True): self._logger = logger self._local_storage = None self._skip_not_implemented = skip_not_implemented self._tmp_folder = None super().__init__(name=arg.acquire(name, NAME))
def to_stream(self, stream_type: AutoStreamType = AUTO, *args, **kwargs) -> Stream: stream_type = arg.acquire(stream_type, self.get_stream_type()) method_suffix = StreamType.of(stream_type).get_method_suffix() method_name = 'to_{}'.format(method_suffix) stream_method = self.__getattribute__(method_name) return stream_method(stream_type, *args, **kwargs)
def can_be_in_memory(self, step: AutoCount = AUTO) -> bool: step = arg.acquire(step, self.max_items_in_memory) if self.is_in_memory() or step is None: return True else: count = self.get_estimated_count() if count is None: return False else: return count <= step
def __init__(self, name: Name, value: Union[Value, arg.Auto] = arg.AUTO, update: bool = False): if update or not self._is_initialized(): name = arg.get_name(name) if self._auto_value: value = arg.acquire(value, name) self.name = name self.value = value
def format_message( self, *messages, max_len: Union[int, arg.Auto] = arg.AUTO, truncate: bool = True, ) -> str: messages = arg.update(messages) max_len = arg.acquire(max_len, self.max_line_len) message = SPACE.join([str(m) for m in messages]) if truncate and len(message) > max_len: message = message[:max_len - 2] + TRUNCATED_SUFFIX return message
def update_with_step(self, position, step=arg.AUTO): step = arg.acquire(step, DEFAULT_STEP) cur_increment = position - (self.position or 0) self.position = position step_passed = (self.position + 1) % step == 0 step_passed = step_passed or (cur_increment >= step) expected_count = self.expected_count if not arg.is_defined(expected_count): expected_count = 0 pool_finished = 0 < expected_count < (self.position + 1) if step_passed or pool_finished: self.update_now(position)
def map_to_type(self, function: Callable, stream_type: AutoStreamType = AUTO) -> Stream: stream_type = arg.acquire(stream_type, self.get_stream_type()) result = self.stream( map(function, self.get_items()), stream_type=stream_type, ) if hasattr(self, 'is_in_memory'): if self.is_in_memory(): return result.to_memory() return result
def map_to(self, function: Callable, stream_type: OptStreamType = AUTO) -> Native: stream_type = arg.acquire(stream_type, self.get_stream_type, delayed=True) stream = self.stream( map(function, self.get_iter()), stream_type=stream_type, ) if self.is_in_memory(): stream = stream.to_memory() return self._assume_native(stream)