def bucket(self, name, access_key=AUTO, secret_key=AUTO): bucket = self.get_buckets().get(name) if not bucket: bucket = ct.S3Bucket( name=name, storage=self, access_key=arg.undefault(access_key, self.access_key), secret_key=arg.undefault(secret_key, self.secret_key), ) return bucket
def get_struct_rows(self, rows, struct=arg.AUTO, skip_bad_rows=False, skip_bad_values=False, verbose=True): struct = arg.undefault(struct, self.get_struct()) if isinstance(struct, StructInterface): # actual approach converters = struct.get_converters('str', 'py') for r in rows: converted_row = list() for value, converter in zip(r, converters): converted_value = converter(value) converted_row.append(converted_value) yield converted_row.copy() else: # deprecated approach for r in rows: if skip_bad_rows: try: yield apply_struct_to_row( r, struct, False, logger=self if verbose else None) except ValueError: self.log(['Skip bad row:', r], verbose=verbose) else: yield apply_struct_to_row(r, struct, skip_bad_values, logger=self if verbose else None)
def get_lines(self, items: Iterable, item_type: ItemType, add_title_row: AutoBool = AUTO) -> Generator: add_title_row = arg.undefault(add_title_row, self.is_first_line_title()) if add_title_row: assert self.is_first_line_title() title_row = self.get_struct().get_columns() yield self.get_formatted_item(title_row, item_type=ItemType.Row) for i in items: yield self.get_formatted_item(i, item_type=item_type)
def log(self, msg, level=arg.DEFAULT, logger=None, end=arg.DEFAULT, verbose=False, truncate=False) -> NoReturn: level = arg.undefault(level, LoggingLevel.Debug) if isinstance(msg, str): msg = DetailedMessage(message=msg) self.add_message(msg) if logger or verbose: super().log(msg, level, logger, end, verbose, truncate)
def write_items( self, items: Iterable, item_type: Union[ItemType, Auto] = AUTO, add_title_row: AutoBool = AUTO, verbose: AutoBool = AUTO, ) -> Native: item_type = arg.undefault(item_type, self.get_default_item_type()) content_format = self.get_content_format() assert isinstance(content_format, ParsedFormat) lines = content_format.get_lines(items, item_type=item_type, add_title_row=add_title_row) return self.write_lines(lines, verbose=verbose)
def hist(data: Data, *fields, in_memory=arg.DEFAULT, step=1000000, logger=arg.DEFAULT, msg=None) -> Stream: stream = _stream(data) total_count = stream.get_count() in_memory = arg.undefault(in_memory, stream.is_in_memory()) logger = arg.undefault(logger, stream.get_logger, delayed=True) # if in_memory: if in_memory or len(fields) > 1: stream = stream.stream( get_hist_records(stream, fields, in_memory=in_memory, logger=logger, msg=msg), stream_type='RecordStream', ) else: stream = stream if len(fields) <= 1 else stream.tee_stream() f = fields[0] if logger: logger.log('Calc hist for field {}...'.format(f)) stream = stream.to_stream( stream_type='RecordStream', ).group_by( f, values=['-'], step=step, ).select( field=lambda r, k=f: k, value=f, count=('-', len), ).sort('value') if not total_count: stream = stream.to_memory() total_count = sum(stream.filter(field=fields[0]).get_one_column_values('count')) stream = stream.select( '*', total_count=fs.const(total_count), share=('count', 'total_count', lambda c, t: c / t if t else None), ) return _assume_native(stream)
def run( self, operations: Union[list, arg.DefaultArgument] = arg.DEFAULT, if_not_yet: bool = True, options: Optional[dict] = None, ): operations = arg.undefault(operations, self.get_queue()) operations = [self.get_operation(op) for op in operations] names = [op.get_name() for op in operations] for name, operation in zip(names, operations): options = self.get_options(including=operation, upd=options) if if_not_yet and hasattr(operation, 'run_if_not_yet'): operation.run_if_not_yet(options=options) else: operation.run_now(options=options)
def __init__( self, name: str = arg.DEFAULT, source: Source = None, context: Context = None, check: bool = True, ): name = arg.undefault(name, arg.get_generated_name(self._get_default_name_prefix())) if arg.is_defined(context): if arg.is_defined(source): source.set_context(context) else: source = context super().__init__(name=name, source=source, check=check) if arg.is_defined(self.get_context()): self.put_into_context(check=check)
def __init__( self, name: Name, operations: Optional[dict] = None, queue: Optional[list] = None, options: Optional[dict] = None, context: OptContext = arg.DEFAULT, ): context = arg.undefault(context, ct.get_context()) super().__init__( name=name, children=operations, parent=context, ) self._queue = queue or list() self._options = options or dict()
def run_now( self, return_stream: bool = True, stream_type: OptStreamType = arg.DEFAULT, options: Options = None, verbose: bool = True, ) -> Stream: stream_type = arg.undefault(stream_type, self.get_stream_type()) stream = self.get_src().to_stream(stream_type=stream_type) if verbose: self.log('Running operation: {}'.format(self.get_name())) if self.has_procedure(): if self.has_apply_to_stream(): stream = self.get_procedure()(stream, **self.get_kwargs(upd=options)) else: stream = stream.apply_to_data(self.get_procedure(), **self.get_kwargs(upd=options)) return stream.write_to(self.get_dst(), return_stream=return_stream)
def __init__(self, name: str = arg.DEFAULT, source: Optional[SourcedInterface] = None, check: bool = True): name = arg.undefault(name, arg.get_generated_name(self._get_default_name_prefix())) super().__init__(name=name) self._source = source if arg.is_defined(source): self.register(check=check)