Exemple #1
0
 def bucket(self, name, access_key=AUTO, secret_key=AUTO):
     bucket = self.get_buckets().get(name)
     if not bucket:
         bucket = ct.S3Bucket(
             name=name,
             storage=self,
             access_key=arg.undefault(access_key, self.access_key),
             secret_key=arg.undefault(secret_key, self.secret_key),
         )
     return bucket
Exemple #2
0
 def get_struct_rows(self,
                     rows,
                     struct=arg.AUTO,
                     skip_bad_rows=False,
                     skip_bad_values=False,
                     verbose=True):
     struct = arg.undefault(struct, self.get_struct())
     if isinstance(struct, StructInterface):  # actual approach
         converters = struct.get_converters('str', 'py')
         for r in rows:
             converted_row = list()
             for value, converter in zip(r, converters):
                 converted_value = converter(value)
                 converted_row.append(converted_value)
             yield converted_row.copy()
     else:  # deprecated approach
         for r in rows:
             if skip_bad_rows:
                 try:
                     yield apply_struct_to_row(
                         r, struct, False, logger=self if verbose else None)
                 except ValueError:
                     self.log(['Skip bad row:', r], verbose=verbose)
             else:
                 yield apply_struct_to_row(r,
                                           struct,
                                           skip_bad_values,
                                           logger=self if verbose else None)
Exemple #3
0
 def get_lines(self,
               items: Iterable,
               item_type: ItemType,
               add_title_row: AutoBool = AUTO) -> Generator:
     add_title_row = arg.undefault(add_title_row,
                                   self.is_first_line_title())
     if add_title_row:
         assert self.is_first_line_title()
         title_row = self.get_struct().get_columns()
         yield self.get_formatted_item(title_row, item_type=ItemType.Row)
     for i in items:
         yield self.get_formatted_item(i, item_type=item_type)
Exemple #4
0
 def log(self,
         msg,
         level=arg.DEFAULT,
         logger=None,
         end=arg.DEFAULT,
         verbose=False,
         truncate=False) -> NoReturn:
     level = arg.undefault(level, LoggingLevel.Debug)
     if isinstance(msg, str):
         msg = DetailedMessage(message=msg)
     self.add_message(msg)
     if logger or verbose:
         super().log(msg, level, logger, end, verbose, truncate)
Exemple #5
0
 def write_items(
     self,
     items: Iterable,
     item_type: Union[ItemType, Auto] = AUTO,
     add_title_row: AutoBool = AUTO,
     verbose: AutoBool = AUTO,
 ) -> Native:
     item_type = arg.undefault(item_type, self.get_default_item_type())
     content_format = self.get_content_format()
     assert isinstance(content_format, ParsedFormat)
     lines = content_format.get_lines(items,
                                      item_type=item_type,
                                      add_title_row=add_title_row)
     return self.write_lines(lines, verbose=verbose)
Exemple #6
0
def hist(data: Data, *fields, in_memory=arg.DEFAULT, step=1000000, logger=arg.DEFAULT, msg=None) -> Stream:
    stream = _stream(data)
    total_count = stream.get_count()
    in_memory = arg.undefault(in_memory, stream.is_in_memory())
    logger = arg.undefault(logger, stream.get_logger, delayed=True)
    # if in_memory:
    if in_memory or len(fields) > 1:
        stream = stream.stream(
            get_hist_records(stream, fields, in_memory=in_memory, logger=logger, msg=msg),
            stream_type='RecordStream',
        )
    else:
        stream = stream if len(fields) <= 1 else stream.tee_stream()
        f = fields[0]
        if logger:
            logger.log('Calc hist for field {}...'.format(f))
        stream = stream.to_stream(
            stream_type='RecordStream',
        ).group_by(
            f,
            values=['-'],
            step=step,
        ).select(
            field=lambda r, k=f: k,
            value=f,
            count=('-', len),
        ).sort('value')
    if not total_count:
        stream = stream.to_memory()
        total_count = sum(stream.filter(field=fields[0]).get_one_column_values('count'))
    stream = stream.select(
        '*',
        total_count=fs.const(total_count),
        share=('count', 'total_count', lambda c, t: c / t if t else None),
    )
    return _assume_native(stream)
Exemple #7
0
 def run(
     self,
     operations: Union[list, arg.DefaultArgument] = arg.DEFAULT,
     if_not_yet: bool = True,
     options: Optional[dict] = None,
 ):
     operations = arg.undefault(operations, self.get_queue())
     operations = [self.get_operation(op) for op in operations]
     names = [op.get_name() for op in operations]
     for name, operation in zip(names, operations):
         options = self.get_options(including=operation, upd=options)
         if if_not_yet and hasattr(operation, 'run_if_not_yet'):
             operation.run_if_not_yet(options=options)
         else:
             operation.run_now(options=options)
Exemple #8
0
 def __init__(
         self, name: str = arg.DEFAULT,
         source: Source = None,
         context: Context = None,
         check: bool = True,
 ):
     name = arg.undefault(name, arg.get_generated_name(self._get_default_name_prefix()))
     if arg.is_defined(context):
         if arg.is_defined(source):
             source.set_context(context)
         else:
             source = context
     super().__init__(name=name, source=source, check=check)
     if arg.is_defined(self.get_context()):
         self.put_into_context(check=check)
Exemple #9
0
 def __init__(
     self,
     name: Name,
     operations: Optional[dict] = None,
     queue: Optional[list] = None,
     options: Optional[dict] = None,
     context: OptContext = arg.DEFAULT,
 ):
     context = arg.undefault(context, ct.get_context())
     super().__init__(
         name=name,
         children=operations,
         parent=context,
     )
     self._queue = queue or list()
     self._options = options or dict()
Exemple #10
0
 def run_now(
     self,
     return_stream: bool = True,
     stream_type: OptStreamType = arg.DEFAULT,
     options: Options = None,
     verbose: bool = True,
 ) -> Stream:
     stream_type = arg.undefault(stream_type, self.get_stream_type())
     stream = self.get_src().to_stream(stream_type=stream_type)
     if verbose:
         self.log('Running operation: {}'.format(self.get_name()))
     if self.has_procedure():
         if self.has_apply_to_stream():
             stream = self.get_procedure()(stream,
                                           **self.get_kwargs(upd=options))
         else:
             stream = stream.apply_to_data(self.get_procedure(),
                                           **self.get_kwargs(upd=options))
     return stream.write_to(self.get_dst(), return_stream=return_stream)
Exemple #11
0
 def __init__(self, name: str = arg.DEFAULT, source: Optional[SourcedInterface] = None, check: bool = True):
     name = arg.undefault(name, arg.get_generated_name(self._get_default_name_prefix()))
     super().__init__(name=name)
     self._source = source
     if arg.is_defined(source):
         self.register(check=check)