Exemple #1
0
 def file(
         self,
         name: str,
         content_format: Union[ContentFormatInterface, ContentType, Auto] = AUTO,
         filetype: Union[ContentType, Auto] = AUTO,  # deprecated argument
         **kwargs
 ) -> ConnectorInterface:
     file = self.get_children().get(name)
     if kwargs or not file:
         filename = kwargs.pop('filename', name)
         file_class = self.get_default_child_obj_class()  # LocalFile
         assert file_class, "connector class or type name aren't detected"
         if arg.is_defined(filetype):
             if arg.is_defined(content_format):
                 msg = 'Only one of arguments allowed: filetype (got {}) or content_format (got {})'
                 raise ValueError(msg.format(filetype, content_format))
             else:
                 msg = 'LocalFolder.file(): filetype-argument is deprecated, use content_format instead'
                 self.log(level=LoggingLevel.Warning, msg=msg, stacklevel=1)
             if isinstance(filetype, (ContentType, Auto, str)):
                 content_format = filetype
             else:  # temporary workaround for deprecated FileType class
                 content_format = filetype.get_value()
         try:
             file = file_class(filename, content_format=content_format, folder=self, **kwargs)
         except TypeError as e:
             raise TypeError('{}.{}'.format(file_class.__name__, e))
         self.add_child(file)
     return file
Exemple #2
0
 def stream(self, data, source=arg.AUTO, context=arg.AUTO, *args, **kwargs):
     if arg.is_defined(source):
         kwargs['source'] = source
     if arg.is_defined(context):
         kwargs['context'] = context
     stream_class = self.get_class()
     return stream_class(data, *args, **kwargs)
Exemple #3
0
 def update(self, position, step=None, message=None):
     if arg.is_defined(message):
         self.set_name(message, inplace=True)
     if step == 1 or not arg.is_defined(step):
         self.update_now(position)
     else:
         self.update_with_step(position, step)
Exemple #4
0
 def stack(cls,
           *iter_streams,
           how: How = 'vertical',
           name=AUTO,
           context=None,
           **kwargs):
     iter_streams = arg.update(iter_streams)
     assert cls.is_same_stream_type(
         iter_streams), 'concat(): streams must have same type: {}'.format(
             iter_streams)
     result = None
     for cur_stream in iter_streams:
         assert isinstance(cur_stream, StreamInterface)
         if result is None:
             if hasattr(cur_stream, 'copy'):
                 result = cur_stream.copy()
             else:
                 result = cur_stream
             if arg.is_defined(name):
                 result.set_name(name)
             if arg.is_defined(context):
                 result.set_context(context)
         elif how == 'vertical':
             result = result.add_stream(cur_stream)
         else:
             result = result.join(cur_stream, how=how, **kwargs)
         gc.collect()
     return result
Exemple #5
0
 def set_context(self, context: AutoContext, reset: bool = False, inplace: bool = True) -> Optional[Native]:
     if not arg.is_defined(context):
         if not self.get_context(skip_missing=True):
             context = self.get_default_context()
     if arg.is_defined(context):
         self.set_parent(context, reset=False, inplace=True)
     if not inplace:
         return self
Exemple #6
0
 def get_kwargs(self,
                ex: Optional[Iterable] = None,
                upd: Options = None) -> dict:
     kwargs = dict()
     kwargs.update(self.get_connectors())
     kwargs.update(self.get_options())
     if arg.is_defined(ex):
         for k in ex:
             kwargs.pop(k)
     if arg.is_defined(upd):
         kwargs.update(upd)
     return kwargs
Exemple #7
0
 def get_group_header(self, name: Comment = AUTO, caption: Comment = AUTO, comment: Comment = None) -> Iterable[str]:
     is_title_row = name == arg.AUTO
     name = arg.acquire(name, self.get_name())
     caption = arg.acquire(caption, self.get_caption())
     if arg.is_defined(name):
         yield name
     if arg.is_defined(caption):
         yield caption
     if is_title_row:
         yield self.get_str_fields_count()
     if arg.is_defined(comment):
         yield comment
Exemple #8
0
 def get_count(self,
               allow_reopen: bool = True,
               allow_slow_gzip: bool = True,
               force: bool = False) -> Optional[int]:
     must_recount = force or self.is_changed_by_another(
     ) or not arg.is_defined(self.get_prev_lines_count())
     if self.is_existing() and must_recount:
         count = self.get_actual_lines_count(
             allow_reopen=allow_reopen, allow_slow_gzip=allow_slow_gzip)
         self.set_count(count)
     else:
         count = self.get_prev_lines_count()
     if arg.is_defined(count):
         return count
Exemple #9
0
 def set_context(self,
                 context: AutoContext,
                 reset: bool = False,
                 inplace: bool = True) -> Optional[Native]:
     if arg.is_defined(context):
         parent = self.get_parent()
         if arg.is_defined(parent):
             parent.set_context(context, reset=False, inplace=True)
         else:
             self.set_parent(context, reset=False, inplace=True)
         if not inplace:
             return self
     else:
         return self
Exemple #10
0
 def __init__(
     self,
     mask: str,
     parent: HierarchicConnector,
     context: AutoContext = None,
     verbose: AutoBool = AUTO,
 ):
     if not arg.is_defined(parent):
         if arg.is_defined(context):
             parent = context.get_local_storage()
     assert parent.is_folder() or parent.is_storage()
     super().__init__(path=mask,
                      parent=parent,
                      context=context,
                      verbose=verbose)
Exemple #11
0
 def __init__(
         self, name: str = arg.DEFAULT,
         source: Source = None,
         context: Context = None,
         check: bool = True,
 ):
     name = arg.undefault(name, arg.get_generated_name(self._get_default_name_prefix()))
     if arg.is_defined(context):
         if arg.is_defined(source):
             source.set_context(context)
         else:
             source = context
     super().__init__(name=name, source=source, check=check)
     if arg.is_defined(self.get_context()):
         self.put_into_context(check=check)
Exemple #12
0
 def to_stream(
         self,
         data: Data = AUTO,
         stream_type: AutoStreamType = AUTO,
         ex: OptionalFields = None,
         **kwargs
 ) -> Stream:
     stream_type = arg.delayed_acquire(stream_type, self.get_stream_type)
     if isinstance(stream_type, str):
         stream_class = StreamType(stream_type).get_class()
     elif isclass(stream_type):
         stream_class = stream_type
     elif isinstance(stream_type, StreamType) or hasattr(stream_type, 'get_class'):
         stream_class = stream_type.get_class()
     else:
         raise TypeError('AnyStream.to_stream(data, stream_type): expected StreamType, got {}'.format(stream_type))
     if not arg.is_defined(data):
         if hasattr(self, 'get_items_of_type'):
             item_type = stream_class.get_item_type()
             data = self.get_items_of_type(item_type)
         else:
             data = self.get_data()
     meta = self.get_compatible_meta(stream_class, ex=ex)
     meta.update(kwargs)
     if 'count' not in meta:
         meta['count'] = self.get_count()
     if 'source' not in meta:
         meta['source'] = self.get_source()
     stream = stream_class(data, **meta)
     return self._assume_stream(stream)
Exemple #13
0
 def map(self, function: Callable, to: Union[StreamType, Auto] = AUTO) -> Native:
     if arg.is_defined(to):
         self.log('to-argument for map() is deprecated, use map_to_type() method instead', level=30)
         stream = super().map_to_type(function, stream_type=to)
     else:
         stream = super().map(function)
     return self._assume_native(stream)
Exemple #14
0
 def __init__(
         self,
         name: OptName = arg.AUTO,
         level: Level = arg.AUTO,
         formatter: Union[Formatter, arg.Auto] = arg.AUTO,
         loggers: SubLoggers = arg.AUTO,
         context: Context = None,
         file: Optional[FileOrName] = None,
 ):
     name = arg.acquire(name, DEFAULT_LOGGER_NAME)
     level = arg.acquire(level, DEFAULT_LOGGING_LEVEL)
     formatter = arg.acquire(formatter, DEFAULT_FORMATTER)
     if not isinstance(level, LoggingLevel):
         level = LoggingLevel(level)
     if isinstance(loggers, list):
         loggers = {i: i.get_name() for i in loggers}
     elif not arg.is_defined(loggers):
         loggers = dict()
     if name not in loggers:
         level_value = arg.get_value(level)
         base_logger = self.build_base_logger(name, level_value, formatter)
         loggers[name] = base_logger
     self._level = level
     super().__init__(name=name, children=loggers, context=context)
     if file:
         self.set_file(file)
Exemple #15
0
 def __init__(
     self,
     name: Name,
     src: Connector,
     dst: Connector,
     procedure: Optional[Callable],
     options: Options = None,
     apply_to_stream: bool = True,
     stream_type: OptStreamType = arg.DEFAULT,
     context: Context = arg.DEFAULT,
 ):
     if not arg.is_defined(options):
         options = dict()
     super().__init__(
         name=name,
         connectors={
             SRC_ID: src,
             DST_ID: dst
         },
         procedure=procedure,
         options=options,
         apply_to_stream=apply_to_stream,
         stream_type=stream_type,
         context=context,
     )
Exemple #16
0
def csv_reader(delimiter: Union[str, arg.Auto, None] = arg.AUTO,
               *args,
               **kwargs) -> Callable:
    if arg.is_defined(delimiter):
        return lambda a: csv.reader(a, delimiter=delimiter, *args, **kwargs)
    else:
        return csv.reader
Exemple #17
0
 def get_parsed_line(
     self,
     line: str,
     item_type: Union[ItemType, Auto] = AUTO,
     struct: Union[Array, StructInterface, Auto] = AUTO,
 ) -> Item:
     item_type = arg.delayed_acquire(item_type, self.get_default_item_type)
     if item_type == ItemType.Line:
         return line
     line_parser = fs.csv_loads(delimiter=self.get_delimiter())
     row = line_parser(line)
     if isinstance(struct, StructInterface):
         field_converters = struct.get_converters()
         row_converter = self._get_row_converter(
             converters=field_converters)
         row = row_converter(row)
     if item_type in (ItemType.Row, ItemType.Any, ItemType.Auto):
         return row
     if not arg.is_defined(struct, check_name=False):
         column_count = len(row)
         struct = list(range(column_count))
     if item_type == ItemType.Record:
         return {arg.get_name(k): v for k, v in zip(struct, row)}
     elif item_type == ItemType.StructRow:
         return ItemType.StructRow.build(data=row, struct=struct)
     else:
         msg = 'item_type {} is not supported for {}.parse_lines()'
         raise ValueError(msg.format(item_type, self.__class__.__name__))
Exemple #18
0
def get_field_value_from_item(field: FieldID,
                              item: ConcreteItem,
                              item_type: ItemType = ItemType.Auto,
                              skip_errors: bool = False,
                              logger=None,
                              default: Value = None):
    if field == STAR:
        return item
    if item_type == ItemType.Auto or not arg.is_defined(item_type):
        item_type = ItemType.detect(item, default='any')
    if isinstance(item_type, str):
        item_type = ItemType(item_type)
    else:
        item_type = ItemType(item_type.value)
    try:
        return item_type.get_value_from_item(
            item=item,
            field=field,
            default=default,
            skip_unsupported_types=skip_errors,
        )
    except IndexError as e:
        pass
    except TypeError as e:
        pass
    msg = 'Field {} does no exists in current item ({})'.format(field, e)
    if skip_errors:
        if logger:
            logger.log(msg)
        return default
    else:
        raise IndexError(msg)
Exemple #19
0
 def get_child_class_by_name_and_type(self, name: str, filetype: Union[FileType, Auto] = AUTO) -> Type:
     if arg.is_defined(filetype):
         return FileType(filetype).get_class()
     else:
         supposed_type = self.get_type_by_name(name)
         if supposed_type:
             return supposed_type.get_class()
Exemple #20
0
 def get_lines(
     self,
     count: Optional[int] = None,
     skip_first: bool = False,
     allow_reopen: bool = True,
     check: bool = True,
     verbose: AutoBool = AUTO,
     message: Union[str, Auto] = AUTO,
     step: AutoCount = AUTO,
 ) -> Iterable:
     if check and not self.is_gzip():
         # assert self.get_count(allow_reopen=True) > 0
         assert not self.is_empty(
         ), 'for get_lines() file must be non-empty: {}'.format(self)
     self.open(allow_reopen=allow_reopen)
     lines = self.get_next_lines(count=count,
                                 skip_first=skip_first,
                                 close=True)
     verbose = arg.acquire(verbose, self.is_verbose())
     if verbose or arg.is_defined(message):
         message = arg.acquire(message, 'Reading {}')
         if '{}' in message:
             message = message.format(self.get_name())
         logger = self.get_logger()
         assert hasattr(logger,
                        'progress'), '{} has no progress in {}'.format(
                            self, logger)
         if not count:
             count = self.get_count(allow_slow_gzip=False)
         lines = self.get_logger().progress(lines,
                                            name=message,
                                            count=count,
                                            step=step)
     return lines
Exemple #21
0
 def get_dataframe(self, columns: Columns = None) -> DataFrame:
     if pd and get_use_objects_for_output():
         dataframe = DataFrame(self.get_items())
         if arg.is_defined(columns):
             columns = arg.get_names(columns)
             dataframe = dataframe[columns]
         return dataframe
Exemple #22
0
 def describe(
         self, example: Optional[dict] = None,
         as_dataframe: bool = False,
         separate_by_tabs: bool = False,
         show_header: bool = True,
         comment: Comment = None,
         select_fields: Optional[Array] = None,
         logger: Union[ExtLogger, Auto] = AUTO,
 ) -> Optional[DataFrame]:
     log = logger.log if arg.is_defined(logger) else print
     if show_header:
         for line in self.get_group_header(comment=comment):
             log(line)
         log('')
     if as_dataframe:
         return self.show()
     else:
         columns, template = self._get_describe_template(example)
         log('\t'.join(columns) if separate_by_tabs else template.format(*columns))
         for (n, type_name, name, caption, is_valid) in self.get_struct_description(include_header=False):
             if type_name == GROUP_TYPE_STR:
                 log('')
                 for line in self.get_group_header(name, caption=caption):
                     log(line)
             else:
                 if name in (select_fields or []):
                     is_valid = '>' if is_valid == '.' else str(is_valid).upper()
                 if example:
                     value = str(example.get(name))
                     row = (is_valid, n, type_name, name, value, caption)
                 else:
                     row = (is_valid, n, type_name, name, caption)
                 log('\t'.join(row) if separate_by_tabs else template.format(*row))
Exemple #23
0
 def __init__(
     self,
     name: str,
     content_format: Union[ContentFormatInterface, Auto] = AUTO,
     struct: Union[Struct, Auto, None] = AUTO,
     folder: Connector = None,
     context: Context = AUTO,
     expected_count: AutoCount = AUTO,
     verbose: AutoBool = AUTO,
 ):
     if folder:
         message = 'only LocalFolder supported for *File instances (got {})'.format(
             type(folder))
         assert isinstance(
             folder, ConnectorInterface) or folder.is_folder(), message
     elif arg.is_defined(context):
         folder = context.get_job_folder()
     else:
         folder = self.get_default_folder()
     self._fileholder = None
     super().__init__(
         name=name,
         content_format=content_format,
         struct=struct,
         expected_count=expected_count,
         parent=folder,
         context=context,
         verbose=verbose,
     )
Exemple #24
0
 def map(self, function: Callable, to: AutoStreamType = AUTO) -> Native:
     if arg.is_defined(to):
         self.get_logger().warning('to-argument for map() is deprecated, use map_to() instead')
         stream = super().map_to(function, stream_type=to)
     else:
         stream = super().map(function)
     return self._assume_native(stream)
Exemple #25
0
 def append_field(
         self,
         field: Field,
         default_type: FieldType = FieldType.Any,
         before: bool = False,
         exclude_duplicates: bool = True,
         reassign_struct_name: bool = False,
         inplace: bool = True,
 ) -> Optional[Native]:
     if self._is_field(field):
         field_desc = field
     elif isinstance(field, str):
         field_desc = AdvancedField(field, default_type)
     elif isinstance(field, ARRAY_TYPES):
         field_desc = AdvancedField(*field)
     elif isinstance(field, dict):
         field_desc = AdvancedField(**field)
     else:
         raise TypeError('Expected field, str or dict, got {} as {}'.format(field, type(field)))
     if exclude_duplicates and field_desc.get_name() in self.get_field_names():
         return self
     else:
         if isinstance(field_desc, AdvancedField):
             if reassign_struct_name or not arg.is_defined(field_desc.get_group_name()):
                 field_desc.set_group_name(self.get_name(), inplace=True)
                 field_desc.set_group_caption(self.get_caption(), inplace=True)
         if before:
             fields = [field_desc] + self.get_fields()
         else:
             fields = self.get_fields() + [field_desc]
         return self.set_fields(fields, inplace=inplace)
Exemple #26
0
 def get_context(self) -> Parent:
     parent = self.get_parent()
     if arg.is_defined(parent):
         if parent.is_context():
             return parent
         elif hasattr(parent, 'get_context'):
             return parent.get_context()
Exemple #27
0
 def to_stream(self,
               data: Union[Iterable, Auto] = AUTO,
               name: AutoName = AUTO,
               stream_type: Union[StreamType, Auto] = AUTO,
               ex: OptionalFields = None,
               step: AutoCount = AUTO,
               **kwargs) -> Stream:
     name = arg.delayed_acquire(name, self._get_generated_stream_name)
     stream_type = self._get_stream_type(stream_type)
     stream_class = self._get_stream_class(stream_type)
     if hasattr(stream_class, 'get_item_type'):
         item_type = stream_class.get_item_type()
     else:
         item_type = AUTO
     if not arg.is_defined(data):
         data = self._get_items_of_type(item_type,
                                        verbose=kwargs.get('verbose', AUTO),
                                        step=step)
     meta = self.get_compatible_meta(stream_class,
                                     name=name,
                                     ex=ex,
                                     **kwargs)
     if 'count' not in meta:
         meta['count'] = self._get_fast_count()
     if 'source' not in meta:
         meta['source'] = self
     stream = stream_class(data, **meta)
     return self._assume_stream(stream)
Exemple #28
0
 def stream(self,
            data: Iterable,
            stream_type: AutoStreamType = AUTO,
            ex: OptionalArguments = None,
            save_name: bool = True,
            save_count: bool = True,
            **kwargs) -> Stream:
     if arg.is_defined(stream_type):
         if isinstance(stream_type, str):
             stream_class = StreamType(stream_type).get_class()
         else:
             stream_class = stream_type.get_class()
         meta = self.get_compatible_meta(stream_class, ex=ex)
     else:
         stream_class = self.__class__
         meta = self.get_meta()
     if not save_name:
         meta.pop('name')
     if not save_count:
         meta.pop('count')
     meta.update(kwargs)
     if 'context' not in meta:
         meta['context'] = self.get_context()
     stream = stream_class(data, **meta)
     return stream
Exemple #29
0
 def __init__(
         self,
         path: str,
         path_is_relative: AutoBool = AUTO,
         parent: AutoConnector = AUTO,
         context: AutoContext = None,
         verbose: AutoBool = AUTO,
 ):
     if not arg.is_defined(parent):
         if arg.is_defined(context):
             parent = context.get_local_storage()
         else:
             parent = self.get_default_storage()
     parent = self._assume_native(parent)
     self._path_is_relative = arg.acquire(path_is_relative, not arg.is_absolute_path(path))
     super().__init__(name=path, parent=parent, verbose=verbose)
Exemple #30
0
 def get_lines(self,
               items: Iterable,
               item_type: ItemType,
               add_title_row: AutoBool = AUTO) -> Generator:
     if arg.is_defined(add_title_row):
         assert not add_title_row, 'title_row available in FlatStructFormat only'
     for i in items:
         yield self.get_formatted_item(i, item_type=item_type)