Exemple #1
0
 def __init__(
         self,
         name: str,
         content_format: Union[ContentFormatInterface, Auto] = AUTO,
         struct: Union[Struct, Auto, None] = AUTO,
         folder: Connector = None,
         context: Context = AUTO,
         first_line_is_title: AutoBool = AUTO,
         expected_count: AutoCount = AUTO,
         caption: Optional[str] = None,
         verbose: AutoBool = AUTO,
         **kwargs
 ):
     parent = kwargs.pop('parent', None)
     if folder:
         message = 'only LocalFolder supported for *File instances (got {})'.format(type(folder))
         assert isinstance(folder, ConnectorInterface) or folder.is_folder(), message
         assert folder == parent or not Auto.is_defined(parent)
     elif Auto.is_defined(parent):
         folder = parent
     elif Auto.is_defined(context):
         folder = context.get_job_folder()
     else:
         folder = self.get_default_folder()
     self._fileholder = None
     super().__init__(
         name=name,
         content_format=content_format, struct=struct,
         first_line_is_title=first_line_is_title,
         expected_count=expected_count,
         caption=caption,
         parent=folder, context=context, verbose=verbose,
         **kwargs,
     )
Exemple #2
0
 def __init__(
     self,
     data: Iterable,
     name: AutoName = AUTO,
     check: bool = False,
     count: AutoCount = None,
     less_than: AutoCount = None,
     source: Connector = None,
     context: Context = None,
     max_items_in_memory: AutoCount = AUTO,
     tmp_files: TmpMask = AUTO,
 ):
     count = get_optional_len(data, count)
     if count and Auto.is_defined(count) and not Auto.is_defined(less_than):
         less_than = count
     self._tmp_files = None
     super().__init__(
         data=data,
         name=name,
         check=check,
         source=source,
         context=context,
         count=count,
         less_than=less_than,
         max_items_in_memory=max_items_in_memory,
     )
     self._tmp_files = Auto.delayed_acquire(tmp_files, sm.get_tmp_mask,
                                            self.get_name())
Exemple #3
0
 def get_new_progress(self,
                      name: str,
                      count: Optional[int] = None,
                      context: AutoContext = AUTO):
     logger = self.get_logger()
     if Auto.is_defined(context) and not Auto.is_defined(logger):
         logger = context.get_logger()
     if isinstance(logger, ExtendedLoggerInterface) or hasattr(
             logger, 'get_new_progress'):
         return logger.get_new_progress(name, count=count, context=context)
Exemple #4
0
 def __init__(
         self,
         mask: str,
         parent: HierarchicConnector,
         context: AutoContext = None,
         verbose: AutoBool = AUTO,
 ):
     if not Auto.is_defined(parent):
         if Auto.is_defined(context):
             parent = context.get_local_storage()
     assert parent.is_folder() or parent.is_storage()
     super().__init__(path=mask, parent=parent, context=context, verbose=verbose)
Exemple #5
0
 def set_context(self,
                 context: AutoContext,
                 reset: bool = False,
                 inplace: bool = True) -> Optional[Native]:
     if context:
         parent = self.get_parent()
         if Auto.is_defined(parent):
             parent.set_context(context, reset=False, inplace=True)
         elif Auto.is_defined(context):
             self.set_parent(context, reset=False, inplace=True)
     if not inplace:
         return self
Exemple #6
0
 def get_count(self,
               allow_reopen: bool = True,
               allow_slow_mode: bool = True,
               force: bool = False) -> Count:
     must_recount = force or self.is_outdated() or not Auto.is_defined(
         self.get_prev_lines_count())
     if self.is_existing() and must_recount:
         count = self.get_actual_lines_count(
             allow_reopen=allow_reopen, allow_slow_mode=allow_slow_mode)
         self.set_count(count)
     else:
         count = self.get_prev_lines_count()
     if Auto.is_defined(count):
         return count
Exemple #7
0
 def get_group_header(self,
                      name: Comment = AUTO,
                      caption: Comment = AUTO,
                      comment: Comment = None) -> Iterable[str]:
     is_title_row = name == AUTO
     name = Auto.acquire(name, self.get_name())
     caption = Auto.acquire(caption, self.get_caption())
     if Auto.is_defined(name):
         yield name
     if Auto.is_defined(caption):
         yield caption
     if is_title_row:
         yield self.get_str_fields_count()
     if Auto.is_defined(comment):
         yield comment
Exemple #8
0
 def to_stream(self,
               data: Union[Iterable, Auto] = AUTO,
               name: AutoName = AUTO,
               stream_type: Union[StreamType, Auto] = AUTO,
               ex: OptionalFields = None,
               step: AutoCount = AUTO,
               **kwargs) -> Stream:
     name = Auto.delayed_acquire(name, self._get_generated_stream_name)
     stream_type = self._get_stream_type(stream_type)
     stream_class = self._get_stream_class(stream_type)
     if hasattr(stream_class, 'get_item_type'):
         item_type = stream_class.get_item_type()
     else:
         stream_obj = stream_class([])
         if hasattr(stream_obj, 'get_item_type'):
             item_type = stream_obj.get_item_type()
         else:
             item_type = AUTO
     if not Auto.is_defined(data):
         data = self._get_items_of_type(item_type,
                                        verbose=kwargs.get('verbose', AUTO),
                                        step=step)
     meta = self.get_compatible_meta(stream_class,
                                     name=name,
                                     ex=ex,
                                     **kwargs)
     if 'count' not in meta and 'count' not in kwargs:
         meta['count'] = self._get_fast_count()
     if 'source' not in meta:
         meta['source'] = self
     stream = stream_class(data, **meta)
     return self._assume_stream(stream)
Exemple #9
0
 def get_struct_repr_lines(self,
                           example: Optional[dict] = None,
                           delimiter: str = COLUMN_DELIMITER,
                           select_fields: Optional[Array] = None,
                           count: Optional[int] = None) -> Generator:
     columns, template = self._get_describe_template(example)
     separate_by_tabs = delimiter == '\t'
     yield '\t'.join(columns) if separate_by_tabs else template.format(
         *columns)
     for (n, type_name, name, caption,
          is_valid) in self.get_struct_description(include_header=False):
         if type_name == GROUP_TYPE_STR:
             yield ''
             for line in self.get_group_header(name, caption=caption):
                 yield line
         else:
             if name in (select_fields or []):
                 is_valid = '>' if is_valid == '.' else str(
                     is_valid).upper()
             if example:
                 value = str(example.get(name))
                 row = (is_valid, n, type_name, name, value, caption)
             else:
                 row = (is_valid, n, type_name, name, caption)
             yield '\t'.join(row) if separate_by_tabs else template.format(
                 *row)
         if Auto.is_defined(count):
             if n >= count - 1:
                 break
Exemple #10
0
 def simple_select(
     self,
     fields: OptionalFields,
     filters: OptionalFields = None,
     sort: OptionalFields = None,
     count: Count = None,
     stream_type: Union[StreamType, Auto] = AUTO,
     verbose: AutoBool = AUTO,
 ) -> Stream:
     stream_type = Auto.acquire(stream_type, StreamType.RecordStream)
     stream_class = stream_type.get_class()
     stream_rows = self.execute_select(fields=fields,
                                       filters=filters,
                                       sort=sort,
                                       count=count,
                                       verbose=verbose)
     if stream_type == StreamType.RowStream:
         stream_data = stream_rows
     elif stream_type == StreamType.RecordStream:
         columns = self.get_columns()
         stream_data = map(lambda r: dict(zip(columns, r)), stream_rows)
     else:
         raise NotImplementedError
     if Auto.is_defined(count):
         if count < MAX_ITEMS_IN_MEMORY:
             stream_data = list(stream_data)
             count = len(stream_data)
     return stream_class(stream_data,
                         count=count,
                         source=self,
                         context=self.get_context())
Exemple #11
0
 def to_stream(self,
               data: Union[Iterable, Auto] = AUTO,
               name: AutoName = AUTO,
               stream_type: Union[StreamType, Auto] = AUTO,
               ex: OptionalFields = None,
               step: AutoCount = AUTO,
               **kwargs) -> Stream:
     stream_type = Auto.acquire(stream_type, StreamType.SqlStream)
     if stream_type == StreamType.SqlStream:
         assert not Auto.is_defined(data)
         name = Auto.delayed_acquire(name, self._get_generated_stream_name)
         stream_class = stream_type.get_class()
         meta = self.get_compatible_meta(stream_class,
                                         name=name,
                                         ex=ex,
                                         **kwargs)
         meta['source'] = self
         return stream_class(data, **meta)
     else:
         return super().to_stream(
             data=data,
             name=name,
             stream_type=stream_type,
             ex=ex,
             step=step,
             **kwargs,
         )
Exemple #12
0
 def insert_data(
         self,
         table: Union[Table, Name], data: Data, struct: Struct = None,
         encoding: Optional[str] = None, skip_errors: bool = False,
         skip_lines: Count = 0, skip_first_line: bool = False,
         step: AutoCount = DEFAULT_STEP, verbose: AutoBool = AUTO,
 ) -> tuple:
     if not Auto.is_defined(skip_lines):
         skip_lines = 0
     is_struct_description = isinstance(struct, StructInterface) or hasattr(struct, 'get_struct_str')
     if not is_struct_description:
         message = 'Struct as {} is deprecated, use FlatStruct instead'.format(type(struct))
         self.log(msg=message, level=LoggingLevel.Warning)
         struct = FlatStruct(struct or [])
     input_stream = self._get_struct_stream_from_data(
         data, struct=struct,
         encoding=encoding, skip_first_line=skip_first_line, verbose=verbose,
     )
     if skip_lines:
         input_stream = input_stream.skip(skip_lines)
     if input_stream.get_stream_type() != StreamType.StructStream:
         input_stream = input_stream.structure(
             struct,
             skip_bad_rows=True,
             verbose=True,
         ).update_meta(
             count=input_stream.get_count(),
         )
     initial_count = input_stream.get_estimated_count() + skip_lines
     final_count = self.insert_struct_stream(
         table, input_stream,
         skip_errors=skip_errors, step=step,
         verbose=verbose,
     )
     return initial_count, final_count
Exemple #13
0
 def get_types_list(
         self,
         dialect: Union[DialectType, Auto] = DialectType.String) -> list:
     if Auto.is_defined(dialect):
         return [f.get_type_in(dialect) for f in self.get_fields()]
     else:
         return [f.get_type() for f in self.get_fields()]
Exemple #14
0
 def get_detected_format(
         self,
         detect: bool = True,
         force: bool = False,
         skip_missing: bool = True,
 ) -> ContentFormatInterface:
     if force or (detect and not Auto.is_defined(self._detected_format)):
         self.reset_detected_format(use_declared_types=True, skip_missing=skip_missing)
     return self._detected_format
Exemple #15
0
 def format(self, value, skip_errors: bool = False) -> str:
     representation = self.get_representation()
     if Auto.is_defined(representation):
         try:
             return representation.format(value, skip_errors=skip_errors)
         except AttributeError:
             return representation.format(value)
     else:
         return str(value)
Exemple #16
0
 def _get_native_struct(self,
                        raw_struct: Struct,
                        save_if_not_yet: bool = False,
                        verbose: AutoBool = AUTO) -> Struct:
     if hasattr(self, 'is_verbose') and not Auto.is_defined(verbose):
         verbose = self.is_verbose()
     if raw_struct is None:
         native_struct = None
     elif isinstance(raw_struct, StructInterface):
         native_struct = raw_struct
     elif hasattr(raw_struct, 'get_fields'):
         struct_class = self._get_struct_class()
         native_struct = struct_class(raw_struct)
     elif isinstance(raw_struct, ARRAY_TYPES):
         if verbose:
             msg = 'Struct as list is deprecated, use FlatStruct(StructInterface) class instead'
             if hasattr(self, 'get_logger'):
                 logger = self.get_logger()
                 logger.warning(msg,
                                category=DeprecationWarning,
                                stacklevel=2)
             elif hasattr(self, 'log'):
                 self.log(msg=msg, level=30)
             else:
                 print(msg)
         column_names = raw_struct
         has_types_descriptions = [
             isinstance(f, ARRAY_TYPES) for f in raw_struct
         ]
         if max(has_types_descriptions):
             struct_class = self._get_struct_class()
             native_struct = struct_class(raw_struct)
         else:
             native_struct = self._get_struct_detected_by_title_row(
                 column_names)
     elif raw_struct == AUTO:
         native_struct = None
         if hasattr(self, 'get_struct_from_source'):
             native_struct = self.get_struct_from_source(
                 set_struct=save_if_not_yet, verbose=verbose)
         elif hasattr(self, 'is_first_line_title'):
             if self.is_first_line_title():
                 if hasattr(self, 'get_detected_struct_by_title_row'):
                     native_struct = self.get_detected_struct_by_title_row(
                         set_struct=save_if_not_yet,
                         verbose=verbose,
                     )
                 elif hasattr(self, 'get_title_row'):
                     title_row = self.get_title_row(close=True)
                     native_struct = self._get_struct_detected_by_title_row(
                         title_row)
     else:
         message = 'struct must be FlatStruct(StructInterface), got {}'.format(
             type(raw_struct))
         raise TypeError(message)
     return native_struct
Exemple #17
0
 def get_logger(self, create_if_not_yet=True) -> LoggerInterface:
     logger = self._logger
     if Auto.is_defined(logger, check_name=False):
         if isinstance(logger, ExtendedLoggerInterface) or hasattr(logger, 'get_context'):
             if not logger.get_context():
                 if hasattr(logger, 'set_context'):
                     logger.set_context(self)
         return self._logger
     elif create_if_not_yet:
         return self.get_new_logger()
Exemple #18
0
 def map(self, function: Callable, to: OptStreamType = AUTO) -> Native:
     if Auto.is_defined(to):
         self.get_logger().warning(
             'to-argument for map() is deprecated, use map_to() instead')
         stream = self.map_to(function, stream_type=to)
     else:
         stream = super().map(function)
     if self.is_in_memory() and hasattr(stream, 'to_memory'):
         stream = stream.to_memory()
     return self._assume_native(stream)
Exemple #19
0
 def reset_struct_to_initial(self,
                             verbose: bool = True,
                             message: Optional[str] = None) -> Native:
     if not Auto.is_defined(message):
         message = self.__repr__()
     initial_struct = self.get_initial_struct()
     if verbose:
         for line in self.get_struct().get_struct_comparison_iter(
                 initial_struct, message=message):
             self.log(line)
     return self.struct(initial_struct)
Exemple #20
0
 def get_logger(self, create_if_not_yet: bool = True) -> Optional[Logger]:
     logger = self.logger
     if Auto.is_defined(logger):
         if hasattr(logger, 'get_context') and hasattr(
                 logger, 'set_context'):
             if not logger.get_context():
                 logger.set_context(self)
         return logger
     elif create_if_not_yet:
         logger = lg.get_logger(context=self)
         self.set_logger(logger, inplace=True)
         return logger
Exemple #21
0
 def _get_stream_type(self,
                      stream_type: Union[StreamType,
                                         Auto] = AUTO) -> StreamType:
     if not Auto.is_defined(stream_type):
         if hasattr(self, 'get_stream_type'):
             stream_type = self.get_stream_type()
         elif hasattr(self, 'get_default_stream_type'):
             stream_type = self.get_default_stream_type()
         else:
             item_type = self.get_default_item_type()
             stream_type = StreamType.detect(item_type)
     return stream_type
Exemple #22
0
 def set_verbose(self,
                 verbose: AutoBool = AUTO,
                 parent: AutoConnector = AUTO) -> Native:
     if not Auto.is_defined(verbose):
         parent = Auto.delayed_acquire(parent, self.get_parent)
         if hasattr(parent, 'is_verbose'):
             verbose = parent.is_verbose()
         elif hasattr(parent, 'verbose'):
             verbose = parent.verbose
         else:
             verbose = DEFAULT_VERBOSE
     self._verbose = verbose
     return self
Exemple #23
0
 def get_lines(
         self,
         count: Optional[int] = None,
         skip_first: bool = False, allow_reopen: bool = True, check: bool = True,
         verbose: AutoBool = AUTO, message: AutoName = AUTO, step: AutoCount = AUTO,
 ) -> Generator:
     if check and not self.is_gzip():
         assert not self.is_empty(), 'for get_lines() file must be non-empty: {}'.format(self)
     self.open(allow_reopen=allow_reopen)
     lines = self.get_next_lines(count=count, skip_first=skip_first, close=True)
     verbose = Auto.acquire(verbose, self.is_verbose())
     if verbose or Auto.is_defined(message):
         if not Auto.is_defined(message):
             message = 'Reading {}'
         if '{}' in message:
             message = message.format(self.get_name())
         logger = self.get_logger()
         assert hasattr(logger, 'progress'), '{} has no progress in {}'.format(self, logger)
         if not count:
             count = self.get_count(allow_slow_mode=False)
         lines = self.get_logger().progress(lines, name=message, count=count, step=step)
     return lines
Exemple #24
0
 def get_fast_lines_count(self, ending: Union[str, Auto] = AUTO, verbose: AutoBool = AUTO) -> int:
     if self.is_gzip():
         raise ValueError('get_fast_lines_count() method is not available for gzip-files')
     if not Auto.is_defined(ending):
         if hasattr(self, 'get_content_format'):
             ending = self.get_content_format().get_ending()
         else:
             ending = '\n'
     verbose = Auto.acquire(verbose, self.is_verbose())
     self.log('Counting lines in {}...'.format(self.get_name()), end='\r', verbose=verbose)
     count_n_symbol = sum(chunk.count(ending) for chunk in self.get_chunks())
     count_lines = count_n_symbol + 1
     self.set_count(count_lines)
     return count_lines
Exemple #25
0
 def to_stream(
         self,
         data: Union[Iterable, Auto] = AUTO,
         name: AutoName = AUTO,
         stream_type: Union[StreamType, Auto] = AUTO,
         ex: OptionalFields = None,
         step: AutoCount = AUTO,
         **kwargs
 ) -> Stream:
     if Auto.is_defined(data):
         kwargs['data'] = data
     stream_type = Auto.delayed_acquire(stream_type, self.get_stream_type)
     assert not ex, 'ex-argument for LocalFile.to_stream() not supported (got {})'.format(ex)
     return self.to_stream_type(stream_type=stream_type, step=step, **kwargs)
Exemple #26
0
 def _collect_inplace(self, log: AutoBool = AUTO) -> None:
     estimated_count = self.get_estimated_count()
     if Auto.is_defined(estimated_count):
         log = Auto.acquire(
             log, estimated_count > self.get_limit_items_in_memory())
     if log and estimated_count:
         self.log(
             'Trying to collect {} items into memory from {}...'.format(
                 estimated_count, self.__repr__()))
     self.set_data(self.get_list(), inplace=True)
     self.update_count(force=False)
     if log:
         self.log('Collected {} items into memory from {}...'.format(
             estimated_count, self.__repr__()))
Exemple #27
0
 def assert_not_empty(self,
                      message: Union[Auto, str, None] = AUTO,
                      skip_error: bool = False) -> Native:
     if self.is_iter():
         self._collect_inplace()
     if not Auto.is_defined(message):
         message = 'Empty stream: {}'
     if '{}' in message:
         message = message.format(self)
     if self.is_empty():
         logger = self.get_logger()
         logger.warning(msg=message, stacklevel=2)
         if not skip_error:
             raise ValueError(message)
     return self
Exemple #28
0
 def get_struct_from_database(
     self,
     types: AutoLinks = AUTO,
     set_struct: bool = False,
     skip_missing: bool = False,
     verbose: AutoBool = AUTO,
 ) -> StructInterface:
     struct = FlatStruct(self.describe_table(verbose=verbose))
     if struct.is_empty() and not skip_missing:
         raise ValueError(
             'Can not get struct for non-existing table {}'.format(self))
     if Auto.is_defined(types):
         struct.set_types(types, inplace=True)
     if set_struct:
         self.set_struct(struct, inplace=True)
     return struct
Exemple #29
0
 def _get_item_type(
         self,
         stream: Union[StreamType, RegularStream, Auto] = AUTO) -> ItemType:
     if isinstance(stream, StreamType) or hasattr(stream, 'get_class'):
         stream_class = self._get_stream_class(stream)
     elif Auto.is_defined(stream):
         stream_class = stream
     else:
         stream_class = self._get_stream_class()
     assert isinstance(stream_class, RegularStream) or hasattr(
         stream_class, 'get_item_type')
     if hasattr(stream_class, 'get_item_type'):
         return stream_class.get_item_type()
     else:
         stream_obj = stream_class([])
         return stream_obj.get_item_type()
Exemple #30
0
 def validate_fields(self, initial: bool = True) -> Native:
     if initial:
         expected_struct = self.get_initial_struct()
         if Auto.is_defined(expected_struct):
             expected_struct = expected_struct.copy()
         else:
             expected_struct = self.get_struct_from_source(set_struct=True,
                                                           verbose=True)
     else:
         expected_struct = self.get_struct()
     actual_struct = self.get_struct_from_source(set_struct=False,
                                                 verbose=False)
     actual_struct = self._get_native_struct(actual_struct)
     validated_struct = actual_struct.validate_about(expected_struct)
     self.set_struct(validated_struct, inplace=True)
     return self