Exemple #1
0
 def to_stream(self,
               data: Union[Iterable, Auto] = AUTO,
               name: AutoName = AUTO,
               stream_type: Union[StreamType, Auto] = AUTO,
               ex: OptionalFields = None,
               step: AutoCount = AUTO,
               **kwargs) -> Stream:
     name = Auto.delayed_acquire(name, self._get_generated_stream_name)
     stream_type = self._get_stream_type(stream_type)
     stream_class = self._get_stream_class(stream_type)
     if hasattr(stream_class, 'get_item_type'):
         item_type = stream_class.get_item_type()
     else:
         stream_obj = stream_class([])
         if hasattr(stream_obj, 'get_item_type'):
             item_type = stream_obj.get_item_type()
         else:
             item_type = AUTO
     if not Auto.is_defined(data):
         data = self._get_items_of_type(item_type,
                                        verbose=kwargs.get('verbose', AUTO),
                                        step=step)
     meta = self.get_compatible_meta(stream_class,
                                     name=name,
                                     ex=ex,
                                     **kwargs)
     if 'count' not in meta and 'count' not in kwargs:
         meta['count'] = self._get_fast_count()
     if 'source' not in meta:
         meta['source'] = self
     stream = stream_class(data, **meta)
     return self._assume_stream(stream)
Exemple #2
0
 def to_stream(self,
               data: Union[Iterable, Auto] = AUTO,
               name: AutoName = AUTO,
               stream_type: Union[StreamType, Auto] = AUTO,
               ex: OptionalFields = None,
               step: AutoCount = AUTO,
               **kwargs) -> Stream:
     stream_type = Auto.acquire(stream_type, StreamType.SqlStream)
     if stream_type == StreamType.SqlStream:
         assert not Auto.is_defined(data)
         name = Auto.delayed_acquire(name, self._get_generated_stream_name)
         stream_class = stream_type.get_class()
         meta = self.get_compatible_meta(stream_class,
                                         name=name,
                                         ex=ex,
                                         **kwargs)
         meta['source'] = self
         return stream_class(data, **meta)
     else:
         return super().to_stream(
             data=data,
             name=name,
             stream_type=stream_type,
             ex=ex,
             step=step,
             **kwargs,
         )
Exemple #3
0
 def disk_sort(
     self,
     key: UniKey = fs.same(),
     reverse: bool = False,
     step: AutoCount = AUTO,
     verbose: AutoBool = False,
 ) -> Native:
     step = Auto.delayed_acquire(step, self.get_limit_items_in_memory)
     key_function = fs.composite_key(key)
     stream_parts = self.split_to_disk_by_step(
         step=step,
         sort_each_by=key_function,
         reverse=reverse,
         verbose=verbose,
     )
     assert stream_parts, 'streams must be non-empty'
     iterables = [f.get_iter() for f in stream_parts]
     counts = [f.get_count() or 0 for f in stream_parts]
     self.log('Merging {} parts... '.format(len(iterables)),
              verbose=verbose)
     items = algo.merge_iter(
         iterables,
         key_function=key_function,
         reverse=reverse,
         post_action=self.get_tmp_files().remove_all,
     )
     stream = self.stream(items, count=sum(counts))
     return self._assume_native(stream)
Exemple #4
0
 def __init__(
     self,
     data: Iterable,
     name: AutoName = AUTO,
     check: bool = False,
     count: AutoCount = None,
     less_than: AutoCount = None,
     source: Connector = None,
     context: Context = None,
     max_items_in_memory: AutoCount = AUTO,
     tmp_files: TmpMask = AUTO,
 ):
     count = get_optional_len(data, count)
     if count and Auto.is_defined(count) and not Auto.is_defined(less_than):
         less_than = count
     self._tmp_files = None
     super().__init__(
         data=data,
         name=name,
         check=check,
         source=source,
         context=context,
         count=count,
         less_than=less_than,
         max_items_in_memory=max_items_in_memory,
     )
     self._tmp_files = Auto.delayed_acquire(tmp_files, sm.get_tmp_mask,
                                            self.get_name())
Exemple #5
0
 def can_be_in_memory(self, step: AutoCount = AUTO) -> bool:
     step = Auto.delayed_acquire(step, self.get_limit_items_in_memory)
     if self.is_in_memory() or step is None:
         return True
     else:
         count = self.get_estimated_count()
         if count is None:
             return False
         else:
             return count <= step
Exemple #6
0
 def write_items(
         self,
         items: Iterable,
         item_type: Union[ItemType, Auto] = AUTO,
         add_title_row: AutoBool = AUTO,
         verbose: AutoBool = AUTO,
 ) -> Native:
     item_type = Auto.delayed_acquire(item_type, self.get_default_item_type)
     content_format = self.get_content_format()
     assert isinstance(content_format, ParsedFormat)
     lines = content_format.get_lines(items, item_type=item_type, add_title_row=add_title_row)
     return self.write_lines(lines, verbose=verbose)
Exemple #7
0
 def _get_detected_struct(
     self,
     set_struct: bool = False,
     use_declared_types: AutoBool = AUTO,  # ?
     verbose: AutoBool = AUTO,
 ) -> Optional[StructInterface]:
     struct = self.get_struct_from_database(set_struct=set_struct)
     if not isinstance(struct, StructInterface) and Auto.delayed_acquire(
             verbose, self.is_verbose):
         message = 'Struct as {} is deprecated. Use items.FlatStruct instead.'.format(
             type(struct))
         self.log(msg=message, level=LoggingLevel.Warning)
     return struct
Exemple #8
0
 def set_verbose(self,
                 verbose: AutoBool = AUTO,
                 parent: AutoConnector = AUTO) -> Native:
     if not Auto.is_defined(verbose):
         parent = Auto.delayed_acquire(parent, self.get_parent)
         if hasattr(parent, 'is_verbose'):
             verbose = parent.is_verbose()
         elif hasattr(parent, 'verbose'):
             verbose = parent.verbose
         else:
             verbose = DEFAULT_VERBOSE
     self._verbose = verbose
     return self
Exemple #9
0
 def get_count(self,
               in_memory: AutoBool = AUTO,
               final: bool = False) -> Count:
     in_memory = Auto.delayed_acquire(in_memory, self.is_in_memory)
     if in_memory:
         data = self.get_list()
         self._count = len(data)
         self._data = data
         return self._count
     else:
         if final:
             return self.final_count()
         else:
             return self.get_expected_count()
Exemple #10
0
 def to_stream(
         self,
         data: Union[Iterable, Auto] = AUTO,
         name: AutoName = AUTO,
         stream_type: Union[StreamType, Auto] = AUTO,
         ex: OptionalFields = None,
         step: AutoCount = AUTO,
         **kwargs
 ) -> Stream:
     if Auto.is_defined(data):
         kwargs['data'] = data
     stream_type = Auto.delayed_acquire(stream_type, self.get_stream_type)
     assert not ex, 'ex-argument for LocalFile.to_stream() not supported (got {})'.format(ex)
     return self.to_stream_type(stream_type=stream_type, step=step, **kwargs)
Exemple #11
0
 def __init__(
         self,
         name: Name,
         content_format: Union[ContentFormatInterface, ContentType, Auto] = AUTO,
         struct: Union[StructInterface, Auto, None] = AUTO,
         first_line_is_title: AutoBool = AUTO,
         parent: Parent = None,
         context: AutoContext = AUTO,
         streams: Links = None,
         expected_count: AutoCount = AUTO,
         caption: Optional[str] = None,
         verbose: AutoBool = AUTO,
         **kwargs
 ):
     self._declared_format = None
     self._detected_format = None
     self._modification_ts = None
     self._count = expected_count
     self._caption = caption
     super().__init__(name=name, parent=parent, context=context, children=streams, verbose=verbose)
     content_format = Auto.delayed_acquire(content_format, self._get_detected_format_by_name, name, **kwargs)
     suit_classes = ContentType, ContentFormatInterface, str
     is_deprecated_class = hasattr(content_format, 'get_value') and not isinstance(content_format, suit_classes)
     if is_deprecated_class:
         msg = 'LeafConnector({}, {}): content_format as {} is deprecated, use ContentType or ContentFormat instead'
         self.log(msg.format(name, content_format, content_format.__class__.__name__), level=30)
         content_format = content_format.get_value()
     if isinstance(content_format, str):
         content_format = ContentType(content_format)  # ContentType.detect(content_format) ?
     if isinstance(content_format, ContentType):  # tmp fix
         content_class = content_format.get_class()
         content_format = content_class(**kwargs)
     elif isinstance(content_format, ContentFormatInterface):
         content_format.set_inplace(**kwargs)
     else:
         if kwargs:
             msg = 'LeafConnector: kwargs allowed for ContentType only, not for {}, got kwargs={}'
             raise ValueError(msg.format(content_format, kwargs))
     assert isinstance(content_format, ContentFormatInterface), 'Expect ContentFormat, got {}'.format(content_format)
     self.set_content_format(content_format, inplace=True)
     self.set_first_line_title(first_line_is_title)
     if struct is not None:
         if struct == AUTO:
             struct = self._get_detected_struct(use_declared_types=False)
         if Auto.is_defined(struct, check_name=False):
             self.set_struct(struct, inplace=True)
Exemple #12
0
 def _get_linked_fields_descriptions(
         self,
         fields: Union[Iterable, Auto] = AUTO,
         group_name: str = 'used',
         prefix: str = '    - ',
         max_len: int = JUPYTER_LINE_LEN,
 ) -> Generator:
     fields = list(Auto.delayed_acquire(fields, self.get_linked_fields))
     count = len(fields)
     yield '{count} {name} fields:'.format(count=count, name=group_name)
     for f in fields:
         if isinstance(f, DescribeMixin) or hasattr(f, 'get_one_line_repr'):
             f_repr = f.get_one_line_repr(max_len=120)
         else:
             f_repr = repr(f)
         f_repr = prefix + f_repr
         if len(f_repr) > max_len:
             f_repr = f_repr[:max_len - len(CROP_SUFFIX)] + CROP_SUFFIX
         yield f_repr
Exemple #13
0
 def get_detected_struct_by_title_row(
         self,
         set_struct: bool = False,  # deprecated argument
         types: Union[dict, Auto, None] = AUTO,
         verbose: AutoBool = AUTO,  # deprecated argument
 ) -> Struct:
     assert self.is_first_line_title(
     ), 'Can detect struct by title row only if first line is a title row'
     assert self.is_existing(
     ), 'For detect struct file/object must be existing: {}'.format(
         self.get_path())
     verbose = Auto.delayed_acquire(verbose, self.is_verbose)
     title_row = self.get_title_row(close=True)
     struct = self._get_struct_detected_by_title_row(title_row, types=types)
     message = 'Struct for {} detected by title row: {}'.format(
         self.get_name(), struct.get_struct_str(None))
     self.log(message, end='\n', verbose=verbose)
     if set_struct:
         self.set_struct(struct, inplace=True)
     return struct
Exemple #14
0
 def get_items_of_type(
     self,
     item_type: Union[ItemType, Auto],
     verbose: AutoBool = AUTO,
     message: AutoName = AUTO,
     step: AutoCount = AUTO,
 ) -> Iterable:
     item_type = Auto.delayed_acquire(item_type, self.get_item_type)
     rows = self.get_rows(verbose=verbose)
     if item_type == ItemType.Row:
         items = rows
     else:
         if item_type == ItemType.StructRow:
             row_class = ItemType.StructRow.get_class()
             items = map(lambda i: row_class(i, self.get_struct()), rows)
         elif item_type == ItemType.Record:
             items = map(
                 lambda r: {c: v
                            for c, v in zip(r, self.get_columns())}, rows)
         elif item_type == ItemType.Line:
             items = map(lambda r: '\t'.join([str(v) for v in r]), rows)
         else:
             raise ValueError(
                 'Table.get_items_of_type(): cannot convert Rows to {}'.
                 format(item_type))
     if step:
         logger = self.get_logger()
         if isinstance(logger, ExtendedLoggerInterface):
             count = self._get_fast_count()
             if not Auto.is_defined(message):
                 message = 'Downloading {count} lines from {name}'
             if '{}' in message:
                 message = message.format(count, self.get_name())
             if '{' in message:
                 message = message.format(count=count, name=self.get_name())
             items = logger.progress(items,
                                     name=message,
                                     count=count,
                                     step=step,
                                     context=self.get_context())
     return items
Exemple #15
0
 def describe(
         self, *filters,
         take_struct_from_source: bool = False,
         count: Count = DEFAULT_SHOW_COUNT,
         columns: Columns = None,
         allow_collect: bool = True,
         show_header: bool = True,
         struct_as_dataframe: bool = False,
         delimiter: str = ' ',
         output=AUTO,
         **filter_kwargs
 ):
     output = Auto.delayed_acquire(output, self.get_logger)
     if show_header:
         for line in self.get_str_headers():
             self.output_line(line, output=output)
     example = self.example(*filters, **filter_kwargs, count=count)
     if hasattr(self, 'get_struct'):
         expected_struct = self.get_struct()
         source_str = 'native'
     elif take_struct_from_source:
         expected_struct = self.get_source_struct()
         source_str = 'from source {}'.format(self.get_source().__repr__())
     else:
         expected_struct = self.get_detected_struct()
         source_str = 'detected from example items'
     expected_struct = fc.FlatStruct.convert_to_native(expected_struct)
     detected_struct = example.get_detected_struct(count)
     assert isinstance(expected_struct, fc.FlatStruct) or hasattr(expected_struct, 'describe'), expected_struct
     assert isinstance(detected_struct, fc.FlatStruct) or hasattr(expected_struct, 'describe'), expected_struct
     detected_struct.validate_about(expected_struct)
     validation_message = '{} {}'.format(source_str, expected_struct.get_validation_message())
     struct_as_dataframe = struct_as_dataframe and get_use_objects_for_output()
     struct_dataframe = expected_struct.describe(
         as_dataframe=struct_as_dataframe, show_header=False, output=output,
         delimiter=delimiter, example=example.get_one_item(), comment=validation_message,
     )
     if struct_as_dataframe:
         return struct_dataframe
     else:
         return example.get_demo_example(as_dataframe=get_use_objects_for_output())
Exemple #16
0
 def sort(self,
          *keys,
          reverse: bool = False,
          step: AutoCount = AUTO,
          verbose: AutoBool = True) -> Native:
     keys = update(keys)
     step = Auto.delayed_acquire(step, self.get_limit_items_in_memory)
     if len(keys) == 0:
         key_function = fs.same()
     else:
         key_function = fs.composite_key(keys)
     if self.can_be_in_memory(step=step) or step is None:
         stream = self.memory_sort(key_function,
                                   reverse=reverse,
                                   verbose=verbose)
     else:
         stream = self.disk_sort(key_function,
                                 reverse=reverse,
                                 step=step,
                                 verbose=verbose)
     return self._assume_native(stream)
Exemple #17
0
 def get_stream_kwargs(self,
                       data: Union[Iterable, Auto] = AUTO,
                       name: AutoName = AUTO,
                       verbose: AutoBool = AUTO,
                       step: AutoCount = AUTO,
                       message: AutoName = AUTO,
                       **kwargs) -> dict:
     name = Auto.delayed_acquire(name, self._get_generated_stream_name)
     if not Auto.is_defined(data):
         item_type = self._get_item_type()
         data = self._get_items_of_type(item_type,
                                        verbose=verbose,
                                        step=step,
                                        message=message)
     result = dict(
         data=data,
         name=name,
         source=self,
         count=self._get_fast_count(),
         context=self.get_context(),
     )
     result.update(kwargs)
     return result
Exemple #18
0
 def to_stream_type(
     self,
     stream_type: StreamType,
     step: AutoCount = AUTO,
     verbose: AutoBool = AUTO,
     message: Union[str, Auto, None] = AUTO,
     **kwargs,
 ) -> Stream:
     stream_type = Auto.delayed_acquire(stream_type, self._get_stream_type)
     item_type = self._get_item_type(stream_type)
     if item_type == ItemType.StructRow and hasattr(
             self, 'get_struct') and 'struct' not in kwargs:
         kwargs['struct'] = self.get_struct()
     data = kwargs.pop('data', None)
     if not Auto.is_defined(data):
         data = self._get_items_of_type(item_type,
                                        step=step,
                                        verbose=verbose,
                                        message=message)
     stream_kwargs = self.get_stream_kwargs(data=data,
                                            step=step,
                                            verbose=verbose,
                                            **kwargs)
     return stream_type.stream(**stream_kwargs)
Exemple #19
0
 def get_dict_output_field_types(self, struct: Union[Struct, Auto] = AUTO) -> dict:
     struct = Auto.delayed_acquire(struct, self.get_input_struct)
     output_types = dict()
     for d in self.get_descriptions():
         output_types.update(d.get_dict_output_field_types(struct))
     return output_types