Beispiel #1
0
 def sorted_group_by(
     self,
     *keys,
     values: Columns = None,
     as_pairs: bool = False,
     skip_missing: bool = False,
 ) -> Stream:
     keys = arg.update(keys)
     keys = arg.get_names(keys)
     values = arg.get_names(values)
     key_function = get_key_function(keys)
     groups = self._get_groups(key_function, as_pairs=as_pairs)
     if as_pairs:
         sm_groups = sm.KeyValueStream(
             groups, value_stream_type=StreamType.RowStream)
     else:
         sm_groups = sm.RowStream(groups, check=False)
     if values:
         sm_groups = sm_groups.map_to_type(
             lambda r: ms.fold_lists(
                 r, keys, values, skip_missing=skip_missing),
             stream_type=StreamType.RecordStream,
         )
     if self.is_in_memory():
         return sm_groups.to_memory()
     else:
         sm_groups.set_estimated_count(self.get_count()
                                       or self.get_estimated_count())
         return sm_groups
Beispiel #2
0
 def group_by(
     self,
     *keys,
     values: Columns = None,
     step: AutoCount = AUTO,
     as_pairs: bool = False,
     take_hash: bool = True,
     verbose: bool = True,
 ) -> Stream:
     keys = arg.update(keys)
     keys = arg.get_names(keys)
     values = arg.get_names(values)
     if hasattr(keys[0],
                'get_field_names'):  # if isinstance(keys[0], FieldGroup)
         keys = keys[0].get_field_names()
     step = arg.acquire(step, self.max_items_in_memory)
     if as_pairs:
         key_for_sort = keys
     else:
         key_for_sort = get_key_function(keys, take_hash=take_hash)
     sorted_stream = self.sort(
         key_for_sort,
         step=step,
         verbose=verbose,
     )
     grouped_stream = sorted_stream.sorted_group_by(
         keys,
         values=values,
         as_pairs=as_pairs,
     )
     return grouped_stream
Beispiel #3
0
 def sorted_group_by(
     self,
     *keys,
     values: Columns = None,
     as_pairs: bool = False,
     skip_missing: bool = False,
 ) -> Stream:
     keys = arg.update(keys)
     keys = arg.get_names(keys, or_callable=True)
     values = arg.get_names(values)
     key_function = self._get_key_function(keys)
     iter_groups = self._get_groups(key_function, as_pairs=as_pairs)
     if as_pairs:
         stream_groups = sm.KeyValueStream(
             iter_groups, value_stream_type=StreamType.RowStream)
     else:
         stream_groups = sm.RowStream(iter_groups, check=False)
     if values:
         item_type = self.get_item_type()  # ItemType.Record
         fold_mapper = fs.fold_lists(keys=keys,
                                     values=values,
                                     skip_missing=skip_missing,
                                     item_type=item_type)
         stream_groups = stream_groups.map_to_type(
             fold_mapper, stream_type=StreamType.RecordStream)
     if self.is_in_memory():
         return stream_groups.to_memory()
     else:
         stream_groups.set_estimated_count(self.get_count()
                                           or self.get_estimated_count())
         return stream_groups
Beispiel #4
0
 def group_by(self,
              *keys,
              values: Optional[Iterable] = None,
              as_pairs: bool = False) -> Stream:
     keys = arg.get_names(keys)
     keys = arg.update(keys)
     values = arg.get_names(values)
     return self.sort(*keys).sorted_group_by(*keys,
                                             values=values,
                                             as_pairs=as_pairs)
Beispiel #5
0
 def get_dataframe(self, columns: Columns = None) -> DataFrame:
     if pd and get_use_objects_for_output():
         dataframe = DataFrame(self.get_items())
         if arg.is_defined(columns):
             columns = arg.get_names(columns)
             dataframe = dataframe[columns]
         return dataframe
Beispiel #6
0
 def map_side_join(
         self,
         right: Native,
         key: UniKey,
         how: How = JoinType.Left,
         right_is_uniq: bool = True,
         inplace: bool = False,
 ) -> Optional[Native]:
     key = arg.get_names(key)
     keys = arg.update([key])
     if not isinstance(how, JoinType):
         how = JoinType(how)
     joined_items = algo.map_side_join(
         iter_left=self.get_items(),
         iter_right=right.get_items(),
         key_function=fs.composite_key(keys),
         merge_function=fs.merge_two_items(),
         dict_function=fs.items_to_dict(),
         how=how,
         uniq_right=right_is_uniq,
     )
     if self.is_in_memory():
         joined_items = list(joined_items)
     if inplace:
         self.set_items(joined_items, count=self.get_count(), inplace=True)
     else:
         stream = self.stream(joined_items)
         meta = self.get_compatible_static_meta()
         stream = stream.set_meta(**meta)
         return self._assume_native(stream)
Beispiel #7
0
 def get_rows(self,
              columns: Union[Columns, Auto] = AUTO,
              add_title_row=False) -> Iterable:
     columns = arg.delayed_acquire(columns, self.get_columns)
     columns = arg.get_names(columns)
     if add_title_row:
         yield columns
     for r in self.get_items():
         yield [r.get(c) for c in columns]
Beispiel #8
0
 def get_struct_comparison_iter(self, other: StructInterface, message: Optional[str] = None) -> Iterable:
     if arg.is_defined(message):
         title = '{} {}'.format(self.__repr__(), message)
     else:
         title = self.__repr__()
     comparison = self.get_struct_comparison_dict(other)
     counts = {k: len(v) for k, v in comparison.items()}
     added_names = arg.get_names(comparison.get('added'))
     removed_names = arg.get_names(comparison.get('removed'))
     if added_names or removed_names:
         message = '{}: {saved} fields will be saved, {added} added, {removed} removed'.format(title, **counts)
         yield message
         if added_names:
             yield 'Added {} fields: {}'.format(len(added_names), ', '.join(added_names))
         if removed_names:
             yield 'Removed {} fields: {}'.format(len(removed_names), ', '.join(removed_names))
     else:
         yield '{}: Struct is actual, will not be changed'.format(title)
Beispiel #9
0
 def get_dataframe(self, columns: Optional[Iterable] = None) -> DataFrame:
     if pd and get_use_objects_for_output():
         if columns:
             dataframe = DataFrame(self.get_items(), columns=columns)
             columns = arg.get_names(columns)
             dataframe = dataframe[columns]
         else:
             dataframe = DataFrame(self.get_items())
         return dataframe
Beispiel #10
0
def unfold_lists(fields, number_field='n', default_value=0) -> Callable:
    fields = arg.get_names(fields)

    def func(record: dict) -> Iterable:
        yield from ms.unfold_lists(record,
                                   fields=fields,
                                   number_field=number_field,
                                   default_value=default_value)

    return func
Beispiel #11
0
 def _get_uniq_records(self, *keys) -> Iterable:
     keys = arg.update(keys)
     key_fields = arg.get_names(keys)
     key_function = get_key_function(key_fields)
     prev_value = AUTO
     for r in self.get_records():
         value = key_function(r)
         if value != prev_value:
             yield r
         prev_value = value
Beispiel #12
0
 def get_dict(
     self,
     key: Union[Field, Columns],
     value: Union[Field, Columns, None] = None,
     of_lists: bool = False,
     skip_errors: bool = False,
 ) -> dict:
     key = arg.get_names(key)
     key_value_stream = self.to_key_value_stream(key,
                                                 value,
                                                 skip_errors=skip_errors)
     return key_value_stream.get_dict(of_lists=of_lists)
Beispiel #13
0
 def _get_key_function(self, descriptions: Array, take_hash: bool = False) -> Callable:
     descriptions = arg.get_names(descriptions)
     if len(descriptions) == 0:
         raise ValueError('key must be defined')
     elif len(descriptions) == 1:
         key_function = fs.partial(sf.value_from_row, descriptions[0])
     else:
         key_function = fs.partial(sf.row_from_row, descriptions)
     if take_hash:
         return lambda r: hash(key_function(r))
     else:
         return key_function
Beispiel #14
0
 def remove_fields(self, *fields, multiple: bool = False, inplace: bool = True):
     removing_fields = arg.update(fields)
     removing_field_names = arg.get_names(removing_fields)
     existing_fields = self.get_fields()
     if inplace:
         for e in existing_fields:
             if arg.get_name(e) in removing_field_names:
                 existing_fields.remove(e)
                 if not multiple:
                     break
     else:
         new_fields = [f for f in existing_fields if arg.get_name(f) not in removing_field_names]
         return self.make_new(new_fields)
Beispiel #15
0
 def map_side_join(self,
                   right: Native,
                   key: UniKey,
                   how: How = JoinType.Left,
                   right_is_uniq: bool = True) -> Native:
     key = arg.get_names(key)
     keys = arg.update([key])
     if not isinstance(how, JoinType):
         how = JoinType(how)
     joined_items = algo.map_side_join(
         iter_left=self.get_items(),
         iter_right=right.get_items(),
         key_function=fs.composite_key(keys),
         merge_function=fs.merge_two_items(),
         dict_function=fs.items_to_dict(),
         how=how,
         uniq_right=right_is_uniq,
     )
     stream = self.stream(
         list(joined_items) if self.is_in_memory() else
         joined_items, ).set_meta(**self.get_static_meta())
     return self._assume_native(stream)
Beispiel #16
0
 def get_records(self, columns: AutoColumns = AUTO) -> Generator:
     if columns == AUTO:
         columns = self.get_columns()
     column_names = arg.get_names(columns)
     for row in self.get_rows():
         yield {k: v for k, v in zip(column_names, row)}
Beispiel #17
0
 def get_field_names(self) -> list:
     return arg.get_names(self.get_fields())
Beispiel #18
0
 def get_records(self, columns: Union[Iterable, Auto] = AUTO) -> Iterable:
     if columns == AUTO:
         return self.get_items()
     else:
         columns = arg.get_names(columns)
         return self.select(*columns).get_items()