def agg(self, stats): grouping_schema = StructType([ field for col in self.grouping_cols for field in col.find_fields_in_schema(self.jdf.bound_schema) ]) aggregated_stats = self.jdf.aggregate( GroupedStats( self.grouping_cols, stats, pivot_col=self.pivot_col, pivot_values=self.pivot_values, ), lambda grouped_stats, row: grouped_stats.merge( row, self.jdf.bound_schema), lambda grouped_stats_1, grouped_stats_2: grouped_stats_1. mergeStats(grouped_stats_2, self.jdf.bound_schema), ) data = [] all_stats = self.add_subtotals(aggregated_stats) for group_key in all_stats.group_keys: key = [(str(key), None if value is GROUPED else value) for key, value in zip(self.grouping_cols, group_key)] grouping = tuple(value is GROUPED for value in group_key) key_as_row = row_from_keyed_values(key).set_grouping(grouping) data.append( row_from_keyed_values(key + [( str(stat), stat.with_pre_evaluation_schema(self.jdf.bound_schema). eval(key_as_row, grouping_schema), ) for pivot_value in all_stats.pivot_values for stat in get_pivoted_stats( all_stats.groups[group_key] [pivot_value], pivot_value)])) if self.pivot_col is not None: if len(stats) == 1: new_schema = StructType(grouping_schema.fields + [ StructField(str(pivot_value), DataType(), True) for pivot_value in self.pivot_values ]) else: new_schema = StructType(grouping_schema.fields + [ StructField('{0}_{1}'.format(pivot_value, stat), DataType(), True) for pivot_value in self.pivot_values for stat in stats ]) else: new_schema = StructType( grouping_schema.fields + [StructField(str(stat), DataType(), True) for stat in stats]) # noinspection PyProtectedMember return self.jdf._with_rdd(self.jdf._sc.parallelize(data), schema=new_schema)
def merge_rows_joined_on_values(left, right, left_schema, right_schema, how, on): left_names = left_schema.names right_names = right_schema.names left_on_fields, right_on_fields = get_on_fields(left_schema, right_schema, on) on_parts = [(on_field, left[on_field] if left is not None else right[on_field]) for on_field in on] if left is None and how in (FULL_JOIN, RIGHT_JOIN): left = create_row(left_names, [None for _ in left_names]) if right is None and how in (LEFT_JOIN, FULL_JOIN): right = create_row(right_names, [None for _ in right_names]) left_parts = ((field.name, value) for field, value in zip(left_schema.fields, left) if field not in left_on_fields) if how in (INNER_JOIN, CROSS_JOIN, LEFT_JOIN, FULL_JOIN, RIGHT_JOIN): right_parts = ((field.name, value) for field, value in zip(right_schema.fields, right) if field not in right_on_fields) elif how in (LEFT_SEMI_JOIN, LEFT_ANTI_JOIN): right_parts = () else: raise IllegalArgumentException( "Argument 'how' cannot be '{0}'".format(how)) return row_from_keyed_values( itertools.chain(on_parts, left_parts, right_parts))
def decode_record(item): if isinstance(item, list): return [decode_record(e) for e in item] if isinstance(item, dict): return row_from_keyed_values( (key, decode_record(value)) for key, value in item.items()) return item
def resolve_partitions(patterns): """ Given a list of patterns, returns all the files matching or in folders matching one of them. The file are returned in a list of tuple of 2 elements: - The first tuple is the file path - The second being the partition keys and values if any were encountered else None In addition to this list, return, if the data was partitioned, a schema for the partition keys, else None :type patterns: list of str :rtype: Tuple[List[str], List[Optional[Row]], Optional[StructType]] """ file_paths = File.get_content(patterns) if not file_paths: raise AnalysisException('Path does not exist: {0}'.format(patterns)) partitions = {} for file_path in file_paths: if '=' in file_path: row = row_from_keyed_values( folder.split('=') for folder in file_path.split('/')[:-1] if folder.count('=') == 1) partitions[file_path] = row else: partitions[file_path] = None partitioning_field_sets = set(p.__fields__ for p in partitions.values() if p is not None) if len(partitioning_field_sets) > 1: raise Exception( 'Conflicting directory structures detected while reading {0}. ' 'All partitions must have the same partitioning fields, found fields {1}' .format( ','.join(patterns), ' and also '.join( str(fields) for fields in partitioning_field_sets), )) if partitioning_field_sets: if any(value is None for value in partitions.values()): raise AnalysisException( 'Unable to parse those malformed folders: {1} of {0}'.format( file_paths, [ path for path, value in partitions.items() if value is None ], )) partitioning_fields = partitioning_field_sets.pop() partition_schema = guess_schema_from_strings(partitioning_fields, partitions.values(), options={}) else: partition_schema = None return partitions, partition_schema
def get_as_rows(self, stats=('count', 'mean', 'stddev', 'min', 'max')): """ Provide a list of Row with the same format as the one in the Dataset returned by Dataset.stats() """ return [ row_from_keyed_values( [('summary', stat)] + [(col_name, self.get_stat(self.column_stat_helpers[col_name], stat)) for col_name in self.col_names]) for stat in stats ]
def get_select_output_field_lists(self, partition, non_generators, initialized_cols, generator): output_rows = [] for row in partition: base_row_fields = [] for col in non_generators: output_cols, output_values = resolve_column( col, row, schema=self.bound_schema) base_row_fields += zip(output_cols, output_values[0]) if generator is not None: generated_row_fields = self.get_generated_row_fields( generator, row, initialized_cols, base_row_fields) for generated_row in generated_row_fields: output_rows.append( row_from_keyed_values(generated_row, metadata=row.get_metadata())) else: output_rows.append( row_from_keyed_values(base_row_fields, metadata=row.get_metadata())) return output_rows
def test_session_create_data_frame_from_list_with_col_names(self): df = self.spark.createDataFrame( [(0.0, [1.0, 0.8]), (1.0, [0.0, 0.0]), (2.0, [0.5, 0.5])], ['label', 'features'], ) self.assertEqual(df.count(), 3) self.assertListEqual( df.collect(), [ row_from_keyed_values([('label', 0.0), ('features', [1.0, 0.8])]), row_from_keyed_values([('label', 1.0), ('features', [0.0, 0.0])]), row_from_keyed_values([('label', 2.0), ('features', [0.5, 0.5])]), ], ) self.assertEqual( df.schema, StructType( [ StructField('label', DoubleType(), True), StructField('features', ArrayType(DoubleType(), True), True), ] ), )
def drop(self, cols): positions_to_drop = [] for col in cols: if isinstance(col, str): if col == '*': continue col = parse(col) try: positions_to_drop.append( col.find_position_in_schema(self.bound_schema)) except ValueError: pass new_schema = StructType([ field for i, field in enumerate(self.bound_schema.fields) if i not in positions_to_drop ]) return self._with_rdd( self.rdd().map(lambda row: row_from_keyed_values( [(field, row[i]) for i, field in enumerate(row.__fields__) if i not in positions_to_drop])), new_schema, )
def mapper(row): keyed_values = [(new_name, row[old]) for new_name, old in zip(new_names, row.__fields__) ] return row_from_keyed_values(keyed_values)
def mapper(row): keyed_values = [(new, row[col]) if col == existing else (col, row[col]) for col in row.__fields__] return row_from_keyed_values(keyed_values)
def change_col_order(row): return row_from_keyed_values([(field.name, row[field.name]) for field in self.bound_schema.fields ])
def change_col_names(row): return row_from_keyed_values([ (field.name, value) for field, value in zip(self.bound_schema.fields, row) ])