Ejemplo n.º 1
0
    def agg(self, stats):
        grouping_schema = StructType([
            field for col in self.grouping_cols
            for field in col.find_fields_in_schema(self.jdf.bound_schema)
        ])

        aggregated_stats = self.jdf.aggregate(
            GroupedStats(
                self.grouping_cols,
                stats,
                pivot_col=self.pivot_col,
                pivot_values=self.pivot_values,
            ),
            lambda grouped_stats, row: grouped_stats.merge(
                row, self.jdf.bound_schema),
            lambda grouped_stats_1, grouped_stats_2: grouped_stats_1.
            mergeStats(grouped_stats_2, self.jdf.bound_schema),
        )

        data = []
        all_stats = self.add_subtotals(aggregated_stats)
        for group_key in all_stats.group_keys:
            key = [(str(key), None if value is GROUPED else value)
                   for key, value in zip(self.grouping_cols, group_key)]
            grouping = tuple(value is GROUPED for value in group_key)

            key_as_row = row_from_keyed_values(key).set_grouping(grouping)
            data.append(
                row_from_keyed_values(key + [(
                    str(stat),
                    stat.with_pre_evaluation_schema(self.jdf.bound_schema).
                    eval(key_as_row, grouping_schema),
                ) for pivot_value in all_stats.pivot_values
                                             for stat in get_pivoted_stats(
                                                 all_stats.groups[group_key]
                                                 [pivot_value], pivot_value)]))

        if self.pivot_col is not None:
            if len(stats) == 1:
                new_schema = StructType(grouping_schema.fields + [
                    StructField(str(pivot_value), DataType(), True)
                    for pivot_value in self.pivot_values
                ])
            else:
                new_schema = StructType(grouping_schema.fields + [
                    StructField('{0}_{1}'.format(pivot_value, stat),
                                DataType(), True)
                    for pivot_value in self.pivot_values for stat in stats
                ])
        else:
            new_schema = StructType(
                grouping_schema.fields +
                [StructField(str(stat), DataType(), True) for stat in stats])

        # noinspection PyProtectedMember
        return self.jdf._with_rdd(self.jdf._sc.parallelize(data),
                                  schema=new_schema)
Ejemplo n.º 2
0
def merge_rows_joined_on_values(left, right, left_schema, right_schema, how,
                                on):
    left_names = left_schema.names
    right_names = right_schema.names

    left_on_fields, right_on_fields = get_on_fields(left_schema, right_schema,
                                                    on)

    on_parts = [(on_field,
                 left[on_field] if left is not None else right[on_field])
                for on_field in on]

    if left is None and how in (FULL_JOIN, RIGHT_JOIN):
        left = create_row(left_names, [None for _ in left_names])
    if right is None and how in (LEFT_JOIN, FULL_JOIN):
        right = create_row(right_names, [None for _ in right_names])

    left_parts = ((field.name, value)
                  for field, value in zip(left_schema.fields, left)
                  if field not in left_on_fields)

    if how in (INNER_JOIN, CROSS_JOIN, LEFT_JOIN, FULL_JOIN, RIGHT_JOIN):
        right_parts = ((field.name, value)
                       for field, value in zip(right_schema.fields, right)
                       if field not in right_on_fields)
    elif how in (LEFT_SEMI_JOIN, LEFT_ANTI_JOIN):
        right_parts = ()
    else:
        raise IllegalArgumentException(
            "Argument 'how' cannot be '{0}'".format(how))

    return row_from_keyed_values(
        itertools.chain(on_parts, left_parts, right_parts))
Ejemplo n.º 3
0
def decode_record(item):
    if isinstance(item, list):
        return [decode_record(e) for e in item]
    if isinstance(item, dict):
        return row_from_keyed_values(
            (key, decode_record(value)) for key, value in item.items())
    return item
Ejemplo n.º 4
0
def resolve_partitions(patterns):
    """
    Given a list of patterns, returns all the files matching or in folders matching
    one of them.

    The file are returned in a list of tuple of 2 elements:
    - The first tuple is the file path
    - The second being the partition keys and values if any were encountered else None

    In addition to this list, return, if the data was partitioned, a schema for the
    partition keys, else None

    :type patterns: list of str
    :rtype: Tuple[List[str], List[Optional[Row]], Optional[StructType]]
    """
    file_paths = File.get_content(patterns)
    if not file_paths:
        raise AnalysisException('Path does not exist: {0}'.format(patterns))
    partitions = {}
    for file_path in file_paths:
        if '=' in file_path:
            row = row_from_keyed_values(
                folder.split('=') for folder in file_path.split('/')[:-1]
                if folder.count('=') == 1)
            partitions[file_path] = row
        else:
            partitions[file_path] = None

    partitioning_field_sets = set(p.__fields__ for p in partitions.values()
                                  if p is not None)
    if len(partitioning_field_sets) > 1:
        raise Exception(
            'Conflicting directory structures detected while reading {0}. '
            'All partitions must have the same partitioning fields, found fields {1}'
            .format(
                ','.join(patterns),
                ' and also '.join(
                    str(fields) for fields in partitioning_field_sets),
            ))

    if partitioning_field_sets:
        if any(value is None for value in partitions.values()):
            raise AnalysisException(
                'Unable to parse those malformed folders: {1} of {0}'.format(
                    file_paths,
                    [
                        path
                        for path, value in partitions.items() if value is None
                    ],
                ))
        partitioning_fields = partitioning_field_sets.pop()
        partition_schema = guess_schema_from_strings(partitioning_fields,
                                                     partitions.values(),
                                                     options={})
    else:
        partition_schema = None

    return partitions, partition_schema
 def get_as_rows(self, stats=('count', 'mean', 'stddev', 'min', 'max')):
     """
     Provide a list of Row with the same format as the one in the
     Dataset returned by Dataset.stats()
     """
     return [
         row_from_keyed_values(
             [('summary', stat)] +
             [(col_name,
               self.get_stat(self.column_stat_helpers[col_name], stat))
              for col_name in self.col_names]) for stat in stats
     ]
Ejemplo n.º 6
0
    def get_select_output_field_lists(self, partition, non_generators,
                                      initialized_cols, generator):
        output_rows = []
        for row in partition:
            base_row_fields = []
            for col in non_generators:
                output_cols, output_values = resolve_column(
                    col, row, schema=self.bound_schema)
                base_row_fields += zip(output_cols, output_values[0])

            if generator is not None:
                generated_row_fields = self.get_generated_row_fields(
                    generator, row, initialized_cols, base_row_fields)
                for generated_row in generated_row_fields:
                    output_rows.append(
                        row_from_keyed_values(generated_row,
                                              metadata=row.get_metadata()))
            else:
                output_rows.append(
                    row_from_keyed_values(base_row_fields,
                                          metadata=row.get_metadata()))
        return output_rows
Ejemplo n.º 7
0
    def test_session_create_data_frame_from_list_with_col_names(self):
        df = self.spark.createDataFrame(
            [(0.0, [1.0, 0.8]), (1.0, [0.0, 0.0]), (2.0, [0.5, 0.5])], ['label', 'features'],
        )
        self.assertEqual(df.count(), 3)
        self.assertListEqual(
            df.collect(),
            [
                row_from_keyed_values([('label', 0.0), ('features', [1.0, 0.8])]),
                row_from_keyed_values([('label', 1.0), ('features', [0.0, 0.0])]),
                row_from_keyed_values([('label', 2.0), ('features', [0.5, 0.5])]),
            ],
        )

        self.assertEqual(
            df.schema,
            StructType(
                [
                    StructField('label', DoubleType(), True),
                    StructField('features', ArrayType(DoubleType(), True), True),
                ]
            ),
        )
Ejemplo n.º 8
0
    def drop(self, cols):
        positions_to_drop = []
        for col in cols:
            if isinstance(col, str):
                if col == '*':
                    continue
                col = parse(col)
            try:
                positions_to_drop.append(
                    col.find_position_in_schema(self.bound_schema))
            except ValueError:
                pass

        new_schema = StructType([
            field for i, field in enumerate(self.bound_schema.fields)
            if i not in positions_to_drop
        ])

        return self._with_rdd(
            self.rdd().map(lambda row: row_from_keyed_values(
                [(field, row[i]) for i, field in enumerate(row.__fields__)
                 if i not in positions_to_drop])),
            new_schema,
        )
Ejemplo n.º 9
0
 def mapper(row):
     keyed_values = [(new_name, row[old])
                     for new_name, old in zip(new_names, row.__fields__)
                     ]
     return row_from_keyed_values(keyed_values)
Ejemplo n.º 10
0
 def mapper(row):
     keyed_values = [(new, row[col]) if col == existing else
                     (col, row[col]) for col in row.__fields__]
     return row_from_keyed_values(keyed_values)
Ejemplo n.º 11
0
 def change_col_order(row):
     return row_from_keyed_values([(field.name, row[field.name])
                                   for field in self.bound_schema.fields
                                   ])
Ejemplo n.º 12
0
 def change_col_names(row):
     return row_from_keyed_values([
         (field.name, value)
         for field, value in zip(self.bound_schema.fields, row)
     ])