Ejemplo n.º 1
0
    def agg(self, stats):
        grouping_schema = StructType([
            field for col in self.grouping_cols
            for field in col.find_fields_in_schema(self.jdf.bound_schema)
        ])

        aggregated_stats = self.jdf.aggregate(
            GroupedStats(self.grouping_cols,
                         stats,
                         pivot_col=self.pivot_col,
                         pivot_values=self.pivot_values),
            lambda grouped_stats, row: grouped_stats.merge(
                row, self.jdf.bound_schema),
            lambda grouped_stats_1, grouped_stats_2: grouped_stats_1.
            mergeStats(grouped_stats_2, self.jdf.bound_schema))

        data = []
        all_stats = self.add_subtotals(aggregated_stats)
        for group_key in all_stats.group_keys:
            key = [(str(key), None if value is GROUPED else value)
                   for key, value in zip(self.grouping_cols, group_key)]
            grouping = tuple(value is GROUPED for value in group_key)

            key_as_row = row_from_keyed_values(key).set_grouping(grouping)
            data.append(
                row_from_keyed_values(key + [
                    (str(stat),
                     stat.with_pre_evaluation_schema(self.jdf.bound_schema).
                     eval(key_as_row, grouping_schema))
                    for pivot_value in all_stats.pivot_values
                    for stat in get_pivoted_stats(
                        all_stats.groups[group_key][pivot_value], pivot_value)
                ]))

        if self.pivot_col is not None:
            if len(stats) == 1:
                new_schema = StructType(grouping_schema.fields + [
                    StructField(str(pivot_value), DataType(), True)
                    for pivot_value in self.pivot_values
                ])
            else:
                new_schema = StructType(grouping_schema.fields + [
                    StructField("{0}_{1}".format(pivot_value, stat),
                                DataType(), True)
                    for pivot_value in self.pivot_values for stat in stats
                ])
        else:
            new_schema = StructType(
                grouping_schema.fields +
                [StructField(str(stat), DataType(), True) for stat in stats])

        # noinspection PyProtectedMember
        return self.jdf._with_rdd(self.jdf._sc.parallelize(data),
                                  schema=new_schema)
Ejemplo n.º 2
0
def merge_rows_joined_on_values(left, right, left_schema, right_schema, how,
                                on):
    left_names = left_schema.names
    right_names = right_schema.names

    left_on_fields, right_on_fields = get_on_fields(left_schema, right_schema,
                                                    on)

    on_parts = [(on_field,
                 left[on_field] if left is not None else right[on_field])
                for on_field in on]

    if left is None and how in (FULL_JOIN, RIGHT_JOIN):
        left = create_row(left_names, [None for _ in left_names])
    if right is None and how in (LEFT_JOIN, FULL_JOIN):
        right = create_row(right_names, [None for _ in right_names])

    left_parts = ((field.name, value)
                  for field, value in zip(left_schema.fields, left)
                  if field not in left_on_fields)

    if how in (INNER_JOIN, CROSS_JOIN, LEFT_JOIN, FULL_JOIN, RIGHT_JOIN):
        right_parts = ((field.name, value)
                       for field, value in zip(right_schema.fields, right)
                       if field not in right_on_fields)
    elif how in (LEFT_SEMI_JOIN, LEFT_ANTI_JOIN):
        right_parts = ()
    else:
        raise IllegalArgumentException(
            "Argument 'how' cannot be '{0}'".format(how))

    return row_from_keyed_values(
        itertools.chain(on_parts, left_parts, right_parts))
Ejemplo n.º 3
0
def decode_record(item):
    if isinstance(item, list):
        return [decode_record(e) for e in item]
    if isinstance(item, dict):
        return row_from_keyed_values(
            (key, decode_record(value)) for key, value in item.items())
    return item
Ejemplo n.º 4
0
    def test_session_create_data_frame_from_list_with_col_names(self):
        df = self.spark.createDataFrame([(0.0, [1.0, 0.8]), (1.0, [0.0, 0.0]),
                                         (2.0, [0.5, 0.5])],
                                        ["label", "features"])
        self.assertEqual(df.count(), 3)
        self.assertListEqual(df.collect(), [
            row_from_keyed_values([("label", 0.0), ("features", [1.0, 0.8])]),
            row_from_keyed_values([("label", 1.0), ("features", [0.0, 0.0])]),
            row_from_keyed_values([("label", 2.0), ("features", [0.5, 0.5])]),
        ])

        self.assertEqual(
            df.schema,
            StructType([
                StructField("label", DoubleType(), True),
                StructField("features", ArrayType(DoubleType(), True), True)
            ]))
Ejemplo n.º 5
0
def resolve_partitions(patterns):
    """
    Given a list of patterns, returns all the files matching or in folders matching
    one of them.

    The file are returned in a list of tuple of 2 elements:
    - The first tuple is the file path
    - The second being the partition keys and values if any were encountered else None

    In addition to this list, return, if the data was partitioned, a schema for the
    partition keys, else None

    :type patterns: list of str
    :rtype: Tuple[List[str], List[Optional[Row]], Optional[StructType]]
    """
    file_paths = File.get_content(patterns)
    if not file_paths:
        raise AnalysisException('Path does not exist: {0}'.format(patterns))
    partitions = {}
    for file_path in file_paths:
        if "=" in file_path:
            row = row_from_keyed_values(
                folder.split("=")
                for folder in file_path.split("/")[:-1]
                if folder.count("=") == 1
            )
            partitions[file_path] = row
        else:
            partitions[file_path] = None

    partitioning_field_sets = set(p.__fields__ for p in partitions.values() if p is not None)
    if len(partitioning_field_sets) > 1:
        raise Exception(
            "Conflicting directory structures detected while reading {0}. "
            "All partitions must have the same partitioning fields, found fields {1}".format(
                ",".join(patterns),
                " and also ".join(
                    str(fields) for fields in partitioning_field_sets
                )
            )
        )

    if partitioning_field_sets:
        if any(value is None for value in partitions.values()):
            raise AnalysisException(
                "Unable to parse those malformed folders: {1} of {0}".format(
                    file_paths,
                    [path for path, value in partitions.items() if value is None]
                )
            )
        partitioning_fields = partitioning_field_sets.pop()
        partition_schema = guess_schema_from_strings(
            partitioning_fields, partitions.values(), options={}
        )
    else:
        partition_schema = None

    return partitions, partition_schema
Ejemplo n.º 6
0
 def get_as_rows(self, stats=("count", "mean", "stddev", "min", "max")):
     """
     Provide a list of Row with the same format as the one in the
     Dataset returned by Dataset.stats()
     """
     return [
         row_from_keyed_values(
             [("summary", stat)] +
             [(col_name,
               self.get_stat(self.column_stat_helpers[col_name], stat))
              for col_name in self.col_names]) for stat in stats
     ]
Ejemplo n.º 7
0
    def get_select_output_field_lists(self, partition, non_generators,
                                      initialized_cols, generator):
        output_rows = []
        for row in partition:
            base_row_fields = []
            for col in non_generators:
                output_cols, output_values = resolve_column(
                    col, row, schema=self.bound_schema)
                base_row_fields += zip(output_cols, output_values[0])

            if generator is not None:
                generated_row_fields = self.get_generated_row_fields(
                    generator, row, initialized_cols, base_row_fields)
                for generated_row in generated_row_fields:
                    output_rows.append(
                        row_from_keyed_values(generated_row,
                                              metadata=row.get_metadata()))
            else:
                output_rows.append(
                    row_from_keyed_values(base_row_fields,
                                          metadata=row.get_metadata()))
        return output_rows
Ejemplo n.º 8
0
    def drop(self, cols):
        positions_to_drop = []
        for col in cols:
            if isinstance(col, str):
                if col == "*":
                    continue
                col = parse(col)
            try:
                positions_to_drop.append(
                    col.find_position_in_schema(self.bound_schema))
            except ValueError:
                pass

        new_schema = StructType([
            field for i, field in enumerate(self.bound_schema.fields)
            if i not in positions_to_drop
        ])

        return self._with_rdd(
            self.rdd().map(lambda row: row_from_keyed_values(
                [(field, row[i]) for i, field in enumerate(row.__fields__)
                 if i not in positions_to_drop])), new_schema)
Ejemplo n.º 9
0
 def mapper(row):
     keyed_values = [(new_name, row[old])
                     for new_name, old in zip(new_names, row.__fields__)
                     ]
     return row_from_keyed_values(keyed_values)
Ejemplo n.º 10
0
 def mapper(row):
     keyed_values = [(new, row[col]) if col == existing else
                     (col, row[col]) for col in row.__fields__]
     return row_from_keyed_values(keyed_values)
Ejemplo n.º 11
0
 def change_col_order(row):
     return row_from_keyed_values([(field.name, row[field.name])
                                   for field in self.bound_schema.fields
                                   ])
Ejemplo n.º 12
0
 def change_col_names(row):
     return row_from_keyed_values([
         (field.name, value)
         for field, value in zip(self.bound_schema.fields, row)
     ])