Python Row Examples

Programming Language: Python

Namespace/Package Name: pyspark

Method/Function: Row

Examples at hotexamples.com: 6

Python Row - 6 examples found. These are the top rated real world Python examples of pyspark.Row extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: test__make_feature_vector.py Project: Visma-MLaaS/WorkflowCleaning

 def setUp(self):
     spark = SparkSession(sparkContext=self.sc)
     data = np.random.randint(0, 9, [100, 10])
     row = pyspark.Row('id', *['feature_' + str(i) for i in range(9)])
     self.x_rdd = self.sc.parallelize(
         list(map(lambda x: row(*x), data.tolist())))
     self.x_df = spark.createDataFrame(self.x_rdd)

Example #2

Show file

File: script_fra-oc-p2.py Project: f2buttet/ML_Project

def convert_rdd_raw(row):
    # row[0] : full path to file
    classname = get_classname_from_filename(row[0])
    # row[1] : features as string value
    values = row[1].strip('[]').split(',')
    values = [float(x) for x in values]
    return pyspark.Row(label=classname, features=values)

Example #3

Show file

File: test_do_cartesian.py Project: Visma-MLaaS/WorkflowCleaning

 def test_do_cartesian(self):
     spark_session = sql.SparkSession(self.sc)
     string_rdd = self.sc.parallelize(self.test_data).map(
         lambda x: pyspark.Row(id=x[0], label=x[1], vector=ml_linalg.DenseVector(x[2])))
     string_df = string_rdd.toDF()
     test_demon = do_cartesian(sc=self.sc, df=string_df, id_col='id', feature_col='vector')
     check_diagonal = test_demon.filter(lambda x: x.i == x.j).map(lambda x: x.value).collect()
     for diag in check_diagonal:
         self.assertEqual(1.0, diag)

Example #4

Show file

def dict_to_spark_row(unischema, row_dict):
    """Converts a single row into a spark Row object.

    Verifies that the data confirms with unischema definition types and encodes the data using the codec specified
    by the unischema.

    The parameters are keywords to allow use of functools.partial.

    :param unischema: an instance of Unischema object
    :param row_dict: a dictionary where the keys match name of fields in the unischema.
    :return: a single pyspark.Row object
    """

    # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
    # (currently works only with make_batch_reader)
    import pyspark

    assert isinstance(unischema, Unischema)
    # Add null fields. Be careful not to mutate the input dictionary - that would be an unexpected side effect
    copy_row_dict = copy.copy(row_dict)
    insert_explicit_nulls(unischema, copy_row_dict)

    if set(copy_row_dict.keys()) != set(unischema.fields.keys()):
        raise ValueError(
            'Dictionary fields \n{}\n do not match schema fields \n{}'.format(
                '\n'.join(sorted(copy_row_dict.keys())),
                '\n'.join(unischema.fields.keys())))

    encoded_dict = {}
    for field_name, value in copy_row_dict.items():
        schema_field = unischema.fields[field_name]
        if value is None:
            if not schema_field.nullable:
                raise ValueError(
                    'Field {} is not "nullable", but got passes a None value')
        if schema_field.codec:
            encoded_dict[field_name] = schema_field.codec.encode(
                schema_field, value) if value is not None else None
        else:
            if isinstance(value, (np.generic, )):
                encoded_dict[field_name] = value.tolist()
            else:
                encoded_dict[field_name] = value

    field_list = list(unischema.fields.keys())
    # generate a value list which match the schema column order.
    value_list = [encoded_dict[name] for name in field_list]
    # create a row by value list
    row = pyspark.Row(*value_list)
    # set row fields
    row.__fields__ = field_list
    return row

Example #5

Show file

File: utils.py Project: EmiCareOfCell44/BigDL

def dict_to_row(schema, row_dict):
    import pyspark
    err_msg = 'Dictionary fields \n{}\n do not match schema fields \n{}'\
        .format('\n'.join(sorted(row_dict.keys())), '\n'.join(schema.keys()))
    assert set(row_dict.keys()) == set(schema.keys()), err_msg

    row = {}
    for k, v in row_dict.items():
        schema_field = schema[k]
        if schema_field.feature_type == FeatureType.IMAGE:
            image_path = v
            with open(image_path, "rb") as f:
                img_bytes = f.read()
            row[k] = bytearray(img_bytes)
        elif schema_field.feature_type == FeatureType.NDARRAY:
            memfile = BytesIO()
            np.savez_compressed(memfile, arr=v)
            row[k] = bytearray(memfile.getvalue())
        else:
            row[k] = v
    return pyspark.Row(**row)

Example #6

Show file

def _load_match(provider: str, match_id: str) -> List[pyspark.Row]:
    dataset = datasets.load(provider, match_id=match_id)
    pdf = kh.fix_kloppy_dataframe(dataset.to_pandas(all_passes=True))
    return [pyspark.Row(match=match_id, **row) for row in pdf.to_dict(orient="records")]