def setUp(self): spark = SparkSession(sparkContext=self.sc) data = np.random.randint(0, 9, [100, 10]) row = pyspark.Row('id', *['feature_' + str(i) for i in range(9)]) self.x_rdd = self.sc.parallelize( list(map(lambda x: row(*x), data.tolist()))) self.x_df = spark.createDataFrame(self.x_rdd)
def convert_rdd_raw(row): # row[0] : full path to file classname = get_classname_from_filename(row[0]) # row[1] : features as string value values = row[1].strip('[]').split(',') values = [float(x) for x in values] return pyspark.Row(label=classname, features=values)
def test_do_cartesian(self): spark_session = sql.SparkSession(self.sc) string_rdd = self.sc.parallelize(self.test_data).map( lambda x: pyspark.Row(id=x[0], label=x[1], vector=ml_linalg.DenseVector(x[2]))) string_df = string_rdd.toDF() test_demon = do_cartesian(sc=self.sc, df=string_df, id_col='id', feature_col='vector') check_diagonal = test_demon.filter(lambda x: x.i == x.j).map(lambda x: x.value).collect() for diag in check_diagonal: self.assertEqual(1.0, diag)
def dict_to_spark_row(unischema, row_dict): """Converts a single row into a spark Row object. Verifies that the data confirms with unischema definition types and encodes the data using the codec specified by the unischema. The parameters are keywords to allow use of functools.partial. :param unischema: an instance of Unischema object :param row_dict: a dictionary where the keys match name of fields in the unischema. :return: a single pyspark.Row object """ # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path # (currently works only with make_batch_reader) import pyspark assert isinstance(unischema, Unischema) # Add null fields. Be careful not to mutate the input dictionary - that would be an unexpected side effect copy_row_dict = copy.copy(row_dict) insert_explicit_nulls(unischema, copy_row_dict) if set(copy_row_dict.keys()) != set(unischema.fields.keys()): raise ValueError( 'Dictionary fields \n{}\n do not match schema fields \n{}'.format( '\n'.join(sorted(copy_row_dict.keys())), '\n'.join(unischema.fields.keys()))) encoded_dict = {} for field_name, value in copy_row_dict.items(): schema_field = unischema.fields[field_name] if value is None: if not schema_field.nullable: raise ValueError( 'Field {} is not "nullable", but got passes a None value') if schema_field.codec: encoded_dict[field_name] = schema_field.codec.encode( schema_field, value) if value is not None else None else: if isinstance(value, (np.generic, )): encoded_dict[field_name] = value.tolist() else: encoded_dict[field_name] = value field_list = list(unischema.fields.keys()) # generate a value list which match the schema column order. value_list = [encoded_dict[name] for name in field_list] # create a row by value list row = pyspark.Row(*value_list) # set row fields row.__fields__ = field_list return row
def dict_to_row(schema, row_dict): import pyspark err_msg = 'Dictionary fields \n{}\n do not match schema fields \n{}'\ .format('\n'.join(sorted(row_dict.keys())), '\n'.join(schema.keys())) assert set(row_dict.keys()) == set(schema.keys()), err_msg row = {} for k, v in row_dict.items(): schema_field = schema[k] if schema_field.feature_type == FeatureType.IMAGE: image_path = v with open(image_path, "rb") as f: img_bytes = f.read() row[k] = bytearray(img_bytes) elif schema_field.feature_type == FeatureType.NDARRAY: memfile = BytesIO() np.savez_compressed(memfile, arr=v) row[k] = bytearray(memfile.getvalue()) else: row[k] = v return pyspark.Row(**row)
def _load_match(provider: str, match_id: str) -> List[pyspark.Row]: dataset = datasets.load(provider, match_id=match_id) pdf = kh.fix_kloppy_dataframe(dataset.to_pandas(all_passes=True)) return [pyspark.Row(match=match_id, **row) for row in pdf.to_dict(orient="records")]