def test_convert_row_to_dict(self): row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")}) self.assertEqual(1, row.asDict()['l'][0].a) df = self.sc.parallelize([row]).toDF() df.registerTempTable("test") row = self.sqlCtx.sql("select l, d from test").head() self.assertEqual(1, row.asDict()["l"][0].a) self.assertEqual(1.0, row.asDict()['d']['key'].c)
def take_log_in_all_columns(row: types.Row): old_row = row.asDict() new_row = { f'log({column_name})': math.log(value) for column_name, value in old_row.items() } return types.Row(**new_row)
def nulls(row: T.Row) -> T.Row: d = row.asDict() _cnt = 0 for _var in d.keys(): if d[_var] is None: _cnt += 1 d['nullcnt'] = _cnt return T.Row(**d)
def mse(row: T.Row) -> T.Row: d = row.asDict() _mse = 0.0 if d['Sales_Pred'] is None: print("'Sales_Pred'=None") _mse = 0 elif d['sales'] is None: _mse = d['Sales_Pred']**2 else: _mse = (d['Sales_Pred'] - d['sales'])**2 d['mse'] = _mse return T.Row(**d)
def getMapColumnQuery(row: Row) -> str: row_dict = row.asDict() if _xpath_return_type.__eq__('string'): if str(row_dict["xpath"]) is None or str( row_dict["xpath"]).strip().__eq__('') or str( row_dict["xpath"]).strip().__eq__('None') or str( row_dict["xpath"]).strip().__contains__('?'): return f'CAST(NULL AS STRING) AS {str(row_dict["column_alias"]).strip().replace(" ", "_")}' else: return f'CAST(TRIM(CONCAT_WS("{_sep.value}", XPATH(line, "{row_dict["xpath"]}"))) AS STRING) AS {str(row_dict["column_alias"]).strip().replace(" ", "_")}' else: if str(row_dict["xpath"]) is None or str( row_dict["xpath"]).strip().__eq__('') or str( row_dict["xpath"]).strip().__eq__('None') or str( row_dict["xpath"]).strip().__contains__('?'): return f'CAST(NULL AS STRING) AS {str(row_dict["column_alias"]).strip().replace(" ", "_")}' else: return f'XPATH(line, "{row_dict["xpath"]}") AS {str(row_dict["column_alias"]).strip().replace(" ", "_")}'
def getColumnsAndSerdeXpath(row: Row) -> {}: ddl_tail_serde_property = "" ddl_body_column_and_datatype = "" row_dict = row.asDict() if str(row_dict["column_alias"]) is None or str( row_dict["column_alias"]).strip().__eq__(''): print('Skipping empty/null column name') elif str(row_dict["xpath"]) is None or str( row_dict["xpath"]).strip().__eq__(''): ddl_tail_serde_property = f'"column.xpath.{str(row_dict["column_alias"]).strip().replace(" ", "_")}"="{row_dict["xpath"]}"' ddl_body_column_and_datatype = f'{str(row_dict["column_alias"]).strip().replace(" ", "_")} ARRAY<STRING>' else: ddl_tail_serde_property = f'"column.xpath.{str(row_dict["column_alias"]).strip().replace(" ", "_")}"="/PleaseCorrectTheXpath/@InvalidColumn"' ddl_body_column_and_datatype = f'{str(row_dict["column_alias"]).strip().replace(" ", "_")} ARRAY<STRING>' return { 'column_datatype': ddl_body_column_and_datatype, 'serde_property': ddl_tail_serde_property }
def keyvalues(row: T.Row) -> ((str, str), float): d = row.asDict() key = (d["store_id"], d["dept_id"], d["year"]) return key, d["mse"]
def astraining(row: Row) -> Row: df = row.asDict() del df['Sales_Pred'] del df['sales'] sales = row.asDict()['sales'] return Row(label=sales, features=list(df.values()))