def load_sentence_data_frame(sc, dataPath): df = SQLContext(sc).read.format('com.databricks.spark.csv') \ .options(header='true', inferschema='true') \ .load(dataPath) # 複製欄位(vector) df = df.withColumn("_vector", df['vector']) # 去除_vector的 [ 以及 ] df = df.select( df['id'], df['sentence'], df['vector'], regexp_replace(df['_vector'], "[\]\[]", "").alias("_vector")) # 分割_vector字串並且轉型 df = df.select( df['id'], df['sentence'], df['vector'], split(df['_vector'], " ").cast("array<double>").alias("_vector")) # 將double轉換為vectory再轉換為numpy array tmp = df.rdd.flatMap(lambda x: { Row(x['id'], x['sentence'], x['vector'], Vectors.dense(x['_vector'])) }) # 再轉換為dataframe df = SQLContext(sc).createDataFrame(tmp)\ .selectExpr("_1 as id", "_2 as sentence", "_3 as vector", "_4 as _vector") # 回傳dataframe return df
# In[13]: df = df.withColumn( "date", F.when( F.to_date(F.col("date_str"),"yyyy-MM-dd").isNotNull(), F.to_date(F.col("date_str"),"yyyy-MM-dd"), ).otherwise( F.when( F.to_date(F.col("date_str"),"yyyy MM dd").isNotNull(), F.to_date(F.col("date_str"),"yyyy MM dd"), ).otherwise( F.when( F.to_date(F.col("date_str"),"yyyy MMM dd").isNotNull(), F.to_date(F.col("date_str"),"yyyy MMM dd"), ).otherwise( F.when( F.to_date(F.col("date_str"),"E, dd MMMM yy").isNotNull(), F.to_date(F.col("date_str"),"E, dd MMMM yy"), ) ), ), ), ) # In[14]: