def load_csv(spark, table): """ :param spark: spark session :param table: table object which contains path and field describe :return: data frame in spark """ reader = csv.reader(open(table.path, "r"), delimiter=table.delimiter) un_order_header = dict() for field_name in table.all_fields: if field_name not in table: un_order_header[field_name] = None continue field = table[field_name] if field.field_type == 'numeric': un_order_header[field_name] = float else: un_order_header[field_name] = None header = [] col_type = [] for row in reader: for r in row: if r not in un_order_header: raise Exception("column %s not found in configuration" % r) header.append(r) col_type.append(un_order_header[r]) break col_num = len(header) i = 1 data = list() for row in reader: if len(row) != col_num: raise Exception( "data not consist with header:line %d, expect %d columns, found %d" % (i, col_num, len(row))) line = list() for r, nm, tp in zip(row, header, col_type): if tp is None: line.append(r) else: try: r = r.strip() if r == '': line.append(None) else: line.append(tp(r)) except Exception as e: raise Exception( "line %d, column %s can not convert to float: %s" % (i, nm, r)) data.append(tuple(line)) i += 1 rdd = spark.sparkContext.parallelize(data) data = SQLContext(spark.sparkContext).createDataFrame(rdd, header) print "%s loaded!" % table.name print data data.show() return data
def kmeans_training(master_url): sf = SparkConf()\ .setMaster(master_url) \ .setAppName("SparkSessionZipsExample") \ .set("spark.executor.memory", "8g") sc = SparkContext(conf=sf) data = sc.textFile("hdfs://master32:9000/vectors/word_vector_sh.vec") def get_word_vec(line): x = [] i = 0 __ = line.split(" ") if (len(__) >= 100): for _ in __: if (i == 0): i = 1 continue if (_ == ""): continue x.append(float(_)) i = i + 1 else: for i in range(0, 100): x.append(float(0)) return array(x) tmp = data.map(lambda line: get_word_vec(line.encode('utf-8'))) df = SQLContext(sc).createDataFrame(tmp) df.show() return
), ), ) # In[15]: df.printSchema() # In[16]: df.show() # In[17]: df.withColumn("date_sub_10",F.date_sub("date",10)).show() # In[18]: df.withColumn("date_add_10",F.date_add("date",20)).show() # In[ ]: