Beispiel #1
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import DataFrame, HiveContext  #SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

sconf = SparkConf().setMaster("local[32]").setAppName("TradeDataCount").set(
    "spark.driver.maxResultSize", "32g").set(
        "spark.shuffle.consolidateFiles", "true"
    )  #spark://10.160.5.48:7077 or local[*] to use as many threads as cores
sc = SparkContext(conf=sconf)
hc = HiveContext(sc)

dataset = sc.pickleFile('raw_dataset_rdd.pickle')
dataset = dataset.toDF()

holidays = ["2016-05-30", "2016-07-04",
            "2016-09-05"]  #better to exclude files when doing intial parsing?
data = (dataset.select([
    'ProductName', 'Maturity', 'Date', 'TimeStamp',
    hour("TimeStamp").alias("Hour"), 'Price', 'Quantity'
]).where((dataset.Date.isin(holidays) == False)).cache())
print "Raw data is %d rows." % data.count()

max_date = data.select(max('Date')).first()
print "Max date is %s." % max_date

data.registerTempTable("RawData")
#check holidays and weather index gone
from pyspark import SparkConf, SparkContext
SparkContext.setSystemProperty("hadoop.home.dir", "C:\\spark-1.5.1-bin-hadoop2.6\\")
import sys, pickle,math
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils

conf = SparkConf().setAppName('random-forest')
sc = SparkContext(conf=conf)

input = sys.argv[1]

# Load and parse the data
def parsePoint(line):
    return LabeledPoint(float(line[1]), line[0])

train = sc.pickleFile(input+'/bow_train/part-00000')
test = sc.pickleFile(input+'/bow_test/part-00000')
parsedtrain=train.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0)
parsedtest = test.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0).cache()
model = GradientBoostedTrees.trainRegressor(parsedtrain,categoricalFeaturesInfo={}, numIterations=1)
predictions = model.predict(parsedtest.map(lambda x: x.features))
labelsAndPredictions = parsedtest.map(lambda lp: lp.label).zip(predictions)
val_err = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(parsedtest.count())
parsedtest.unpersist()
RMSE=math.sqrt(val_err)

print("Root Mean Squared Error Test= " + str(RMSE))

class SparkFEProcess:
    def __init__(self):

        self.parser = self.init_config()

        sparkConf = SparkConf().setAppName("feature engineering on spark of explore_spark_cross") \
            .set("spark.ui.showConsoleProgress", "false")
        self.sc = SparkContext(conf=sparkConf)
        self.sc.broadcast(self.parser)
        self.init_logger()
        # #初始化相关参数
        # #bins_dict保存相关列的分箱方案,在处理测试数据的时候使用
        # self.bins_dict={}

    def init_config(self):
        current_path = os.path.dirname(os.path.realpath(__file__))
        workspace_path = current_path.split('featureEngineering')[0]
        config_file = workspace_path + 'resource/config.ini'
        parser = configparser.ConfigParser()
        parser.read(config_file)
        return parser

    def init_logger(self):
        '''
        设置日志级别
        :param sc:
        :return:
        '''
        logger = self.sc._jvm.org.apache.log4j
        logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
        logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
        logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)

    def read_rdd(self, fileName):
        try:
            file_path = self.parser.get("hdfs_path",
                                        "hdfs_data_path") + fileName
            data_rdd = self.sc.textFile(file_path)
            return data_rdd
        except Exception as e:
            print(e)

    def data_describe(self):

        sqlContext = SQLContext(self.sc)
        rootPath = self.parser.get("hdfs_path", "hdfs_data_path")
        print('start to read actLog_single  ,and to deal with cross_feature')
        train_file_path = rootPath + 'actLog_train_single'
        test_file_path = rootPath + 'actLog_test_single'
        actLog_train_rdd = self.sc.pickleFile(train_file_path)
        actLog_test_rdd = self.sc.pickleFile(test_file_path)
        #修改label

        labels = [
            ('duration_time', typ.IntegerType()),
            ('device', typ.IntegerType()),
            ('music_id', typ.IntegerType()),
            ('item_city', typ.IntegerType()),
            ('author_id', typ.IntegerType()),
            ('item_id', typ.IntegerType()),
            ('user_city', typ.IntegerType()),
            ('uid', typ.IntegerType()),
            ('channel', typ.IntegerType()),
            ('finish', typ.IntegerType()),
            ('like', typ.IntegerType()),
            ('time_day', typ.IntegerType()),
            ('item_pub_month', typ.IntegerType()),
            ('item_pub_day', typ.LongType()),
            ('item_pub_hour', typ.IntegerType()),
            ('item_pub_minute', typ.IntegerType()),
            ('uid_count_bin', typ.IntegerType()),
            ('user_city_count_bin', typ.IntegerType()),
            ('user_city_count_ratio', typ.DoubleType()),
            ('item_id_count_bin', typ.IntegerType()),
            ('item_id_count_ratio', typ.DoubleType()),
            ('author_id_count_bin', typ.IntegerType()),
            ('author_id_count_ratio', typ.DoubleType()),
            ('item_city_count_bin', typ.IntegerType()),
            ('item_city_count_ratio', typ.DoubleType()),
            ('music_id_count_bin:', typ.IntegerType()),
            ('music_id_count_ratio', typ.DoubleType()),
            ('device_count_bin', typ.IntegerType()),
            ('device_count_ratio', typ.DoubleType()),
            ('duration_time_count_bin',
             typ.IntegerType()),  #由于step1_single中多处理这个字段,这个字段其实用不上,读进来后删掉
            ('duration_time_count_ratio', typ.DoubleType())
        ]

        actionLogSchema = typ.StructType(
            [typ.StructField(e[0], e[1], True) for e in labels])
        df_actLog_train = sqlContext.createDataFrame(actLog_train_rdd,
                                                     actionLogSchema)
        df_actLog_test = sqlContext.createDataFrame(actLog_test_rdd,
                                                    actionLogSchema)
        df_actLog_train = df_actLog_train.drop('duration_time_count_bin').drop(
            'duration_time_count_ratio')
        df_actLog_test = df_actLog_test.drop('duration_time_count_bin').drop(
            'duration_time_count_ratio')

        # df_actLog_train.show(5,truncate=False)
        df_actLog_train.printSchema()
        # df_actLog_test.show(5,truncate=False)
        df_actLog_test.printSchema()

        return df_actLog_train, df_actLog_test

    def bining(self, sqlContext, df, col, percent_list):
        '''
        :param sqlContext:
        :param df:
        :param col:  需要分箱的列
        :return:
        '''
        pandas_df = df.toPandas()
        bins = []
        for percent in percent_list:
            bins.append(np.percentile(pandas_df.loc[:, col],
                                      percent))  #至少有20%的数据项小于或等于这个值
        print(col + '查看分箱')
        print(bins)
        pandas_df.loc[:, col] = np.digitize(pandas_df.loc[:, col],
                                            bins,
                                            right=True)
        #修改pandas中的列名
        pandas_df.rename(columns={col: col + '_bin'}, inplace=True)
        df_spark = sqlContext.createDataFrame(pandas_df)
        # df_spark.show()
        return df_spark

    # def city_col_deal(self,df,col):
    #     df_city_score=df.groupBy(col).avg('finish', 'like') \
    #         .withColumnRenamed("avg(finish)","avg_finish").withColumnRenamed("avg(like)","avg_like")
    #     df_city_score=df_city_score.withColumn(col+'_score', df_city_score.avg_finish*0.7+df_city_score.avg_like*0.3)\
    #                           .select(col,fn.bround(col+'_score', scale=4).alias(col+'_score'))
    #     return df_city_score

    def dropUnuseCols(self, df, unuse_col):
        for col in unuse_col:
            df = df.drop(col)
        return df

    def data_explore(self, df_train, df_test):

        sqlContext = SQLContext(self.sc)

        print('--------2、统计特征:count、ratio、nunique、ctr相关特征')
        print("计算交叉特征的count、类别偏好的ratio")
        count_feats_list = []
        print('cross count')
        users = ['uid']
        authors = ['author_id', 'item_city', 'channel', 'music_id', 'device']
        count_feats_list.extend([[u_col, a_col] for u_col in users
                                 for a_col in authors])

        users = ['author_id']
        authors = ['channel', 'user_city', 'item_city', 'music_id']
        count_feats_list.extend([[u_col, a_col] for u_col in users
                                 for a_col in authors])

        count_feats_list.append(['uid', 'channel', 'device'])
        count_feats_list.append(['author_id', 'item_city', 'music_id'])
        print("计算count的字段有以下这些")
        print(count_feats_list)

        for i in range(len(count_feats_list)):
            group_cols = count_feats_list[i]
            new_feature = '_'.join(group_cols)
            print("根据上述保存的df_train 和df_test 再处理2维交叉变量")
            if len(group_cols) == 2:
                print("开始处理2维交叉变量")
                df_train = df_train.withColumn(
                    new_feature,
                    fn.concat_ws(
                        '_', df_train[group_cols[0]].cast(typ.StringType()),
                        df_train[group_cols[1]].cast(typ.StringType())))
                df_test = df_test.withColumn(
                    new_feature,
                    fn.concat_ws(
                        '_', df_test[group_cols[0]].cast(typ.StringType()),
                        df_test[group_cols[1]].cast(typ.StringType())))
                df2 = df_train.groupby(new_feature).count()\
                       .withColumnRenamed('count',new_feature+'_count')
                #类别偏好的ratio比例
                count_min = df2.select(fn.min(df2[new_feature +
                                                  '_count'])).collect()[0][0]
                count_max = df2.select(fn.max(df2[new_feature +
                                                  '_count'])).collect()[0][0]
                # F.bround("Rank", scale=4)
                df2 = df2.withColumn(
                    new_feature + '_count_ratio',
                    fn.bround(
                        ((df2[new_feature + '_count'] - fn.lit(count_min)) /
                         ((fn.lit(count_max) - fn.lit(count_min)).cast(
                             typ.IntegerType()))),
                        scale=3))

                if new_feature == "uid_author_id":  #用户看了这个用户发布的视频 超过2个
                    percent_list = [0, 90, 95, 98, 100]
                if new_feature == "uid_music_id":
                    percent_list = [0, 75, 90, 95, 98, 100]
                if new_feature == "uid_device":
                    percent_list = [0, 25, 50, 75, 90, 100]
                if new_feature == "author_id_user_city":
                    percent_list = [0, 75, 90, 95, 98, 100]
                if new_feature == "author_id_music_id":
                    percent_list = [0, 75, 90, 95, 98, 100]
                else:
                    percent_list = [0, 50, 75, 90, 95, 100]

                df2 = self.bining(sqlContext, df2, new_feature + '_count',
                                  percent_list)
                print("查看df2_2")
                df2.show(1, truncate=False)
                df_train = df_train.join(df2, new_feature,
                                         'left').drop(new_feature)
                print("train")
                df_train.show(1, truncate=False)  #ratio是一个连续变量,范围0-1
                df_train.printSchema()
                df_test = df_test.join(df2, new_feature,
                                       'left').drop(new_feature)  #先关联后删除
                print("test")
                df_test.show(1, truncate=False)

            if len(group_cols) == 3:
                print("开始处理3维交叉变量")
                df_train = df_train.withColumn(
                    new_feature,
                    fn.concat_ws(
                        '_', df_train[group_cols[0]].cast(typ.StringType()),
                        df_train[group_cols[1]].cast(typ.StringType()),
                        df_train[group_cols[2]].cast(typ.StringType())))
                df_test = df_test.withColumn(
                    new_feature,
                    fn.concat_ws(
                        '_', df_test[group_cols[0]].cast(typ.StringType()),
                        df_test[group_cols[1]].cast(typ.StringType()),
                        df_test[group_cols[2]].cast(typ.StringType())))

                df3 = df_train.groupby(new_feature).count()\
                       .withColumnRenamed('count',new_feature+'_count')

                #类别偏好的ratio比例
                count_min = df3.select(fn.min(df3[new_feature +
                                                  '_count'])).collect()[0][0]
                count_max = df3.select(fn.max(df3[new_feature +
                                                  '_count'])).collect()[0][0]
                # F.bround("Rank", scale=4)
                df3 = df3.withColumn(
                    new_feature + '_count_ratio',
                    fn.bround(
                        ((df3[new_feature + '_count'] - fn.lit(count_min)) /
                         ((fn.lit(count_max) - fn.lit(count_min)).cast(
                             typ.IntegerType()))),
                        scale=3))
                # print("查看df3_1")
                # df3.show(5,truncate=False)
                percent_list = [0, 50, 75, 90, 95, 100]
                df3 = self.bining(sqlContext, df3, new_feature + '_count',
                                  percent_list)
                print("查看df3_2")
                df3.show(1, truncate=False)
                df_train = df_train.join(df3, new_feature,
                                         'left').drop(new_feature)
                print("train")
                df_train.show(1, truncate=False)
                df_train.printSchema()
                df_test = df_test.join(df3, new_feature,
                                       'left').drop(new_feature)
                print("test")
                df_test.show(1, truncate=False)

        print("交叉特征处理结束")
        print("查看train的表结构")
        df_train.printSchema()
        # print("删除没有必要的列")
        # unuse_col=['item_city','user_city','device','author_id','music_id',]  #'uid','item_id'这两列不能删除,后面提交结果的时候应该要用到
        # df_train=self.dropUnuseCols(df_train,unuse_col)
        # df_test=self.dropUnuseCols(df_test,unuse_col)

        print("表中含有为null的字段,主要产生在leftjoin的时候")
        # df_train=df_train.na.fill({'uid_author_id_count_bin':1,'uid_author_id_count_ratio':0,\
        #                            'uid_item_city_count_bin':1,'uid_item_city_count_ratio':0,\
        #                            'uid_channel_count_bin':1,'uid_channel_count_ratio':0,\
        #                            'uid_music_id_count_bin':1,'uid_music_id_count_ratio':0,\
        #                            'uid_device_count_bin':1,'uid_device_count_ratio':0,\
        #                            'author_id_channel_count_bin':1,'author_id_channel_count_ratio':0,\
        #                            'author_id_user_city_count_bin':1,'author_id_user_city_count_ratio':0,\
        #                            'author_id_item_city_count_bin':1,'author_id_item_city_count_ratio':0,\
        #                            'author_id_music_id_count_bin':1,'author_id_music_id_count_ratio':0,\
        #                            'uid_channel_device_count_bin':1,'uid_channel_device_count_ratio':0,\
        #                            'author_id_item_city_music_id_bin':1,'author_id_item_city_music_id_ratio':0
        #                            })
        df_train = df_train.na.fill({
            'user_city_count_bin': 1,
            'user_city_count_ratio': 0
        })
        #user_city_count_bin,device_count_bin  这两个是step1_single中漏掉的两个字段
        df_test=df_test.na.fill({'user_city_count_bin':1,'user_city_count_ratio':0,\
                                 'device_count_bin':-1,'device_count_ratio':0,\
                                   'uid_author_id_count_bin':1,'uid_author_id_count_ratio':0,\
                                   'uid_item_city_count_bin':1,'uid_item_city_count_ratio':0,\
                                   'uid_channel_count_bin':1,'uid_channel_count_ratio':0,\
                                   'uid_music_id_count_bin':1,'uid_music_id_count_ratio':0,\
                                   'uid_device_count_bin':1,'uid_device_count_ratio':0,\
                                   'author_id_channel_count_bin':1,'author_id_channel_count_ratio':0,\
                                   'author_id_user_city_count_bin':1,'author_id_user_city_count_ratio':0,\
                                   'author_id_item_city_count_bin':1,'author_id_item_city_count_ratio':0,\
                                   'author_id_music_id_count_bin':1,'author_id_music_id_count_ratio':0,\
                                   'uid_channel_device_count_bin':1,'uid_channel_device_count_ratio':0,\
                                   'author_id_item_city_music_id_count_bin':1,'author_id_item_city_music_id_count_ratio':0
                                   })

        print("查看test缺失值")
        df_test.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing')
                      for c in df_test.columns]).show()
        print("查看train缺失值")  #以防万一,可能会漏掉哪个字段
        df_train.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c +
                                                                 '_missing')
                       for c in df_train.columns]).show()

        print('-------5.保存数据预处理结果-------')
        test_file_path = self.parser.get(
            "hdfs_path", "hdfs_data_path") + 'actLog_test_single_cross'
        os.system("hadoop fs -rm -r {}".format(test_file_path))
        df_test.rdd.map(tuple).saveAsPickleFile(test_file_path)

        del df_test
        gc.collect()

        train_file_path = self.parser.get(
            "hdfs_path", "hdfs_data_path") + 'actLog_train_single_cross'
        os.system("hadoop fs -rm -r {}".format(
            train_file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        df_train.rdd.map(tuple).saveAsPickleFile(train_file_path)
Beispiel #4
0
from PredictionsHandlerFlask import NewsPrediction
import json

if __name__ == "__main__":
    conf = SparkConf()
    #conf.set('spark.shuffle.blockTransferService', 'nio')
    conf.set('spark.files.fetchTimeout', '180')
    conf.set('spark.files.overwrite', 'yes')
    conf.set('spark.akka.timeout', '180')
    conf.set('spark.task.maxFailures', '30000')
    conf.set('spark.akka.frameSize', '500')
    conf.set('spark.network.timeout', '180')

    dataDirectory = 'hdfs://157.26.83.52/user/wdroz/stream2'

    myClassifierOnevsOne = pickle.load(open('myClassifierOnevsOne.p', 'rb'))

    dataSetMaker = DataSetMakerV2(n=200000)

    sc = SparkContext(conf=conf)

    newsRDD = sc.pickleFile(dataDirectory + '/2015-05-040')

    print('%d news' % newsRDD.count())

    for news in newsRDD.collect():
        try:
            print(str(news))
        except:
            pass
Beispiel #5
0
# The code from here on is wrapped in a try-finally block to ensure sc.stop() cleans up in
# the event of an exception or a ctrl-C termination

    try:
        sc = SparkContext(conf=config, appName="acka630")

        vecdir = sys.argv[2] if sys.argv[2][-1] == "/" else sys.argv[2] + "/"
        metadir = sys.argv[3]

        # Load TFIDF vectors from the specified directory, and cache as these will be used
        # each time a new subject/fold is run

        tfidfVectorsAll = sc.parallelize([], 16)
        for tfidf in os.listdir(vecdir):
            if tfidf[:5] != 'TFIDF': continue
            vectors = sc.pickleFile(vecdir + tfidf)
            tfidfVectorsAll = tfidfVectorsAll.union(vectors)

        numVectors = tfidfVectorsAll.cache().count()

        print "numVectors:", numVectors

        # Generate a list of all file IDs and collect to Python

        fileIdList = tfidfVectorsAll.keys().collect()

        # Filter the metadata by file ID to just keep relevant file-subject pairs

        metaData = sc.pickleFile(metadir) \
                     .filter(lambda x: int(x[0]) in fileIdList)
Beispiel #6
0
def main():
    appName = "BadOrGood;zl"
    
    conf = (SparkConf()
            .setAppName(appName)
            .set("spark.executor.memory", "5g")
            .set("spark.executor.cores","3")
            .set("spark.executor.instance", "3")
            )
    sc = SparkContext(conf = conf)
    hc = HiveContext(sc)

    #fetch data
    #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd'
    #fetchDataToFile(hc, filepath)
    
    #load data
    # AllDataRawrdd = sc.pickleFile(filepath) \
                    # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \
                    # .repartition(10)
    
    AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10)
    
    
    #standardizer for train and test data
    model = StandardScaler(True, True) \
            .fit( AllDataRawrdd \
                  .map( lambda _: Vectors.dense(_['feature']) ) 
            )
    labels = AllDataRawrdd.map(lambda _: _['label'])
    featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) )
    AllDataRawrdd = labels \
                    .zip(featureTransformed) \
                    .map( lambda _: { 'label':_[0], 'feature':_[1] } )
    #sampling
    trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100)
    trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist()
    testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist()
    
    #prediction & test
    lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1")
    resultrdd = test(lrmLBFGS, testDatardd)
    lrmLBFGSFone = fone(resultrdd)
    lrmLBFGSac = accuracy(resultrdd)

    lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1")
    resultrdd = test(lrmSGD, testDatardd)
    lrmSGDFone = fone(resultrdd)
    lrmSGDac = accuracy(resultrdd)
  
    dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10)
    resultrdd = test(dt, testDatardd)
    dtFone = fone(resultrdd)
    dtac = accuracy(resultrdd)
  
    rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10)
    resultrdd = test(rf, testDatardd)
    rfFone = fone(resultrdd)
    rfac = accuracy(resultrdd)

    print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac)
    print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac)
    print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac)
    print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac)

    print lrmLBFGS.weights
    print lrmSGD.weights

    sc.stop()
Beispiel #7
0
def create_wordbag(x):
	wordbag = []
	if(x['eval_content']) is None:
		return wordbag	
	twitter = Twitter()
	for text in twitter.pos(x['eval_content'], stem = True):
		tag = text[1]
		if tag in unneeded:
			continue

		word = text[0]
		wordbag.append(word)
	return wordbag
	

documents = sqlContext.createDataFrame(sc.pickleFile('merged_file/part-00000').map(lambda x : [x['eval_id'],x['no'],create_wordbag(x),x['professor'],x['lec_code'][:4],x['lec_code'][5],x['eval_total'],x['eval_id']]),['eval_id','no','words','prof_name','department','grade','eval_total','eval_id'])

#users = sqlContext.createDataFrame(sc.pickleFile('merged_file').map(lambda x : (x['mb_no'],x['lec_code'][:4])),['user','department']).orderBy('department')
#for u in users.select('department','user').take(10000):
#	print u
'''
professors = documents.select('prof_name').distinct()
department = documents.select('department').distinct()
#grade	1/2/3/4
eval_total = documents.select('eval_total').distinct() # 1/2/3/4/5

for e in eval_total.collect():
	print e
'''

Beispiel #8
0
    ## ****  you can jump this part. directly load data in the next part ****** ##
    print "calculating tfidf ..."
    tfidf, tags = create_tfidf(sc)
    dimention = 1000
    print "reducing tfidf to " + str(dimention) + "..."

    save_file = './data/10k_reducedRDD'
    # just this if you want to load reduced directly in the following section.
    # reduced = reduce_tfidf(tfidf, dimention)
    # # use below to save the reduce tfidf RDD
    # if os.path.exists(save_file):
    #     shutil.rmtree(save_file, ignore_errors=True)
    # reduced.saveAsPickleFile(save_file)

    # use below to load reduced tfidf RDD directly.
    reduced = sc.pickleFile(save_file)
    # processed = sc.pickleFile('./data/10k_processedRDD')

    #tune parameters below
    K = 50
    KeepAllTag = False
    # you can run KNN to get the results by yourself, 1k posts in 60s, 10k posts in 20 mins.
    KNN(reduced,tags, K, KeepAllTag)
    # or... you can load the results for KNN directly, there is 1k and 10k version in ./data
    confution_matrix = load_obj('10k_confution_matrix')
    perfomanceMatrix = load_obj('10k_perfomanceMatrix')
    # need some one use this data to make some figures for showing our results.


    # comment next line out, if you want to save. (note: change the path accordingly)
    # reduced.saveAsPickleFile('./data/1k_reducedRDD')
from pyspark import SparkContext

sc = SparkContext()
tmpFile = "sparkdata/srm.data.samples"

r = sc.pickleFile(tmpFile, 5).collect()

Beispiel #10
0
from gensim.models.doc2vec import Doc2Vec
from math import exp
from threading import Thread, Event

sc = SparkContext()
sqlContext = SQLContext(sc)

# this is a large object we cache it on each worker node
gmod_broadcast = sc.broadcast( Doc2Vec.load("/data/_hndata/doc2vec_model/hn") ) 

tfidf_model = RandomForestModel.load(sc, "hdfs:///hndata/hnrrmodel_tfidf")

doc2vec_model = RandomForestModel.load(sc, "hdfs:///hndata/rrscoremodel")
doc2vec_model2 = RandomForestModel.load(sc, "hdfs:///hndata/rrscoremodel2")

tf = sc.pickleFile("hdfs:///hndata/tf_pickle")
idf = IDF().fit(tf)
hashingTF = HashingTF(1000)



def pred_tfidf(docs):
    sents = sc.parallelize(docs).map(lambda d: d.strip().split())
    new_tf = hashingTF.transform(sents)
    tfidf = idf.transform(new_tf)
    return tfidf_model.predict(tfidf)

def pred_doc2vec(docs, takelog=True, cased=False):
    sents = sc.parallelize(docs) \
              .map(lambda d: (d.lower() if not cased else d).strip().split())
    def loadDoc2vec(sents):
if __name__ == "__main__":
    #print(ageToGroup)
    print(dayInterval)

    aviser=['an','ba','dt','fb','havis','oa','rb',\
        'ta','tb','nordlys','firda','glomdalen',\
        'mossavis','ringblad','sb','sa',\
        'tk','op','ostlendingen']

    conf=SparkConf().setAppName('konsumprofiler').setMaster("local[8]").set('spark.app.id','200')

    sc=SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    user_map=sc.pickleFile('/home/erlenda/data/konsum/konsumprofil-rdd')
    print(user_map.first())

    total_pvs=user_map.map(lambda x:x.pvs).collect()
    #total_visits=user_map.map(lambda x:(x.a_user_key,1.)).collect()
    #print(total_pvs[:1000])
    percs_pageviews=np.percentile(total_pvs,[20.,40.,60.,80.])
    #percs_visits=np.percentile(total_visits,[20.,40.,60.,80.])

    print('Quantiles pageviews:',percs_pageviews)
    percs_pageviews_top=np.percentile(total_pvs,[95.,98.,99.])
    #print('Quantiles visits:',percs_visits)



Beispiel #12
0
from pyspark.mllib.feature import IDF
import datetime
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
from pyspark.mllib.feature import Normalizer


conf = SparkConf()

conf.setMaster('yarn-client')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)


path = "/Users/sradhakr/Desktop/Assignment3/Assignment3"

train_featureScoreTimeRDD=sc.pickleFile(path+'trainDataRDD',10)
val_featureScoreTimeRDD=sc.pickleFile(path+'valDataRDD',10)

norm = Normalizer(2)



train_featureScoreTimeRDD=sc.pickleFile(path+'trainDataRDD',10)
val_featureScoreTimeRDD=sc.pickleFile(path+'valDataRDD',10)


train_featuresRDD=train_featureScoreTimeRDD.map(lambda (feature, score): feature)

trainfeatureScoreNormRDD=norm.transform(train_featuresRDD).zip(train_featuresRDD.map(lambda (feature, score): score))

unneeded = [u'Unknown', u'KoreanParticle',u'Hashtag', u'ScreenName' ,u'Number', u'Alpha', u'Foreign',u'Punctuation', u'Suffix', u'Eomi', u'PreEomi' ,u'Josa', u'Exclamation']

def create_wordbag(x):
	wordbag = []
	if(x['eval_content']) is None:
		return wordbag	
	twitter = Twitter()
	for text in twitter.pos(x['eval_content'], stem = True):
		tag = text[1]
		if tag in unneeded:
			continue

		word = text[0]
		wordbag.append(word)
	return wordbag
documents = sc.pickleFile('merged_file').map(lambda x : (x['no'],create_wordbag(x)))
htf = HashingTF()
tf_id = documents.mapValues(htf.transform)
#tf_id.cache()
#for a in tf_id.take(100):
#	print a
#tf = htf.transform(documents.values())
#tf.cache()
idf = IDF().fit(tf_id.values())
#idf.cache()

tfidf_id = idf.transform(tf_id.values())
#tf_id.mapValues(idf.transform)

print type(tfidf_id)
Beispiel #14
0
                      (cv_data_rdd, out_cv_data)]:
        url = sparkutil.util.s3n_url(S3_BUCKET, S3_PATH, name)
        sparkutil.util.s3n_delete(url)
        rdd.saveAsPickleFile(url)
        pickle.dump({'url': url}, open(name, 'w'))

    sc.stop()


@jobs_limit(1)
@transform(spark_run_experiments, suffix('.samples'), '.samples.pickle')
def get_samples((exp_samples, exp_cvdata, exp_inits), out_filename):
    sample_metadata = pickle.load(open(exp_samples, 'r'))

    sc = SparkContext()
    results_rdd = sc.pickleFile(sample_metadata['url'])

    sparkutil.util.save_rdd_elements(results_rdd, out_filename, S3_BUCKET,
                                     S3_PATH)

    sc.stop()


@jobs_limit(1)
@transform(spark_run_experiments, suffix('.samples'), '.cvdata.pickle')
def get_cvdata((exp_samples, exp_cvdata, exp_inits), out_filename):
    cvdata_metadata = pickle.load(open(exp_cvdata, 'r'))

    sc = SparkContext()
    results_rdd = sc.pickleFile(cvdata_metadata['url'])
    pickle.dump(results_rdd.collect(), open(out_filename, 'w'))
import json

if __name__ == "__main__":
    conf = SparkConf()
    #conf.set('spark.shuffle.blockTransferService', 'nio')
    conf.set('spark.files.fetchTimeout', '180')
    conf.set('spark.files.overwrite', 'yes')
    conf.set('spark.akka.timeout', '180')
    conf.set('spark.task.maxFailures', '30000')
    conf.set('spark.akka.frameSize', '500')
    conf.set('spark.network.timeout', '180')
    
    dataDirectory = 'hdfs://157.26.83.52/user/wdroz/stream2'
    
    myClassifierOnevsOne = pickle.load(open('myClassifierOnevsOne.p','rb'))
    
    dataSetMaker = DataSetMakerV2(n=200000)
    
    sc = SparkContext(conf=conf)

    
    newsRDD = sc.pickleFile(dataDirectory + '/2015-05-040')
    
    print('%d news' % newsRDD.count())
    
    for news in newsRDD.collect():
        try:
            print(str(news))
        except:
            pass
    
Beispiel #16
0
sqlContext = SQLContext(sc)

path = "/Users/sradhakr/Desktop/Assignment3/Assignment3"


df = sqlContext.read.json(path+'/reviews_Pet_Supplies_p1.json')

reviewDF = df.select("overall", "reviewText", "reviewTime")

def removePunctuation(text):
   return re.sub("[^a-zA-Z]", " ", text)


cleanedReviewRDD = reviewDF.map(lambda row: (row.overall, removePunctuation(row.reviewText).lower().split(), row.reviewTime ))

reviewRDD = sc.pickleFile(path+'/P2CleanedRDD', 10)

uniqueWordsRDD = reviewRDD.flatMap(lambda words: words).distinct().map(lambda word: (word, 1))

word2VecRDD = sqlContext.read.parquet(path+"/word2vec/data")

wordsFeaturesDict = sc.broadcast(uniqueWordsRDD.join(word2VecRDD.rdd).map(lambda (key, (dummy,features)):(key, features)).collectAsMap())


def getFeature(word):
	if wordsFeaturesDict.value.has_key(word):
		return np.array(wordsFeaturesDict.value[word])
	else:
		return []	
	
year_end = gui.endYear
print  year_end
num_of_segments = 5

predict_end = gui.predYear  # predict till year



conf = SparkConf()
conf.setMaster("local[4]")
conf.setAppName("damu1000")
conf.set("spark.executor.memory", "4g")
sc = SparkContext(conf=conf)

# read data
lines = sc.pickleFile(".//result")
# filter by country, indicator and period. sort by period
lines = lines.filter(lambda x: x[0] == country and x[2] == indicator and x[4] != '' and x[3] >= year_start and x[3] <= year_end).sortBy(lambda (a, b, c, d, e): d, True).cache()
lines.take(1)
if not lines.take(1):
    print "Index not present for this country. Stopping"
    sys.exit()
print lines.collect()
x = lines.map(lambda (a, b, c, d, e): (d))  # getting x values in 2D RDD
y = lines.map(lambda (a, b, c, d, e): float(e))  # getting x values in 2D RDD


# num_of_segments = x.count() / 5 #averaging at around 5 points per segment
#--------------------------------------- Find out "break" points in pattern--------------------------------------------------------------------

# assign indexes to y values, increment by 1 and 2 so that elements can be joined to find out diff later.
from pyspark import SparkContext,SparkConf

sc=SparkContext(conf=SparkConf().setAppName("Batsman Runs"))

batsmanData=sc.pickleFile("deliveries.pickle")
batsmanData=batsmanData.map(lambda s:(s['batsman'],int(s['batsman_runs']))).reduceByKey(lambda a,b:a+b).sortByKey()
for item in batsmanData.collect():
	print(item)
def main(comment_dir, submission_dir, output_dir, author, n):
    # spark specific setup
    conf = SparkConf().setAppName('Subreddit Recommender')
    sc = SparkContext(conf=conf)

    model = None
    author_id_rdd = None
    subreddit_id_rdd = None

    MODEL_PFP = output_dir + '/pickles/model'
    AUTHOR_ID_PFP = output_dir + '/pickles/author_id_rdd'
    SUBREDDIT_ID_PFP = output_dir + '/pickles/subreddit_id_rdd'

    if os.path.isdir(MODEL_PFP) and os.path.isdir(SUBREDDIT_ID_PFP) and os.path.isdir(AUTHOR_ID_PFP):
        print 'Loading model...',
        model = MatrixFactorizationModel.load(sc, MODEL_PFP)
        author_id_rdd = sc.pickleFile(AUTHOR_ID_PFP)
        subreddit_id_rdd = sc.pickleFile(SUBREDDIT_ID_PFP)
        print 'Done!'
    else:
        print 'Model not found :('
        print 'This will take a while...'

        # ((author, subreddit), comment_rank)
        comment_rdd = do_it(sc, comment_dir)
        # ((author, subreddit), submission_rank)
        submission_rdd = do_it(sc, submission_dir)
        # ((author, subreddit),(comment_rank, submission_rank))
        total_rdd = submission_rdd.fullOuterJoin(comment_rdd)
        # (author, subreddit, comment_rank + submission_rank)
        sum_rdd = total_rdd.map(combine_join_results).cache()

        author_id_rdd, subreddit_id_rdd, translated_rdd = hash_rating(sum_rdd, sc)

        print 'Training...',
        model = ALS.train(translated_rdd, 1)
        print 'Saving...',
        model.save(sc, MODEL_PFP)
        author_id_rdd.saveAsPickleFile(AUTHOR_ID_PFP)
        subreddit_id_rdd.saveAsPickleFile(SUBREDDIT_ID_PFP)
        print 'Done!'

    print 'Getting recommendations...'

    wanted_author_id = author_id_rdd.filter(lambda (a, a_id): str(a) == str(author)).collect()
    wanted_author_id = int(wanted_author_id[0][1])

    products_ratings = model.recommendProducts(wanted_author_id, int(n))

    wanted_subreddit_ids = map(lambda x: x.product, products_ratings)
    wanted_subredits = subreddit_id_rdd.filter(lambda (sub, s_id): s_id in wanted_subreddit_ids).collect()
    wanted_subredits = map(lambda (sub, s_id): sub, wanted_subredits)

    print 'author:', author
    print 'Recommended subreddits:'
    print wanted_subredits

    fp_out = open(output_dir + '/recommendation.txt', 'w')
    fp_out.write('Recommendations for /u/' + author + ':\n')
    for i, s in enumerate(wanted_subredits):
        fp_out.write(str(i) + ': /r/' + str(s) + '\n')
    fp_out.close()
Beispiel #20
0
    'linear': LinearClassifier(3, 32, 32, 10, 20),
    'nn'    : NNClassifier(3, 32, 32, 10, 5),
    'cnn'   : CNNClassifier(3, 32, 32, 10, 3),
  }
  classifier = classifiers[name]

  """ set spark context and RDDs """
  master = open("/root/spark-ec2/cluster-url").read().strip()
  slaves = sum(1 for line in open("/root/spark-ec2/slaves"))
  conf = SparkConf()
  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
  conf.set("spark.eventLog.enabled", "TRUE")
  conf.set("spark.default.parallelism", str(slaves * 2))
  conf.set("spark.akka.frameSize", "50")
  sc = SparkContext(master=master, environment={'PYTHONPATH':os.getcwd()}, conf=conf)
  trainData = sc.pickleFile("s3n://61c-cnn/" + data, slaves * 4)\
                .persist(StorageLevel.MEMORY_AND_DISK_SER)

  """ run clssifier """
  log = open('ec2-' + name + data.strip('train') + '.log', 'w')
  sys.stdout = Log(sys.stdout, log)
  if name == 'cnn':
    classifier.load('snapshot/' + name + '/')
  s = time()
  classifier.train(trainData, [], datanum, is_ec2=True)
  e = time()
  """ skip validation """
  print '[CS61C Project 4] training performane: %.2f imgs / sec' % \
    ((datanum * classifier.iternum) / (e - s))
  print '[CS61C Project 4] time elapsed: %.2f min' % ((e - s) / 60.0)

  trainData.unpersist()
        dist = dist + p
    if len(v1) == 0:
        dist = sum(v[1] ** 2 for v in v2)
    return dist


# ----------------------------------------------------------------

if __name__ == "__main__":
    conf = SparkConf()
    conf.setMaster("local[2]")
    conf.setAppName("ItemBased")
    conf.set("spark.executor.memory", "4g")

    sc = SparkContext(conf=conf)
    sourceFile = sys.argv[1] if len(sys.argv) > 1 else "data/sample1k.txt"
    similarity = sys.argv[2] if len(sys.argv) > 1 else "cos_sim"

    rawdata = sc.textFile(sourceFile)
    users = (
        rawdata.map(toRowKey)
        .aggregateByKey([], fill_row, assemble_row)
        .map(sort_row)
        .sortBy(lambda x: len(x[1]), ascending=False)
    )
    testdata = users.take(10)

    sim = sc.pickleFile(similarity)
    error = evaluate(testdata, sim)
    print(error)
Beispiel #22
0
conf = SparkConf()

conf.setMaster('yarn-client')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

df = sqlContext.read.json(path+'/reviews_Pet_Supplies_p1.json')

reviewDF = df.select("overall", "reviewText", "reviewTime")

def removePunctuation(text):
   return re.sub("[^a-zA-Z]", " ", text)

cleanedReviewRDD = reviewDF.map(lambda row: (row.overall, removePunctuation(row.reviewText).lower().split(), row.reviewTime ))

wordsClustersRDD= sc.pickleFile(path+'/WordClustersRDD',10)

reviewRDDWithIndex = cleanedReviewRDD.zipWithIndex().map(lambda (row, index):(index, row)).cache()
reviewTextWithIndex = reviewRDDWithIndex.map(lambda (index,(score, words, time)): (index, words))
reviewScoreTimeWithIndex = reviewRDDWithIndex.map(lambda (index,(score, words, time)): (index, (score, time)))

def getKey(item):
    return item[0]


def createSparseVector(histogram):
	indexList = []
	countList = []
	for histogramIndex, count in sorted(histogram, key=getKey):
		indexList.append(histogramIndex)
		countList.append(count)
Beispiel #23
0
from pyspark import SparkConf,SparkContext

sc=SparkContext(conf=SparkConf().setAppName("Bowler Extras"))

bowlerData=sc.pickleFile('deliveries.pickle')
bowlerData=bowlerData.map(lambda s:(s['bowler'],int(s['wide_runs'])+int(s['bye_runs'])+int(s['legbye_runs'])+int(s['noball_runs']))).reduceByKey(lambda a,b:a+b).sortByKey()
for item in bowlerData.collect():
	print(item)
Beispiel #24
0
def main():
    # Parse arguments
    parser = argparse.ArgumentParser(
        description='Distributed RNN-LSTM built on Spark and Tensorflow')
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to dataset')
    parser.add_argument('-t',
                        '--target',
                        type=str,
                        required=True,
                        help='Path to target classes')
    parser.add_argument('-m',
                        '--master',
                        type=str,
                        help='host of master node',
                        default='local')
    parser.add_argument('-sem',
                        '--sparkexecmemory',
                        type=str,
                        help='Spark executor memory',
                        default='4g')
    parser.add_argument('-p',
                        '--partitions',
                        type=int,
                        help='Number of minibatch for dataset',
                        default=4)
    parser.add_argument('-hl',
                        '--numHidden',
                        type=int,
                        help='Number of hidden layers',
                        default=1)
    parser.add_argument('-e',
                        '--epoch',
                        type=int,
                        help='Number of training epoch',
                        default=1)
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        help='Output path',
                        default='temp')
    parser.add_argument('-lp',
                        '--loadPickle',
                        type=bool,
                        help='Load weights from a pickle file',
                        default=False)
    parser.add_argument('-lo',
                        '--loadOp',
                        type=str,
                        help='Operation to execute after load',
                        default='reduce')
    parser.add_argument('-gp',
                        '--graphPath',
                        type=str,
                        help='Graph path',
                        default='tmp/graph_default')

    args = vars(parser.parse_args())
    input_path = args['input']
    target_path = args['target']
    master_host = args['master']
    sem = args['sparkexecmemory']
    partitions = args['partitions']
    hidden = args['numHidden']
    epoch = args['epoch']
    output = args['output']
    load = args['loadPickle']
    load_op = args['loadOp'].split('|')
    graphPath = args['graphPath']

    global COUNT_RUN
    COUNT_RUN = 1

    # Initialize spark
    # Substitute 4 with max supported
    workers = partitions if partitions == multiprocessing.cpu_count(
    ) else partitions % multiprocessing.cpu_count()
    workers_master = '[%d]' % workers
    conf = SparkConf().setMaster(master_host +
                                 workers_master).setAppName("RNN-LSTM").set(
                                     "spark.executor.memory", sem)

    print 'Total workers: ', workers_master
    print 'Spark executor memory: ', sem

    sc = SparkContext(conf=conf)
    quiet_logs(sc)

    with open(target_path, 'rb') as t_f:
        target = json.load(t_f)

    target = map_target(target)

    if not load:
        # Read dataset into RDD as csv
        training_rdd = textToRDDCsv(sc, input_path, partitions)
        minibatch_rdd = training_rdd.partitionBy(partitions + 1)  # FOR NOW OK

        # It is simple to extend multilayer lstm to support different settings
        # on multiple layers
        multilayer_props = [
            dict(layer_name='1',
                 layer_type='lstm',
                 dim_size=-1,
                 num_hidden=hidden,
                 normalize=True)
        ]
        start = time.time()

        weights_rdd = minibatch_rdd.mapPartitions(
            lambda x: train_rnn(
                x, multilayer_props, epoch=epoch, target=target), True)

        # Return weights and average them
        weights_rdd = weights_rdd.filter(lambda x: len(x) == 2)
        #weights = weights_rdd.saveAsPickleFile(output + '_raw')

        out = weights_rdd.filter(lambda x: len(x) == 2)
        # Mean row by row
        weights_mean_rdd = out.groupByKey().mapValues(
            lambda x: sum(x) / float(len(x)))  #
        if (output == 'temp'):
            print 'No output directory defined using temp'
        weights_mean_rdd.collect()
        print 'RNN-LSTM - Total Processing Time (with weight averaging): %f' % (
            time.time() - start)
        print 'RNN-LSTM - Total Processing Time (with repartition) %f' % (
            time.time() - start)

    else:
        if 'reduce' in load_op:
            print 'REDUCING'
            weights_rdd = sc.pickleFile(input_path + '_raw', partitions)
            print weights_rdd.collect()
            out = weights_rdd.filter(lambda x: len(x) == 2)

            # Mean row by row
            c = out.groupByKey().mapValues(
                lambda x: sum(x) / float(len(x))).collect()
            for i, d in enumerate(c):
                print i, " - ", d

                print
                print
            if 'save' in load_op:
                if (output == 'temp'):
                    print 'No output directory defined using temp'
                weights_mean_rdd.saveAsPickleFile(output + '_mean')
                print weights_mean_rdd.collect()
        # SHOULD CONTINUE
    sys.exit(0)
Beispiel #25
0
#        start_map = time()
#        RDD_IDF = sc.pickleFile('IDF_RAW.RDD')
##        RDD_IDF = RDD_IDF.map(lambda (word, count): (word, ( N / count)))
#        RDD_IDF = RDD_IDF.map(lambda (word, count): (word, np.log( N / count)))
#        save_name = 'IDF.RDD'
#        call(["rm", "-rf", save_name])
#        RDD_IDF.saveAsPickleFile(save_name)
#        finish_map = time()
#        print "Mapped, took ",finish_map - start_map

    # Part 1f - create TF.IDF
    if MODE == 'TF.IDF':
        print ""
        print "CREATING TF.IDFs"
        print >> runtimes_file,"CREATING TF.IDF"
        RDD_IDF = sc.pickleFile('IDF.RDD')
        TFIDF.create_tfidf(sc, RDD_IDF, diag_file, runtimes_file, stop_after, batch_size, report_diagnostics)

    # Part 1d/e/f - Do the whole thing for 'stop_after' files
    if MODE == 'ALL':
        print ""
        print "CREATING TFs"
        print >> runtimes_file,"CREATING TFs"
        TF.create_tf(data_path, sc, diag_file, runtimes_file, stop_after, batch_size, report_diagnostics)
        print ""
        print "CREATING IDF"
        print >> runtimes_file,"CREATING IDF"
        RDD_IDF = sc.parallelize([])
        RDD_IDF = IDF.create_idf(sc, RDD_IDF, diag_file, runtimes_file, stop_after, batch_size, report_diagnostics)
        print ""
        print "CREATING TF.IDFs"
    else:
        print 'Using Pre-Pickled Files\n'
    # End Timer for this phase
    WordFreq_Time = time() - WordFreq_Time
    print('############ Processing Completed ##############')
    print('################################################\n')

    print('################################################')
    print('############## Word Freq to IDF RDD ############\n')
    # Start Timer
    IDF_Time = time()
    # Ascertain if Section has already been completed
    if len(getDirectory(directory[3])) < 1:
        allFolders = getDirectory(directory[2])
        # Load in Word Frequency Pickles into one RDD
        IDF = sc.union([sc.pickleFile(i) for i in allFolders])
        # Rearrange RDD into correct the correct format
        IDF = IDF.flatMap(lambda x_y: [(pair[0], [[x_y[0], str(pair[1])]]) for pair in x_y[1]]) \
                 .reduceByKey(add) \
                 .map(lambda x_y1: (x_y1[0], len(x_y1[1]), float(N), x_y1[1])) \
                 .map(lambda x_y_z_a: (x_y_z_a[0], np.log2(x_y_z_a[2] / x_y_z_a[1]), x_y_z_a[3])) \
                 .repartition(8)
        # Save IDF RDD as a Pickle File
        IDF.saveAsPickleFile(directory[4], 50)
    else:
        print 'Using Pre-Pickled Files\n'
    # End Timer for this phase
    IDF_Time = time() - IDF_Time
    print('############ Processing Completed ##############')
    print('################################################\n')
Beispiel #27
0
    # cluster(group_res[1][1], group_res[1][0], st, global_dict.value)  # testing group with length of 9
    """

    # print("Test clustering")
    # group_res = group_rdd.collect()
    # # cluster_two_pass(group_res[1][1], group_res[1][0], st, global_dict.value)  # testing group with length of 9
    # cluster(group_res[1][1], group_res[1][0], st, global_dict.value)  # testing group with length of 9

    print("Working on clustering")

    cluster_rdd = group_rdd.map(
        lambda x: cluster_two_pass(x[1], x[0], st, global_dict.value))

    cluster_rdd.saveAsPickleFile(
        path_save_res)  # save all the cluster to the hard drive
    cluster_rdd_reload = sc.pickleFile(
        path_save_res).collect()  # here we have all the clusters in memory
    # first_dict = cluster_rdd_reload[0]
    print("clustering done")

    # plot all the clusters
    # plot_cluster(cluster_rdd_reload, 2, time_series_dict, 5)
    """
        ##### query
        Current implementation: if we want to find k best matches, we give the first k best matches for given sequence length range


        The following line is for testing querying on one cluster
        # query_result = query(query_sequence, cluster_rdd_reload[0], k, time_series_dict.value)

    """
    # # '(001-SART-August2017-MB)_(211-Current-Item:-3)_(A-DC1)_(64434.0)_(105950.0)'
from pyspark.mllib.linalg import Vectors
import sys, math

conf = SparkConf().setAppName('tf-idf')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

train = sys.argv[1]
test = sys.argv[2]

# Load and parse the data
def parsePoint(line):
    return LabeledPoint(float(line[1]), line[0])

#train data
train_data = sc.pickleFile(train)
parsedData = train_data.map(parsePoint)
#test data
test_data = sc.pickleFile(test)
parsedtestData = test_data.map(parsePoint)

# cross validation
num_iterations = 100
step_size=[0.1,10,20,300]
best_error=1000000
best_model=[0]
best_step=0
best_test_error=0
best_split=[]
best_RMSE=0
class SparkFEProcess:
    def __init__(self):

        self.parser = self.init_config()

        sparkConf = SparkConf().setAppName("feature engineering on spark of explore_spark_2") \
            .set("spark.ui.showConsoleProgress", "false")
        self.sc = SparkContext(conf=sparkConf)
        self.sc.broadcast(self.parser)
        self.init_logger()
        # #初始化相关参数
        # #bins_dict保存相关列的分箱方案,在处理测试数据的时候使用
        # self.bins_dict={}

    def init_config(self):
        current_path = os.path.dirname(os.path.realpath(__file__))
        workspace_path = current_path.split('featureEngineering')[0]
        config_file = workspace_path + 'resource/config.ini'
        parser = configparser.ConfigParser()
        parser.read(config_file)
        return parser

    def init_logger(self):
        '''
        设置日志级别
        :param sc:
        :return:
        '''
        logger = self.sc._jvm.org.apache.log4j
        logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
        logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
        logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)

    def read_rdd(self, fileName):
        try:
            file_path = self.parser.get("hdfs_path",
                                        "hdfs_data_path") + fileName
            data_rdd = self.sc.textFile(file_path)
            return data_rdd
        except Exception as e:
            print(e)

    def data_describe(self):
        sqlContext = SQLContext(self.sc)
        print('starto read data after explore_saprk_step1_cross:')
        rootPath = self.parser.get("hdfs_path", "hdfs_data_path")
        print('start to read actLog_train_single_cross')
        test_file_path = rootPath + 'actLog_test_single_cross'
        actLog_test_rdd = self.sc.pickleFile(test_file_path)
        #比对label,看labels是否合适
        labels = [
            ('duration_time', typ.IntegerType()),
            ('device', typ.IntegerType()),
            ('music_id', typ.IntegerType()),
            ('item_city', typ.IntegerType()),
            ('author_id', typ.IntegerType()),
            ('item_id', typ.IntegerType()),
            ('user_city', typ.IntegerType()),
            ('uid', typ.IntegerType()),
            ('channel', typ.IntegerType()),
            ('finish', typ.IntegerType()),
            ('like', typ.IntegerType()),
            ('time_day', typ.IntegerType()),
            ('item_pub_month', typ.IntegerType()),
            ('item_pub_day', typ.LongType()),
            ('item_pub_hour', typ.IntegerType()),
            ('item_pub_minute', typ.IntegerType()),
            ('uid_count_bin', typ.IntegerType()),
            ('user_city_count_bin', typ.IntegerType()),
            ('user_city_count_ratio', typ.DoubleType()),
            ('item_id_count_bin', typ.IntegerType()),
            ('item_id_count_ratio', typ.DoubleType()),
            ('author_id_count_bin', typ.IntegerType()),
            ('author_id_count_ratio', typ.DoubleType()),
            ('item_city_count_bin', typ.IntegerType()),
            ('item_city_count_ratio', typ.DoubleType()),
            ('music_id_count_bin', typ.IntegerType()),
            ('music_id_count_ratio', typ.DoubleType()),
            ('device_count_bin', typ.IntegerType()),
            ('device_count_ratio', typ.DoubleType()),
            ('uid_author_id_count_bin', typ.IntegerType()),
            ('uid_author_id_count_ratio', typ.DoubleType()),
            ('uid_item_city_count_bin', typ.IntegerType()),
            ('uid_item_city_count_ratio', typ.DoubleType()),
            ('uid_channel_count_bin', typ.IntegerType()),
            ('uid_channel_count_ratio', typ.DoubleType()),
            ('uid_music_id_count_bin', typ.IntegerType()),
            ('uid_music_id_count_ratio', typ.DoubleType()),
            ('uid_device_count_bin', typ.IntegerType()),
            ('uid_device_count_ratio', typ.DoubleType()),
            ('author_id_channel_count_bin', typ.IntegerType()),
            ('author_id_channel_count_ratio', typ.DoubleType()),
            ('author_id_user_city_count_bin', typ.IntegerType()),
            ('author_id_user_city_count_ratio', typ.DoubleType()),
            ('author_id_item_city_count_bin', typ.IntegerType()),
            ('author_id_item_city_count_ratio', typ.DoubleType()),
            ('author_id_music_id_count_bin', typ.IntegerType()),
            ('author_id_music_id_count_ratio', typ.DoubleType()),
            ('uid_channel_device_count_bin',
             typ.IntegerType()),  #改成uid_channel_device
            ('uid_channel_device_count_ratio',
             typ.DoubleType()),  #改成uid_channel_device
            ('author_id_item_city_music_id_count_bin', typ.IntegerType()),
            ('author_id_item_city_music_id_count_ratio', typ.DoubleType()),
        ]
        actionLogSchema = typ.StructType(
            [typ.StructField(e[0], e[1], True) for e in labels])

        df_actLog_test = sqlContext.createDataFrame(actLog_test_rdd,
                                                    actionLogSchema)
        df_actLog_test.show(1, truncate=False)

        print('start to read actLog_train_single_cross')
        train_file_path = rootPath + 'actLog_train_single_cross'
        actLog_train_rdd = self.sc.pickleFile(train_file_path)
        df_actLog_train = sqlContext.createDataFrame(actLog_train_rdd,
                                                     actionLogSchema)
        df_actLog_train.show(1, truncate=False)

        return df_actLog_train, df_actLog_test

    def data_explore(self, df_train, df_test):
        sqlContext = SQLContext(self.sc)

        print("对item_pub_hour进行离散化")

        def hourBin(x):
            if x >= 23 or x <= 2:
                return 1
            elif 3 <= x < 8:
                return 2
            elif 8 <= x < 12:
                return 3
            else:
                return 4

        converHourBin = udf(lambda x: hourBin(x), typ.IntegerType())
        df_train = df_train.withColumn("item_pub_hour",
                                       converHourBin(df_train.item_pub_hour))
        df_test = df_test.withColumn("item_pub_hour",
                                     converHourBin(df_test.item_pub_hour))

        print("--------1、针对uid,authorid,musicid等组合的正负样本数量统计特征--------")
        print("交叉特征的正负样本数量统计")
        posneg_feats_list = []
        # posneg_feats_list.append(["duration_time"])
        # posneg_feats_list.append(["time_day"])
        print('cross count')
        users = ['uid']
        authors = ['author_id', 'item_city', 'channel',
                   'music_id']  #,'item_pub_hour'

        posneg_feats_list.extend([[u_col, a_col] for u_col in users
                                  for a_col in authors])
        # posneg_feats_list.append(['uid','author_id', 'channel'])
        # posneg_feats_list.append(['uid', 'author_id', 'music_id'])
        # posneg_feats_list.append(['uid','author_id', 'channel','time_day'])
        # posneg_feats_list.append(['uid', 'author_id', 'music_id','time_day'])

        print("计算以下交叉特征的正负样本比例")  #有2、3、4维的交叉特征
        print(posneg_feats_list)

        for i in range(len(posneg_feats_list)):
            group_cols = posneg_feats_list[i]
            new_feature = '_'.join(group_cols)
            #计算df_train数据中正负样本的比例,test中直接拼接,为null则填充为0或者均值
            #正负样本判定字段:like  finish
            #d第一步,先拼接
            print(new_feature)
            if len(group_cols) == 2:
                print("开始处理2维交叉变量")
                df_train = df_train.withColumn(
                    new_feature,
                    fn.concat_ws(
                        '_', df_train[group_cols[0]].cast(typ.StringType()),
                        df_train[group_cols[1]].cast(typ.StringType())))
                df_test = df_test.withColumn(
                    new_feature,
                    fn.concat_ws(
                        '_', df_test[group_cols[0]].cast(typ.StringType()),
                        df_test[group_cols[1]].cast(typ.StringType())))

            if len(group_cols) == 3:

                print("开始处理3维交叉变量")
                df_train = df_train.withColumn(
                    new_feature,
                    fn.concat_ws(
                        '_', df_train[group_cols[0]].cast(typ.StringType()),
                        df_train[group_cols[1]].cast(typ.StringType())),
                    df_train[group_cols[2]].cast(typ.StringType()))
                df_test = df_test.withColumn(
                    new_feature,
                    fn.concat_ws(
                        '_', df_test[group_cols[0]].cast(typ.StringType()),
                        df_test[group_cols[1]].cast(typ.StringType())),
                    df_test[group_cols[2]].cast(typ.StringType()))
            # if len(group_cols)==4:
            #
            #     print("开始处理4维交叉变量")
            #     df_train=df_train.withColumn(new_feature, fn.concat_ws('_',df_train[group_cols[0]].cast(typ.StringType()),df_train[group_cols[1]].cast(typ.StringType()))
            #                                                      ,df_train[group_cols[2]].cast(typ.StringType()) ,df_train[group_cols[3]].cast(typ.StringType()))
            #     df_test=df_test.withColumn(new_feature, fn.concat_ws('_',df_test[group_cols[0]].cast(typ.StringType()),df_test[group_cols[1]].cast(typ.StringType()))
            #                                                      ,df_test[group_cols[2]].cast(typ.StringType()) ,df_test[group_cols[3]].cast(typ.StringType()))

            for target in ["like", "finish"]:
                df3 = df_train.select(
                    new_feature,
                    target).groupby(new_feature).count().withColumnRenamed(
                        'count', new_feature + '_count')
                df4 = df_train.select(
                    new_feature, target).where(df_train[target] == 1).groupby(
                        new_feature).count().withColumnRenamed(
                            'count', new_feature + "_count_" + target + "_1")
                df3 = df3.join(df4, new_feature, 'left').na.fill(0)
                del df4
                gc.collect()
                # print("两列相除:得到正样本的比例",target)
                df3 = df3.withColumn(
                    new_feature + "_" + target + "_pos_neg",
                    fn.col(new_feature + "_count_" + target + "_1") /
                    fn.col(new_feature + '_count'))
                df3 = df3.drop(new_feature + "_count_" + target + "_1",
                               new_feature + '_count')
                print("新的df_train", new_feature, target)
                df_train = df_train.join(df3, new_feature, "left")
                df_train.show(1)
                df_test = df_test.join(df3, new_feature,
                                       "left")  #会存在null,缺失值设置为0
                print("新的df_test", new_feature, target)
                df_test.show(1)
                df_test = df_test.na.fill(0)
                del df3
                gc.collect()
            if new_feature not in ["duration_time", "time_day"]:
                df_train = df_train.drop(new_feature)
                df_test = df_test.drop(new_feature)
                df_train.printSchema()
                df_test.printSchema()

        print('最终表结构,该表结构用于concate的输入'
              )  #是不是应该有build_data_for_like  build_data_for_finish
        df_train.printSchema()
        df_test.printSchema()

        print("查看test缺失值")
        df_test.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing')
                      for c in posneg_feats_list]).show()
        print("查看train缺失值")
        df_train.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c +
                                                                 '_missing')
                       for c in posneg_feats_list]).show()

        print('-------5.保存数据预处理结果-------')
        test_file_path = self.parser.get(
            "hdfs_path", "hdfs_data_path") + 'actLog_test_step2'
        os.system("hadoop fs -rm -r {}".format(test_file_path))
        df_test.rdd.map(tuple).saveAsPickleFile(test_file_path)

        del df_test
        gc.collect()

        train_file_path = self.parser.get(
            "hdfs_path", "hdfs_data_path") + 'actLog_train_step2'
        os.system("hadoop fs -rm -r {}".format(
            train_file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        df_train.rdd.map(tuple).saveAsPickleFile(train_file_path)
Beispiel #30
0
import numpy as np

sc = SparkContext()
sqlContext = SQLContext(sc)

# raw data
df = sqlContext.read.load("hdfs:///hndata/parquet_typed", format="parquet")

scores = df.where("score IS NOT NULL") \
         .where("type='story'") \
         .where("title IS NOT NULL") \
         .map(lambda row: (row.id, row.score))

# this is a RDD of (id, <numpy array>)
docvecs = sc.pickleFile("hdfs:///hndata/docvecs_glove_pickle")

def loadVecs(score_pairs):
    '''
    Executes on works, gensim doc2vec model has been rsynced to each
    node on cluster, so each worker can read its own copy

    If the model/np-array is larger than my driver memory, cannot use
    sc.broadcast to sync to each worker
    '''
    import numpy as np
    docvecs = np.load("/data/_hndata/doc2vec_model/hn.docvecs.doctag_syn0.npy", mmap_mode='r')
    return [(s, np.array(docvecs[i])) for (s,i) in score_pairs]

def mergeByKey(a,b):
    '''
from pyspark import SparkConf, SparkContext

sc = SparkContext(conf=SparkConf().setAppName('Airport timezone'))
mainRdd = sc.pickleFile('airports_mod.pickle')
mainRdd = mainRdd.map(lambda d: (d['Tz'], 1))
mainRdd = mainRdd.reduceByKey(lambda acc, b: acc + b)
for item in mainRdd.collect():
    print(item)
    # creating a hdfs client for writing purposes
    hdfs_client = InsecureClient(hdfs_address, user=hdfs_user)

    # opening training and test data files
    if not cluster_execution:
        learning_data_filename_training = 'file://' + learning_data_filename_training
        id_to_dataset_filename_training = 'file://' + id_to_dataset_filename_training
        if learning_data_filename_test:
            learning_data_filename_test = 'file://' + learning_data_filename_test
            id_to_dataset_filename_test = 'file://' + id_to_dataset_filename_test

    learning_data_training = sc.textFile(learning_data_filename_training +
                                         '/*').persist(
                                             StorageLevel.MEMORY_AND_DISK)
    id_to_dataset_training = sc.pickleFile(
        id_to_dataset_filename_training).persist(StorageLevel.MEMORY_AND_DISK)
    learning_data_test = sc.emptyRDD()
    id_to_dataset_test = sc.emptyRDD()
    if learning_data_filename_test:
        learning_data_test = sc.textFile(learning_data_filename_test +
                                         '/*').persist(
                                             StorageLevel.MEMORY_AND_DISK)
        id_to_dataset_test = sc.pickleFile(
            id_to_dataset_filename_test).persist(StorageLevel.MEMORY_AND_DISK)

    # taking first element and checking if information about joined dataset is present
    has_joined_data = False
    first = json.loads(learning_data_training.first())
    if 'joined_dataset' in first:
        has_joined_data = True
Beispiel #33
0
# pyspark统计某个字段的取值个数
# spark: 2.5min, awk: 5min
sc.textFile("file_name").map(lambda x:x.split("\x01")).map(lambda x:(x[0], 1)).reduceByKey(lambda a,b:a+b).collect()

# RDD集合操作
a.union(b)			# 并集
a.intersection(b)	# 交集
a.subtract(b)		# 差集

# RDD去重
a.distinct()

# rdd保存与加载
rdd.saveAsPickleFile(file_name, partition_num)
sc.pickleFile(file_name)

# spark关键概念
Application: 用户提交的任务
Driver: 任务调度
Job: 每个action算子是一个job
Task: RDD的partitions上的执行单元
Stage: 按照宽窄依赖划分
宽窄依赖: "https://github.com/rohgar/scala-spark-4/wiki/Wide-vs-Narrow-Dependencies"

# pandas DataFrame to spark DataFrame
from pyspark.sql import SparkSession
sqlContext = SparkSession\
				.builder \
	            .appName("dataFrame") \
	            .getOrCreate()
########################functions##################################
def quiet_logs(sc):
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
    logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)


region = sys.argv[1]
timeframe = sys.argv[2]

# spatial division: cell_id->region of interest


# data loading
# checking file existance
#####
sc = SparkContext()


chiamate_orarie = sc.pickleFile(
    '/peaks/hourly_presence-' + "%s-%s" % (region, timeframe))
presenze_medie = chiamate_orarie.map(lambda x: (
    (x[0][0], x[0][1], x[0][3]), x[1])).groupByKey()
os.system("$HADOOP_HOME/bin/hadoop fs -rm -r /peaks/weekly_presence-%s-%s/" %
          (region, timeframe))
presenze_medie.saveAsPickleFile(
    '/peaks/weekly_presence-' + "%s-%s" % (region, timeframe))


##picchi ##
			continue 
	
		wordbag.append((word))

	return wordbag

'''
def create_matrix(x, terms, matrix):
	
	if (x['eval_content']) is None:
		return matrix
	for text in twitter.pos(x['eval_content'], stem = True):
		matrix.index((text)
'''

terms = sc.pickleFile('merged_file').flatMap(lambda x : create_dictionary(x)).distinct()
#print terms.count()

matrix_key = terms.collect()
if len(matrix_key) % 2:
	matrix_key.append("")

matrix = dict((k, []) for k in matrix_key)



'''
f = open('dictionary_test.txt', 'w')
for m in matrix:
	f.write(m)
	f.write('	')
Beispiel #36
0
# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""

from pyspark import SparkConf, SparkContext

sc = SparkContext(conf = SparkConf().setAppName("My App"))
mainRdd = sc.pickleFile('matches.pickle')
mainRdd.persist()
seasonRunRdd = mainRdd.map(lambda d : (d['season'], int(d['win_by_runs']))).reduceByKey(lambda a, b: a if a > b else b).sortByKey()
print(seasonRunRdd.collect())

Beispiel #37
0
# =========== SPARK CONFIG ===========
set_master_val = "local[" + str(num_nodes) + "]"

from pyspark import SparkContext, SparkConf
#conf = SparkConf().setAppName('TrainWavenet').set("spark.driver.maxResultSize", "2G")
conf = SparkConf().setMaster(set_master_val).setAppName('TrainWavenet').set("spark.driver.maxResultSize", "2G")
sc = SparkContext(conf=conf)


# ============ DATA SETUP ===========
# s3_song_directory = "s3://waveform-storage/input_data/song_processed/Pop/part-00000"

#, minPartitions=n_data_partitions) 
train_rdd = sc.pickleFile(song_directory) \
                .flatMap(lambda x: split_song_to_train(x, data_size, data_collect_stride)) \
                .map(lambda x: (x, one_hot_encode_chunk(x))) \
                .map(lambda x: (np.array(x[0]).reshape(data_size,1), np.array(x[1])))

print("Num Partitions: ", train_rdd.getNumPartitions())


# ============ MODEL SETUP ===========
from keras.optimizers import SGD

wavenet_model = create_wavenet(stack_layers, n_output_channels, n_filter_list, num_stacks, skip=False)
adam_opt = keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False)
wavenet_model.compile(optimizer=SGD(), loss='categorical_crossentropy')
print(wavenet_model.summary())


# ============ ELEPHAS TRAIN ===========
Beispiel #38
0
    df = sqlContext.read.format('jdbc')\
                        .options(url = MYSQL_CONNECTION_URL.value,
                                 dbtable = db+'.'+tableName.value
                                ).load()

    # CREATE STREAMING CONTEXT
    ssc = StreamingContext(sc, int(spark_batch_duration))

    # setting checkpoint
    # ssc.checkpoint(".")

    tf_val = 1048576

    # LOADING AND COMPUTING TF's TRAINING MODEL
    print('Loading TRAINING_TF_MODEL...', end="")
    tf_training = sc.pickleFile(os.getcwd() + "/Desktop/MODEL/TF/TF_MODEL_" +
                                str(tf_val))
    print('done!')

    print('Computing TF-IDF MODEL...', end="")
    idf_training = IDF(minDocFreq=5).fit(tf_training)
    print('done!')

    print('Loading Naive Bayes Model...', end="")
    NBM = NaiveBayesModel.load(
        sc,
        os.getcwd() + "/Desktop/MODEL/NBM/NaiveBayesModel_" + str(tf_val))
    print('done!')

    print('READY TO PROCESS DATA...')

    kafkaParams = {'metadata.broker.list"': kafka_brokers}
Beispiel #39
0
                       samples - 1 if samples > 1 else samples)

    X = vstack(data)
    pca = TruncatedSVD(n_components)
    Xtransformed = pca.fit_transform(X)

    p = pd.DataFrame(Xtransformed,
                     columns=['%i' % i for i in range(n_components)],
                     index=author_index)

    output_list = []
    for k, v in zip(p.index, p.values):
        output_list.append((k, v))
    return (subreddit, output_list)


ae_flat_cat = sc.pickleFile(
    '/user/username/data/output/_jobs/author_entities_cats')

# map as ((subcat, author, entity_id), score) and compute median
medians = ae_flat_cat.map(lambda x: ((x[5][1], x[0], x[1]), x[3])).groupByKey(
).mapValues(list).map(lambda x: x[0] + (float(np.median(x[1])), ))

#group by cat
ae_grouped_by_subreddit = medians.groupBy(lambda x: (x[0])).mapValues(list)

processed_groups = ae_grouped_by_subreddit.map(
    lambda x: process_group(x)).filter(lambda x: len(x) > 0)
processed_groups.saveAsPickleFile(
    '/user/username/data/output/_jobs/processed_groups_topcats_svd')
Beispiel #40
0
15;visitor;0.0;0.0;0.0; 0.1; 0.1; 0.1;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0;0.0"""

archetipi = [(y[1], y[2:][:18]) for y in [x.split(';')
                                          for x in archetipi.split("\n")[:-1]]]

if __name__ == '__main__':
    region = sys.argv[1]
    timeframe = sys.argv[2]

    # import rdd with profiles

    sc = SparkContext()
    quiet_logs(sc)
    sc._conf.set('spark.executor.memory', '32g').set(
        'spark.driver.memory', '32g').set('spark.driver.maxResultsSize', '0')
    r = sc.pickleFile('/profiles-%s-%s' % (region, timeframe))

    # clustering!

    r_carrelli = r.flatMap(lambda x: array_carretto(x[1]))

    percentage = 0.3
    r_carrelli.sample(False, percentage, 0).filter(
        lambda l: sum(l))  # filtro passing by
    # sample and filter out passing by
    data = r_carrelli.sample(False, percentage, 0).filter(
        lambda l: sum(l)).map(lambda x: np.array(x))

    #kmns = KMeans.train(data, 100, initializationMode="random")
    kmns = KMeans.train(data, 100, initializationMode="k-means||")
    tipi_centroidi = []
Beispiel #41
0
class SparkFEProcess:

    def __init__(self):

        self.parser = self.init_config()

        sparkConf = SparkConf().setAppName("feature engineering on spark of explore_spark_step3") \
            .set("spark.ui.showConsoleProgress", "false")
        self.sc = SparkContext(conf=sparkConf)
        self.sc.broadcast(self.parser)
        self.init_logger()
        # #初始化相关参数
        # #bins_dict保存相关列的分箱方案,在处理测试数据的时候使用
        # self.bins_dict={}


    def init_config(self):
        current_path = os.path.dirname(os.path.realpath(__file__))
        workspace_path = current_path.split('featureEngineering')[0]
        config_file = workspace_path + 'resource/config.ini'
        parser = configparser.ConfigParser()
        parser.read(config_file)
        return  parser

    def init_logger(self):
        '''
        设置日志级别
        :param sc:
        :return:
        '''
        logger = self.sc._jvm.org.apache.log4j
        logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
        logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
        logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)


    def read_rdd(self, fileName):
        try:
            file_path = self.parser.get("hdfs_path", "hdfs_data_path") + fileName
            data_rdd = self.sc.textFile(file_path)
            return data_rdd
        except Exception as e:
            print(e)

    def data_describe(self):
        sqlContext = SQLContext(self.sc)
        print('starto read data after explore_saprk_step1_cross:')
        rootPath=self.parser.get("hdfs_path", "hdfs_data_path")
        print('start to read actLog_train_single_cross')
        test_file_path = rootPath + 'actLog_test_single_cross'
        actLog_test_rdd = self.sc.pickleFile(test_file_path)
        #比对label,看labels是否合适
        labels=[  ('duration_time',typ.IntegerType()),
                ('device',typ.IntegerType()),
                ('music_id',typ.IntegerType()),
                ('item_city',typ.IntegerType()),
                ('author_id',typ.IntegerType()),
                ('item_id',typ.IntegerType()),
                ('user_city',typ.IntegerType()),
                ('uid',typ.IntegerType()),
                ('channel',typ.IntegerType()),
                ('finish',typ.IntegerType()),
                ('like',typ.IntegerType()),
                ('time_day',typ.IntegerType()),
                ('item_pub_month',typ.IntegerType()),
                ('item_pub_day',typ.LongType()),
                ('item_pub_hour',typ.IntegerType()),
                ('item_pub_minute',typ.IntegerType()),
                ('uid_count_bin',typ.IntegerType()),
                ('user_city_count_bin',typ.IntegerType()),
                ('user_city_count_ratio',typ.DoubleType()),
                ('item_id_count_bin',typ.IntegerType()),
                ('item_id_count_ratio',typ.DoubleType()),
                ('author_id_count_bin',typ.IntegerType()),
                ('author_id_count_ratio',typ.DoubleType()),
                ('item_city_count_bin',typ.IntegerType()),
                ('item_city_count_ratio',typ.DoubleType()),
                ('music_id_count_bin',typ.IntegerType()),
                ('music_id_count_ratio',typ.DoubleType()),
                ('device_count_bin',typ.IntegerType()),
                ('device_count_ratio',typ.DoubleType()),
                ('uid_author_id_count_bin',typ.IntegerType()),
                ('uid_author_id_count_ratio',typ.DoubleType()),
                 ('uid_item_city_count_bin',typ.IntegerType()),
                ('uid_item_city_count_ratio',typ.DoubleType()),
                ('uid_channel_count_bin',typ.IntegerType()),
                ('uid_channel_count_ratio',typ.DoubleType()),
                ('uid_music_id_count_bin',typ.IntegerType()),
                ('uid_music_id_count_ratio',typ.DoubleType()),
                ('uid_device_count_bin',typ.IntegerType()),
                ('uid_device_count_ratio',typ.DoubleType()),
                ('author_id_channel_count_bin',typ.IntegerType()),
                ('author_id_channel_count_ratio',typ.DoubleType()),
                ('author_id_user_city_count_bin',typ.IntegerType()),
                ('author_id_user_city_count_ratio',typ.DoubleType()),
                ('author_id_item_city_count_bin',typ.IntegerType()),
                ('author_id_item_city_count_ratio',typ.DoubleType()),
                ('author_id_music_id_count_bin',typ.IntegerType()),
                ('author_id_music_id_count_ratio',typ.DoubleType()),
                ('uid_channel_device_count_bin',typ.IntegerType()),  #改成uid_channel_device
                ('uid_channel_device_count_ratio',typ.DoubleType()),  #改成uid_channel_device
                ('author_id_item_city_music_id_count_bin',typ.IntegerType()),
                ('author_id_item_city_music_id_count_ratio',typ.DoubleType()),
            ]
        actionLogSchema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels])

        df_actLog_test = sqlContext.createDataFrame(actLog_test_rdd,actionLogSchema)
        # df_actLog_test.show(1,truncate=False)

        print('start to read actLog_train_single_cross')
        train_file_path = rootPath + 'actLog_train_single_cross'
        actLog_train_rdd = self.sc.pickleFile(train_file_path)
        df_actLog_train = sqlContext.createDataFrame(actLog_train_rdd,actionLogSchema)
        # df_actLog_train.show(1,truncate=False)


        return df_actLog_train, df_actLog_test




    def data_explore(self,df_train,df_test):

        sqlContext = SQLContext(self.sc)

        print("对item_pub_hour进行离散化")
        def hourBin(x):
            if x>=23 or x <=2:
                return 1
            elif 3<=x<8:
                return 2
            elif 8<=x<12:
                return 3
            else:
                return 4

        converHourBin=udf(lambda x :hourBin(x), typ.IntegerType())
        df_train = df_train.withColumn("item_pub_hour", converHourBin(df_train.item_pub_hour))
        df_test = df_test.withColumn("item_pub_hour", converHourBin(df_test.item_pub_hour))

        print("----1、计算统计特征:用户特征和item特征之间的条件概率---------")
        feats_list = []

        condition = ['uid']
        authors = ['music_id','item_pub_hour']  #'author_id', 'item_city', 'channel',
        feats_list.extend([[u_col, a_col] for u_col in condition for a_col in authors])
        df_tmp=df_train.select(condition)
        df2=df_tmp.groupby(condition).count().withColumnRenamed('count',condition[0]+'_count')
        # df2.show(1,truncate=False) # ['uid','uid_count']
        df2.cache()
        # df_train=df_train.join(df2,condition,'left')
        # df_train.show(1,truncate=False)
        # cannot resolve '`uid_count`' given input columns: [time, user_city, like, author_id, uid, device, music_id, finish, duration_time, channel, item_city, item_id]
        # del df2
        # gc.collect()
        for feature_group in feats_list:
            print(feature_group+[feature_group[0]+'_count'])   #+[feature_group[0]+'_count']
            df1=df_train.select(feature_group).groupby(feature_group).count()
            # df1.show(1,truncate=False)   #理论上还是只有3个字段,不包含uid_count
            df1=df1.join(df2,condition,'left')
            df1.show(1,truncate=False)   #|uid|item_pub_hour|count|uid_count
            df1=df1.withColumn(feature_group[1]+'_'+feature_group[0]+"_condition_ratio",fn.col('count')/fn.col(feature_group[0]+'_count'))
            df1=df1.drop('count').drop(feature_group[0]+'_count')
            df1.show(1,truncate=False)
            print(df_train.columns)
            print(df1.columns)
            df_train=df_train.join(df1,feature_group,"left")   #|uid|item_pub_hour|item_pub_hour_uid_condition_ratio
            df_train.show(1,truncate=False)
            df_test=df_test.join(df1,feature_group,"left").na.fill({feature_group[1]+'_'+feature_group[0]+"_condition_ratio":0})  #对某一列填充缺失值
            df_test.show(1,truncate=False)



        feats_list = []
        condition = ['item_id']
        authors = ['uid_city', 'channel']
        feats_list.extend([[u_col, a_col] for u_col in condition for a_col in authors])

        df_tmp=df_train.select(condition)
        df2=df_tmp.groupby(condition).count().withColumnRenamed('count',condition[0]+'_count')
        # df2.show(1,truncate=False) # ['uid','uid_count']
        df2.cache()
        # df_train=df_train.join(df2,condition,'left')
        # df_train.show(1,truncate=False)
        # cannot resolve '`uid_count`' given input columns: [time, user_city, like, author_id, uid, device, music_id, finish, duration_time, channel, item_city, item_id]
        # del df2
        # gc.collect()
        for feature_group in feats_list:
            print(feature_group+[feature_group[0]+'_count'])   #+[feature_group[0]+'_count']
            df1=df_train.select(feature_group).groupby(feature_group).count()
            # df1.show(1,truncate=False)   #理论上还是只有3个字段,不包含uid_count
            df1=df1.join(df2,condition,'left')
            df1.show(1,truncate=False)
            df1=df1.withColumn(feature_group[1]+'_'+feature_group[0]+"_condition_ratio",fn.col('count')/fn.col(feature_group[0]+'_count'))
            df1=df1.drop('count').drop(feature_group[0]+'_count')
            # df1.show(5)
            df_train=df_train.join(df1,feature_group,"left")
            df_train.show(1,truncate=False)
            df_test=df_test.join(df1,feature_group,"left").na.fill({feature_group[1]+'_'+feature_group[0]+"_condition_ratio":0})  #对某一列填充缺失值
            df_test.show(1,truncate=False)


        df_train=df_train.drop('uid_count').drop('item_id_count')
        df_train.printSchema()
        df_test.printSchema()

        print('-------5.保存数据预处理结果-------')
        test_file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'actLog_test_step3_try'
        os.system("hadoop fs -rm -r {}".format(test_file_path))
        df_test.rdd.map(tuple).saveAsPickleFile(test_file_path)

        del df_test
        gc.collect()

        train_file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'actLog_train_step3_try'
        os.system("hadoop fs -rm -r {}".format(train_file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        df_train.rdd.map(tuple).saveAsPickleFile(train_file_path)
from pyspark import SparkConf, SparkContext

sc = SparkContext(conf=SparkConf().setAppName("Innings Run"))

inningData = sc.pickleFile('deliveries.pickle')
inningData = inningData.map(lambda s: (
    (int(s['match_id']), int(s['inning'])), int(s['total_runs']))).reduceByKey(
        lambda a, b: a + b).sortByKey()
for item in inningData.collect():
    print(item)
Beispiel #43
0
from pyspark import SparkContext, SparkConf

from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint


conf = (SparkConf() \
    .set("spark.driver.maxResultSize", "2g"))

sc = SparkContext(conf=conf)
pos = sc.textFile("hdfs:///movie_review/positive").map(lambda s: (True, s.lower().split()))
neg = sc.textFile("hdfs:///movie_review/negative").map(lambda s: (False, s.lower().split()))

if False:
    docvecs = sc.pickleFile("hdfs://movie_review/doctags")
else:
    from ddoc2vec import DistDoc2Vec

    data = (neg + pos).zipWithIndex().map(lambda (v, i): (i, v[0], v[1]))
    sents = data.map(lambda (a,b,c): c)

    model = Word2Vec(size=100, hs=0, negative=8)
    dd2v = DistDoc2Vec(model, learn_hidden=False, num_partitions=5, num_iterations=10)
    dd2v.build_vocab_from_rdd(sents, reset_hidden=False)
    # train word2vec in driver
    model.train(sents.collect())
    model.save("/root/doc2vec/word2vec_model/review")
    print "*** done training words ****"
    print "*** len(model.vocab): %d ****" % len(model.vocab)
    dd2v.train_sentences_cbow(data.map(lambda (i, l, v): TaggedDocument(words=v, tags=[i])))
import math
from itertools import combinations
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml import Pipeline
from pyspark.ml.regression import *
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

df = sqlContext.createDataFrame(sc.pickleFile("rdd1.p", 30), ["label", "cat_features","cont_features"]).cache()

to_delete = [2,3,4,5,6,7,8,96]    # cela correspond à cat2, cat3, cat4 ...
features_to_keep = list(range(116))
for idx in to_delete:
    features_to_keep.remove(idx - 1) # Car to_delete commence à 1

class customTransformer:
    
    def __init__(self, inputCol, outputCol, *others):
        self.inputCol = inputCol
        self.outputCol = outputCol
        self.args = list(others)
        self.fitInfo = 0
        
    # Store information taken from the dataframe
		#carr=np.zeros(24)
		carr=[0 for x in range(18)]
		for o in obs:
			week_idx=week_ordering.index(o[0])
			idx=(week_idx-1)*6+o[1]*3+o[2]
			carr[idx]=o[3]
		tipo_utente=sorted([(c[0],euclidean(carr,list(c[1]))) for c in profiles],key=lambda x:x[1])[0][0]
		yield (munic,tipo_utente,id)



sc=SparkContext()
##annotazione utenti

##open
r=sc.pickleFile('hdfs://hdp1.itc.unipi.it:9000/profiles/centroids%s-%s'%(region,timeframe))
cntr=r.collect()

profiles=[(x[0],x[1]) for x in cntr]


r=sc.pickleFile('hdfs://hdp1.itc.unipi.it:9000/profiles/'+"%s-%s"%(region,timeframe))

r_auto= r.flatMap(lambda x:  annota_utente(x[1],profiles)) \
   .map(lambda x: ((x[0],x[1]),1)) \
   .reduceByKey(lambda x,y:x+y)
#
##ottengo coppie municipio,id_cluster
### risultato finale
#
lst=r_auto.collect()
    num = int(fields[2])
    result = []
    for i in range(num - 1):
        result.append([
            bid, fields[3 + 4 * i], fields[3 + 4 * i + 1],
            fields[3 + 4 * i + 2], fields[3 + 4 * i + 3]
        ])
    return result


rdd_flat = rdd.flatMap(lambda line: flat_trend(line))
trend_df = sqlContext.createDataFrame(
    rdd_flat, ['business_id', 'start', 'end', 'rating', 'trend'])

geo_rdd = sc.pickleFile(
    '/Users/zimoli/Downloads/RBDA-MCINTOSH/Project/RBDAProject/phoenix_cate_ts'
).cache()

trend_list = sc.broadcast([[
    str(row['business_id']),
    str(row['start']),
    str(row['end']),
    str(row['rating']),
    str(row['trend'])
] for row in trend_df.collect()])

trend_map = {}


def get_trend_map(trend_map):
    for trend in trend_list.value:
                      (cv_data_rdd, out_cv_data)]:
        url = sparkutil.util.s3n_url(S3_BUCKET, S3_PATH, name)
        sparkutil.util.s3n_delete(url)
        rdd.saveAsPickleFile(url)
        pickle.dump({'url' : url}, open(name, 'w'))
    
    sc.stop()

    
@jobs_limit(1)
@transform(spark_run_experiments, suffix('.samples'), '.samples.pickle')
def get_samples((exp_samples, exp_cvdata, exp_inits), out_filename):
    sample_metadata = pickle.load(open(exp_samples, 'r'))
    
    sc = SparkContext()
    results_rdd = sc.pickleFile(sample_metadata['url'])

    sparkutil.util.save_rdd_elements(results_rdd, out_filename, S3_BUCKET, S3_PATH)
    
    sc.stop()

@jobs_limit(1) 
@transform(spark_run_experiments, suffix('.samples'), '.cvdata.pickle')
def get_cvdata((exp_samples, exp_cvdata, exp_inits), out_filename):
    cvdata_metadata = pickle.load(open(exp_cvdata, 'r'))
    
    sc = SparkContext()
    results_rdd = sc.pickleFile(cvdata_metadata['url'])
    pickle.dump(results_rdd.collect(),
                open(out_filename, 'w'))
    sc.stop()
Beispiel #48
0
df = sqlContext.read.json(path+'/reviews_Pet_Supplies_p2.json')

reviewDF = df.select("reviewText")

sc = SparkContext(appName='Word2Vec')

def removePunctuation(text):
   return re.sub("[^a-zA-Z]", " ", text)


cleanedReviewRDD = reviewDF.map(lambda row: removePunctuation(row.reviewText).lower().split())


cleanedReviewRDD.saveAsPickleFile(path+'/P2CleanedRDD',10)

reviewRDD = sc.pickleFile('/Users/sradhakr/Desktop/Assignment3/Assignment3/P2CleanedRDD', 10)

uniqueWordsRDD = reviewRDD.flatMap(lambda words: words).distinct().map(lambda word: (word, 1))

word2VecRDD = sqlContext.read.parquet(path+"/word2vec/data")

wordsFeaturesRDD = uniqueWordsRDD.join(word2VecRDD.rdd).map(lambda (key, (dummy,features)):(key, features))

#not a RDD
kMeansclusters = KMeans.train(wordsFeaturesRDD.map(lambda (key, features): features), 2000, maxIterations=50, runs=5, initializationMode="random", seed=50)


wordsClustersRDD = wordsFeaturesRDD.map(lambda (key,features): (key,kMeansclusters.predict(features)))

wordsClustersRDD.saveAsPickleFile(path+'/WordClustersRDD',10)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jul  7 23:50:23 2017

@author: dray
"""

from pyspark import SparkConf, SparkContext
from pprint import pprint

sc = SparkContext(conf=SparkConf().setAppName("Students read"))
marksRdd = sc.pickleFile("students.pickle").map(lambda d: (d['sid'], d[
    'marks'])).reduceByKey(lambda a, b: a + b).sortByKey()
pprint(marksRdd.collect())
Beispiel #50
0
import findspark
findspark.init()

try:
    sc.stop()
except:
    pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = SparkConf().setAppName("finalproject").setMaster("local[*]")
sc = SparkContext(conf=conf)
spark = SparkSession(sparkContext=sc)

import pickle
from operator import add
allData = sc.pickleFile("./all-data.pkl")

#print allData.first()

#([["Company", "Class", "Name", "City", "State", "Country", "Date"]])


#only use if there is a problem with using None
def companyOrPersonCount(x):
    if (x[0] is None):
        return (x[2], 1)
    else:
        return (x[0], 1)


def companyCount(x):
Beispiel #51
0
from konlpy.tag import Twitter
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import Normalizer
from pyspark.mllib.clustering import KMeans, KMeansModel
import pickle
from numpy import array


sc = SparkContext()
sqlContext = SQLContext(sc)

normData = sc.pickleFile('idf_normalized')

from pyspark.mllib.clustering import KMeans, KMeansModel
from math import sqrt
data = normData.map(lambda x : x.idf_norm)
clusters = KMeans.train(data, 10, maxIterations=10,runs=10, initializationMode="random")
'''
def error(point):
	center = clusters.centers[clusters.predict(point)]
	return sqrt(sum([x**2 for x in (point - center)]))
'''
clusters.save(sc,'KMeansModel')
#WSSSE = data.map(lambda point: error(point)).reduce(lambda x, y: x + y)
#print("Within Set Sum of Squared Error = " + str(WSSSE))

    else:
        print 'Using Pre-Pickled Files\n'
    # End Timer for this phase
    WordFreq_Time = time() - WordFreq_Time
    print('############ Processing Completed ##############')
    print('################################################\n')

    print('################################################')
    print('############## Word Freq to IDF RDD ############\n')
    # Start Timer
    IDF_Time = time()
    # Ascertain if Section has already been completed
    if len(getDirectory(directory[3])) < 1:
        allFolders = getDirectory(directory[2])
        # Load in Word Frequency Pickles into one RDD
        IDF = sc.union([sc.pickleFile(i) for i in allFolders])
        # Rearrange RDD into correct the correct format
        IDF = IDF.flatMap(lambda x_y: [(pair[0], [[x_y[0], str(pair[1])]]) for pair in x_y[1]]) \
                 .reduceByKey(add) \
                 .map(lambda x_y1: (x_y1[0], len(x_y1[1]), float(N), x_y1[1])) \
                 .map(lambda x_y_z_a: (x_y_z_a[0], np.log2(x_y_z_a[2] / x_y_z_a[1]), x_y_z_a[3])) \
                 .repartition(8)
        # Save IDF RDD as a Pickle File
        IDF.saveAsPickleFile(directory[4], 50)
    else:
        print 'Using Pre-Pickled Files\n'
    # End Timer for this phase
    IDF_Time = time() - IDF_Time
    print('############ Processing Completed ##############')
    print('################################################\n')
"""
Peak detection Module
Given a hourly presence dataset (usually regarding a month of activity), and a typical weekly presence dataset, it computes
the relative presences for each hour of the month, in order to identify eventual peaks of presences.
Usage: peak_detection.py  <spatial_division> <region> <timeframe>
--region,timeframe: names of the file stored into the hdfs. E.g. Roma 11-2015
example: pyspark peak_detection.py roma 06-215
It loads the hourly presences in /peaks/weekly_presence-<region>-<timeframe> and stores
results into standard csv file: rome_peaks<region>-<timeframe>-<spatial_division>.csv
"""

spatial_division = sys.argv[1]
region = sys.argv[2]
timeframe = sys.argv[3]

sc = SparkContext()


presenze_medie = sc.pickleFile(
    '/peaks/weekly_presence-' + "%s-%s" % (region, timeframe)).collectAsMap()

chiamate_orarie = sc.pickleFile(
    '/peaks/hourly_presence-' + "%s-%s" % (region, timeframe))


peaks = open('rome_peaks%s-%s-%s.csv' % (region, timeframe,
                                         spatial_division.replace(".", "").replace("/", "")), 'w')
for l in chiamate_orarie.collect():
    print >>peaks, "%s,%s,%s,%s" % (l[0][0], l[0][4], l[0][3], l[
                                    1] / np.mean(list(presenze_medie[(l[0][0], l[0][1], l[0][3])])))
Beispiel #54
0
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
args = sys.argv;
api = tweepy.API(auth,timeout=10)

filename='tweets'+str(time.time())+'.pickle'
folderurl='/user/bijoyan/tweetstore/'

list_tweets = []

for status in tweepy.Cursor(api.search,q=args[1:],lang='en',result_type='recent').items(80):
    list_tweets.append(status.text)

mainRdd = sc.parallelize(list_tweets)
mainRdd.saveAsPickleFile(folderurl+filename)
def mymap(line):
	for char in string.punctuation:
		line = line.replace(char,' ')
	blob = TextBlob(line)
	sum=0
	n=0
	for sentence in blob.sentences:
		sum+=sentence.sentiment.polarity
		n+=1
	return (sum/n)*100
mainRdd=sc.pickleFile(folderurl+filename)
mainRdd = mainRdd.map(mymap)
for emotion in mainRdd.collect():
	print(emotion)

z = 2*365	#z in formula is poverty line.
GINI=0.54115

indicator="Adjusted net national income per capita (constant 2005 US$)"	#this is yt in formula
year_start = 1971
year_end = 2015
num_of_segments = 5

conf = SparkConf()
conf.setMaster("local[4]")
conf.setAppName("damu1000")
conf.set("spark.executor.memory", "4g")
sc = SparkContext(conf=conf)

#read data
lines = sc.pickleFile(".//result").filter(lambda x: x[0]==country).cache()
#filter by country, indicator and period. sort by period



#-----------------------------------------read 1st to 5th 20% income----------------------------------------------------------
#using inequality data to approximate standard deviation
income_20_1 = lines.filter(lambda x: x[0]==country and x[2]=="Income share held by lowest 20%" and x[4] != '' and x[3] >= year_start and x[3] <= year_end).sortBy(lambda (a,b,c,d,e): d, True)
years = income_20_1.map(lambda (a,b,c,d,e): float(d)  )	#years for which income share data is not null. use this later to filter average income
income_20_1 = income_20_1.map(lambda (a,b,c,d,e): float(e)  )
income_20_2 = lines.filter(lambda x: x[0]==country and x[2]=="Income share held by second 20%" and x[4] != '' and x[3] >= year_start and x[3] <= year_end).sortBy(lambda (a,b,c,d,e): d, True).map(lambda (a,b,c,d,e): float(e)  )
income_20_3 = lines.filter(lambda x: x[0]==country and x[2]=="Income share held by third 20%" and x[4] != '' and x[3] >= year_start and x[3] <= year_end).sortBy(lambda (a,b,c,d,e): d, True).map(lambda (a,b,c,d,e): float(e)  )
income_20_4 = lines.filter(lambda x: x[0]==country and x[2]=="Income share held by fourth 20%" and x[4] != '' and x[3] >= year_start and x[3] <= year_end).sortBy(lambda (a,b,c,d,e): d, True).map(lambda (a,b,c,d,e): float(e)  )
income_20_5 = lines.filter(lambda x: x[0]==country and x[2]=="Income share held by highest 20%" and x[4] != '' and x[3] >= year_start and x[3] <= year_end).sortBy(lambda (a,b,c,d,e): d, True).map(lambda (a,b,c,d,e): float(e)  )

PPP = lines.filter(lambda x: x[0]==country and x[2]=="Poverty headcount ratio at $2 a day (PPP) (% of population)" and x[4] != '' and x[3] >= year_start and x[3] <= year_end).sortBy(lambda (a,b,c,d,e): d, True)
Beispiel #56
0
def main(args):
    file_path = args.input
    # './dataset/001-SART-August2017-MB.csv'
    Server_path = ['/usr/lib/jvm/java-1.8.0-openjdk-amd64',
                   './res/saved_dataset',
                   file_path
                   ]
    Yu_path = ['/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home',
               './res/saved_dataset',
               './dataset/001-SART-August2017-MB-50.csv']
    Leo_path = ['/Library/Java/JavaVirtualMachines/jdk1.8.0_151.jdk/Contents/Home',
                './res/saved_dataset',
                file_path]
    Yuncong_path = ['/Library/Java/JavaVirtualMachines/jdk1.8.0_161.jdk/Contents/Home',
                    './res/saved_dataset',
                    file_path]

    path = Server_path
    os.environ['JAVA_HOME'] = path[0]
    # create a spark job
    cores = args.cores
    st = args.st
    full_length = args.full_length
    sc = SparkContext('' + 'local' + '[' + str(cores) + ']' + '', "First App")

    # sc = SparkContext("local[4]", "First App")
    # st = 0.25
    new_path = re.match(r"(.*)\.csv", path[2]).group(1)
    path_save_res = path[1] + '/' + new_path + '_' + str(st)
    # if path exist, the job can't be executed
    if os.path.isdir(path_save_res):
        group_rdd = sc.pickleFile(path_save_res + '/group/')
        cluster_rdd = sc.pickleFile(path_save_res + '/cluster/')
        global_dict_rdd = sc.pickleFile(path_save_res + '/dict/')
        # shutil.rmtree(path_save_res)
    else:
        # TODO
        file = path[2]
        # add test for commit
        features_to_append = [0, 1, 2, 3, 4]

        # res_list: list of raw time series data to be on distributed
        # timeSeries: a dictionary version of as res_list, used for sebsequence look up
        res_list, time_series_dict, global_min, global_max = generate_source(file, features_to_append)
        print('processing dataset' + path[2])
        print("Global Max is " + str(global_max))
        print("Global Min is " + str(global_min))

        normalized_ts_dict = normalize_ts_with_min_max(time_series_dict, global_min, global_max)

        # TODO
        # add clustering method after grouping

        # this broadcast object can be accessed from all nodes in computer cluster
        # in order to access the value this, just use val = global_dict.value
        # for future reading data
        # NOTE that the data being broadcasted is the minmax-normalized data
        global_dict = sc.broadcast(normalized_ts_dict)
        time_series_dict = sc.broadcast(time_series_dict)
        # max(flows, key=lambda k: len(flows[k]))
        # find the key of largest length of
        # max_len_key = max(global_dict.value, key=lambda k: len(global_dict.value[k]))
        # max_length = len(global_dict.value[max_len_key])
        if full_length:
            grouping_range = (1, max([len(v) for v in global_dict.value.values()]))

        else:
            grouping_range = (89, 90)
        #         grouping_range = (1, length)

        global_dict_rdd = sc.parallelize(res_list[1:], numSlices=16)
        global_dict_rdd.saveAsPickleFile(path_save_res + '/dict/')

        # global_dict_res = global_dict_rdd.collect()
        # finish grouping here, result in a key, value pair where
        # key is the length of sub-sequence, value is the [id of source time series, start_point, end_point]
        # res_rdd = global_dict_rdd.flatMap(lambda x: get_all_subsquences(x)).collect()

        # In get_subsquences(x, 100, 110): we are grouping subsequences that are of length 90 to 110

        """
        ##### group
        group_rdd_res: list: items = (length, time series list) -> time series list: items = (id, start, end)
        """
        # add save option or not
        group_start_time = time.time()
        group_rdd = global_dict_rdd.flatMap(lambda x: get_subsquences(x, grouping_range[0], grouping_range[1])).map(
            lambda x: (x[0], [x[1:]])).reduceByKey(
            lambda a, b: a + b)
        group_rdd.saveAsPickleFile(path_save_res + '/group/')
        group_end_time = time.time()
        print('group of timeseries from ' + str(grouping_range[0]) + ' to ' + str(grouping_range[1]) + ' using ' + str(
            group_end_time - group_start_time) + ' seconds')
        # group_rdd_res = group_rdd.collect()
        print("grouping done, saved to dataset")

        """
        ##### cluster

        The following code is for testing clustering operation. Cluster one group without using RDD
        4/15/19
        # print("Test clustering")
        # group_res = group_rdd.collect()
        # cluster(group_res[1][1], group_res[1][0], st, global_dict.value)  # testing group with length of 9
        """

        # print("Test clustering")
        # group_res = group_rdd.collect()
        # # cluster_two_pass(group_res[1][1], group_res[1][0], st, global_dict.value)  # testing group with length of 9
        # cluster(group_res[1][1], group_res[1][0], st, global_dict.value)  # testing group with length of 9

        print("Working on clustering")
        cluster_start_time = time.time()
        cluster_rdd = group_rdd.map(lambda x: cluster(x[1], x[0], st, global_dict.value))

        cluster_rdd.saveAsPickleFile(path_save_res + '/cluster/')  # save all the cluster to the hard drive
        cluster_rdd_reload = sc.pickleFile(path_save_res).collect()  # here we have all the clusters in memory
        # first_dict = cluster_rdd_reload[0]
        cluster_end_time = time.time()

        print('clustering of timeseries from ' + str(grouping_range[0]) + ' to ' + str(
            grouping_range[1]) + ' using ' + str(cluster_end_time - cluster_start_time) + ' seconds')

        print("clustering done, saved to dataset")

        # plot all the clusters
        # plot_cluster(cluster_rdd_reload, 2, time_series_dict, 5)

        """
            ##### query
            Current implementation: if we want to find k best matches, we give the first k best matches for given sequence length range


            The following line is for testing querying on one cluster
            # query_result = query(query_sequence, cluster_rdd_reload[0], k, time_series_dict.value)

        """



        # print("Using Twopass")
        # total_cluster_count = 0
        # for cluster_dic in cluster_rdd.collect():
        #
        #     representative, cluster_subsequences = random.choice(list(cluster_dic.items()))
        #
        #     cluster_length = representative.get_length()
        #     total_cluster_count = total_cluster_count + len(cluster_dic.keys())
        #
        #     print("length " + str(cluster_length) + " has cluster count of " + str(len(cluster_dic.keys())))
        # print("Total cluster count is: " + str(total_cluster_count))

        # # '(001-SART-August2017-MB)_(211-Current-Item:-3)_(A-DC1)_(64434.0)_(105950.0)'
        # '(2013e_001)_(100-0-Back)_(B-DC8)_(232665953.1250)'
        query_id = '(001-SART-August2017-MB)_(211-Current-Item:-3)_(A-DC1)_(64434.0)_(105950.0)'
        query_sequence = get_data(query_id, 24, 117, time_series_dict.value)  # get an example query
        filter_rdd = cluster_rdd.filter(lambda x: exclude_same_id(x, query_id))
        # raise exception if the query_range exceeds the grouping range
        querying_range = (90, 91)
        k = 5  # looking for k best matches
        if querying_range[0] < grouping_range[0] or querying_range[1] > grouping_range[1]:
            raise Exception("query_operations: query: Query range does not match group range")

        query_result = cluster_rdd.filter(lambda x: x).map(
            lambda clusters: query(query_sequence, querying_range, clusters, k, time_series_dict.value)).collect()
        exclude_overlapping = True
        query_result = filter_rdd.map(
            lambda clusters: query(query_sequence, querying_range, clusters, k, time_series_dict.value,
                                   exclude_overlapping,
                                   0.5)).collect()

        plot_query_result(query_sequence, query_result, time_series_dict.value)

    sc.stop()
def __main__():
    # Get program options
    input_path = ""
    num_learners = 1
    num_parts = 1
    output_path = '/filer/tmp1/yw298/spark/output/'
    fs = 'file:'
    save_data = 0

    # Parameters for base learner
    max_depth = None
    max_features = None
    min_samples_leaf = 1
    min_samples_split = 2

    # Parameters for coefficient fitting
    regularizer = None
    niters = 100
    reg_weight = 1.0
    step_size = 1.0
    batch_frac = 1
    
    for option in sys.argv:
        opt_val = option.split('=')
        if opt_val[0] == '--input':
            input_path = str(opt_val[1])
        elif opt_val[0] == '--fs':
            fs = str(opt_val[1])
        elif opt_val[0] == '--num_learners':
            num_learners = int(opt_val[1]) - 1
        elif opt_val[0] == '--num_parts':
            num_parts = int(opt_val[1])
        elif opt_val[0] == '--max_depth':
            max_depth = int(opt_val[1])
        elif opt_val[0] == '--max_features':
            max_features = int(opt_val[1])
        elif opt_val[0] == '--min_samples_leaf':
            min_samples_leaf = int(opt_val[1])
        elif opt_val[0] == '--min_samples_split':
            min_samples_split = int(opt_val[1])
        elif opt_val[0] == '--output':
            output_path = str(opt_val[1])
        elif opt_val[0] == '--regularizer':
            regularizer = str(opt_val[1])
        elif opt_val[0] == '--niters':
            niters = int(opt_val[1])
        elif opt_val[0] == '--reg_weight':
            reg_weight = float(opt_val[1])
        elif opt_val[0] == '--step_size':
            step_size = float(opt_val[1])
        elif opt_val[0] == '--batch_fraction':
            batch_frac = float(opt_val[1])
        elif opt_val[0] == '--save_data':
            save_data = int(opt_val[1])

    print '>>> input_path = %s' % str(input_path)
    print '>>> num_learners = %s' % str(num_learners)
    print '>>> num_parts = %s' % str(num_parts)
    print '>>> output_path = %s' % str(output_path)
    print '>>> max_depth = %s' % str(max_depth)
    print '>>> max_features = %s' % str(max_features)
    print '>>> min_samples_leaf = %s' % str(min_samples_leaf)
    print '>>> min_samples_split = %s' % str(min_samples_split)
    print '>>> regularizer = %s' % str(regularizer)
    print '>>> niters = %s' % str(niters)
    print '>>> reg_weight = %s' % str(reg_weight)
    print '>>> step_size = %s' % str(step_size)
    print '>>> file_system = %s' % str(fs)
    print '>>> batch_fraction = %s' % str(batch_frac)
    print '>>> save_data = %s' % str(save_data)

    if input_path == "":
        print >> sys.stderr, "Usage: parallel boosting training <file>"
        exit(-1)

    # Initialize Spark
    conf = SparkConf()
    sc = SparkContext(conf=conf)

    # Map function of mapping training data to each learner (1):
    # Randomly partitioning the entire dataset.
    def func_pmap_rndpartition(p_iter):
        rnd.seed()

        for (k, v) in p_iter:
            yval = v[0]
            xvec = v[1]

            kv_pair = (rnd.randint(1, num_learners + 1), (yval, xvec))
            yield kv_pair

    # Map function of mapping training data to each learner (2):
    # Mapping each example to num_learners copies with their labels corrupted
    # with standard Gaussian multiplications.
    def func_pmap_rndlabeling(p_iter):
        rnd.seed()

        for v in p_iter:
            yval = v[0]
            xvec = v[1]

            # Emitting the training set with true labels
            yield (0, (yval, xvec))

            # Emitting the training sets with corrupted labels
            for tid in range(num_learners):
                coin = rnd.random()
                if coin < 0.5:
                    yval_rnd = -yval
                else:
                    yval_rnd = yval

                kv_pair = (tid + 1, (yval_rnd, xvec))
                yield kv_pair

    # Def of mapping function for training each learner
    def func_train_learner((key, data)):
        yvec = []
        xmat = []

        # Emitting the trained learners
        for (yval, xvec) in data:
            # Append label and feature values
            xmat.append(xvec)
            yvec.append(yval)

        # Train learner
        learner = tree.DecisionTreeRegressor( \
                max_depth = max_depth, \
                max_features = max_features, \
                min_samples_leaf = min_samples_leaf, \
                min_samples_split = min_samples_split \
                )
        learner.fit(xmat, yvec)

        return (key, learner)

    # Hypothesis sampling
    train_data_HS = sc.pickleFile(fs + input_path) \
            .repartition(num_parts) \
            .persist(StorageLevel.MEMORY_AND_DISK)

    train_data_map_HS = train_data_HS.mapPartitions(func_pmap_rndlabeling).\
            combineByKey(createCombiner = lambda v : [v], \
            mergeValue = lambda c, v : c + [v], \
            mergeCombiners = lambda c1, c2 : c1 + c2 \
            )

    learner_class = train_data_map_HS.map(func_train_learner).collect()
    learner_class_broadcast = sc.broadcast(learner_class)

    # Map function of generating training data for coefficient fitting
    def func_map_ypredmat(v):
            yval = v[0]
            xvec = v[1]

            values = np.zeros(num_learners + 1)
            for l in learner_class_broadcast.value:
                values[l[0]] = l[1].predict(xvec)

            return LabeledPoint(yval, values)

    # Coefficient fitting
    train_data_CF = train_data_HS \
            .map(func_map_ypredmat) \
            .persist(StorageLevel.MEMORY_AND_DISK)

    coeffs = LinearRegressionWithSGD.train(\
            data = train_data_CF, \
            iterations = niters, \
            step = step_size, \
            regType = regularizer, \
            regParam = reg_weight, \
            miniBatchFraction = batch_frac, \
            intercept = False
            )

    coeffs_list = sorted(list(enumerate(coeffs.weights, start = 1)), \
            key = lambda kv : kv[0])

    learner_class_list = sorted(learner_class, \
            key = lambda kv : kv[0])

    if save_data:
        # Save the raw training data
        train_data_HS.saveAsPickleFile(path = fs + output_path + '/train_data', batchSize = 10240)

        # Save the coeff-fit training data
        train_data_CF.saveAsPickleFile(path = fs + output_path + '/coeff_data',  batchSize = 10240)

    # Save the learner class and fitted coefficients
    file = open(output_path + '/learner_class', 'w')
    pk.dump(learner_class_list, file)
    file.close()

    file = open(output_path + '/fitted_coeffs', 'w')
    pk.dump(coeffs_list, file)
    file.close()

    sc.stop()