Beispiel #1
0
def main(training_file,n):

    epochs = int(n);
    x,y,tags = read_training_data(training_file)
    v = {}
    sc = SparkContext(appName="parameterMixing")
    tags = sc.broadcast(tags)
    time0 = time.time()
    training_data = []
    for i in range(len(x)):
        training_data.append((x[i],y[i]))
    train_data = sc.parallelize(training_data).cache()
    for round in range(0,epochs):
        fv = sc.broadcast(v)
        feat_vec_list = train_data.mapPartitions(lambda t: perc_train(t, tags.value, fv.value))
        feat_vec_list = feat_vec_list.combineByKey((lambda x: (x,1)),
                             (lambda x, y: (x[0] + y, x[1] + 1)),
                             (lambda x, y: (x[0] + y[0], x[1] + y[1]))).collect()

        for (feat, (a,b)) in feat_vec_list:
            v[feat] = float(a)/float(b)
    sc.stop()
    # Compute the weight vector using the Perceptron algorithm
    #trainer.perceptron_algorithm(5)
    print "iteration %d in %f seconds" %(iterations, time.time()-t0)
    # Write out the final weight vector
    write_weight_vector(v)
        def createContext():
            uBATCH_INTERVAL = 10
            sc = SparkContext(SPARK_MASTER, appName="StreamingKafka")
            sc.broadcast(batchUserPostDict)
            sc.broadcast(batchPostUserDict)
            #sc = SparkContext("local[*]", appName="StreamingKafka")
            # streaming batch interval of 5 sec first, and reduce later to 1 sec or lower
            ssc = StreamingContext(sc, uBATCH_INTERVAL)
            ssc.checkpoint(CHECKPOINT_DIR)   # set checkpoint directory in HDFS
            #ssc.checkpoint(10 * uBATCH_INTERVAL)
            return ssc

            ssc = StreamingContext.getOrCreate(CHECKPOINT_DIR, createContext)
def geneSpark(input_filename, output_filename,
                upstream_bp=2000, downstream_bp=500):
    '''
    Performs geneSpark extensions given a `input_filename`
    and stores the output in `output_filename`

    Parameters
    ----------

    input_filename : string
        path to the GTF file

    output_filename : string
        path to the output extended GTF file

    upstream_bp : int (default=2000):
        Extend upstream of first exon of each gene

    dowstream_bp : int (default=500):
        Extend dowstream of last exon of each gene
    '''
    # create spark context
    sc = SparkContext(appName="geneSpark")

    # set up broadcasting variables
    upstream_bp_var = sc.broadcast(upstream_bp)
    downstream_bp_var = sc.broadcast(downstream_bp)

    # create temporary folder where to store the output chunks
    tempFile = NamedTemporaryFile(delete=True)
    tempFile.close()

    # define the spark pipeline
    (sc.textFile(input_filename)
     .map(lambda x: x.split('\t'))
     .filter(lambda x: x[2] == 'exon')
     .map(parse_line)
     .reduceByKey(min_and_max)
     .sortByKey()
     .map(partial(geneSpark,
                  upstream_bp=upstream_bp_var,
                  downstream_bp=downstream_bp_var))
     .saveAsTextFile(tempFile.name))

    # merge output chunks to single output_filename
    with open(output_filename, 'w') as fw:
        for line in input(sorted(glob(tempFile.name + "/part-000*"))):
            fw.write(line)

    sc.stop()
def main():
    """Process the input file got as a command-line argument."""

    global stop_words, punctuations

    input_file, feature_dimensions, num_clusters, max_iterations, runs = _parse_cmd_line_args()

    sc = SparkContext(conf=_get_conf("CS-838-Assignment3-PartB"))

    # for the _tokenize function to remove stopwords and punctuations
    stop_words = sc.broadcast(set(stopwords.words("english")))
    punctuations = sc.broadcast(set(string.punctuation))

    input_text_rdd, tfidf_vectors_rdd = get_feature_vectors(sc, input_file, feature_dimensions)
    model = build_cluster_model(tfidf_vectors_rdd, num_clusters, max_iterations, runs)
    top_n_in_each_cluster(sc, input_text_rdd, tfidf_vectors_rdd, model, 5)
def main():

    # Insure a search term was supplied at the command line
    if len(sys.argv) != 2:
        sys.stderr.write("Usage: {} <search_term>".format(sys.argv[0]))
        sys.exit()

    # Create the SparkContext
    sc = SparkContext(appName="SparkWordCount")

    # Broadcast the requested term
    requested_movie = sc.broadcast(sys.argv[1])

    # Load the input file
    source_file = sc.textFile("/user/hduser/input/movies")

    # Get the movie title from the second fields
    titles = source_file.map(lambda line: line.split("|")[1])

    # Create a map of the normalized title to the raw title
    normalized_title = titles.map(lambda title: (re.sub(r"\s*\(\d{4}\)", "", title).lower(), title))

    # Find all movies matching the requested_movie
    matches = normalized_title.filter(lambda x: requested_movie.value in x[0])

    # Collect all the matching titles
    matching_titles = matches.map(lambda x: x[1]).distinct().collect()

    # Display the result
    print "{} Matching titles found:".format(len(matching_titles))
    for title in matching_titles:
        print title

    sc.stop()
def main(name, divide):

    """
    old_g = pickle.load(open("/net/data/facebook/facebook-ucsb/Facebook_2008/"+name +"/original_pickles/"+name +".pickle", 'r'))
    new_g = networkx.Graph()
    for node, friends in old_g.adj.iteritems():
        if node not in new_g.nodes():
            new_g.add_node(node)
        for friend in friends.iterkeys():
            new_g.add_node(friend)
            new_g.add_edge(node, friend)
            """
    # serialize the networkx graph as text files of edgelist
    # into a text file for workers to read

    #   networkx.write_edgelist(new_g, "edgelist/"+name, data=False)
    #   subprocess.check_call("hdfs dfs -put edgelist/"+name+ " edgelist/", shell=True)

    new_g = networkx.read_adjlist(name + "_list.txt")  # Egypt_list is an edge list
    sc = SparkContext(appName="Sorted_removal")

    dataG = json_graph.node_link_data(new_g)
    stringG = json.dumps(dataG)
    originalG = sc.broadcast(stringG)
    edges = sc.textFile("hdfs://scrapper/user/xiaofeng/edgelist/" + name, 192 * 4 * int(divide))
    costs = edges.map(lambda line: line.split(" ")).map(lambda edge: edge_to_cost(edge, originalG.value))
    costs.saveAsTextFile("hdfs://scrapper/user/xiaofeng/costs_" + name)
    sc.stop()
    subprocess.check_call("hdfs dfs -get costs_" + name + " /home/xiaofeng/facebook/FacebookProject/costs/", shell=True)
    Reformat("/home/xiaofeng/facebook/FacebookProject/costs/costs_" + name + "/", name)
Beispiel #7
0
def run(date):

    """"
    加载hdfs上 业务提供的规则
    并封装成 FunnelRule对象
    例如:[FunnelRule(funnelId=u'1496', ruleId=u'896', level=u'1', requestRule=u'contains')]
    """""
    sc = SparkContext(appName="readHdfsFile",master=conf.sparkURL)

    rulesList=readFile(sc,conf.dim_model_url_new).flatMap(lambda line:line.split('\r\n')).map(buildBean).collect() #OrderedDict(

    rules_lookup = sc.broadcast(rulesList)

    """
      setp2:加载点击流日志与规则表比对,剔除无效日志, 生成后期数据分析结构(in 1-----> out N+)
      set4:产生新的key
      set5:
    """

    """
>>>rdd2=sc.parallelize([['1\t1',['1','1','2','a']],['1\t1',['1','1','1','b']],['2\t1',['2','1','1','b']]])
>>>rdd2.groupByKey().map(lambda line:list(line[1])).filter(lambda x:x[0][0]=='1').flatMap(lambda x:x).collect()
     [['1', '1', '2', 'a'], ['1', '1', '1', 'b']]
"""

    #conf.click_jr_log_url_dir+"/dt="+date

    clickLogRDD=readFile(sc,"/funnelNew/input/click_log/000000_0").map(rowSplit)

    clickLogRDD1=clickLogRDD.flatMap(lambda line:funnelFilter.getList(line[0],rules_lookup)).groupByKey()\
        .map(lambda line:line[1]).filter(reduceFilter).flatMap(lambda x:x).map(countSessionKey).\
        partitionBy(1).reduceByKey(add)

    clickLogRDD1.saveAsTextFile("/funnelNew/output/dt="+date)
Beispiel #8
0
def _train_spark(data, n_components, n_pc, covar_types, verbose, n_jobs, n_iter_search):
    # Spark configuration.
    conf = (SparkConf()
             .setMaster("local[" + str(n_jobs) + "]")
             .setAppName("FDD")
             .set("spark.executor.memory", "512mb")
             .set("spark.cores.max", str(n_jobs)))
    sc = SparkContext(conf=conf)
    # Build hyperparameter vectors.
    parameters = cartesian((n_components,
                            n_pc,
                            covar_types))
    # Distribute the hyperparameters vector.
    parameters_rdd = sc.parallelize(parameters, 96)
    # Broadcast the data to all workers.
    data_broadcast = sc.broadcast(data)
    # Train a model for each hyperparameter set.
    models = parameters_rdd.map(lambda param: train_with_parameters(param, data_broadcast))
    # Persist the models the avoid re-computation.
    models.persist(StorageLevel(True, True, False, True, 1))
    # Sort by BIC.
    sorted_models = models.sortBy(lambda model: model[0])
    # The first is the best model.
    best_model = sorted_models.collect()[0][1]
    sc.stop()
    return best_model
def count_triangles(data, master="local[2]"):
    """
    @brief: Count triangles using Spark
    @param data: The data location for the input files
    @param master: The master URL as defined at
    https://spark.apache.org/docs/1.1.0/submitting-applications.html#master-urls
    """
    #################  NO EDITS HERE ###################
    assert not os.path.exists("triangles.out"), "File: triangles.out \
    already exists"
    sc = SparkContext(master, "Triangle Count")
    start = time()
    ###############  END NO EDITS HERE  ################
    # TODO: Your code goes here!
    people = sc.textFile(data)
    AdjList = people.map(makepair)
    DriverAdj = dict(AdjList.collect())
    WorkerAdj = sc.broadcast(DriverAdj)
    Edges = AdjList.flatMapValues(lambda x: x)
    TriSet = Edges.map(lambda (k,v): ((k,v), 
             AintersectB(k,v,WorkerAdj.value)))
    Triangle = TriSet.flatMapValues(lambda x: x).map(lambda (k,v): 
             tuple(sorted([int(v),int(k[0]),int(k[1])],reverse=True)))
    output = set(Triangle.collect())
    #################  NO EDITS HERE  ###################
    print "\n\n*****************************************"
    print "\nTotal algorithm time: %.4f sec \n" % (time()-start)
    print "*****************************************\n\n""" 
    ###############  END NO EDITS HERE ################
    with open("triangles.out", "wb") as f:
        for friends in output:
            f.write(str(friends[0])+" "+str(friends[1])+" "+str(friends[2])+"\n") # TODO: Loop with f to write your result to file serially
        pass
def SparkBroadcastAccumulator(n): 
	global broadcast_var
	global accumulator_var
	spcon = SparkContext("local[2]","SparkBroadcastAccumulator")
	broadcast_var=spcon.broadcast("broadcast_message")
	accumulator_var=spcon.accumulator(0)
	spcon.parallelize(xrange(1,n)).foreach(lambda x: broadcast_accumulator_receiver(accumulator_var.add(x)))
def SLAPmi_initialize_spark(fullpath):
    D = io.loadmat(fullpath, struct_as_record=False, squeeze_me=True)

    obs = D['obs']
    opts = D['opts']

    Y = obs.data_in
    P0 = opts.P.T  # transpose

    Sk = D['Sk']
    Su = D['Su']
    if len(Su.shape)<2:
        Su = Su[:,None]

    masks = D['masks']
    #S = Sk
    #S = np.concatenate((Sk,Su), axis=1)

    def P (frame):
        return P0


    def solveOneFrame(frameDataIn):  #framedata has structure [framenumber, y[:,framenumber]]
        Pt = P(frameDataIn[0])
        #PSk = np.zeros((Pt.shape[0], Sk.shape[0]))
        #for Sk_ix in range(len(Sk)):
        #    PSk[:, Sk_ix] = Pt[:,masks[:,Sk_ix].toarray()[:,0]].dot(Sk[Sk_ix])
        #code.interact(local=locals())
        PSk = Pt.dot(Sk_bc.value).toarray()
        PSu = Pt.dot(Su_bc.value)
        PS = np.concatenate((PSk, PSu), axis=1)
        F = optimize.nnls(PS,frameDataIn[1])

        #code.interact(local=locals())

        return F[0]


    #code.interact(local=locals())

    conf = SparkConf().setAppName('SLAPmi_initialize')
    sc = SparkContext(conf=conf)

    Sk_bc = sc.broadcast(Sk)
    Su_bc = sc.broadcast(Su)

    frameData = [(i, Y[:,i]) for i in range(Y.shape[1])]

    F_solved = np.array(sc.parallelize(frameData,len(frameData)).map(solveOneFrame).collect())

    #
    print 'F_solved', F_solved.shape
    print 'Sk', Sk.shape
    print 'Su', Su.shape

    Fk = F_solved[:, 0:Sk.shape[1]].T
    Fu = F_solved[:, Sk.shape[1]:(Sk.shape[1]+Su.shape[1])].T

    return Sk,Su,Fk,Fu, obs, opts, masks, D['ground_truth']
Beispiel #12
0
class BroadcastTest(unittest.TestCase):

    def tearDown(self):
        if getattr(self, "sc", None) is not None:
            self.sc.stop()
            self.sc = None

    def _test_encryption_helper(self, vs):
        """
        Creates a broadcast variables for each value in vs, and runs a simple job to make sure the
        value is the same when it's read in the executors.  Also makes sure there are no task
        failures.
        """
        bs = [self.sc.broadcast(value=v) for v in vs]
        exec_values = self.sc.parallelize(range(2)).map(lambda x: [b.value for b in bs]).collect()
        for ev in exec_values:
            self.assertEqual(ev, vs)
        # make sure there are no task failures
        status = self.sc.statusTracker()
        for jid in status.getJobIdsForGroup():
            for sid in status.getJobInfo(jid).stageIds:
                stage_info = status.getStageInfo(sid)
                self.assertEqual(0, stage_info.numFailedTasks)

    def _test_multiple_broadcasts(self, *extra_confs):
        """
        Test broadcast variables make it OK to the executors.  Tests multiple broadcast variables,
        and also multiple jobs.
        """
        conf = SparkConf()
        for key, value in extra_confs:
            conf.set(key, value)
        conf.setMaster("local-cluster[2,1,1024]")
        self.sc = SparkContext(conf=conf)
        self._test_encryption_helper([5])
        self._test_encryption_helper([5, 10, 20])

    def test_broadcast_with_encryption(self):
        self._test_multiple_broadcasts(("spark.io.encryption.enabled", "true"))

    def test_broadcast_no_encryption(self):
        self._test_multiple_broadcasts()

    def _test_broadcast_on_driver(self, *extra_confs):
        conf = SparkConf()
        for key, value in extra_confs:
            conf.set(key, value)
        conf.setMaster("local-cluster[2,1,1024]")
        self.sc = SparkContext(conf=conf)
        bs = self.sc.broadcast(value=5)
        self.assertEqual(5, bs.value)

    def test_broadcast_value_driver_no_encryption(self):
        self._test_broadcast_on_driver()

    def test_broadcast_value_driver_encryption(self):
        self._test_broadcast_on_driver(("spark.io.encryption.enabled", "true"))
Beispiel #13
0
def process(master, input_container, output_container):
    sc = SparkContext(master, "CDNBilling")

    # load broadcast variables
    countryMapRDD = sc.textFile(input_container + "/country_map.tsv")
    countryMapList = countryMapRDD.collect()
    sc.broadcast(countryMapList)
    countryMapDict.update(createCountryDict(countryMapList))

    # load domainLogs
    domainsRawRDD = sc.textFile(input_container + "/domains_map.tsv")
    domainsRDD = domainsRawRDD.map(formatDomainsLine)

    # load logs
    logsRDD = sc.textFile(input_container + "/raxcdn_*.gz")
    # drop the header
    actual_log_lines = logsRDD.filter(lambda x: x[0] != '#')

    # filter by date
    filteredRDD = actual_log_lines.filter(filterByDate)

    # format the data
    formattedRDD = filteredRDD.map(formatLogLine, countryMapDict)

    # Zero event domains
    domains_unused = domainsRDD.subtractByKey(formattedRDD)
    domains_unused_formatted = domains_unused.map(formatUnusedDomain)

    # for each domain, calculate bandwidth and request count
    aggregatedLogs = formattedRDD.combineByKey(createCombiner, mergeValue,
                                               mergeCombiners)

    # add type of domain, project-ID, service-ID
    joinedWithDomainDetails = aggregatedLogs.join(domainsRDD)

    # join the usage logs with domains map including zero events
    joinedLogs = joinedWithDomainDetails.union(domains_unused_formatted)

    # save the output
    joinedLogs.saveAsTextFile(output_container + "/output-files")

    sc.stop()
Beispiel #14
0
def main():
    conf = SparkConf().setAppName("Test2")
    sc = SparkContext(conf=conf)
    # new_dict函数将<tuple,value>键值对转换成<tuple_1,dict(tuple_2,value)>键值对
    def new_dict(line):
        Dict = dict()
        Dict[line[0][1]] = line[1]
        return (line[0][0], Dict)
    # 读取原始文件,形成<文件,内容>的键值对
    data_raw = sc.wholeTextFiles("/home/djt/data/proclassified")
    # Doc函数将<文件,内容>键值对中内容按行split,每一行即对应一封判决书的内容
    def Doc(line):
        s = line[1].split("\n")
        return s[0:len(s) - 1]
    # <文件,内容>的键值对 => <判决书路径,判决书内容>键值对
    data = data_raw.flatMap(Doc)
    # 将判决书路径 => ID
    def DocID(string):
        s = filter(lambda x: x.isdigit(), string)
        return s[1:len(s)]
    # <判决书路径,判决书内容> => <判决书ID,判决书内容>
    data_wordsplit = data.map(lambda line: (DocID(line.split(",<")[0]), line.split(",<")[1].split(" ")))
    # 去除分词后文本之间的空格,便于后续正则表达式匹配
    def Doc_Integration(line):
        doc = ""
        for k in line[1]:
            doc += k
        return (line[0], doc)
    # <判决书ID,判决书内容(有空格)> => <判决书ID,判决书内容>
    data_doc = data_wordsplit.map(Doc_Integration)
    # 从keywords_body.txt中提取出各可能维度,用正则表达式编译
    keywords_raw = sc.textFile("/home/djt/data/keywords_crime.txt")
    keywords = keywords_raw.map(
        lambda line: re.compile(line)).collect()
    # 将<维度,set(特征词)>键值对广播
    keywords = sc.broadcast(keywords)
    # 正则表达式匹配各判决书中出现的所有腐败行为方式(即罪名)
    def keywords_stats(line):
        doc = line[1]
        # 匹配 doc是判决书内容 value[0]即正则表达式
        temp = keywords.value[0].findall(doc)
        crime_set = set(temp)
        crime = ""
        for k in crime_set:
            crime+="\t"+k
        return (line[0],crime)
    # raw:<判决书ID,所有出现的行为方式(罪名)>
    raw = data_doc.map(keywords_stats)
    after = raw.sortByKey()
    # 输出
    res = after.map(lambda (k, v): k + "\t" + v)
    res.saveAsTextFile("/home/djt/data/out")
def computeMinHashSig(K, N, rdd):
    """

    :param K: number of random hash functions (i.e., the number of rows of the signature matrix)
    :param N: maximum number of elements in any of the considered sets
    :param rdd: RDD where each record contains one set represented as a sorted list of 32-bit integers from the
                range [1 , . . . , N]
    :return: RDD containing the signature matrix, stored column-wise.
             That is, one record holds the K entries that correspond to the signature of one set
    """
    sc = SparkContext(appName="PythonMinhash")
    # first choose a set of K random hash functions h1,..., hK (described in lecture 5 on slide 33)
    hashParams = sc.broadcast(generateHashParams(K))

    data = sc.parallelize(rdd)
    sig = data.map(lambda x: computeSig(hashParams.value, N, x))
    return sig.collect()
def main():
    parser = argparse.ArgumentParser(
        description='process some log messages, storing them and signaling '
                    'a rest server')
    parser.add_argument('--mongo', help='the mongodb url',
                        required=True)
    parser.add_argument('--rest', help='the rest endpoint to signal',
                        required=True)
    parser.add_argument('--port', help='the port to receive from '
                        '(default: 1984)',
                        default=1984, type=int)
    parser.add_argument('--appname', help='the name of the spark application '
                        '(default: SparkharaLogCounter)',
                        default='SparkharaLogCounter')
    parser.add_argument('--master',
                        help='the master url for the spark cluster')
    parser.add_argument('--socket',
                        help='the socket ip address to attach for streaming '
                        'text data (default: caravan-pathfinder)',
                        default='caravan-pathfinder')
    parser.add_argument('--model',
                        help='the serialized model to use',
                        default='model.json')
    args = parser.parse_args()
    mongo_url = args.mongo
    rest_url = args.rest
    model = args.model

    sconf = SparkConf().setAppName(args.appname)
    if args.master:
        sconf.setMaster(args.master)
    sc = SparkContext(conf=sconf)
    ssc = StreamingContext(sc, 1)
    somv = fromJSON(model)
    som = sc.broadcast(somv)

    log4j = sc._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.WARN)

    lines = ssc.socketTextStream(args.socket, args.port)
    lines.foreachRDD(lambda rdd: process_generic(rdd, mongo_url,
                                                 rest_url, som))

    ssc.start()
    ssc.awaitTermination()
Beispiel #17
0
def spark_batch(sc: SparkContext, feature_names, question_db: str, guess_db: str,
                granularity='sentence'):
    sql_context = SQLContext(sc)
    question_db = QuestionDatabase(question_db)

    log.info("Loading Questions")
    questions = question_db.guess_questions()

    log.info("Loading Guesses")
    guess_list = GuessList(guess_db)
    guess_lookup = guess_list.all_guesses(allow_train=True)

    log.info("Loading tasks")
    tasks = [Task(q, guess_lookup[q.qnum]) for q in questions]
    shuffle(tasks)
    log.info("Number of tasks: {0}".format(len(tasks)))

    features = {name: instantiate_feature(name, question_db) for name in feature_names}

    b_features = sc.broadcast(features)

    def f_eval(x):
        return evaluate_feature_question(x, b_features, granularity)

    log.info("Beginning feature job")
    feature_rdd = sc.parallelize(tasks)\
        .repartition(150 * len(feature_names))\
        .flatMap(f_eval)

    feature_df = sql_context.createDataFrame(feature_rdd, SCHEMA).cache()
    feature_df.count()
    log.info("Beginning write job")
    for fold in FOLDS:
        feature_df_with_fold = feature_df.filter('fold = "{0}"'.format(fold)).cache()
        for name in feature_names:
            filename = 'output/features/{0}/sentence.{1}.parquet'.format(fold, name)
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            feature_df_with_fold.filter('feature_name = "{0}"'.format(name))\
                .write.save(filename, mode='overwrite')
        feature_df_with_fold.unpersist()
    log.info("Computation Completed, stopping Spark")
    sc.stop()
def main():
    # master = 'local[2]'
    master = 'spark://192.168.9.164:7077'
    app_name = 'test-broadcast'
    # spark_home = '/data01/app/bigdata/spark'  # local
    spark_home = '/home/hadoop/app/spark'  # test

    pyFiles = ['mysql_utils.py']
    spark_conf = SparkConf()
    spark_conf.setMaster(master).setAppName(app_name).setSparkHome(spark_home)
    sc = SparkContext(conf=spark_conf)
    for path in (pyFiles or []):
        sc.addPyFile(path)

    external_cache = get_api_deviceinfo()

    deviceinfo_b = sc.broadcast(external_cache)


    sc.stop()
Beispiel #19
0
def feature_to_fdata(file_name):
    from pyspark import SparkContext
    def handle(x):
        line = x.split("\t")
        return line[0],line[1:]
    sc = SparkContext(appName="feature_to_fdata")
    data = sc.textFile(file_name)
    result = data.map(handle).reduceByKey(lambda x,y:list(x)+list(y))
    transform_set = read_transform("/home/wangzhe/ccf/data/feature/transform.txt")
    transform_broadcast = sc.broadcast(transform_set)

    def handle2(x):
        uid,values = x
        label = '1' if uid in transform_broadcast.value else '0'
        value_map = {}
        for item in values:
            key,value = item.split(":")
            value_map[key] = float(value)
        return uid,label,value_map

    return result.map(handle2)
Beispiel #20
0
class TFIDF():

	def __init__(self,input_path,output_path):
		self.input = input_path
		self.output = output_path
		self.texts = glob(self.input + '/*.txt')
		self.conf = SparkConf().setAppName('tfidf')\
							   .setMaster('local')\
							   .set('spark.executor.memory','1g')
		self.sc = SparkContext(conf=self.conf)

	def writeToCSVFile(self,rdd):
		with open(self.output + '/tfidf-scores.csv','wb') as csvfile:
			writer = csv.writer(csvfile)
			writer.writerow(['docID','word','score'])
			writer.writerows(rdd)


	def run(self):
		# Job 1: Word Frequency in Documents.
		tfilter = TextFilter().filter
		wcRDD = self.sc.emptyRDD()
		for dkey,textfile in enumerate(self.texts):
			tf = self.sc.textFile(textfile)\
					 .filter(lambda line: len(line.strip()) > 0)\
				     .flatMap(lambda line: tfilter(line))\
				     .map(lambda word: ((word,dkey),1))\
				     .reduceByKey(operator.add)
			N = tf.map(lambda ((w,d),y): y).sum()
			tf = tf.map(lambda ((w,d),y): ((w,d),(y,N)))
			wcRDD = self.sc.union([wcRDD,tf])

		# Job 2: Word Frequency in Corpus & Calculate TF-IDF.
		D = self.sc.broadcast(len(self.texts))
		wcRDD = wcRDD.map(lambda ((w,d),(a,b)): (w,(d,a,b)))
		wfRDD = wcRDD.map(lambda (w,(d,a,b)): (w,1)).reduceByKey(operator.add)
		tfidf = wcRDD.join(wfRDD).map(lambda (w,((d,a,b),c)): ((d,-a/b * np.log(D.value/c),w),1))\
					 .sortByKey(True).map(lambda ((d,z,w),a): (d,w,-z))
		self.writeToCSVFile(tfidf.collect())
Beispiel #21
0
	def train(self, data, iterations, partitions=12):
		from pyspark import SparkContext
		sc = SparkContext()
		dataRDD = sc.parallelize(data).cache()
		for t in range(iterations):
			sigma = self._decay_func(self.sigma, t, iterations)
			lr = self._decay_func(self.lr, t, iterations)
			codebookBC = sc.broadcast(self.codebook)
			randomizedRDD = dataRDD.repartition(partitions)
			print "iter: %d, sigma: %.2f, lr: %.2f, error: %.4f" % (t, sigma, lr, self.quantization_error(randomizedRDD.collect()))
			def train_partition(partition_data):
				localCodebook = codebookBC.value
				for elem in partition_data:
					(w_h, w_w) = winner(elem, localCodebook, self.w, self.h)
					g = gaussian(self.w, self.h, (w_h, w_w), sigma) * lr
					it = np.nditer(g, flags=['multi_index'])
					while not it.finished:
						localCodebook[it.multi_index] += g[it.multi_index]*(elem - localCodebook[it.multi_index])
						it.iternext()
				return [localCodebook]
			resultCodebookRDD = randomizedRDD.mapPartitions(train_partition)
			sumCodebook = resultCodebookRDD.reduce(lambda a, b: a + b)
			newCodebook = sumCodebook / float(partitions)
			self.codebook = newCodebook
Beispiel #22
0
    nPart = 38 * 14 * 4 * 4
    sDir = op.join(sHdfsDir, 'hg38.fa.nb.enc.gzip')
    sPtter = op.join(sHdfsDir, 'ptter')
    codec = "org.apache.hadoop.io.compress.GzipCodec"

    # print default SparkConf
    sf = SparkConf()
    print sf.toDebugString()
    sc = SparkContext(appName=sApp)

    rdd = sc.textFile(sDir, use_unicode=False)
    rdd = rdd.map(split2KV)

    #lPtter = genPtter(rdd, 0.001, nPart)
    #sc.parallelize(lPtter).saveAsTextFile(sPtter)
    ptter = sc.broadcast(sc.textFile(sPtter, use_unicode=False).collect())

    nTime = 4
    nOne = nPart / nTime
    lIndex = [i * nOne for i in xrange(1, nTime)]
    s0 = ptter.value[lIndex[0]]
    s1 = ptter.value[lIndex[1]]
    s2 = ptter.value[lIndex[2]]

    #print ptter.value[lIndex[0]], ptter.value[:lIndex[0]]
    #print ptter.value[lIndex[0]], ptter.value[lIndex[1]], ptter.value[lIndex[0]:lIndex[1]]
    #print ptter.value[lIndex[1]], ptter.value[lIndex[2]], ptter.value[lIndex[1]:lIndex[2]]
    #print ptter.value[lIndex[2]], ptter.value[lIndex[2]:]

    for i in xrange(4):
        sp.call('hdfs dfs -rm -r ' + op.join(sHdfsDir, 'nb.' + str(i)),
Beispiel #23
0
        except:
            return [(0, "x")]


def artistToAlias(line):
    tokens = line.split('\t')
    try:
        return [(int(tokens[0]), int(tokens[1]))]
    except:
        return [(9999, 0)]


def prepareRawUserArtistData(line, bArtistAlias):
    userID, artistId, count = map(int, line.split(' '))
    finalArtistID = bArtistAlias.value.get(artistId, artistId)
    return mlrecom.Rating(userID, finalArtistID, count)


###############################################################################

########################CODE###################################################
artistByID = rawArtistData.flatMap(lambda line: artistToId(line))
artistAlias = rawArtistAlias.flatMap(
    lambda line: artistToAlias(line)).collectAsMap()

bArtistAlias = sc.broadcast(artistAlias)
trainData = rawUserArtistData.map(
    lambda line: prepareRawUserArtistData(line, bArtistAlias)).cache()

model = mlrecom.ALS.trainImplicit(trainData, 10, 5, 0.01, 1.0)
Beispiel #24
0
    #To get 1-item frequent pattern
    one_item = _trans.flatMap(mineOneItem).reduceByKey(add).filter(
        lambda x: x[1] > SUPPORT_NUM).cache()
    result_buffer = one_item.map(
        lambda x: str(x[0]) + ":" + str(float(x[1]) / TRANS_NUM))
    if args.verbose:
        print "1-item pattern:"
        print result_buffer.collect()
    #result_buffer.saveAsTextFile(args.output+"/1_item.out")

    #To get 2-k item frequent pattern
    frequent_pattern = one_item
    for i in range(2, args.k + 1):
        child_pattern = getChildPattern(
            frequent_pattern.map(lambda x: x[0]).collect(), i)
        #print child_pattern
        if len(child_pattern) == 0:
            break
        broadcast_pattern = sc.broadcast(child_pattern)
        frequent_pattern = _trans.flatMap(mineItem).reduceByKey(add).filter(
            lambda x: x[1] > SUPPORT_NUM).cache()
        result_buffer = frequent_pattern.map(
            lambda x: str(x[0]) + ":" + str(float(x[1]) / TRANS_NUM))
        if args.verbose:
            print str(i) + "-item pattern:"
            print result_buffer.collect()
        #result_buffer.saveAsTextFile(args.output+"/"+str(i)+"_item.out")
        broadcast_pattern.unpersist()
    stop = time.time()
    if args.verbose:
        print "Complete! Time cost: {}".format(stop - start)
        question, context_follwers, context_name, topicFollowers, topicNames,
        question_key, ans, anonymous
    ]


with open("answered_data_10k.in") as f:
    data = f.readlines()

N = int(data[0])
data = data[1:]

print N, len(data), data[0]

sc = SparkContext()
sqlContext = SQLContext(sc)
V = sc.broadcast(punctuations)
r = sc.parallelize(data)

r = r.map(lambda s: s.strip()).map(json.loads).map(getData)
r = r.take(10)
df = sqlContext.createDataFrame(r, [
    "question_text", "context_topic_followers", "context_topic_names",
    "topics_followers", "topics_name", "question_key", "__ans__", "anonymous"
])
df.show()
rdd = df.select("question_text").rdd
print rdd.take(2)
row = Row("cleaned_text")
k = rdd.map(lambda d: d["question_text"].lower()).map(lambda word: " ".join(
    [str(w) for w in word.split() if not w in stopword])).map(
        lambda word: ''.join(char for char in word
Beispiel #26
0
	try:
		spark=SparkSession.builder.config(conf=conf).getOrCreate()
		logger.debug("Initialized spark session successfully")
	except:
		logger.error("Fail to start spark session")

	# Input the dataset
	try:
		logger.debug("Start to read the input dataset")
		posts_df=spark.read.json(posts_file)
		tags_df=spark.read.csv(tags_file, header=True)
		selected_tags=pd.read_csv(selected_tags_file, header=None)
		local_tags_to_catId=dict(zip(selected_tags[0], list(selected_tags.index)))
		local_catId_to_tags=dict(zip(list(selected_tags.index), selected_tags[0]))
		tags_to_catId=sc.broadcast(local_tags_to_catId)
		catId_to_tags=sc.broadcast(local_catId_to_tags)
		tags_set=sc.broadcast(set(selected_tags[0]))
		logger.debug("Read in dataset successfully")
		
	except:
		logger.error("Can't input dataset")

	# Join posts_df and tags_df together and prepare training dataset
	selected_tags_df=tags_df.filter(tags_df.Tag.isin(tags_set.value)).na.drop(how = 'any')
	tags_questions_df=selected_tags_df.join(posts_df, "Id")
	training_df=tags_questions_df.select(['Tag', 'Body','Id']).na.drop(how = 'any')
	logger.debug("successfully get training_df")

	# tokenize post texts and get term frequency and inverted document frequency
	logger.debug("Start to generate TFIDF features")
artistByID = dict(rawArtistData.flatMap(lambda x: pairsplit(x)).collect())


def aliaslookup(alias):
    splitPair = alias.rsplit('\t')
    if len(splitPair) != 2:
        return []
    else:
        try:
            return [(int(splitPair[0]), int(splitPair[1]))]
        except:
            return []


artistAlias = rawArtistAlias.flatMap(lambda x: aliaslookup(x)).collectAsMap()
bArtistAlias = sc.broadcast(artistAlias)


def ratinglookup(x):
    userID, artistID, count = map(lambda line: int(line), x.split())
    finalArtistID = bArtistAlias.value.get(artistID)
    if finalArtistID is None:
        finalArtistID = artistID
    return Rating(userID, finalArtistID, count)


trainData = rawUserArtistData.map(lambda x: ratinglookup(x))
trainData.cache()
'''build model'''
model = ALS.trainImplicit(trainData, 10, 5)
'''test artist'''
Beispiel #28
0
    workerList = []
    scenarioList = []
    for i in range(scenarioSize):
        workerList.append(Worker(i))
        scenarioList.append("Scenario " + str(i))

    parallelWorkerList = sc.parallelize(zip(workerList,
                                            scenarioList)).persist()

    for x in range(20):
        var1 = randint(100, 10000)
        var2 = randint(100, 10000)
        print("Initializing iteration " + str(x))

        print("Updating variables: ")
        print("\tvar1: " + str(var1))
        print("\tvar2: " + str(var2))

        broadcast1 = sc.broadcast(var1)
        broadcast2 = sc.broadcast(var2)

        solved_values = parallelWorkerList.foreach(lambda item: do_iteration(
            item[0], item[1], broadcast1, broadcast2))

        #solved_values = parallelWorkerList.map(lambda item: do_iteration(item[0], item[1], broadcast1, broadcast2)).collect()

        assert len(solved_values) == scenarioSize

    print("Total time: " + str(time.time() - startTime))
import plotly.tools as tls
tls.set_credentials_file(username='******', api_key='njpjllrdy0')

os.environ['SPARK_HOME'] = "/usr/local/spark"
sys.path.append("/usr/local/spark/python")
sys.path.append("/usr/local/spark/python/lib")
from pyspark import SparkContext 
from pyspark import SparkConf, SparkContext
conf = SparkConf()
conf.setMaster("local[4]")
conf.setAppName("My app")
conf.set("spark.executor.memory", "8g")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sc = SparkContext(conf = conf)
months = {'Jan': '01', 'Feb': '02', 'Mar':'03', 'Apr':'04', 'May': '05', 'Jun': '06', 'Jul':'07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov':'11', 'Dec': '12'}
months = sc.broadcast(months)

class OSDataAnalysis(object):

	@staticmethod
	def urpd(line):
		# line = line.replace("\"", "", 10)
		if "::1" not in line:
			reqTime = re.search(r"\[([A-Za-z0-9_]+)(.+)\]", line).group()[1:-1]
			if re.search(r'\"([A-Za-z0-9_]+)(.+)\"', line) != None: 
				request = re.search(r'\"([A-Za-z0-9_]+)(.+)\"', line).group().split('\"')[1]
			else:
				return False
			# request = re.search(r'\"([A-Za-z0-9_]+)(.+)\"', line).group().split('\"')[1]
			date = reqTime[:11]
			date = date[7:]+'-'+months.value[date[3:6]]+'-'+date[0:2]
Beispiel #30
0
#use pred_source_all_com_small training tree
data_pred_source_all = sc.textFile("/data/mllib/pred_source_all_com_small").map(data_p_std)

#data_p_std = data.filter(filter_positive_data).sample(True, 50) #resampling positive sample

#union data and data_p
data_trans_feature = data_ans.union(data_p_std).union(data_ans_0827)
#data_union = data_ans.sample(False, 0.5).union(data_p_std)  #balance data set by reduce negative data
data_union = data_ans.union(data_p_std.sample(True, 1.5))  #balance data set by reduce negative data

#get the unique features, broadcast the value
#col_na = range(80, 83) + [84] + range(87, 92) + [97]
col_na = range(80, 83) + range(87, 92) + [97]
fe = trans_fun(data_trans_feature, col_na)
class_col = sc.broadcast(col_na)
uni_f = sc.broadcast(fe)    #broadcast uni_feature list


#transform raw data to labeledPoint
parsed_data = data_union.map(feature_char_to_num)
#parsed_data = data_pred_source_all.map(feature_char_to_num)



numFeatures = -1
if numFeatures <= 0:
    parsed_data.cache()
    numFeatures = parsed_data.map(lambda x:-1 if x[1].size==0 else x[1][-1]).reduce(max)+1
labeled_data = parsed_data.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1],x[2])))
Beispiel #31
0
#longitude=data.map(lambda x:float(x[1])).cache()#extract field2
latitude = data.map(lambda x: np.array(x[0]).astype(float))  #extract field1
longitude = data.map(lambda x: np.array(x[1]).astype(float))  #extract field1

coord1 = latitude.zip(
    longitude)  # Zip latitude with longitude to coord(Spatial Information)
#print type(coord1)
coord = coord1.zipWithIndex(
)  # Index the coordinate data--> 'coord' will be in  format(coordinate,index)

coordData = coord.map(
    lambda (k, v):
    (v, k))  # make the index as key and coordinate data as value

count = coordData.count()  # Count the number of points: count is global
countglobal = sc.broadcast(count)
ab = coord1.collect()
#print(ab[:5])
#print type(ab[0][1])
#print(ab[0][1])
tre = spatial.cKDTree(ab)
b1 = coord1.take(2)
aa = sc.broadcast(tre)
graph = coordData.map(lambda (k, v): (k, tre.query(v, 29)))
gra = graph.map(lambda (k, v): (k, v[1][1:]))
cra = gra.cartesian(gra)  #gra=graph.map(lambda (k,v): k)


def intersectionCount(k1, k2, v1, v2):
    countNum = 0
from pyspark import SparkConf, SparkContext

def loadMovieNames():
    movieNames = {}
    with open("ml-100k/u.item") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

conf = SparkConf().setMaster("local").setAppName("PopularMovies")
sc = SparkContext(conf = conf)

nameDict = sc.broadcast(loadMovieNames())

lines = sc.textFile("/Users/bjhav1/Documents/SparkCourse/ml-100k/u.data")
movies = lines.map(lambda x: (int(x.split()[1]), 1))
movieCounts = movies.reduceByKey(lambda x, y: x + y)

flipped = movieCounts.map( lambda (x, y) : (y, x))
sortedMovies = flipped.sortByKey()

sortedMoviesWithNames = sortedMovies.map(lambda (count, movie) : (nameDict.value[movie], count))

results = sortedMoviesWithNames.collect()

for result in results:
    print result
Beispiel #33
0
def parse_names(line):
	movie_names = {}
	fields = line.split("|")
	return (int(fields[0]), fields[1])

conf = SparkConf()#.setMaster("local").setAppName("PopularMovies")
sc = SparkContext(conf = conf)

id_lines = sc.textFile("hdfs://...ml100k/u.item")
id_lines_rdd = id_lines.map(parse_names)
names_dict = id_lines_rdd.collectAsMap()   # creates key:value dict (id:movie)


# Sends our mapping dictionary we made, one time, to every node in cluster and keeps it there
# so it's available when needed and all nodes will know it as the object names_dict
nameDict = sc.broadcast(names_dict)

lines = sc.textFile("hdfs://.../ml100k/u.data")
movies = lines.map(lambda x: (int(x.split()[1]), 1))
movieCounts = movies.reduceByKey(lambda x, y: x + y)

flipped = movieCounts.map(lambda x : (x[1], x[0]))
sortedMovies = flipped.sortByKey()

sortedMoviesWithNames = sortedMovies.map(lambda countMovie : (nameDict.value[countMovie[1]], countMovie[0]))

results = sortedMoviesWithNames.collect()

for result in results:
    print (result)
    output_file = sys.argv[3]

    sc = SparkContext(master, job_name)

    # Create an acucmulator initialized to 0
    #  This will be used to count the number of empty lines in the file
    blank_lines = sc.accumulator(0)
    file = sc.textFile(input_file)

    call_signs = file.flatMap(extract_call_signs)

    call_signs.count()  # Call an action so blank lines can be displayed
    print('Blank lines: {0}'.format(blank_lines.value))

    # Load call signs lookup table into a broadcast variable
    sign_prefixes = sc.broadcast(load_call_lookup)
    print(type(sign_prefixes))

    ###################################################
    #### Numeric Stats. Switching to my NeoWs Data ####
    ###################################################
    with open('../data/near_miss_data.csv') as csv_file:
        near_miss_data = csv.DictReader(csv_file)
        near_miss_distance = {(float(row['miss_distance_astronomical']))
                              for row in near_miss_data}
    nm_data = sc.parallelize(near_miss_distance).persist()
    stats = nm_data.stats()
    stdev = stats.stdev()
    mean = stats.mean()

    print('Total number of near missses: {0}'.format(nm_data.count()))
Beispiel #35
0
# Helper functions for looking up the call signs


def lookupCountry(sign, prefixes):
    pos = bisect.bisect_left(prefixes, sign)
    return prefixes[pos].split(",")[1]


def loadCallSignTable():
    f = open("./files/callsign_tbl_sorted", "r")
    return f.readlines()

# Lookup the locations of the call signs on the
# RDD contactCounts. We load a list of call sign
# prefixes to country code to support this lookup.
signPrefixes = sc.broadcast(loadCallSignTable())


def processSignCount(sign_count, signPrefixes):
    country = lookupCountry(sign_count[0], signPrefixes.value)
    count = sign_count[1]
    return (country, count)

countryContactCounts = (contactCounts
                        .map(lambda signCount: processSignCount(signCount, signPrefixes))
                        .reduceByKey((lambda x, y: x + y)))

countryContactCounts.saveAsTextFile(outputDir + "/countries.txt")

# Query 73s for the call signs CallLogs and parse the personse
Beispiel #36
0
                L_high = tuple(L_high)

                R_low1 = re.findall(r'\d+', R_LOW_HN)
                R_low = list(map(int, R_low1))
                R_low = tuple(R_low)

                R_high1 = re.findall(r'\d+', R_HIGH_HN)
                R_high = list(map(int, R_high1))
                R_high = tuple(R_high)

                borocode = ('Unknown', 'NY', 'BX', 'K', 'Q', 'R')
                yield (int(PHYSICALID), (L_low, L_high), (R_low, R_high),
                       ST_LABEL, borocode[int(BOROCODE)], FULL_STREE)

    street_line = streets.mapPartitionsWithIndex(lines)
    street_list = sc.broadcast(street_line.collect())

    def findid(borough, street, h_num):
        dd = None
        for i in street_list.value:
            if (i[3] == street or i[5] == street) and (i[4] == borough) and (
                (h_num[-1] >= i[2][0][-1] and h_num[-1] <= i[2][1][-1]) or
                (h_num[-1] >= i[1][0][-1] and h_num[-1] <= i[1][1][-1])):
                dd = i[0]
                break
            else:
                dd = None
                break
        return dd

    def extractScores(partId, records):
    images_buf = images_read_rdd.map(images_to_bytes)

    #images_part = images_buf.repartition(3000)

    images_features = images_buf.flatMap(extract_opencv_features("sift"))

    filtered_features = images_features.filter(lambda x: x[1] != None)
    features_with_filenames = filtered_features.map(
        lambda x: (Row(fileName=x[0], features=x[1].tolist())))

    features = features_with_filenames.flatMap(lambda x: x['features'])

    mod = buildModel()

    clusterCenters = mod.clusterCenters
    clusterCenters = sc.broadcast(clusterCenters)

    features_bow = features_with_filenames.map(
        functools.partial(assign_pooling,
                          clusterCenters=clusterCenters,
                          pooling='max'))

    features_bow.coalesce(1, shuffle=True).saveAsTextFile(
        "hdfs://discus-p2irc-master:54310/tmp/output_image/")

    processing_end_time = time() - processing_start_time
    print "SUCCESS: Images procesed in {} seconds".format(
        round(processing_end_time, 3))

    sc.stop()
Beispiel #38
0
    MaxWindowPrecMZ = max(np.array([x[1] for x in res])) + max(
        np.array([x[4] for x in res]))
    MaxOffset = max(np.array([x[4] for x in res]))

    SpectraLibrary = {
        k: SpectraLibrary[k]
        for k in SpectraLibrary
        if SpectraLibrary[k]['PrecursorMZ'] < MaxWindowPrecMZ
    }

    conf = (SparkConf().set("spark.driver.maxResultSize", "25g"))

    sc = SparkContext(conf=conf, appName="Specter", pyFiles=['sparse_nnls.py'])

    #Recast the library as a broadcast variable to improve performance
    BroadcastLibrary = sc.broadcast(SpectraLibrary)

    res = sc.parallelize(res, numPartitions)

    output = res.mapPartitions(
        partial(RegressSpectraOntoLibrary,
                Library=BroadcastLibrary,
                tol=delta * 1e-6,
                maxWindowOffset=MaxOffset)).collect()

    output = [[
        output[i][j][0], output[i][j][1], output[i][j][2], output[i][j][3],
        output[i][j][4], output[i][j][5]
    ] for i in range(len(output)) for j in range(len(output[i]))]

    scPath = os.path.join(outputDir, baseName + '_SpecterCoeffs.csv')
Beispiel #39
0
radioStation = sys.argv[1]

file = open("output.txt", "w")
print(("Getting audience for themes aired on " + radioStation), file=file)
file.close()

#Obtain the titles emitted by the radio station indicated by argument
titles_radioStation_files = sc.textFile("file_cad*.txt").\
                            map(split_file_cad).\
                              filter(lambda keyValue: keyValue[1]==radioStation).\
                                keys().\
                                  collect()

#Broadcast the lookup dictionary to the cluster
titles_radioStation_files_lookup = sc.broadcast(titles_radioStation_files)

#Obtain the total number of listeners to the titles emitted by the indicated radio station
titles_numListeners_files = sc.textFile("file_num*.txt").\
                              map(split_file_num).\
                                filter(lambda keyValue: keyValue[0] in titles_radioStation_files_lookup.value).\
                                  reduceByKey(add).\
                                    collect()

#Sort the output by titles
output = sorted(titles_numListeners_files)

#Save the output
file = open("output.txt", "a")
for o in output:
    print("%s: %d" % (o[0], o[1]), file=file)
Beispiel #40
0
    def pack_by_strata(col_group, partition_iter):
        strata = collections.defaultdict(list)
        perm = range(num_workers)
        for _ in range(col_group):
            perm.insert(0, perm.pop())

        for entry in partition_iter:
            _, (u, m, _, _, _) = entry
            row_group = (u - 1) / blk_row_size
            strata[(perm[row_group], row_group, col_group)].append(entry[1])

        for item in strata.items():
            yield item

    # add N_i, N_j for each rating entry
    rating_per_user_b = sc.broadcast(rating_per_user)
    rating_per_movie_b = sc.broadcast(rating_per_movie)
    # map to :(<col-group>, (<u> <m> <r> <N_i> <N_j>))
    ratings = ratings.map(lambda r: ((r[1] - 1) / blk_col_size,
                                     # value is a 5-element tuple
                                     (r[0], r[1], r[2],
                                      rating_per_user_b.value[r[0]],
                                      rating_per_movie_b.value[r[1]]))) \
                     .partitionBy(num_workers) \
                     .mapPartitionsWithIndex(pack_by_strata,
                                             preservesPartitioning=True) \
                     .cache()

    def calculate_loss(pred_rating, true_rating):
        error, n = 0.0, 0
        for _, entries in true_rating:
class SparkFEProcess:
    def __init__(self):

        self.parser = self.init_config()

        sparkConf = SparkConf().setAppName("feature engineering on spark of explore_spark_2") \
            .set("spark.ui.showConsoleProgress", "false")
        self.sc = SparkContext(conf=sparkConf)
        self.sc.broadcast(self.parser)
        self.init_logger()
        # #初始化相关参数
        # #bins_dict保存相关列的分箱方案,在处理测试数据的时候使用
        # self.bins_dict={}

    def init_config(self):
        current_path = os.path.dirname(os.path.realpath(__file__))
        workspace_path = current_path.split('featureEngineering')[0]
        config_file = workspace_path + 'resource/config.ini'
        parser = configparser.ConfigParser()
        parser.read(config_file)
        return parser

    def init_logger(self):
        '''
        设置日志级别
        :param sc:
        :return:
        '''
        logger = self.sc._jvm.org.apache.log4j
        logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
        logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
        logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)

    def read_rdd(self, fileName):
        try:
            file_path = self.parser.get("hdfs_path",
                                        "hdfs_data_path") + fileName
            data_rdd = self.sc.textFile(file_path)
            return data_rdd
        except Exception as e:
            print(e)

    def data_describe(self):
        sqlContext = SQLContext(self.sc)
        print('starto read data after explore_saprk_step1_cross:')
        rootPath = self.parser.get("hdfs_path", "hdfs_data_path")
        print('start to read actLog_train_single_cross')
        test_file_path = rootPath + 'actLog_test_single_cross'
        actLog_test_rdd = self.sc.pickleFile(test_file_path)
        #比对label,看labels是否合适
        labels = [
            ('duration_time', typ.IntegerType()),
            ('device', typ.IntegerType()),
            ('music_id', typ.IntegerType()),
            ('item_city', typ.IntegerType()),
            ('author_id', typ.IntegerType()),
            ('item_id', typ.IntegerType()),
            ('user_city', typ.IntegerType()),
            ('uid', typ.IntegerType()),
            ('channel', typ.IntegerType()),
            ('finish', typ.IntegerType()),
            ('like', typ.IntegerType()),
            ('time_day', typ.IntegerType()),
            ('item_pub_month', typ.IntegerType()),
            ('item_pub_day', typ.LongType()),
            ('item_pub_hour', typ.IntegerType()),
            ('item_pub_minute', typ.IntegerType()),
            ('uid_count_bin', typ.IntegerType()),
            ('user_city_count_bin', typ.IntegerType()),
            ('user_city_count_ratio', typ.DoubleType()),
            ('item_id_count_bin', typ.IntegerType()),
            ('item_id_count_ratio', typ.DoubleType()),
            ('author_id_count_bin', typ.IntegerType()),
            ('author_id_count_ratio', typ.DoubleType()),
            ('item_city_count_bin', typ.IntegerType()),
            ('item_city_count_ratio', typ.DoubleType()),
            ('music_id_count_bin', typ.IntegerType()),
            ('music_id_count_ratio', typ.DoubleType()),
            ('device_count_bin', typ.IntegerType()),
            ('device_count_ratio', typ.DoubleType()),
            ('uid_author_id_count_bin', typ.IntegerType()),
            ('uid_author_id_count_ratio', typ.DoubleType()),
            ('uid_item_city_count_bin', typ.IntegerType()),
            ('uid_item_city_count_ratio', typ.DoubleType()),
            ('uid_channel_count_bin', typ.IntegerType()),
            ('uid_channel_count_ratio', typ.DoubleType()),
            ('uid_music_id_count_bin', typ.IntegerType()),
            ('uid_music_id_count_ratio', typ.DoubleType()),
            ('uid_device_count_bin', typ.IntegerType()),
            ('uid_device_count_ratio', typ.DoubleType()),
            ('author_id_channel_count_bin', typ.IntegerType()),
            ('author_id_channel_count_ratio', typ.DoubleType()),
            ('author_id_user_city_count_bin', typ.IntegerType()),
            ('author_id_user_city_count_ratio', typ.DoubleType()),
            ('author_id_item_city_count_bin', typ.IntegerType()),
            ('author_id_item_city_count_ratio', typ.DoubleType()),
            ('author_id_music_id_count_bin', typ.IntegerType()),
            ('author_id_music_id_count_ratio', typ.DoubleType()),
            ('uid_channel_device_count_bin',
             typ.IntegerType()),  #改成uid_channel_device
            ('uid_channel_device_count_ratio',
             typ.DoubleType()),  #改成uid_channel_device
            ('author_id_item_city_music_id_count_bin', typ.IntegerType()),
            ('author_id_item_city_music_id_count_ratio', typ.DoubleType()),
        ]
        actionLogSchema = typ.StructType(
            [typ.StructField(e[0], e[1], True) for e in labels])

        df_actLog_test = sqlContext.createDataFrame(actLog_test_rdd,
                                                    actionLogSchema)
        df_actLog_test.show(1, truncate=False)

        print('start to read actLog_train_single_cross')
        train_file_path = rootPath + 'actLog_train_single_cross'
        actLog_train_rdd = self.sc.pickleFile(train_file_path)
        df_actLog_train = sqlContext.createDataFrame(actLog_train_rdd,
                                                     actionLogSchema)
        df_actLog_train.show(1, truncate=False)

        return df_actLog_train, df_actLog_test

    def data_explore(self, df_train, df_test):
        sqlContext = SQLContext(self.sc)

        print("对item_pub_hour进行离散化")

        def hourBin(x):
            if x >= 23 or x <= 2:
                return 1
            elif 3 <= x < 8:
                return 2
            elif 8 <= x < 12:
                return 3
            else:
                return 4

        converHourBin = udf(lambda x: hourBin(x), typ.IntegerType())
        df_train = df_train.withColumn("item_pub_hour",
                                       converHourBin(df_train.item_pub_hour))
        df_test = df_test.withColumn("item_pub_hour",
                                     converHourBin(df_test.item_pub_hour))

        print("--------1、针对uid,authorid,musicid等组合的正负样本数量统计特征--------")
        print("交叉特征的正负样本数量统计")
        posneg_feats_list = []
        # posneg_feats_list.append(["duration_time"])
        # posneg_feats_list.append(["time_day"])
        print('cross count')
        users = ['uid']
        authors = ['author_id', 'item_city', 'channel',
                   'music_id']  #,'item_pub_hour'

        posneg_feats_list.extend([[u_col, a_col] for u_col in users
                                  for a_col in authors])
        # posneg_feats_list.append(['uid','author_id', 'channel'])
        # posneg_feats_list.append(['uid', 'author_id', 'music_id'])
        # posneg_feats_list.append(['uid','author_id', 'channel','time_day'])
        # posneg_feats_list.append(['uid', 'author_id', 'music_id','time_day'])

        print("计算以下交叉特征的正负样本比例")  #有2、3、4维的交叉特征
        print(posneg_feats_list)

        for i in range(len(posneg_feats_list)):
            group_cols = posneg_feats_list[i]
            new_feature = '_'.join(group_cols)
            #计算df_train数据中正负样本的比例,test中直接拼接,为null则填充为0或者均值
            #正负样本判定字段:like  finish
            #d第一步,先拼接
            print(new_feature)
            if len(group_cols) == 2:
                print("开始处理2维交叉变量")
                df_train = df_train.withColumn(
                    new_feature,
                    fn.concat_ws(
                        '_', df_train[group_cols[0]].cast(typ.StringType()),
                        df_train[group_cols[1]].cast(typ.StringType())))
                df_test = df_test.withColumn(
                    new_feature,
                    fn.concat_ws(
                        '_', df_test[group_cols[0]].cast(typ.StringType()),
                        df_test[group_cols[1]].cast(typ.StringType())))

            if len(group_cols) == 3:

                print("开始处理3维交叉变量")
                df_train = df_train.withColumn(
                    new_feature,
                    fn.concat_ws(
                        '_', df_train[group_cols[0]].cast(typ.StringType()),
                        df_train[group_cols[1]].cast(typ.StringType())),
                    df_train[group_cols[2]].cast(typ.StringType()))
                df_test = df_test.withColumn(
                    new_feature,
                    fn.concat_ws(
                        '_', df_test[group_cols[0]].cast(typ.StringType()),
                        df_test[group_cols[1]].cast(typ.StringType())),
                    df_test[group_cols[2]].cast(typ.StringType()))
            # if len(group_cols)==4:
            #
            #     print("开始处理4维交叉变量")
            #     df_train=df_train.withColumn(new_feature, fn.concat_ws('_',df_train[group_cols[0]].cast(typ.StringType()),df_train[group_cols[1]].cast(typ.StringType()))
            #                                                      ,df_train[group_cols[2]].cast(typ.StringType()) ,df_train[group_cols[3]].cast(typ.StringType()))
            #     df_test=df_test.withColumn(new_feature, fn.concat_ws('_',df_test[group_cols[0]].cast(typ.StringType()),df_test[group_cols[1]].cast(typ.StringType()))
            #                                                      ,df_test[group_cols[2]].cast(typ.StringType()) ,df_test[group_cols[3]].cast(typ.StringType()))

            for target in ["like", "finish"]:
                df3 = df_train.select(
                    new_feature,
                    target).groupby(new_feature).count().withColumnRenamed(
                        'count', new_feature + '_count')
                df4 = df_train.select(
                    new_feature, target).where(df_train[target] == 1).groupby(
                        new_feature).count().withColumnRenamed(
                            'count', new_feature + "_count_" + target + "_1")
                df3 = df3.join(df4, new_feature, 'left').na.fill(0)
                del df4
                gc.collect()
                # print("两列相除:得到正样本的比例",target)
                df3 = df3.withColumn(
                    new_feature + "_" + target + "_pos_neg",
                    fn.col(new_feature + "_count_" + target + "_1") /
                    fn.col(new_feature + '_count'))
                df3 = df3.drop(new_feature + "_count_" + target + "_1",
                               new_feature + '_count')
                print("新的df_train", new_feature, target)
                df_train = df_train.join(df3, new_feature, "left")
                df_train.show(1)
                df_test = df_test.join(df3, new_feature,
                                       "left")  #会存在null,缺失值设置为0
                print("新的df_test", new_feature, target)
                df_test.show(1)
                df_test = df_test.na.fill(0)
                del df3
                gc.collect()
            if new_feature not in ["duration_time", "time_day"]:
                df_train = df_train.drop(new_feature)
                df_test = df_test.drop(new_feature)
                df_train.printSchema()
                df_test.printSchema()

        print('最终表结构,该表结构用于concate的输入'
              )  #是不是应该有build_data_for_like  build_data_for_finish
        df_train.printSchema()
        df_test.printSchema()

        print("查看test缺失值")
        df_test.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing')
                      for c in posneg_feats_list]).show()
        print("查看train缺失值")
        df_train.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c +
                                                                 '_missing')
                       for c in posneg_feats_list]).show()

        print('-------5.保存数据预处理结果-------')
        test_file_path = self.parser.get(
            "hdfs_path", "hdfs_data_path") + 'actLog_test_step2'
        os.system("hadoop fs -rm -r {}".format(test_file_path))
        df_test.rdd.map(tuple).saveAsPickleFile(test_file_path)

        del df_test
        gc.collect()

        train_file_path = self.parser.get(
            "hdfs_path", "hdfs_data_path") + 'actLog_train_step2'
        os.system("hadoop fs -rm -r {}".format(
            train_file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        df_train.rdd.map(tuple).saveAsPickleFile(train_file_path)
Beispiel #42
0
#             mywriter.writerow(row);

# os.environ["SPARK_HOME"] = "/apps/spark/spark-1.4.1-bin-hadoop2.6/";
 
conf = SparkConf().setAppName("Spark Test").setMaster("spark://spnode01:7077");
sc = SparkContext(conf=conf);
 
features, labels = loadTrainSet("hdfs://spnode01:9000/kaggle/DigitRecognizer/train.csv", sc);
  
m = features.count();
k = 5;
  
features = features.collect();
labels = labels.collect();
  
featuresBC = sc.broadcast(features);
labelsBC = sc.broadcast(labels);

testDatas = loadTestSet("hdfs://spnode01:9000/kaggle/DigitRecognizer/test.csv", sc);

testDatas.cache();

result = testDatas.map(lambda line : ((((np.tile(line, (m, 1)) - featuresBC.value) ** 2).sum(axis=1)) ** 0.5)
          .argsort()).map(lambda line : [line[i] for i in range(k)]).map(lambda line : map(lambda x : labelsBC.value[x], line)).map(lambda line : {key : line.count(key) for key in set(line)}).map(lambda line : sorted(line.iteritems(), key=operator.itemgetter(1), reverse=True)[0][0]);

# result = result.collect();
# generateResultFile('/home/hadoop/workdatas/kaggle/DigitRecognizer/result_spark.csv', result);

result.repartition(1).saveAsTextFile("hdfs://spnode01:9000/kaggle/DigitRecognizer/result.spark");

sc.stop();
def launch_spark_job():
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SQLContext
    from pyspark.sql.functions import concat, col, lit

    readFile = sys.argv[1]
    k = int(sys.argv[2])
    num_partitions = int(sys.argv[3])
    conf = SparkConf().setAppName("reads Loader" + str(num_partitions))
    sc = SparkContext(conf=conf)
    sc.addPyFile("utils.py")
    sc.setCheckpointDir(
        "hdfs://doop-mng1.haifa.ibm.com:8020/projects/Store_Analytics/SparkCheckPoints"
    )
    import utils
    # from utils import map_read_to_anchors_list, convert_anchors_list_to_seq_edges
    readLines = (
        sc.newAPIHadoopFile(
            readFile,
            'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
            'org.apache.hadoop.io.LongWritable',
            'org.apache.hadoop.io.Text',
            conf={'textinputformat.record.delimiter': '@'})  #, 
        .map(lambda delim_lines_tup: delim_lines_tup[1]
             )  # keeps just the lines and not the @ delimiter
        .filter(lambda x: x.startswith("SRR")
                )  # gets rid of entries due to '@' appearing in the wrong line
        .map(lambda x: x.split("\n")[:2]
             )  # splits the lines, keeps only the first two
        .filter(lambda x: len(x) == 2)  # git rid of any cut off records
        .repartition(num_partitions)
        # .cache()
    )

    print("----------------------there are %i reads" % (readLines.count()))

    # get new RDD including lists of kmers (with no Ns), (k+1)mers
    kmers = (readLines.map(lambda entry: entry[1]).flatMap(
        lambda read: getKmerToNextCharCounts(read, k)))

    print("----------------------there are %i kmers instances" %
          (kmers.count()))

    kmers_with_exts = (kmers.reduceByKey(func=lambda x, y: x + y))

    print("----------------------there are %i distinct kmers" %
          (kmers_with_exts.count()))

    junctions = kmers_with_exts.filter(lambda kmer_tup: my_filter(kmer_tup))

    print("----------------------there are %i junctions" % junctions.count())

    # for i in junctions.take(10):
    #     if sum(i[1])>1:
    #         print i

    generate_juncs = build_partial_junctions_set()
    junctions_set_rdd = (junctions.mapPartitions(generate_juncs).reduceByKey(
        merge_sets).collect())

    juncs_broadcast = sc.broadcast(junctions_set_rdd[0][1])
    print("----------------------there are %i junctions in broadcast" %
          len(juncs_broadcast.value))

    # build edge set rdd, filter out edges including a junction at some end

    def read_line_map_function(read_line):
        return utils.map_read_to_anchors_list(read_line[1], k - 10, 10,
                                              juncs_broadcast.value)

    edges_rdd = (readLines.map(
        lambda read_line: read_line_map_function(read_line)).flatMap(
            lambda anchors: utils.convert_anchors_list_to_seq_edges(anchors),
            preservesPartitioning=True).filter(
                lambda (a, b, c): a not in juncs_broadcast.value and b not in
                juncs_broadcast.value))

    print("----------------------there are %i total edges" % edges_rdd.count())

    # create SQLContext to be able to create dataFrame from rdd
    sqc = SQLContext(sc)
    edges_df = sqc.createDataFrame(edges_rdd, ["src", "dst", "overlap"])
    vertices_df = edges_df.select(
        concat(col("src"), lit(" "), col("dst")).alias('id')).dropDuplicates()
    g = GraphFrame(vertices_df, edges_df)

    # vertices_df.agg(*[count(c).alias(c) for c in vertices_df.columns]).show()

    print("----------------------there are %i total vertices" %
          vertices_df.count())

    # get connected components of remaining graph

    result = g.connectedComponents()
    result.select("id", "component").orderBy("component").show()
Beispiel #44
0
from pyspark import SparkConf, SparkContext


def loadMovieNames():
    movieNames = {}
    with open("ml-1m/movies.dat") as f:
        for line in f:
            fields = line.split('::')
            movieNames[int(fields[0])] = fields[1]
    return movieNames


conf = SparkConf().setMaster("local").setAppName("PopularMovies")
sc = SparkContext(conf=conf)

nameDict = sc.broadcast(loadMovieNames())

lines = sc.textFile("ml-1m/ratings.dat")
movies = lines.map(lambda x: (int(x.split("::")[1]), 1))
movieCounts = movies.reduceByKey(lambda accum, current: accum + current)

flipped = movieCounts.map(lambda (movieId, count): (count, movieId))
sortedMovies = flipped.sortByKey()

sortedMoviesWithNames = sortedMovies.map(lambda (count, movieId):
                                         (nameDict.value[movieId], count))

results = sortedMoviesWithNames.collect()

for result in results:
    print(result)
Beispiel #45
0
        })
    # reason:1497条
    df2 = sqlContext.read.jdbc(url='jdbc:mysql://cdh5-slave2:3306/laws_doc',
                               table='(select id,name,uid from reason ) tmp2',
                               column='id',
                               lowerBound=1,
                               upperBound=1500,
                               numPartitions=1,
                               properties={
                                   "user": "******",
                                   "password": "******"
                               })

    # acc = sc.accumulator(0)
    # print "df.count()======================" + str(df.count())
    reason_broadcast = sc.broadcast(df2.map(lambda x: (x[1], x[2])).collect())
    uuid_reason = df.map(lambda x: x).map(
        lambda x: get_reason(x))  #title_trial_process
    # (x[1], ("||".join(list(set(name))), reason_uids, casedate, plt_claim, dft_rep, crs_exm))
    # print "uuid_reason.count()======================" + str(uuid_reason.count())
    # uuid_reason.foreach(p)
    # print "uuid_reason=============="+str(uuid_reason.count())
    uuid_court = df.map(lambda x: (x[2], x[1]))  #court,uuid
    # print "uuid_court==============" + str(uuid_court.count())
    court_province_full_uid = df1.map(lambda x:
                                      (x[1],
                                       (x[2], x[3])))  #court,province,full_uid
    uuid_province_full_uid = uuid_court.join(court_province_full_uid).map(
        lambda x: x[1])  #uuid_court中的法院,court表里面可能没有,court表不全,因此uuid记录数会少。

    # .map(lambda x: (x[0], x[1][0], x[1][1]))  # uuid,province,full_uid
    # return "\001".join([str(valid_jsontxt(i)) for i in result])
def quchong(x, y):
    max = 0
    item_list = y
    for ln in item_list:
        if int(ln[-1]) > max:
                max = int(ln[-1])
                y = ln
    result = y
    lv = []
    for ln in result:
        lv.append(str(valid_jsontxt(ln)))
    return "\001".join(lv)

s1 = "/commit/iteminfo/20161110"
s2 = "/commit/iteminfo/20161111"
s3 = "/commit/iteminfo/20161112"
rdd1 = sc.textFile(s1)
rdd2 = sc.textFile(s2)
rdd3 = sc.textFile(s3)
rdd = rdd1.union(rdd2).union(rdd3)
c_dim = "/hive/warehouse/wlbase_dev.db/t_base_ec_dim/ds=20151023/1073988839"
cate_dict = sc.broadcast(sc.textFile(c_dim).map(lambda x: get_cate_dict(x)).filter(lambda x:x!=None).collectAsMap()).value
rdd_c = rdd.map(lambda x:f(x,cate_dict)).filter(lambda x:x!=None)
rdd_c.groupByKey().mapValues(list).map(lambda (x,y): quchong(x,y))\
    .saveAsTextFile("/user/wrt/temp/shuang11_iteminfo")


# hfs -rmr /user/wrt/temp/shuang11_iteminfo
# spark-submit --executor-memory 6G   --driver-memory 8G  --total-executor-cores 80  shuang11_item_info.py
# LOAD DATA  INPATH '/user/wrt/temp/shuang11_iteminfo' OVERWRITE INTO TABLE wlservice.t_wrt_tmp_shuang11_iteminfo_new;
def main():

    sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer")
    #sc = SparkContext("local[*]", appName="RedditBatchLayer")
    bcURL = sc.broadcast(urlTitlePool)
    sqlContext = SQLContext(sc)

    conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    #conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET)

    def addTitleURL(cmtTuple):
        # 150,000/ 3000 = avg 50 comments/topic
        onePst = bcURL.value[randint(0, 3000)]
        return cmtTuple + (onePst[0], onePst[1])  # adding title and url

    if (smallBatch):
        logFile = 's3a://reddit-comments/2007/RC_2007-10'
        #df = sqlContext.read.json(logFile)
        df = sqlContext.jsonFile(logFile)
        users_rdd = df.filter(df['author'] != '[deleted]')
        year = 2007
        month = 12
        users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                             .map(addTitleURL)
        #.repartition(REPARTITION_SIZE)
        users_row.foreachPartition(insert_into_cassandra)

        # calculate user relationship graph
        # (URL, user) tuple
        post2user = users_row.map(lambda x: (x[10], x[0]))
        #graph     = post2user.join(post2user)\                       # self join to find user relationship by posts
        #                     .filter(lambda x: x[1][0] != x[1][1])\  # remove all self linked relationship
        #                     .map(makeAscOrder)\                     # make to asc order by user name
        #                     .distinct()\        # remove duplicated user pairs, because the relationship is mutual
        #                     .map(lambda x: (x[1], 1))\              # ready tho count number of common edges
        #                     .reduceByKey(lambda x, y: x+y)\         # count total number for every edge/relationship
        #                     .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table
        graph     = post2user.join(post2user)\
                             .filter(lambda x: x[1][0] != x[1][1])\
                             .map(makeAscOrder)\
                             .distinct()\
                             .map(lambda x: (x[1], 1))\
                             .reduceByKey(lambda x, y: x+y)\
                             .map(lambda x: (x[0][0], x[1], x[0][1]))
        graph.foreachPartition(insert_graph)

    else:

        for key in bucket.list():
            if '-' not in key.name.encode(
                    'utf-8'):  # filter out folders and _SUCCESS
                continue
            logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET,
                                             key.name.encode('utf-8'))
            year = logFile.split('-')[1][-4:]
            month = logFile.split('-')[2]
            from_year = FROM_YEAR_MONTH.split('_')[0]
            from_month = FROM_YEAR_MONTH.split('_')[1]
            if int(year) < int(from_year) or (int(year) == int(from_year) and
                                              int(month) < int(from_month)):
                continue
            #df = sqlContext.read.json(logFile)
            df = sqlContext.jsonFile(logFile)
            users_rdd = df.filter(df['author'] != '[deleted]')
            #   0                     1                        2                3            4          5          6          7              8           9 (title)   10(url)
            users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\
                                 .map(addTitleURL)
            #.repartition(REPARTITION_SIZE)
            users_row.foreachPartition(insert_into_cassandra)

            # calculate user relationship graph
            # (URL, user) tuple
            post2user = users_row.map(lambda x: (x[10], x[0]))
            #graph     = post2user.join(post2user)\                       # self join to find user relationship by posts
            #                     .filter(lambda x: x[1][0] != x[1][1])\  # remove all self linked relationship
            #                     .map(makeAscOrder)\                     # make to asc order by user name
            #                     .distinct()\        # remove duplicated user pairs, because the relationship is mutual
            #                     .map(lambda x: (x[1], 1))\              # ready tho count number of common edges
            #                     .reduceByKey(lambda x, y: x+y)\         # count total number for every edge/relationship
            #                     .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table
            graph     = post2user.join(post2user)\
                                 .filter(lambda x: x[1][0] != x[1][1])\
                                 .map(makeAscOrder)\
                                 .distinct()\
                                 .map(lambda x: (x[1], 1))\
                                 .reduceByKey(lambda x, y: x+y)\
                                 .map(lambda x: (x[0][0], x[1], x[0][1]))
            #.repartition(REPARTITION_SIZE)
            graph.foreachPartition(insert_graph)

    sc.stop()
Beispiel #48
0
def matrix_vector_mult(tuple):
        return (tuple[0], round((V.value[tuple[1]-1] * tuple[2]) * 0.8, 15))


if __name__ == "__main__":

	if len(sys.argv) != 4:
        print("Usage: modified_pagerank.py inputfile outputpath", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="Pagerank")

    graph_rdd = sc.textFile(sys.argv[1]).repartition(10).cache()

    outlink_rdd = graph_rdd.map(lambda x: (int(x.split("\t")[0]),[int(x.split("\t")[1])])).reduceByKey(lambda x,y: x + y).cache()

    total_nodes = sc.broadcast(outlink_rdd.count())

    M = outlink_rdd.flatMap(weight_matrix).cache()


    # Modified pagerank

    local_v = []

    for x in range(total_nodes.value):
        local_v.append(round(Decimal(1)/ Decimal(total_nodes.value), 15))
        
    V = sc.broadcast(local_v)

    local_e = []
    for x in range(total_nodes.value):
Beispiel #49
0
#TODO run this in jupyter notebook

from pyspark import SparkContext

sc = SparkContext('local[*]', 'pyspark')

my_dict = {"item1": 1, "item2": 2, "item3": 3, "item4": 4} 
my_list = ["item1", "item2", "item3", "item4"]

my_dict_bc = sc.broadcast(my_dict)

def my_func(letter):
    return my_dict_bc.value[letter] 

my_list_rdd = sc.parallelize(my_list)

result = my_list_rdd.map(lambda x: my_func(x)).collect()

print(result)
def loadMovieNames() -> dict:
    movieNames = {}
    # Movie titles include swedish characters which require ISO-8859-1 encoding
    with open(
            "/home/mmanopoli/Udemy/TamingBigDataWithSparkAndPython/data/ml-100k/u.item",
            encoding='iso-8859-1') as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames


conf = SparkConf().setMaster("local[4]").setAppName("PopularMovies")
sc = SparkContext(conf=conf)

nameDict = sc.broadcast(loadMovieNames(
))  # Broadcast the python movieNames object to each excecutor as nameDict

lines = sc.textFile(
    "/home/mmanopoli/Udemy/TamingBigDataWithSparkAndPython/data/ml-100k/u.data"
)

movies = lines.map(lambda x: (int(x.split()[1]), 1))
movieCounts = movies.reduceByKey(lambda x, y: x + y)

# flipped = movieCounts.map( lambda x : (x[1], x[0]))
# sortedMovies = flipped.sortByKey()
sortedMovies = movieCounts.sortBy(lambda x: x[1])

#sortedMoviesWithNames = sortedMovies.map(lambda countMovie : (nameDict.value[countMovie[1]], countMovie[0]))
# countMovie[0] is the Movie ID because I used sortBy - that's what we lookup in nameDict
sortedMoviesWithNames = sortedMovies.map(
        (user1,user2) ->    (similarity,co_raters_count)
    '''
    user_sims = pairwise_users.map(
        lambda p: calcSim(p[0],p[1])).map(
        lambda p: keyOnFirstUser(p[0],p[1])).groupByKey().map(
        lambda p: nearestNeighbors(p[0],p[1],50))

    ''' 
        对每个用户的打分记录整理成如下形式
        user_id -> [(item_id_1, rating_1),
                   [(item_id_2, rating_2),
                    ...]
    '''

    user_item_hist = lines.map(parseVectorOnUser).groupByKey().collect()

    ui_dict = {}
    for (user,items) in user_item_hist: 
        ui_dict[user] = items

    uib = sc.broadcast(ui_dict)

    '''
        为每个用户计算Top N的推荐
        user_id -> [item1,item2,item3,...]
    '''
    user_item_recs = user_sims.map(
        lambda p: topNRecommendations(p[0],p[1],uib.value,100)).collect()


Beispiel #52
0
from __future__ import print_function
from pyspark import SparkConf
from pyspark import SparkContext

sparkconfig = SparkConf()
sparkconfig.setMaster("local[*]")
sparkconfig.setAppName("SparkCSVJOB")


def compute_each_line(eachLine):

    # Fetching the broadcast
    date_code = date_code_broadcast.value

    data_split = eachLine.split(",")

    if data_split[0] in date_code:
        print(eachLine)

    return


sparkcontext = SparkContext(conf=sparkconfig)

date_code_broadcast = sparkcontext.broadcast(["20170104", "20170102"])

textFileRDD = sparkcontext.textFile(
    "/home/dharshekthvel/Downloads/query_result.csv")

textFileRDD.map(compute_each_line).collect()

    sc.stop()

if __name__ == '__main__':
    main()


import sys
sys.path.append('/usr/local/lib/python2.7/site-packages')
sys.path.append('/home/hadoop/app/spark/python')
sys.path.append('/home/hadoop/app/spark/python/lib/py4j-0.8.2.1-src.zip')
from pyspark import SparkContext, SparkConf
from mysql_utils import MySQLUtils
master = 'local[2]'
app_name = 'test-broadcast'
# spark_home = '/data01/app/bigdata/spark'  # local
spark_home = '/home/hadoop/app/spark'  # test

pyFiles = ['mysql_utils.py']
spark_conf = SparkConf()
spark_conf.setMaster(master).setAppName(app_name).setSparkHome(spark_home)
sc = SparkContext(conf=spark_conf)
for path in (pyFiles or []):
    sc.addPyFile(path)

external_cache = get_api_deviceinfo()

deviceinfo_b = sc.broadcast(external_cache)

                         scheduled_departure_time=t[1].scheduled_departure_time,
                         actual_departure_time=t[1].actual_departure_time,
                         departure_delay_minutes=t[1].departure_delay_minutes,
                         scheduled_arrival_time=t[1].scheduled_arrival_time,
                         actual_arrival_time=t[1].actual_arrival_time,
                         arrival_delay_minutes=t[1].arrival_delay_minutes,
                         crs_elapsed_flight_minutes=t[1].crs_elapsed_flight_minutes,
                         distance=t[1].distance)


if __name__ == "__main__":
    sc = SparkContext(appName="InsightEdge Python API Demo: prediction job")
    ssc = StreamingContext(sc, 3)
    sqlc = SQLContext(sc)

    zkQuorum = "localhost:2181"
    topic = "flights"

    model = DecisionTreeModel(Utils.load_model_from_grid("DecisionTreeFlightModel", sc))

    carrier_mapping = sc.broadcast(load_mapping("CarrierMap", sqlc))
    origin_mapping = sc.broadcast(load_mapping("OriginMap", sqlc))
    destination_mapping = sc.broadcast(load_mapping("DestinationMap", sqlc))

    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
    lines = kvs.map(lambda x: x[1])
    lines.foreachRDD(predict_and_save)

    ssc.start()
    ssc.awaitTermination()
Beispiel #55
0
#map data to a binary matrix
#1. get the dictionary of the data
#The dictionary of each document is a list of UNIQUE(set) words 
lists=dataRDD.map(lambda x:list(set(x.strip().split(' ')))).collect()
all=[]
#combine all dictionaries together (fastest solution for Python)
for l in lists:
	all.extend(l)
dict=set(all)
print len(dict)
#it is faster to know the position of the word if we put it as values in a dictionary
dictionary={}
for i,word in enumerate(dict):
	dictionary[word]=i
#we need the dictionary to be available AS A WHOLE throughout the cluster
dict_broad=sc.broadcast(dictionary)
#build labelled Points from data
data_class=zip(data,Y)#if a=[1,2,3] & b=['a','b','c'] then zip(a,b)=[(1,'a'),(2, 'b'), (3, 'c')]
dcRDD=sc.parallelize(data_class,numSlices=16)
#get the labelled points
labeledRDD=dcRDD.map(partial(createBinaryLabeledPoint,dictionary=dict_broad.value))
#Train NaiveBayes
model=NaiveBayes.train(labeledRDD)
#broadcast the model
mb=sc.broadcast(model)

test,names=lf.loadUknown('./data/test')
name_text=zip(names,test)
#for each doc :(name,text):
#apply the model on the vector representation of the text
#return the name and the class
from pyspark import SparkContext, SparkConf

sc = SparkContext(conf=SparkConf().setAppName("Airlines App"))
mainRdd = sc.textFile("airports_mod.dat")
l = [
    'Airport_Id', 'Name', 'City', 'Country', 'IATA', 'ICAO', 'Latitude',
    'Longitude', 'Altitude', 'Timezone', 'DST', 'Tz'
]
l = sc.broadcast(l)


def stringtodict(s):
    i = 0
    d = {}
    k = s.split(',')
    for key in l.value:
        d[key] = k[i]
        i += 1
    return d


mainRddDict = mainRdd.map(stringtodict)
mainRddDict.saveAsPickleFile("airports_mod.pickle")
Beispiel #57
0
    sc = SparkContext(appName="SparkLda")
    text = sc.textFile(sys.argv[1]).repartition(200)
    print "caching file ..."
    text.cache()
    print "counting file ..."
    NumDoc = text.count()
    likelihood, likelihood_old = 0, 0
    print "initialing beta ... "
    beta = rand_init_beta(NumTerm,  K)
    
    print "initialing beta success!  "
    
    # E_M iterate for beta
    for i in range(20):
        print "starting iteration {0} ...".format(i)
        print sys.getsizeof(beta)
        beta_global = sc.broadcast(beta)
        print "broadcast success"
        #new_beta = text.flatMap(lambda line, beta=beta : Expectation(line, beta , Alpha, K)).reduceByKey(add)
        new_beta = text.flatMap(lambda line :  Expectation(line, Alpha, K) ).reduceByKey(add)
        #output = new_beta.collect()
        output = new_beta.count()
        print "output", output
        #print >> open('beta.'+str(i), 'w'), beta
        (beta, likelihood) = update_beta(output, K, NumTerm)
        #print beta
        print "likelihood {0} is {1}".format(i, likelihood)
    print >> open('beta.final', 'w'), beta
    
Beispiel #58
0
    for v in vs:
        tn = v[0]
        v3 = v[1][1]
        if tn == 'e1':
            t1.append(v[1][0])
        else:
            for v1 in t1:
                if v1 < v3:
                    if lu.get((v3, v1), False):
                        count += 1

    return count


if __name__ == '__main__':
    fn = sys.argv[1]  # filename of input
    p = int(sys.argv[2])  # parallelism

    sc = SparkContext(master="local[{}]".format(p), appName="Triangle Count")

    text_file = sc.textFile(fn).filter(maxFilter)
    lookup = sc.broadcast(text_file.flatMap(toLU).collectAsMap())
    count = text_file.flatMap(mymap) \
                    .groupByKey(p) \
                    .mapValues(lambda vs: sorted(vs, key=lambda x: x[0])) \
                    .map(lambda x: checkTriangles(lookup, x[1])) \
                    .reduce(lambda a,b: a + b)

    print(count)
Beispiel #59
0
    starpairs = data.map(extract_user_repo)
    starpairs.cache()

    users = starpairs.map(lambda t: t[0]).distinct()

    # get 5% most popular repos
    repos = starpairs.map(lambda t: t[1]).distinct()
    sample = int(0.01 * repos.count())
    top_repos = starpairs\
        .groupBy(lambda t: t[1])\
        .sortBy(lambda t: len(t[1]), False)\
        .map(lambda t: t[0])\
        .take(sample)
    top_repos_rdd = sc.parallelize(top_repos)
    top_repos_rdd.cache()
    top_repos_bc = sc.broadcast(top_repos)
    pprint(top_repos[:5])

    starpairs_filtered = starpairs.filter(lambda t: t[1] in top_repos_bc.value)
    starpairs_filtered.cache()

    # train recommendation model using alternating least squares
    stars_with_rating = starpairs_filtered.map(lambda t: array([t[0], t[1], 1]))
    model = ALS.trainImplicit(stars_with_rating, rank=1)

    # get all user->repo pairs without stars
    users_repos = users.cartesian(top_repos_rdd).groupByKey()
    stars_grouped = starpairs_filtered.groupByKey()
    unstarred = users_repos.join(stars_grouped)\
        .map(lambda i: (i[0], set(i[1][0]) - set(i[1][1]) ))\
        .flatMap(lambda i: [ (i[0], repo) for repo in i[1] ] )
Beispiel #60
0
class SparkFEProcess:

    def __init__(self):

        self.parser = self.init_config()

        sparkConf = SparkConf().setAppName("feature engineering on spark of explore_spark") \
            .set("spark.ui.showConsoleProgress", "false")
        self.sc = SparkContext(conf=sparkConf)
        self.sc.broadcast(self.parser)
        self.init_logger()
        # #初始化相关参数
        # #bins_dict保存相关列的分箱方案,在处理测试数据的时候使用
        # self.bins_dict={}


    def init_config(self):
        current_path = os.path.dirname(os.path.realpath(__file__))
        workspace_path = current_path.split('featureEngineering')[0]
        config_file = workspace_path + 'resource/config.ini'
        parser = configparser.ConfigParser()
        parser.read(config_file)
        return  parser

    def init_logger(self):
        '''
        设置日志级别
        :param sc:
        :return:
        '''
        logger = self.sc._jvm.org.apache.log4j
        logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
        logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
        logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)


    def read_rdd(self, fileName):
        try:
            file_path = self.parser.get("hdfs_path", "hdfs_data_path") + fileName
            data_rdd = self.sc.textFile(file_path)
            return data_rdd
        except Exception as e:
            print(e)

    def data_describe(self):
        print('starto read data for rdd:')
        rawRdd_train = self.read_rdd('final_track2_train.txt').map(lambda line : line.split('\t'))
        rawRdd_test = self.read_rdd('final_track2_test_no_anwser.txt').map(lambda line : line.split('\t'))
        print('finish read rdd, start to init action log rdd:')
        actionLogRdd_train = rawRdd_train.map(
            lambda x :(int(x[0]), int(x[1]), int(x[2]), int(x[3]), int(x[4]), int(x[5]),
                       int(x[6]), int(x[7]), int(x[8]), int(x[9]), int(x[10]), int(x[11])))
        # total = actionLogRdd_train.count()
        # print('total: ' + str(total))

        actionLogRdd_test = rawRdd_test.map(
            lambda x :(int(x[0]), int(x[1]), int(x[2]), int(x[3]), int(x[4]), int(x[5]),
                       int(x[6]), int(x[7]), int(x[8]), int(x[9]), int(x[10]), int(x[11])))

        #转化为dataframe
        sqlContext = SQLContext(self.sc)
        labels=[('uid',typ.IntegerType()),
            ('user_city',typ.IntegerType()),
            ('item_id',typ.IntegerType()),
            ('author_id',typ.IntegerType()),
            ('item_city',typ.IntegerType()),
            ('channel',typ.IntegerType()),
            ('finish',typ.IntegerType()),
            ('like',typ.IntegerType()),
            ('music_id',typ.IntegerType()),
            ('device',typ.IntegerType()),
            ('time',typ.LongType()),
            ('duration_time',typ.IntegerType())]
        actionLogSchema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels])

        dfactionLog_train = sqlContext.createDataFrame(actionLogRdd_train, actionLogSchema)
        dfactionLog_test = sqlContext.createDataFrame(actionLogRdd_test, actionLogSchema)

        dfactionLog_train=dfactionLog_train.filter(dfactionLog_train['duration_time']<=300)
        dfactionLog_test=dfactionLog_test.filter(dfactionLog_test['duration_time']<=300)
        #train和test合并,并且保存保存train的数量,以便拆分   union可能会改变frame中的顺序
        # df=dfactionLog_train.union(dfactionLog_test)
        # train_count=dfactionLog_train.count()
        # print("训练集的数量"+str(train_count))
        # test_count=dfactionLog_test.count()
        # print("测试集的数量"+str(test_count))

        # print('-------2.finish\like各特征下值的个数-------------')
        # df.agg( fn.countDistinct('finish').alias('finish_distinct'), \
        #         fn.countDistinct('like').alias('like_distinct')
        #         ).show()
        # print("各特征下的最大值,最小值")
        # df.describe().show()


        return dfactionLog_train, dfactionLog_test

    def bining(self,sqlContext,df,col,percent_list):
        '''
        :param sqlContext:
        :param df:
        :param col:  需要分箱的列
        :return:
        '''
        pandas_df = df.toPandas()
        bins=[]
        for percent in percent_list:
            bins.append(np.percentile(pandas_df.loc[:,col],percent))  #至少有20%的数据项小于或等于这个值
        print(col+'查看分箱')
        print(bins)
        pandas_df.loc[:,col]=np.digitize(pandas_df.loc[:,col],bins,right=True)
        # print(pandas_df)

        #修改pandas中的列名
        pandas_df.rename(columns={col:col+'_bin'}, inplace = True)
        df_spark= sqlContext.createDataFrame(pandas_df)
        # df_spark.show()
        return  df_spark


    def city_col_deal(self,df,col):
        df_city_score=df.groupBy(col).avg('finish', 'like') \
            .withColumnRenamed("avg(finish)","avg_finish").withColumnRenamed("avg(like)","avg_like")
        df_city_score=df_city_score.withColumn(col+'_score', df_city_score.avg_finish*0.7+df_city_score.avg_like*0.3)\
                              .select(col,fn.bround(col+'_score', scale=4).alias(col+'_score'))
        return df_city_score

    def dropUnuseCols(self,df,unuse_col):
        '''
        #删除没有必要的列
        #device|time|author_id|music_id| uid|item_id|
        #保留一下列
        #  user_city|item_city|channel|finish|like|duration_time
        # device_Cnt_bin|item_pub_hour||authorid_Cnt_bin|musicid_Cnt_bin|uid_playCnt_bin|itemid_playCnt_bin
        '''
        # unuse_col=['device','time','author_id','music_id','uid','item_id']
        for col in unuse_col:
            df=df.drop(col)
        return df


    def data_explore(self,df_train,df_test):

        sqlContext = SQLContext(self.sc)
        print("duration_time应该根据喜欢和不喜欢来分箱")
        print("查看duration_time的分布")
        print()
        print("------------1、通过时间戳获取年月日时分,(没有工作日特征,月日交叉表示节日特征,年份转化有问题)-----------------")


        #作品发布时间-作品发布的最早时间,转化为day
        time_min = df_train.select(fn.min(df_train['time'])).collect()
        df_train=df_train.withColumn('time_day', ((df_train.time-fn.lit(time_min[0][0])) /fn.lit(3600 * 24)).cast(typ.IntegerType()))
        # df_train=df_train.withColumn('time_strDate',fn.from_unixtime(df_train.time , "yyyy-MM-dd HH:mm:ss"))
        #将 unix 格式的时间戳转换为指定格式的日期,提取小时
        df_train=df_train.withColumn('item_pub_month',fn.from_unixtime(df_train.time , "M").cast(typ.IntegerType()))
        df_train=df_train.withColumn('item_pub_day',fn.from_unixtime(df_train.time , "d").cast(typ.IntegerType()))
        df_train=df_train.withColumn('item_pub_hour',fn.from_unixtime(df_train.time , "k").cast(typ.IntegerType()))
        df_train=df_train.withColumn('item_pub_minute',fn.from_unixtime(df_train.time , "m").cast(typ.IntegerType()))
        print("查看month,day,hour,minute的提取是否正确")
        df_train.show(truncate=False)
        df_train=df_train.drop('time')
        #对时间提取的这部分字段进行count后进行分箱并不明显,就直接当作类别变量处理就可以了,另外增加pos_neg_ratio特征


        df_test=df_test.withColumn('time_day', ((df_test.time-fn.lit(time_min[0][0])) /fn.lit(3600 * 24)).cast(typ.IntegerType()))
        df_test=df_test.withColumn('item_pub_month',fn.from_unixtime(df_test.time , "M").cast(typ.IntegerType()))
        df_test=df_test.withColumn('item_pub_day',fn.from_unixtime(df_test.time , "d").cast(typ.IntegerType()))
        df_test=df_test.withColumn('item_pub_hour',fn.from_unixtime(df_test.time , "k").cast(typ.IntegerType()))
        df_test=df_test.withColumn('item_pub_minute',fn.from_unixtime(df_test.time , "m").cast(typ.IntegerType()))
        df_test=df_test.drop('time')

        print('--------2、统计特征:count、ratio、nunique、ctr相关特征')
        print("计算基础特征和交叉特征的count、类别偏好的ratio")
        count_feats_list = []

        print('single feature count')
        count_feats_list.extend([[c] for c in df_train.columns if c not in ['time', 'channel', 'like', 'finish','dutration_time',"time_day","item_pub_month","item_pub_day","item_pub_hour","item_pub_minute"]])
        print(count_feats_list)

        print('cross count')
        users = ['uid']
        authors = ['item_id', 'user_city', 'author_id', 'item_city', 'channel', 'music_id', 'device','item_pub_hour']
        count_feats_list.extend([[u_col, a_col] for u_col in users for a_col in authors])

        users = ['author_id']
        authors = ['channel', 'user_city', 'item_city', 'music_id',  'item_pub_hour']
        count_feats_list.extend([[u_col, a_col] for u_col in users for a_col in authors])

        count_feats_list.append(['uid', 'user_city', 'channel', 'device'])
        count_feats_list.append(['author_id', 'item_city', 'music_id','item_pub_hour'])
        print("计算count的字段有以下这些")
        print(count_feats_list)

        for i in range(len(count_feats_list)):
           group_cols=count_feats_list[i]
           new_feature = '_'.join(group_cols)
           #判断是几维交叉特征,并进行拼接,再计算每个特征值的个数count,并完成映射
           if len(group_cols)==1:
              if new_feature in ["music_id"] :
                  df1 = df_train.where(df_train[new_feature]!=-1).groupby(new_feature).count()\
                          .withColumnRenamed('count',new_feature+'_count')
              else:
                  df1 = df_train.groupby(new_feature).count()\
                          .withColumnRenamed('count',new_feature+'_count')
              #类别偏好的ratio比例
              count_min = df1.select(fn.min(df1[new_feature+'_count'])).collect()[0][0]
              count_max = df1.select(fn.max(df1[new_feature+'_count'])).collect()[0][0]
              # F.bround("Rank", scale=4)
              df1=df1.withColumn(new_feature+'_count_ratio', fn.bround(((df1[new_feature+'_count']-fn.lit(count_min)) /((fn.lit(count_max)-fn.lit(count_min)).cast(typ.IntegerType()))),scale=3))
              # print("查看df1_1")
              # df1.show(5,truncate=False)
              if new_feature=="device":   #[1.0, 16.0, 46.0, 102.0, 204.0, 410.0, 10389.0] 修改
                 percent_list=[0,10,20,30,40,50,60,70,80,90,100]
              elif new_feature=="author_id":  #[1.0, 2.0, 7.0, 32.0, 78.0, 276186.0]
                  percent_list=[0,50,75,90,95,100]
              elif new_feature=="music_id":   #[1.0, 3.0, 13.0, 73.0, 211.0, 193640.0]
                 percent_list=[0,50,75,90,95,100]   #每个percent_list不相同
              elif new_feature=="uid":       #分箱[1.0, 104.0, 329.0, 741.0, 1131.0, 10389.0]
                  percent_list=[0,50,75,90,95,100]
              elif new_feature=="item_id":   #[1.0, 1.0, 2.0, 7.0, 14.0, 6911.0]  分箱修改
                  percent_list=[0,75,90,95,100]
              elif new_feature=="user_city":  #[1.0, 21935.5, 54519.5, 110179.0, 146319.75, 3789087.0] 修改
                  percent_list=[0,10,20,30,40,50,60,70,80,90,100]
              elif new_feature=="item_city":  #[1.0, 14725.0, 48576.0, 122887.0, 206845.5, 744265.0]  修改
                  percent_list=[0,10,20,30,40,50,60,70,80,90,100]
              else:
                  percent_list=[0,10,20,30,40,50,60,70,80,90,100]

              df1=self.bining(sqlContext,df1,new_feature+'_count',percent_list)
              # print(df1.show(5,truncate=False))
              df_train=df_train.join(df1,new_feature,'left')
              # print("train")
              # df_train.show(5,truncate=False)   #ratio是一个连续变量,范围0-1
              df_test=df_test.join(df1,new_feature,'left')
              # print("test")
              # df_test.show(5,truncate=False)   #ratio是一个连续变量,范围0-1
              del df1
              gc.collect()
           print("输出所有一维特征处理后的结果")
           df_train.show(1,truncate=False)
           df_train.printSchema()
           df_test.show(1,truncate=False)
           df_train.printSchema()

           if len(group_cols)==2:
              print("开始处理2维交叉变量")
              df_train=df_train.withColumn(new_feature, fn.concat_ws('_',df_train[group_cols[0]].cast(typ.StringType()),df_train[group_cols[1]].cast(typ.StringType()))
                                                             )
              df_test=df_test.withColumn(new_feature, fn.concat_ws('_',df_test[group_cols[0]].cast(typ.StringType()),df_test[group_cols[1]].cast(typ.StringType()))
                                                             )
              df2 = df_train.groupby(new_feature).count()\
                     .withColumnRenamed('count',new_feature+'_count')
              #类别偏好的ratio比例
              count_min = df2.select(fn.min(df2[new_feature+'_count'])).collect()[0][0]
              count_max = df2.select(fn.max(df2[new_feature+'_count'])).collect()[0][0]
              # F.bround("Rank", scale=4)
              df2=df2.withColumn(new_feature+'_count_ratio', fn.bround(((df2[new_feature+'_count']-fn.lit(count_min)) /((fn.lit(count_max)-fn.lit(count_min)).cast(typ.IntegerType()))),scale=3))
              # print("查看df1_1")
              # df2.show(5,truncate=False)
              if new_feature=="uid_item_id":
                 percent_list=[0,20,35,50,65,85,100]   #每个percent_list不相同
              else:
                 percent_list=[0,50,75,90,95,100]
              # elif new_feature=="uid_user_city":
              #     percent_list=[0,50,75,90,95,100]
              # elif new_feature=="uid_author_id":
              #    percent_list=[0,50,75,90,95,100]   #每个percent_list不相同
              # elif new_feature=="uid_item_city":
              #     percent_list=[0,50,75,90,95,100]
              # elif new_feature=="uid_channel":
              #     percent_list=[0,50,75,90,95,100]
              # elif new_feature=="uid_music_id":
              #     percent_list=[0,50,75,90,95,100]
              # elif new_feature=="uid_device":
              #     percent_list=[0,50,75,90,95,100]
              # elif new_feature=="uid_time_pub_hour":
              #     percent_list=[0,50,75,90,95,100]

              # ['uid', 'item_id'], ['uid', 'user_city'], ['uid', 'author_id'], ['uid', 'item_city'], ['uid', 'channel'], ['uid', 'music_id'],
              #  ['uid', 'device'], ['uid', 'time_pub_hour']
              #['author_id', 'channel'], ['author_id', 'user_city'], ['author_id', 'item_city'], ['author_id', 'music_id'], ['author_id', 'time_pub_hour']

              df2=self.bining(sqlContext,df2,new_feature+'_count',percent_list)
              print("查看df2_2")
              df2.show(5,truncate=False)
              df_train=df_train.join(df2,new_feature,'left')
              # print("train")
              # df_train.show(5,truncate=False)   #ratio是一个连续变量,范围0-1
              df_test=df_test.join(df2,new_feature,'left')
              # print("test")
              # df_test.show(5,truncate=False)


           if len(group_cols)==4:
              print("开始处理4维交叉变量")
              df_train=df_train.withColumn(new_feature, fn.concat_ws('_',df_train[group_cols[0]].cast(typ.StringType()),df_train[group_cols[1]].cast(typ.StringType()),
                                                             df_train[group_cols[2]].cast(typ.StringType()),df_train[group_cols[3]].cast(typ.StringType()))
                                                           )
              df_test=df_test.withColumn(new_feature, fn.concat_ws('_',df_test[group_cols[0]].cast(typ.StringType()),df_test[group_cols[1]].cast(typ.StringType()),
                                                             df_test[group_cols[2]].cast(typ.StringType()),df_test[group_cols[3]].cast(typ.StringType()))
                                                           )

              df3 = df_train.groupby(new_feature).count()\
                     .withColumnRenamed('count',new_feature+'_count')

              #类别偏好的ratio比例
              count_min = df3.select(fn.min(df3[new_feature+'_count'])).collect()[0][0]
              count_max = df3.select(fn.max(df3[new_feature+'_count'])).collect()[0][0]
              # F.bround("Rank", scale=4)
              df3=df3.withColumn(new_feature+'_count_ratio', fn.bround(((df3[new_feature+'_count']-fn.lit(count_min)) /((fn.lit(count_max)-fn.lit(count_min)).cast(typ.IntegerType()))),scale=3))
              # print("查看df3_1")
              # df3.show(5,truncate=False)
              percent_list=[0,50,75,90,95,100]
              df3=self.bining(sqlContext,df3,new_feature+'_count',percent_list)
              print("查看df3_2")
              df3.show(5,truncate=False)
              df_train=df_train.join(df3,new_feature,'left')
              # print("train")
              # df_train.show(5,truncate=False)
              # ['uid', 'user_city', 'channel', 'device'], ['author_id', 'item_city', 'music_id', 'time_pub_hour']
              df_test=df_test.join(df3,new_feature,'left')
              # print("test")
              # df_test.show(5,truncate=False)
        # df.show(5,truncate=False)
        print("删除没有必要的列")
        unuse_col=['item_city','user_city','device','author_id','music_id',]  #'uid','item_id'这两列不能删除,后面提交结果的时候应该要用到
        df_train=self.dropUnuseCols(df_train,unuse_col)
        df_test=self.dropUnuseCols(df_test,unuse_col)

        print("表中含有为null的字段,主要产生在leftjoin的时候")
        print("这一步先不做,三表联合的时候会填充")
        # df_train=df_train.na.fill(-1)
        # df_test=df_test.na.fill(-1)

        print("查看train的统计信息")
        desc = df_train.describe()
        desc.show()
        print("查看test的统计信息")
        desc = df_test.describe()
        desc.show()


        print('-------5.保存数据预处理结果-------')
        test_file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'actLog_test_new'
        os.system("hadoop fs -rm -r {}".format(test_file_path))
        df_test.rdd.map(tuple).saveAsPickleFile(test_file_path)

        del df_test
        gc.collect()

        train_file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'actLog_train_new'
        os.system("hadoop fs -rm -r {}".format(train_file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        df_train.rdd.map(tuple).saveAsPickleFile(train_file_path)




        '''