def main(inputfolderpath, outputfolderpath, jobname): #inputfolderpath = "hdfs://santa-fe:47001/Source-Recommendation-System/FakeNewsCorpus/news_cleaned_2018_02_13.csv" #inputfolderpath = "hdfs://santa-fe:47001/FakeNewsCorpus/news_cleaned_2018_02_13.csv" #inputfolderpath = "hdfs://santa-fe:47001/FakeNewsCorpus-Outputs/news_cleaned_partitioned/news_cleaned_2018_02_1300000" #inputfolderpath = "hdfs://santa-fe:47001/Source-Recommendation-System/FakeNewsCorpus/news_sample.csv" #outputfolderpath = "hdfs://santa-fe:47001/Source-Recommendation-System/FakeNewsCorpus-Outputs" #outputfolderpath = "hdfs://santa-fe:47001/FakeNewsCorpus-Outputs/KeywordsFromPartitions/news_cleaned_partitioned/news_cleaned_2018_02_1300000temp" title_score = 10 keywords_score = 13 meta_keywords_score = 13 meta_description_score = 13 tags_score = 13 summary_score = 10 #spark = SparkSession.builder.appName(jobname).getOrCreate() sc = SparkContext(master="spark://santa-fe.cs.colostate.edu:47002", appName=jobname) delete_path(sc, outputfolderpath) sqlContext = SQLContext(sc) inputfile_rdd = sqlContext.read.csv(inputfolderpath, header=True,sep=",", multiLine = True, quote='"', escape='"')\ .rdd.repartition(29) keywords_from_content = inputfile_rdd\ .filter(lambda row : row["content"] is not None and row["content"] != "null")\ .map(lambda row : extract_with_row_id(row["id"], row["content"]))\ .flatMap(lambda xs: [(x) for x in xs]) keywords_from_title = inputfile_rdd\ .filter(lambda row : row["title"] is not None and row["title"] != "null")\ .map(lambda row : [(x,"(" + str(row["id"]) + "," + str(title_score) + ")") for x in get_processed_words(row["title"])])\ .flatMap(lambda xs: [(x) for x in xs]) keywords_from_keywords_col = inputfile_rdd\ .filter(lambda row : row["keywords"] is not None and row["keywords"] != "null")\ .map(lambda row : [(x.lower(),"(" + str(row["id"]) + "," + str(keywords_score) + ")") for x in get_keywords_from_keywords_col(row["keywords"])])\ .flatMap(lambda xs: [(x) for x in xs]) keywords_from_meta_keywords = inputfile_rdd\ .filter(lambda row : row["meta_keywords"] is not None and row["meta_keywords"] != "null")\ .map(lambda row : [(x.lower(),"(" + str(row["id"]) + "," + str(meta_keywords_score) + ")") for x in parse_meta_keywords(row["meta_keywords"]) if len(x) > 1 ])\ .flatMap(lambda xs: [(x) for x in xs]) keywords_from_meta_description = inputfile_rdd\ .filter(lambda row : row["meta_description"] is not None and row["meta_description"] != "null")\ .map(lambda row : [(x, "(" + str(row["id"]) + "," + str(meta_description_score) + ")") for x in get_processed_words(row["meta_description"])])\ .flatMap(lambda xs: [(x) for x in xs]) keywords_from_tags = inputfile_rdd\ .filter(lambda row : row["tags"] is not None and row["tags"] != "null")\ .map(lambda row : [(x.lower(), "(" + str(row["id"]) + "," + str(tags_score) + ")") for x in str(row["tags"].encode('ascii', "ignore")).split(",") ])\ .flatMap(lambda xs: [(x) for x in xs]) keywords_from_summary = inputfile_rdd\ .filter(lambda row : row["summary"] is not None and row["summary"] != "null")\ .map(lambda row : extract_with_row_id(row["id"], row["summary"]))\ .flatMap(lambda xs: [(x) for x in xs]) all_keywords_list = [keywords_from_content, keywords_from_title, keywords_from_keywords_col, keywords_from_meta_keywords, keywords_from_meta_description, keywords_from_tags, keywords_from_summary] all_keywords_rdd = sc.union(all_keywords_list) all_keywords_rdd = all_keywords_rdd\ .filter(lambda row: len(row[0]) > 2)\ .reduceByKey(concat) all_keywords_df = all_keywords_rdd.toDF(["Keyword", "RowId & Score"]) all_keywords_df.write.csv(outputfolderpath, header=True, quote='"', escape='"') sc.stop()
features = example.features.feature image = numpy.array(features['image'].int64_list.value) label = numpy.array(features['label'].int64_list.value) return (image, label) dataRDD = images.map(lambda x: toNumpy(str(x[0]))) else: if args.format == "csv": images = sc.textFile( args.images).map(lambda ln: [int(x) for x in ln.split(',')]) labels = sc.textFile( args.labels).map(lambda ln: [float(x) for x in ln.split(',')]) else: # args.format == "pickle": images = sc.pickleFile(args.images) labels = sc.pickleFile(args.labels) sc.union() print("zipping images and labels") dataRDD = images.zip(labels) cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK, log_dir=args.model) if args.mode == "train": cluster.train(dataRDD, args.epochs) else: labelRDD = cluster.inference(dataRDD)
output_file_com = sys.argv[4] data = input_file.map(lambda x: x.split(',')).filter(lambda x: x[0] != "user_id") bu = data.map(lambda x: (x[1], [x[0]])).reduceByKey(lambda x,y: x+y).map(lambda x: (x[0], (x[1]))) up = bu.flatMap(lambda x: list(itertools.combinations(x[1], 2))).map(lambda x: (x, 1)).\ reduceByKey(lambda x,y: x+y).filter(lambda x: x[1]>=threshold).\ map(lambda x: x[0]).persist() vertices = up.flatMap(lambda x: x).distinct().persist() N = vertices.count() up_r = up.map(lambda x: (x[1], x[0])) adj = up.map(lambda x: (x[0], [x[1]])).reduceByKey(lambda x, y: x + y) adj_r = up_r.map(lambda x: (x[0], [x[1]])).reduceByKey(lambda x, y: x + y) adjacents = sc.union([adj, adj_r]).reduceByKey(lambda x,y: x+y).persist() d_adjacents = adjacents.collectAsMap() A = adjacents.collectAsMap() betweennesses = vertices.flatMap(lambda x: calculate_betweenness(x, d_adjacents)).\ reduceByKey(lambda x,y: x+y).\ map(lambda x: (x[0], x[1]/2.0)). \ sortBy(lambda x: x[0][1]). \ sortBy(lambda x: x[0][0]). \ sortBy(lambda x: x[1], False). \ persist() bet_file = open(output_file_bet, 'w') for i in betweennesses.collect():
# This one is for the server #base = "/wikistats/{0}.txt" # This one is for local testing base = "~/HW3Data/{0}.txt" rdds = [] for i in range(6, 24): f = base.format(i) rdd = sc.textFile(f) # We use our function-returing function to evade Spark's lazy evaluation rdd = rdd.map(parse(i)) rdds.append(rdd) # Combine all of our rdds rdd = sc.union(rdds) # We use our vector function from above rdd = rdd.map(to_vector) # We add all of the hours together, which is effectively adding a bunch of # zeros and one page view count per column rdd = rdd.reduceByKey(np.add) # Set the bias term to 1 rdd = rdd.map(set_bias) # Split the project code and project name out of the tuple we used earlier rdd = rdd.map(split_code_name) xxt = rdd.map(x_xtranspose)
# This one is for the server base = "/wikistats/{0}.txt" # This one is for local testing # base = "/home/bsprague/Downloads/HW3Data/{0}.txt" rdds = [] for i in range(6, 24): f = base.format(i) rdd = sc.textFile(f) # We use our function-returing function to evade Spark's lazy evaluation rdd = rdd.map(parse(i)) rdds.append(rdd) # Combine all of our rdds rdd = sc.union(rdds).filter(lambda r: r[1][0] == "en") # We use our vector function from above rdd = rdd.map(to_vector) # We add all of the hours together, which is effectively adding a bunch of # zeros and one page view count per column rdd = rdd.reduceByKey(np.add) # Set the bias term to 1 rdd = rdd.map(set_bias) # Split the project code and project name out of the tuple we used earlier rdd = rdd.map(split_code_name) train = rdd.filter(even)