コード例 #1
0
def main(inputfolderpath, outputfolderpath, jobname):
    #inputfolderpath = "hdfs://santa-fe:47001/Source-Recommendation-System/FakeNewsCorpus/news_cleaned_2018_02_13.csv"
    #inputfolderpath = "hdfs://santa-fe:47001/FakeNewsCorpus/news_cleaned_2018_02_13.csv"
    #inputfolderpath = "hdfs://santa-fe:47001/FakeNewsCorpus-Outputs/news_cleaned_partitioned/news_cleaned_2018_02_1300000"
    #inputfolderpath = "hdfs://santa-fe:47001/Source-Recommendation-System/FakeNewsCorpus/news_sample.csv"
    #outputfolderpath = "hdfs://santa-fe:47001/Source-Recommendation-System/FakeNewsCorpus-Outputs"
    #outputfolderpath = "hdfs://santa-fe:47001/FakeNewsCorpus-Outputs/KeywordsFromPartitions/news_cleaned_partitioned/news_cleaned_2018_02_1300000temp"
    title_score = 10
    keywords_score = 13
    meta_keywords_score = 13
    meta_description_score = 13
    tags_score = 13
    summary_score = 10
    #spark = SparkSession.builder.appName(jobname).getOrCreate()
    sc = SparkContext(master="spark://santa-fe.cs.colostate.edu:47002", appName=jobname)
    delete_path(sc, outputfolderpath)
    sqlContext = SQLContext(sc)
    inputfile_rdd = sqlContext.read.csv(inputfolderpath, header=True,sep=",", multiLine = True, quote='"', escape='"')\
        .rdd.repartition(29)
    keywords_from_content = inputfile_rdd\
        .filter(lambda row : row["content"] is not None and row["content"] != "null")\
        .map(lambda  row : extract_with_row_id(row["id"], row["content"]))\
        .flatMap(lambda xs: [(x) for x in xs])
    keywords_from_title = inputfile_rdd\
        .filter(lambda row : row["title"] is not None and row["title"] != "null")\
        .map(lambda row : [(x,"(" + str(row["id"]) + "," + str(title_score) + ")") for x in get_processed_words(row["title"])])\
        .flatMap(lambda xs: [(x) for x in xs])
    keywords_from_keywords_col = inputfile_rdd\
        .filter(lambda row : row["keywords"] is not None and row["keywords"] != "null")\
        .map(lambda row : [(x.lower(),"(" + str(row["id"]) + "," + str(keywords_score) + ")") for x in get_keywords_from_keywords_col(row["keywords"])])\
        .flatMap(lambda xs: [(x) for x in xs])
    keywords_from_meta_keywords = inputfile_rdd\
        .filter(lambda row : row["meta_keywords"] is not None and row["meta_keywords"] != "null")\
        .map(lambda row : [(x.lower(),"(" + str(row["id"]) + "," + str(meta_keywords_score) + ")") for x in parse_meta_keywords(row["meta_keywords"]) if len(x) > 1 ])\
        .flatMap(lambda xs: [(x) for x in xs])
    keywords_from_meta_description = inputfile_rdd\
        .filter(lambda row : row["meta_description"] is not None and row["meta_description"] != "null")\
        .map(lambda row : [(x, "(" + str(row["id"]) + "," + str(meta_description_score) + ")") for x in get_processed_words(row["meta_description"])])\
        .flatMap(lambda xs: [(x) for x in xs])
    keywords_from_tags = inputfile_rdd\
        .filter(lambda row : row["tags"] is not None and row["tags"] != "null")\
        .map(lambda row : [(x.lower(), "(" + str(row["id"]) + "," + str(tags_score) + ")") for x in str(row["tags"].encode('ascii', "ignore")).split(",") ])\
        .flatMap(lambda xs: [(x) for x in xs])
    keywords_from_summary = inputfile_rdd\
        .filter(lambda row : row["summary"] is not None and row["summary"] != "null")\
        .map(lambda  row : extract_with_row_id(row["id"], row["summary"]))\
        .flatMap(lambda xs: [(x) for x in xs])
    all_keywords_list = [keywords_from_content, keywords_from_title, keywords_from_keywords_col, keywords_from_meta_keywords,
        keywords_from_meta_description, keywords_from_tags, keywords_from_summary]
    all_keywords_rdd = sc.union(all_keywords_list)
    all_keywords_rdd = all_keywords_rdd\
        .filter(lambda row: len(row[0]) > 2)\
        .reduceByKey(concat)
    all_keywords_df = all_keywords_rdd.toDF(["Keyword", "RowId & Score"])
    all_keywords_df.write.csv(outputfolderpath, header=True, quote='"', escape='"')
    sc.stop()
コード例 #2
0
        features = example.features.feature
        image = numpy.array(features['image'].int64_list.value)
        label = numpy.array(features['label'].int64_list.value)
        return (image, label)

    dataRDD = images.map(lambda x: toNumpy(str(x[0])))
else:
    if args.format == "csv":
        images = sc.textFile(
            args.images).map(lambda ln: [int(x) for x in ln.split(',')])
        labels = sc.textFile(
            args.labels).map(lambda ln: [float(x) for x in ln.split(',')])
    else:  # args.format == "pickle":
        images = sc.pickleFile(args.images)
        labels = sc.pickleFile(args.labels)
        sc.union()
    print("zipping images and labels")
    dataRDD = images.zip(labels)

cluster = TFCluster.run(sc,
                        mnist_dist.map_fun,
                        args,
                        args.cluster_size,
                        num_ps,
                        args.tensorboard,
                        TFCluster.InputMode.SPARK,
                        log_dir=args.model)
if args.mode == "train":
    cluster.train(dataRDD, args.epochs)
else:
    labelRDD = cluster.inference(dataRDD)
コード例 #3
0
ファイル: task2.py プロジェクト: anjana1909/USC_Data_Mining
    output_file_com = sys.argv[4]

    data = input_file.map(lambda x: x.split(',')).filter(lambda x: x[0] != "user_id")

    bu = data.map(lambda x: (x[1], [x[0]])).reduceByKey(lambda x,y: x+y).map(lambda x: (x[0], (x[1])))
    up = bu.flatMap(lambda x: list(itertools.combinations(x[1], 2))).map(lambda x: (x, 1)).\
        reduceByKey(lambda x,y: x+y).filter(lambda x: x[1]>=threshold).\
        map(lambda x: x[0]).persist()

    vertices = up.flatMap(lambda x: x).distinct().persist()
    N = vertices.count()

    up_r = up.map(lambda x: (x[1], x[0]))
    adj = up.map(lambda x: (x[0], [x[1]])).reduceByKey(lambda x, y: x + y)
    adj_r = up_r.map(lambda x: (x[0], [x[1]])).reduceByKey(lambda x, y: x + y)
    adjacents = sc.union([adj, adj_r]).reduceByKey(lambda x,y: x+y).persist()

    d_adjacents = adjacents.collectAsMap()

    A = adjacents.collectAsMap()

    betweennesses = vertices.flatMap(lambda x: calculate_betweenness(x, d_adjacents)).\
        reduceByKey(lambda x,y: x+y).\
        map(lambda x: (x[0], x[1]/2.0)). \
        sortBy(lambda x: x[0][1]). \
        sortBy(lambda x: x[0][0]). \
        sortBy(lambda x: x[1], False). \
        persist()

    bet_file = open(output_file_bet, 'w')
    for i in betweennesses.collect():
コード例 #4
0
# This one is for the server
#base = "/wikistats/{0}.txt"

# This one is for local testing
base = "~/HW3Data/{0}.txt"

rdds = []
for i in range(6, 24):
    f = base.format(i)
    rdd = sc.textFile(f)
    # We use our function-returing function to evade Spark's lazy evaluation
    rdd = rdd.map(parse(i))
    rdds.append(rdd)

# Combine all of our rdds
rdd = sc.union(rdds)

# We use our vector function from above
rdd = rdd.map(to_vector)

# We add all of the hours together, which is effectively adding a bunch of
# zeros and one page view count per column
rdd = rdd.reduceByKey(np.add)

# Set the bias term to 1
rdd = rdd.map(set_bias)

# Split the project code and project name out of the tuple we used earlier
rdd = rdd.map(split_code_name)

xxt = rdd.map(x_xtranspose)
コード例 #5
0
# This one is for the server
base = "/wikistats/{0}.txt"

# This one is for local testing
# base = "/home/bsprague/Downloads/HW3Data/{0}.txt"

rdds = []
for i in range(6, 24):
    f = base.format(i)
    rdd = sc.textFile(f)
    # We use our function-returing function to evade Spark's lazy evaluation
    rdd = rdd.map(parse(i))
    rdds.append(rdd)

# Combine all of our rdds
rdd = sc.union(rdds).filter(lambda r: r[1][0] == "en")

# We use our vector function from above
rdd = rdd.map(to_vector)

# We add all of the hours together, which is effectively adding a bunch of
# zeros and one page view count per column
rdd = rdd.reduceByKey(np.add)

# Set the bias term to 1
rdd = rdd.map(set_bias)

# Split the project code and project name out of the tuple we used earlier
rdd = rdd.map(split_code_name)

train = rdd.filter(even)