Exemple #1
0
class BroadcastTest(unittest.TestCase):
    def tearDown(self):
        if getattr(self, "sc", None) is not None:
            self.sc.stop()
            self.sc = None

    def _test_encryption_helper(self, vs):
        """
        Creates a broadcast variables for each value in vs, and runs a simple job to make sure the
        value is the same when it's read in the executors.  Also makes sure there are no task
        failures.
        """
        bs = [self.sc.broadcast(value=v) for v in vs]
        exec_values = self.sc.parallelize(
            range(2)).map(lambda x: [b.value for b in bs]).collect()
        for ev in exec_values:
            self.assertEqual(ev, vs)
        # make sure there are no task failures
        status = self.sc.statusTracker()
        for jid in status.getJobIdsForGroup():
            for sid in status.getJobInfo(jid).stageIds:
                stage_info = status.getStageInfo(sid)
                self.assertEqual(0, stage_info.numFailedTasks)

    def _test_multiple_broadcasts(self, *extra_confs):
        """
        Test broadcast variables make it OK to the executors.  Tests multiple broadcast variables,
        and also multiple jobs.
        """
        conf = SparkConf()
        for key, value in extra_confs:
            conf.set(key, value)
        conf.setMaster("local-cluster[2,1,1024]")
        self.sc = SparkContext(conf=conf)
        self._test_encryption_helper([5])
        self._test_encryption_helper([5, 10, 20])

    def test_broadcast_with_encryption(self):
        self._test_multiple_broadcasts(("spark.io.encryption.enabled", "true"))

    def test_broadcast_no_encryption(self):
        self._test_multiple_broadcasts()

    def _test_broadcast_on_driver(self, *extra_confs):
        conf = SparkConf()
        for key, value in extra_confs:
            conf.set(key, value)
        conf.setMaster("local-cluster[2,1,1024]")
        self.sc = SparkContext(conf=conf)
        bs = self.sc.broadcast(value=5)
        self.assertEqual(5, bs.value)

    def test_broadcast_value_driver_no_encryption(self):
        self._test_broadcast_on_driver()

    def test_broadcast_value_driver_encryption(self):
        self._test_broadcast_on_driver(("spark.io.encryption.enabled", "true"))
Exemple #2
0
 def execute(self):
     print("execute ", self.__class__)
     from pyspark.context import SparkContext
     from pyspark.sql import SparkSession
     sc = SparkContext(appName='test PySparkTask')
     b = sc.broadcast([1, 2, 3, 4, 5])
     sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect()
     spark = SparkSession.builder \
         .master("local") \
         .appName("Word Count") \
         .getOrCreate()
     data = [('Alice', 1), ('Monica', 2)]
     spark.createDataFrame(data).collect()
     spark.createDataFrame(data, ['name', 'age']).collect()
Exemple #3
0
######### Global variabls ######### (gross)
# The following variables are broadcast to the spark
# cluster and can be used in the functions below
songTable = 'song_data'
sc = SparkContext('local[*]', 'lastfm_recommender')
sqlContext = SQLContext(sc)

### Set up database connections for metadata and similar artists
### This is starting to get really ugly.
### broadcasting this data is probably not a good idea
artist_engine = create_engine('sqlite:///' + sys.argv[1])
sims = pd.read_sql_query('SELECT * FROM similarity', artist_engine)
# broadcsasting these variables is probably a bad idea since
# they ar quite big
similars = sc.broadcast(sims.similar)
similar_groups = sc.broadcast(sims.groupby('target').groups)

tagFile = open('lastfm_unique_tags.txt', 'r')
# make tag dictionary available across the cluster.
tags = [
    tagstr[0] for tagstr in map(lambda ts: ts.split('\t'),
                                [next(tagFile) for x in xrange(500)])
]
tagDictionary = sc.broadcast(tags)
tagFile.close()

######## Functions for feature extraction #########


# make a "vector" with indices corresoinding to values in
# parse raw user artist data
userArtistDataFile = filePath + 'user_artist_data.txt'
rawUserArtistData = sc.textFile(userArtistDataFile)

# parse Artist data file
artistDataFile = filePath + 'artist_data.txt'
rawArtistData = sc.textFile(artistDataFile)
artistById = rawArtistData.map(parseArtistByIdData).filter(lambda (k, v) : k != -1)

# parse artist alias file
artistAliasDataFile = filePath + 'artist_alias.txt'
rawArtistAliasData = sc.textFile(artistAliasDataFile)
artistAlias = rawArtistAliasData.map(parseArtistAliasData).filter(lambda (k, v) : k != -1).collectAsMap()

# broadcast variable
bArtistAlias = sc.broadcast(artistAlias)


def processTrainData(line):
    (userId, artistId, count) = map(int, line.split(' '))
    
    artistAliasId = bArtistAlias.value.get(artistId)
    if artistAliasId == None: 
        artistAliasId = artistId
    return Rating(userId, artistAliasId, count)

trainData = rawUserArtistData.map(processTrainData).cache()

model = ALS.trainImplicit(trainData, 10)
print model.productFeatures()
    'JOB_NAME', 'source_bucket', 'report_folder_prefix',
    'index_name_prefix_template', 'index_pattern_prefix', 'es_domain_url'
])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

# Parameter init
source_bucket = args['source_bucket']
report_folder_prefix = args['report_folder_prefix']
index_name_prefix_template = args['index_name_prefix_template']
index_pattern_prefix = args['index_pattern_prefix']
es_domain_url = args['es_domain_url']
es_domain_url_shared = sc.broadcast(es_domain_url)
succeed = sc.accumulator(0)
failed = sc.accumulator(0)
now = datetime.datetime.now()
index_name_base = index_name_prefix_template.format(str(now.year),
                                                    str(now.month))
index_name = index_name_base + "-" + str(now.day)
index_name_shared = sc.broadcast(index_name)


def doc_generator(source):
    for row in source:
        updated_row = row.asDict()
        index_name = index_name_shared.value
        new_row = {
            '_index': index_name,
Exemple #6
0
class TestRDDFunctions(PySparkTestCase):

    def test_failed_sparkcontext_creation(self):
        # Regression test for SPARK-1550
        self.sc.stop()
        self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
        self.sc = SparkContext("local")

    def test_save_as_textfile_with_unicode(self):
        # Regression test for SPARK-970
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_save_as_textfile_with_utf8(self):
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x.encode("utf-8")])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_transforming_cartesian_result(self):
        # Regression test for SPARK-1034
        rdd1 = self.sc.parallelize([1, 2])
        rdd2 = self.sc.parallelize([3, 4])
        cart = rdd1.cartesian(rdd2)
        result = cart.map(lambda (x, y): x + y).collect()

    def test_transforming_pickle_file(self):
        # Regression test for SPARK-2601
        data = self.sc.parallelize(["Hello", "World!"])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsPickleFile(tempFile.name)
        pickled_file = self.sc.pickleFile(tempFile.name)
        pickled_file.map(lambda x: x).collect()

    def test_cartesian_on_textfile(self):
        # Regression test for
        path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
        a = self.sc.textFile(path)
        result = a.cartesian(a).collect()
        (x, y) = result[0]
        self.assertEqual("Hello World!", x.strip())
        self.assertEqual("Hello World!", y.strip())

    def test_deleting_input_files(self):
        # Regression test for SPARK-1025
        tempFile = tempfile.NamedTemporaryFile(delete=False)
        tempFile.write("Hello World!")
        tempFile.close()
        data = self.sc.textFile(tempFile.name)
        filtered_data = data.filter(lambda x: True)
        self.assertEqual(1, filtered_data.count())
        os.unlink(tempFile.name)
        self.assertRaises(Exception, lambda: filtered_data.count())

    def testAggregateByKey(self):
        data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)

        def seqOp(x, y):
            x.add(y)
            return x

        def combOp(x, y):
            x |= y
            return x

        sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
        self.assertEqual(3, len(sets))
        self.assertEqual(set([1]), sets[1])
        self.assertEqual(set([2]), sets[3])
        self.assertEqual(set([1, 3]), sets[5])

    def test_itemgetter(self):
        rdd = self.sc.parallelize([range(10)])
        from operator import itemgetter
        self.assertEqual([1], rdd.map(itemgetter(1)).collect())
        self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect())

    def test_namedtuple_in_rdd(self):
        from collections import namedtuple
        Person = namedtuple("Person", "id firstName lastName")
        jon = Person(1, "Jon", "Doe")
        jane = Person(2, "Jane", "Doe")
        theDoes = self.sc.parallelize([jon, jane])
        self.assertEquals([jon, jane], theDoes.collect())

    def test_large_broadcast(self):
        N = 100000
        data = [[float(i) for i in range(300)] for i in range(N)]
        bdata = self.sc.broadcast(data)  # 270MB
        m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum()
        self.assertEquals(N, m)

    def test_zip_with_different_serializers(self):
        a = self.sc.parallelize(range(5))
        b = self.sc.parallelize(range(100, 105))
        self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])
        a = a._reserialize(BatchedSerializer(PickleSerializer(), 2))
        b = b._reserialize(MarshalSerializer())
        self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)])

    def test_zip_with_different_number_of_items(self):
        a = self.sc.parallelize(range(5), 2)
        # different number of partitions
        b = self.sc.parallelize(range(100, 106), 3)
        self.assertRaises(ValueError, lambda: a.zip(b))
        # different number of batched items in JVM
        b = self.sc.parallelize(range(100, 104), 2)
        self.assertRaises(Exception, lambda: a.zip(b).count())
        # different number of items in one pair
        b = self.sc.parallelize(range(100, 106), 2)
        self.assertRaises(Exception, lambda: a.zip(b).count())
        # same total number of items, but different distributions
        a = self.sc.parallelize([2, 3], 2).flatMap(range)
        b = self.sc.parallelize([3, 2], 2).flatMap(range)
        self.assertEquals(a.count(), b.count())
        self.assertRaises(Exception, lambda: a.zip(b).count())

    def test_histogram(self):
        # empty
        rdd = self.sc.parallelize([])
        self.assertEquals([0], rdd.histogram([0, 10])[1])
        self.assertEquals([0, 0], rdd.histogram([0, 4, 10])[1])
        self.assertRaises(ValueError, lambda: rdd.histogram(1))

        # out of range
        rdd = self.sc.parallelize([10.01, -0.01])
        self.assertEquals([0], rdd.histogram([0, 10])[1])
        self.assertEquals([0, 0], rdd.histogram((0, 4, 10))[1])

        # in range with one bucket
        rdd = self.sc.parallelize(range(1, 5))
        self.assertEquals([4], rdd.histogram([0, 10])[1])
        self.assertEquals([3, 1], rdd.histogram([0, 4, 10])[1])

        # in range with one bucket exact match
        self.assertEquals([4], rdd.histogram([1, 4])[1])

        # out of range with two buckets
        rdd = self.sc.parallelize([10.01, -0.01])
        self.assertEquals([0, 0], rdd.histogram([0, 5, 10])[1])

        # out of range with two uneven buckets
        rdd = self.sc.parallelize([10.01, -0.01])
        self.assertEquals([0, 0], rdd.histogram([0, 4, 10])[1])

        # in range with two buckets
        rdd = self.sc.parallelize([1, 2, 3, 5, 6])
        self.assertEquals([3, 2], rdd.histogram([0, 5, 10])[1])

        # in range with two bucket and None
        rdd = self.sc.parallelize([1, 2, 3, 5, 6, None, float('nan')])
        self.assertEquals([3, 2], rdd.histogram([0, 5, 10])[1])

        # in range with two uneven buckets
        rdd = self.sc.parallelize([1, 2, 3, 5, 6])
        self.assertEquals([3, 2], rdd.histogram([0, 5, 11])[1])

        # mixed range with two uneven buckets
        rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01])
        self.assertEquals([4, 3], rdd.histogram([0, 5, 11])[1])

        # mixed range with four uneven buckets
        rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1])
        self.assertEquals([4, 2, 1, 3], rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1])

        # mixed range with uneven buckets and NaN
        rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0,
                                   199.0, 200.0, 200.1, None, float('nan')])
        self.assertEquals([4, 2, 1, 3], rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1])

        # out of range with infinite buckets
        rdd = self.sc.parallelize([10.01, -0.01, float('nan'), float("inf")])
        self.assertEquals([1, 2], rdd.histogram([float('-inf'), 0, float('inf')])[1])

        # invalid buckets
        self.assertRaises(ValueError, lambda: rdd.histogram([]))
        self.assertRaises(ValueError, lambda: rdd.histogram([1]))
        self.assertRaises(ValueError, lambda: rdd.histogram(0))
        self.assertRaises(TypeError, lambda: rdd.histogram({}))

        # without buckets
        rdd = self.sc.parallelize(range(1, 5))
        self.assertEquals(([1, 4], [4]), rdd.histogram(1))

        # without buckets single element
        rdd = self.sc.parallelize([1])
        self.assertEquals(([1, 1], [1]), rdd.histogram(1))

        # without bucket no range
        rdd = self.sc.parallelize([1] * 4)
        self.assertEquals(([1, 1], [4]), rdd.histogram(1))

        # without buckets basic two
        rdd = self.sc.parallelize(range(1, 5))
        self.assertEquals(([1, 2.5, 4], [2, 2]), rdd.histogram(2))

        # without buckets with more requested than elements
        rdd = self.sc.parallelize([1, 2])
        buckets = [1 + 0.2 * i for i in range(6)]
        hist = [1, 0, 0, 0, 1]
        self.assertEquals((buckets, hist), rdd.histogram(5))

        # invalid RDDs
        rdd = self.sc.parallelize([1, float('inf')])
        self.assertRaises(ValueError, lambda: rdd.histogram(2))
        rdd = self.sc.parallelize([float('nan')])
        self.assertRaises(ValueError, lambda: rdd.histogram(2))

        # string
        rdd = self.sc.parallelize(["ab", "ac", "b", "bd", "ef"], 2)
        self.assertEquals([2, 2], rdd.histogram(["a", "b", "c"])[1])
        self.assertEquals((["ab", "ef"], [5]), rdd.histogram(1))
        self.assertRaises(TypeError, lambda: rdd.histogram(2))

        # mixed RDD
        rdd = self.sc.parallelize([1, 4, "ab", "ac", "b"], 2)
        self.assertEquals([1, 1], rdd.histogram([0, 4, 10])[1])
        self.assertEquals([2, 1], rdd.histogram(["a", "b", "c"])[1])
        self.assertEquals(([1, "b"], [5]), rdd.histogram(1))
        self.assertRaises(TypeError, lambda: rdd.histogram(2))
    "mapred.bq.input.dataset.id": "picfeed",
    "mapred.bq.input.table.id": "data_set_term",
    "spark.sql.shuffle.partitions": "1000",
    "spark.default.parallelism": "1000",
}

# Read the data from BigQuery into Spark as an RDD.
table_data_data_combine_uinfo = spark.sparkContext.newAPIHadoopRDD(
    "com.google.cloud.hadoop.io.bigquery.JsonTextBigQueryInputFormat",
    "org.apache.hadoop.io.LongWritable",
    "com.google.gson.JsonObject",
    conf=conf2)

print "start create broadcase value"

b_uinfo = sc.broadcast(table_data_data_combine_uinfo)

print "finish create broadcase value"


def process_uinfo(line):
    line = (line.uid, line.ts, line.label, b_uinfo.get(line.urlid))
    return line


print "start map"
out_rdd = table_data_train_set.map(process_uinfo)

print "finish map"

print out_rdd.take(2)
Exemple #8
0
class TestRDDFunctions(PySparkTestCase):

    def test_failed_sparkcontext_creation(self):
        # Regression test for SPARK-1550
        self.sc.stop()
        self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
        self.sc = SparkContext("local")

    def test_save_as_textfile_with_unicode(self):
        # Regression test for SPARK-970
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_transforming_cartesian_result(self):
        # Regression test for SPARK-1034
        rdd1 = self.sc.parallelize([1, 2])
        rdd2 = self.sc.parallelize([3, 4])
        cart = rdd1.cartesian(rdd2)
        result = cart.map(lambda (x, y): x + y).collect()

    def test_transforming_pickle_file(self):
        # Regression test for SPARK-2601
        data = self.sc.parallelize(["Hello", "World!"])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsPickleFile(tempFile.name)
        pickled_file = self.sc.pickleFile(tempFile.name)
        pickled_file.map(lambda x: x).collect()

    def test_cartesian_on_textfile(self):
        # Regression test for
        path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
        a = self.sc.textFile(path)
        result = a.cartesian(a).collect()
        (x, y) = result[0]
        self.assertEqual("Hello World!", x.strip())
        self.assertEqual("Hello World!", y.strip())

    def test_deleting_input_files(self):
        # Regression test for SPARK-1025
        tempFile = tempfile.NamedTemporaryFile(delete=False)
        tempFile.write("Hello World!")
        tempFile.close()
        data = self.sc.textFile(tempFile.name)
        filtered_data = data.filter(lambda x: True)
        self.assertEqual(1, filtered_data.count())
        os.unlink(tempFile.name)
        self.assertRaises(Exception, lambda: filtered_data.count())

    def testAggregateByKey(self):
        data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)

        def seqOp(x, y):
            x.add(y)
            return x

        def combOp(x, y):
            x |= y
            return x

        sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
        self.assertEqual(3, len(sets))
        self.assertEqual(set([1]), sets[1])
        self.assertEqual(set([2]), sets[3])
        self.assertEqual(set([1, 3]), sets[5])

    def test_itemgetter(self):
        rdd = self.sc.parallelize([range(10)])
        from operator import itemgetter
        self.assertEqual([1], rdd.map(itemgetter(1)).collect())
        self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect())

    def test_namedtuple_in_rdd(self):
        from collections import namedtuple
        Person = namedtuple("Person", "id firstName lastName")
        jon = Person(1, "Jon", "Doe")
        jane = Person(2, "Jane", "Doe")
        theDoes = self.sc.parallelize([jon, jane])
        self.assertEquals([jon, jane], theDoes.collect())

    def test_large_broadcast(self):
        N = 100000
        data = [[float(i) for i in range(300)] for i in range(N)]
        bdata = self.sc.broadcast(data)  # 270MB
        m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum()
        self.assertEquals(N, m)
Exemple #9
0
        return []
    else:
        try:
            return [(int(k[0]), int(k[1]))]
        except:
            return []


# In[12]:

artistAlias = alias.flatMap(lambda x: aliases(x)).collectAsMap()
artist_id = dict(artist_data.flatMap(lambda k: artist(k)).collect())

# In[13]:

lookup = sc.broadcast(artistAlias)


def mapper(x):
    userID, artistID, count = map(lambda lineItem: int(lineItem), x.split())
    finalArtistID = lookup.value.get(artistID)
    if ID is None:
        ID = artistID
    return Rating(userID, ID, count)


Data = user_data.map(lambda x: mapper(x))
Data.cache()

# In[14]:
Exemple #10
0
test_data = "hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/share/MNIST/Test-label-28x28.csv"
test_df = spark.read.csv(test_data, header=False, inferSchema="true")
assembler_test = VectorAssembler(inputCols=test_df.columns[1:784], outputCol="features")
test_vectors = assembler_test.transform(test_df).select(test_df.columns[0],"features")

pca_test = train_model.transform(test_vectors).select(test_vectors.columns[0],'pca')
p_number = pca_test.rdd.count()




train_np1 = np.array(pca_train.select('pca').collect())
train_label = np.array(pca_train.select('_c0').collect())
a,b,c = train_np1.shape
train_set = train_np1.reshape(a,c)
train_np = sc.broadcast(train_np1)
label = sc.broadcast(train_label)
result = pca_test.rdd.map(knn)


predict_rdd = result.map(lambda x: x[0])
predict_rdd.saveAsTextFile("hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/user/czho9311/123")
acc = result.filter(lambda x:x[0]==x[1]).count() / float(p_number)

metrics = MulticlassMetrics(result)
label_metrics = ['0.0','1.0','2.0','3.0','4.0','5.0','6.0','7.0','8.0','9.0']
pre_dict = {}
recall_dict = {}
f_score = {}
for i in label_metrics:
    pre_dict[i] = metrics.precision(i)
# values()
m = sc.parallelize([(1, 2), (3, 4)]).values()
m.collect()

# variance()
sc.parallelize([1, 2, 3]).variance()

# zip(other)
x = sc.parallelize(range(0,5))
y = sc.parallelize(range(1000, 1005))
x.zip(y).collect()

# zipWithIndex()
sc.parallelize(["a", "b", "c", "d"], 3).zipWithIndex().collect()

# zipWithUniqueId()
sc.parallelize(["a", "b", "c", "d", "e"], 3).zipWithUniqueId().collect()


### BROADCAST
from pyspark.context import SparkContext
sc = SparkContext('local', 'test')
b = sc.broadcast([1, 2, 3, 4, 5])
b.value
sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect()
b.unpersist()

large_broadcast = sc.broadcast(range(10000))


# Getting relevant columns - last two cointain review and rating - also remove puncutations
word_counted =  Amazon.map(lambda x: (x[-1], x[-2].translate({ord(char): None for char in string.punctuation})))


# In[17]:


#combining all comments for the same key
word_counted_1  = word_counted.reduceByKey(lambda x,y : x+y)


# In[18]:


####count of freq so that it can be used to get average
word_freq_1  = sc.broadcast(word_counted.countByKey())


# In[19]:


word_counted.countByKey()


# In[24]:


#Getting the average - total length divide by count 
word_avg = word_counted_1.map(lambda x: (x[0], len(x[1].split())/word_freq_1.value[x[0]])).sortByKey()

    training_rdd_cleaned = training_rdd.subtractByKey(validation_rdd)
    training_rdd_cleaned = training_rdd_cleaned.map(lambda x:
                                                    (x[0][0], (x[0][1], x[1])))
    test_rdd = test_rdd_with_res.map(lambda x: (x[0][0], x[0][1])).sortByKey()

    users_rdd1 = training_rdd_cleaned.groupByKey().sortByKey().mapValues(
        list).collectAsMap()
    user_business_rdd1 = training_rdd_cleaned.map(
        lambda x: ((x[0], x[1][0]), x[1][1])).sortByKey().collectAsMap()
    business_rdd1 = training_rdd_cleaned.map(lambda x: (x[1][0], (x[0], x[1][
        1]))).groupByKey().sortByKey().mapValues(list).collectAsMap()
    business_rdd2 = training_rdd_cleaned.map(lambda x: (x[1][0], x[
        0])).groupByKey().sortByKey().mapValues(list).collectAsMap()

    users_rdd = sc.broadcast(users_rdd1)
    user_business_rdd = sc.broadcast(user_business_rdd1)
    business_rdd = sc.broadcast(business_rdd1)

    all_user_train = training_rdd.map(lambda x: x[0][0]).distinct().collect()
    all_user_val = test_rdd_with_res.map(
        lambda x: x[0][0]).distinct().collect()

    all_business_train = training_rdd.map(
        lambda x: x[0][1]).distinct().collect()
    all_business_val = test_rdd_with_res.map(
        lambda x: x[0][1]).distinct().collect()

    all_business_dic = dict()
    all_user_dic = dict()
Exemple #14
0
class TestRDDFunctions(PySparkTestCase):
    def test_failed_sparkcontext_creation(self):
        # Regression test for SPARK-1550
        self.sc.stop()
        self.assertRaises(Exception,
                          lambda: SparkContext("an-invalid-master-name"))
        self.sc = SparkContext("local")

    def test_save_as_textfile_with_unicode(self):
        # Regression test for SPARK-970
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_save_as_textfile_with_utf8(self):
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x.encode("utf-8")])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_transforming_cartesian_result(self):
        # Regression test for SPARK-1034
        rdd1 = self.sc.parallelize([1, 2])
        rdd2 = self.sc.parallelize([3, 4])
        cart = rdd1.cartesian(rdd2)
        result = cart.map(lambda (x, y): x + y).collect()

    def test_transforming_pickle_file(self):
        # Regression test for SPARK-2601
        data = self.sc.parallelize(["Hello", "World!"])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsPickleFile(tempFile.name)
        pickled_file = self.sc.pickleFile(tempFile.name)
        pickled_file.map(lambda x: x).collect()

    def test_cartesian_on_textfile(self):
        # Regression test for
        path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
        a = self.sc.textFile(path)
        result = a.cartesian(a).collect()
        (x, y) = result[0]
        self.assertEqual("Hello World!", x.strip())
        self.assertEqual("Hello World!", y.strip())

    def test_deleting_input_files(self):
        # Regression test for SPARK-1025
        tempFile = tempfile.NamedTemporaryFile(delete=False)
        tempFile.write("Hello World!")
        tempFile.close()
        data = self.sc.textFile(tempFile.name)
        filtered_data = data.filter(lambda x: True)
        self.assertEqual(1, filtered_data.count())
        os.unlink(tempFile.name)
        self.assertRaises(Exception, lambda: filtered_data.count())

    def testAggregateByKey(self):
        data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)

        def seqOp(x, y):
            x.add(y)
            return x

        def combOp(x, y):
            x |= y
            return x

        sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
        self.assertEqual(3, len(sets))
        self.assertEqual(set([1]), sets[1])
        self.assertEqual(set([2]), sets[3])
        self.assertEqual(set([1, 3]), sets[5])

    def test_itemgetter(self):
        rdd = self.sc.parallelize([range(10)])
        from operator import itemgetter
        self.assertEqual([1], rdd.map(itemgetter(1)).collect())
        self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect())

    def test_namedtuple_in_rdd(self):
        from collections import namedtuple
        Person = namedtuple("Person", "id firstName lastName")
        jon = Person(1, "Jon", "Doe")
        jane = Person(2, "Jane", "Doe")
        theDoes = self.sc.parallelize([jon, jane])
        self.assertEquals([jon, jane], theDoes.collect())

    def test_large_broadcast(self):
        N = 100000
        data = [[float(i) for i in range(300)] for i in range(N)]
        bdata = self.sc.broadcast(data)  # 270MB
        m = self.sc.parallelize(range(1),
                                1).map(lambda x: len(bdata.value)).sum()
        self.assertEquals(N, m)

    def test_zip_with_different_serializers(self):
        a = self.sc.parallelize(range(5))
        b = self.sc.parallelize(range(100, 105))
        self.assertEqual(
            a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103),
                                 (4, 104)])
        a = a._reserialize(BatchedSerializer(PickleSerializer(), 2))
        b = b._reserialize(MarshalSerializer())
        self.assertEqual(
            a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103),
                                 (4, 104)])

    def test_zip_with_different_number_of_items(self):
        a = self.sc.parallelize(range(5), 2)
        # different number of partitions
        b = self.sc.parallelize(range(100, 106), 3)
        self.assertRaises(ValueError, lambda: a.zip(b))
        # different number of batched items in JVM
        b = self.sc.parallelize(range(100, 104), 2)
        self.assertRaises(Exception, lambda: a.zip(b).count())
        # different number of items in one pair
        b = self.sc.parallelize(range(100, 106), 2)
        self.assertRaises(Exception, lambda: a.zip(b).count())
        # same total number of items, but different distributions
        a = self.sc.parallelize([2, 3], 2).flatMap(range)
        b = self.sc.parallelize([3, 2], 2).flatMap(range)
        self.assertEquals(a.count(), b.count())
        self.assertRaises(Exception, lambda: a.zip(b).count())

    def test_histogram(self):
        # empty
        rdd = self.sc.parallelize([])
        self.assertEquals([0], rdd.histogram([0, 10])[1])
        self.assertEquals([0, 0], rdd.histogram([0, 4, 10])[1])
        self.assertRaises(ValueError, lambda: rdd.histogram(1))

        # out of range
        rdd = self.sc.parallelize([10.01, -0.01])
        self.assertEquals([0], rdd.histogram([0, 10])[1])
        self.assertEquals([0, 0], rdd.histogram((0, 4, 10))[1])

        # in range with one bucket
        rdd = self.sc.parallelize(range(1, 5))
        self.assertEquals([4], rdd.histogram([0, 10])[1])
        self.assertEquals([3, 1], rdd.histogram([0, 4, 10])[1])

        # in range with one bucket exact match
        self.assertEquals([4], rdd.histogram([1, 4])[1])

        # out of range with two buckets
        rdd = self.sc.parallelize([10.01, -0.01])
        self.assertEquals([0, 0], rdd.histogram([0, 5, 10])[1])

        # out of range with two uneven buckets
        rdd = self.sc.parallelize([10.01, -0.01])
        self.assertEquals([0, 0], rdd.histogram([0, 4, 10])[1])

        # in range with two buckets
        rdd = self.sc.parallelize([1, 2, 3, 5, 6])
        self.assertEquals([3, 2], rdd.histogram([0, 5, 10])[1])

        # in range with two bucket and None
        rdd = self.sc.parallelize([1, 2, 3, 5, 6, None, float('nan')])
        self.assertEquals([3, 2], rdd.histogram([0, 5, 10])[1])

        # in range with two uneven buckets
        rdd = self.sc.parallelize([1, 2, 3, 5, 6])
        self.assertEquals([3, 2], rdd.histogram([0, 5, 11])[1])

        # mixed range with two uneven buckets
        rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01])
        self.assertEquals([4, 3], rdd.histogram([0, 5, 11])[1])

        # mixed range with four uneven buckets
        rdd = self.sc.parallelize(
            [-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1])
        self.assertEquals([4, 2, 1, 3],
                          rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1])

        # mixed range with uneven buckets and NaN
        rdd = self.sc.parallelize([
            -0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, None,
            float('nan')
        ])
        self.assertEquals([4, 2, 1, 3],
                          rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1])

        # out of range with infinite buckets
        rdd = self.sc.parallelize([10.01, -0.01, float('nan'), float("inf")])
        self.assertEquals([1, 2],
                          rdd.histogram([float('-inf'), 0,
                                         float('inf')])[1])

        # invalid buckets
        self.assertRaises(ValueError, lambda: rdd.histogram([]))
        self.assertRaises(ValueError, lambda: rdd.histogram([1]))
        self.assertRaises(ValueError, lambda: rdd.histogram(0))
        self.assertRaises(TypeError, lambda: rdd.histogram({}))

        # without buckets
        rdd = self.sc.parallelize(range(1, 5))
        self.assertEquals(([1, 4], [4]), rdd.histogram(1))

        # without buckets single element
        rdd = self.sc.parallelize([1])
        self.assertEquals(([1, 1], [1]), rdd.histogram(1))

        # without bucket no range
        rdd = self.sc.parallelize([1] * 4)
        self.assertEquals(([1, 1], [4]), rdd.histogram(1))

        # without buckets basic two
        rdd = self.sc.parallelize(range(1, 5))
        self.assertEquals(([1, 2.5, 4], [2, 2]), rdd.histogram(2))

        # without buckets with more requested than elements
        rdd = self.sc.parallelize([1, 2])
        buckets = [1 + 0.2 * i for i in range(6)]
        hist = [1, 0, 0, 0, 1]
        self.assertEquals((buckets, hist), rdd.histogram(5))

        # invalid RDDs
        rdd = self.sc.parallelize([1, float('inf')])
        self.assertRaises(ValueError, lambda: rdd.histogram(2))
        rdd = self.sc.parallelize([float('nan')])
        self.assertRaises(ValueError, lambda: rdd.histogram(2))

        # string
        rdd = self.sc.parallelize(["ab", "ac", "b", "bd", "ef"], 2)
        self.assertEquals([2, 2], rdd.histogram(["a", "b", "c"])[1])
        self.assertEquals((["ab", "ef"], [5]), rdd.histogram(1))
        self.assertRaises(TypeError, lambda: rdd.histogram(2))

        # mixed RDD
        rdd = self.sc.parallelize([1, 4, "ab", "ac", "b"], 2)
        self.assertEquals([1, 1], rdd.histogram([0, 4, 10])[1])
        self.assertEquals([2, 1], rdd.histogram(["a", "b", "c"])[1])
        self.assertEquals(([1, "b"], [5]), rdd.histogram(1))
        self.assertRaises(TypeError, lambda: rdd.histogram(2))
    if len(tmp_ind) > 0 and itr < prune_stop_iter:  # run at window @6
        _tmp_c = np.array(len(crates_list) * [-1.])
        for t_name in tmp_ind:
            _tmp_c[layer_inds[t_name]] = crates[t_name]
        apply_prune(solver.net, _tmp_c)
    #if len(tmp_ind)>1 and itr < prune_stop_iter:
    if itr % 1000 == 0 and len(
            tmp_ind) > 1 and itr < prune_stop_iter:  # run at window @3

        accuracy_ = test_net(solver.net, _count=1, _start="ip1")
        es = {}
        #reference_model = sc.broadcast(solver.net) ## not work, can not be pickled
        solver.net.save(parallel_file_name
                        )  # share model by files through parallel individual
        #print(solver.net.blobs['data'].data.shape)
        the_input_batch = sc.broadcast(solver.net.blobs['data'].data)
        if es_method == 'ncs':
            __C = edict()
            __C.parameters = {
                'reset_xl_to_pop': False,
                'init_value': tmp_crates,
                'stepsize': ncs_stepsize,
                'bounds': [0.0, 10.],
                'ftarget': 0,
                'tmax': 1600,
                'popsize': 10,
                'best_k': 1
            }
            es = ncs.NCS(__C.parameters)
            print('***************NCS initialization***************')
            tmp_x_ = np.array(crates_list)
Exemple #16
0
class TestRDDFunctions(PySparkTestCase):
    def test_failed_sparkcontext_creation(self):
        # Regression test for SPARK-1550
        self.sc.stop()
        self.assertRaises(Exception,
                          lambda: SparkContext("an-invalid-master-name"))
        self.sc = SparkContext("local")

    def test_save_as_textfile_with_unicode(self):
        # Regression test for SPARK-970
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_save_as_textfile_with_utf8(self):
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x.encode("utf-8")])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_transforming_cartesian_result(self):
        # Regression test for SPARK-1034
        rdd1 = self.sc.parallelize([1, 2])
        rdd2 = self.sc.parallelize([3, 4])
        cart = rdd1.cartesian(rdd2)
        result = cart.map(lambda (x, y): x + y).collect()

    def test_transforming_pickle_file(self):
        # Regression test for SPARK-2601
        data = self.sc.parallelize(["Hello", "World!"])
        tempFile = tempfile.NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsPickleFile(tempFile.name)
        pickled_file = self.sc.pickleFile(tempFile.name)
        pickled_file.map(lambda x: x).collect()

    def test_cartesian_on_textfile(self):
        # Regression test for
        path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
        a = self.sc.textFile(path)
        result = a.cartesian(a).collect()
        (x, y) = result[0]
        self.assertEqual("Hello World!", x.strip())
        self.assertEqual("Hello World!", y.strip())

    def test_deleting_input_files(self):
        # Regression test for SPARK-1025
        tempFile = tempfile.NamedTemporaryFile(delete=False)
        tempFile.write("Hello World!")
        tempFile.close()
        data = self.sc.textFile(tempFile.name)
        filtered_data = data.filter(lambda x: True)
        self.assertEqual(1, filtered_data.count())
        os.unlink(tempFile.name)
        self.assertRaises(Exception, lambda: filtered_data.count())

    def testAggregateByKey(self):
        data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)

        def seqOp(x, y):
            x.add(y)
            return x

        def combOp(x, y):
            x |= y
            return x

        sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
        self.assertEqual(3, len(sets))
        self.assertEqual(set([1]), sets[1])
        self.assertEqual(set([2]), sets[3])
        self.assertEqual(set([1, 3]), sets[5])

    def test_itemgetter(self):
        rdd = self.sc.parallelize([range(10)])
        from operator import itemgetter
        self.assertEqual([1], rdd.map(itemgetter(1)).collect())
        self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect())

    def test_namedtuple_in_rdd(self):
        from collections import namedtuple
        Person = namedtuple("Person", "id firstName lastName")
        jon = Person(1, "Jon", "Doe")
        jane = Person(2, "Jane", "Doe")
        theDoes = self.sc.parallelize([jon, jane])
        self.assertEquals([jon, jane], theDoes.collect())

    def test_large_broadcast(self):
        N = 100000
        data = [[float(i) for i in range(300)] for i in range(N)]
        bdata = self.sc.broadcast(data)  # 270MB
        m = self.sc.parallelize(range(1),
                                1).map(lambda x: len(bdata.value)).sum()
        self.assertEquals(N, m)
Exemple #17
0
def main():
    start = time.time()
    #Step 2: handle input parameters
    parser = argparse.ArgumentParser(description='Process some integers.')
    #filenames, K and N
    parser.add_argument('-filenames',
                        type=str,
                        nargs='+',
                        help='the list of fasta files',
                        required=True)
    parser.add_argument('-K',
                        type=int,
                        nargs=1,
                        help='value of K in k-mer',
                        required=True)
    parser.add_argument('-N',
                        type=int,
                        nargs=1,
                        help='value of N in top-n',
                        required=True)
    args = parser.parse_args()

    #Step 3: create a Spark context object (ctx)
    ctx = SparkContext(appName="Kmer count")

    #Step 4: broadcast K and N as global shared objects
    files = ctx.broadcast(args.filenames)
    k = ctx.broadcast(args.K)
    n = ctx.broadcast(args.N)
    print(files.value)
    print(k.value)
    print(n.value)

    #Step 5: read FASTQ file from HDFS and create the first RDD
    records = ctx.textFile(files.value[0])
    #remove file if exists
    # try:
    #     shutil.rmtree("kmers/output/1")
    #     print("removed old output")
    # except OSError:
    #     print("kmers / output / 1 did not exist, creating now")
    # records.saveAsTextFile("kmers/output/1")

    #Step 6: filter redundant records
    #specChar = re.compile('[A-Za-z]')
    pattern = re.compile('^[ACGTNacgn]+$')
    records = records.filter(lambda x: re.match(pattern, x) != None)

    # for i in filterRDD.collect():
    #     print(i)
    # try:
    #     shutil.rmtree("kmers/output/1.5")
    #     print("removed old output")
    # except OSError:
    #     print("kmers / output / 1.5 did not exist, creating now")
    # filterRDD.saveAsTextFile("kmers/output/1.5")

    # Step 7: generate K-mers
    kVal = k.value[0]
    kmers = records.map(lambda x: (x[0:kVal], 1))
    # for k in kmers.collect():
    #     print(k)
    # try:
    #     shutil.rmtree("kmers/output/2")
    #     print("removed old output")
    # except OSError:
    #     print("kmers / output / 2 did not exist, creating now")
    # kmers.saveAsTextFile("kmers/output/2")

    # Step 8: Combine/reduce frequent K-mers
    grouped = kmers.reduceByKey(lambda x, y: x + y)
    # try:
    #     shutil.rmtree("kmers/output/2.5")
    #     print("removed old output")
    # except OSError:
    #     print("kmers / output / 2.5 did not exist, creating now")
    # grouped.saveAsTextFile("kmers/output/2.5")

    # Step 9: create a local top N for all partitions
    sortedKmers = grouped.map(lambda x: (int(-x[1]), x[0])).sortByKey().map(
        lambda x: (x[1], -1 * int(x[0])))
    try:
        shutil.rmtree("kmers/output/3")
        print("removed old output")
    except OSError:
        print("kmers / output / 3 did not exist, creating now")
    sortedKmers.saveAsTextFile("kmers/output/3")

    #Step 10: get top N
    print("Top N={} {}-mers:".format(n.value[0], k.value[0]))
    for val in sortedKmers.take(n.value[0]):
        print(val)
    # print("Bottom N={} {}-mers:".format(n.value[0], k.value[0]))
    # for val in sortedKmers.takeOrdered(n.value[0], key=lambda x:-1*int(x[1])):
    #     print(val)
    end = time.time()
    print(end - start)
               "double"), ("clm15_days_supply", "long", "daysSupply", "long"),
              ("clm27_unit_dose_indicator", "long", "dosageUnit", "long"),
              ("drg13_route_description", "string", "route", "string"),
              ("drg14_strength", "string", "dosageStrength", "string"),
              ("drg25_dosage_form", "string", "doseForm", "string")],
    transformation_ctx="ds2")

# Convert to spark DF
df = ds.toDF()

# Create MemberNumber -> patientId Map
temp_dict = {}
for row in df.select("memberNumber").distinct().collect():
    temp_dict[str(row.memberNumber)] = str(uuid.uuid4())
# Share this dictionary with all workers
patient_id_dict = sc.broadcast(temp_dict)


def get_patient_id(member_number):
    return patient_id_dict.value[str(member_number)]


# Add new column patientId based on memberNumber
get_patient_id_udf = udf(get_patient_id, StringType())
df = df.withColumn("patientId", get_patient_id_udf(df["memberNumber"]))
# Convert date string to a datetime
convert_date_udf = udf(
    lambda date_str: str(datetime.strptime(date_str, "%Y-%m-%d")),
    StringType())
df = df.withColumn("fillDate", convert_date_udf(df["fillDate"]))
# Add new column ndc9 based on substring of product_service_identification
Exemple #19
0
######### Global variabls ######### (gross)
# The following variables are broadcast to the spark
# cluster and can be used in the functions below
songTable = 'song_data'
sc = SparkContext('local[*]', 'lastfm_recommender')
sqlContext = SQLContext(sc)

### Set up database connections for metadata and similar artists
### This is starting to get really ugly.
### broadcasting this data is probably not a good idea
artist_engine = create_engine('sqlite:///'+sys.argv[1])
sims = pd.read_sql_query(
    'SELECT * FROM similarity', artist_engine)
# broadcsasting these variables is probably a bad idea since 
# they ar quite big
similars = sc.broadcast(sims.similar)
similar_groups = sc.broadcast(sims.groupby('target').groups)

tagFile = open('lastfm_unique_tags.txt', 'r')
# make tag dictionary available across the cluster.
tags = [tagstr[0] for tagstr in map(lambda ts: ts.split('\t'),
                                    [next(tagFile) for x in xrange(500)])]
tagDictionary = sc.broadcast(tags)
tagFile.close()

######## Functions for feature extraction #########

# make a "vector" with indices corresoinding to values in 
# tagDictionary
def getTagVector(track):
    return {tagDictionary.value[tag]:1 for [tag, f] in track.tags
class MainApp(object):
    def __init__(self):
        pass

    def init(self):
        os.environ[
            "SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
        # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
        # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
        conf = SparkConf()
        conf.setMaster("local")
        conf.setAppName("PySparkShell")
        conf.set("spark.executor.memory", "2g")
        # conf.set("spark.driver.memory", "1g")
        self.sc = SparkContext(conf=conf)
        self.sqlContext = SQLContext(self.sc)

    def loadData(self):
        self.df_review = self.sqlContext.read.json(
            "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json"
        ).cache()
        # self.df_review = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_review.json").cache()
        self.df_business = self.sqlContext.read.json(
            "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json"
        ).cache()
        # self.df_business = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_business.json").cache()
        self.df_review.registerTempTable("reviews")
        self.df_business.registerTempTable("business")

    def createCheckInDataPerUser(self):
        review_user = self.sqlContext.sql(
            "SELECT business_id, user_id FROM reviews")
        business_loc = self.sqlContext.sql(
            "SELECT business_id, latitude, longitude FROM business")
        review_user.registerTempTable("reviews_user")
        business_loc.registerTempTable("business_loc")

        self.df_join_reviewAndBusiness = self.sqlContext.sql(
            "SELECT r.user_id, b.latitude, b.longitude FROM reviews_user r JOIN business_loc b ON r.business_id = b.business_id"
        ).cache()
        self.df_join_reviewAndBusiness.registerTempTable("userBusiness")

        self.df_unique_users = self.sqlContext.sql(
            "SELECT DISTINCT user_id FROM userBusiness where user_id = \"SIfJLNMv7vBwo-fSipxNgg\""
        )
        self.df_unique_users.registerTempTable("users")

        pd = self.df_join_reviewAndBusiness.toPandas()
        global_db = self.sc.broadcast(pd)

        schema = StructType([
            StructField("latitude", FloatType()),
            StructField("longitude", FloatType())
        ])
        partialFunc = partial(getLocationsOfUser, business_db=global_db.value)

        self.get_locations = udf(partialFunc, ArrayType(schema))
        self.get_centers = udf(getCentersOfUser, ArrayType(schema))

        self.df_unique_users = self.df_unique_users.withColumn(
            "user_locations",
            self.get_locations(self.df_unique_users["user_id"]))
        self.df_unique_users.registerTempTable("users")

        self.df_unique_users.repartition(1).write.save("user.json", "json",
                                                       "overwrite")

        print(
            getCentersOfUser(
                self.df_unique_users.toPandas().iloc[0]["user_locations"]))

        self.df_unique_users = self.df_unique_users.withColumn(
            "user_centers",
            self.get_centers(self.df_unique_users["user_locations"]))
        self.df_unique_users.registerTempTable("users")

        self.df_unique_users.repartition(1).write.save("center.json", "json",
                                                       "overwrite")
        self.df_unique_users.show()

    def distanceCalc(self):
        self.df_unique_users = self.sqlContext.read.json(
            "user.json/part-r-00000-23a1b514-f5fe-4f61-9a64-01ebbc88c146"
        ).cache()
        print(
            len(
                getCentersOfUser(self.df_unique_users.toPandas().iloc[0]
                                 ["user_locations"])))
Exemple #21
0
print "finish to map input"

def process_uinfo(line):
    if len(line) != 2:
        return Row(urlid=line, urlinfo="")
    return Row(urlid=line[0], urlinfo=line[1])

#combine_uinfo_dict = combine_uinfo.map(lambda p: Row(urlid=p[0], urlinfo=p[1])).collect()

print "begin to map and collect uinfo"
combine_uinfo_dict = combine_uinfo.map(process_uinfo).collect()

print "finish map"

combine_uinfo_b = sc.broadcast(combine_uinfo_dict)

print "finish broadcast"

def update(line, uinfo):
    line = (line, uinfo.filter(uinfo.urlid==line[1]).urlinfo)
    return line

print "begin update"
train_set_url = train_set.map(lambda x: update(x, combine_uinfo_b))

print "finish update"

accum.add(1)

print "finish mapping"
Exemple #22
0
all_business.sort()

all_business_dic = dict()
all_user_dic = dict()

i = 0
for item in all_business:
    all_business_dic[item] = i
    i += 1

j = 0
for item in all_user:
    all_user_dic[item] = j
    j += 1

vu = sc.broadcast(all_user_dic)

hashes_value = [[421, 167, 1610612741], [491, 397, 100663319],
                [659, 257, 3145739], [479, 193,
                                      201326611], [167, 167, 402653189],
                [619, 139, 393241], [929, 137, 402653189], [389, 211, 393241],
                [443, 431, 805306457], [983, 211, 100663319],
                [109, 211, 805306457], [761, 389,
                                        1572869], [661, 131, 1610612741],
                [241, 373, 25165843], [491, 163, 12582917], [257, 293, 786433],
                [317, 191, 402653189], [127, 389,
                                        12582917], [467, 347, 3145739],
                [827, 191, 393241], [617, 211, 3145739], [127, 241, 25165843],
                [757, 233, 805306457], [641, 337, 196613],
                [547, 233, 1610612741], [233, 307, 1610612741],
                [457, 271, 100663319], [937, 173, 805306457],
from pyspark.sql.functions import udf, col
from pyspark.sql.types import *
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.mllib.linalg import SparseVector, DenseVector

sc = SparkContext(appName='sparking_your_interest')
SQLContext = HiveContext(sc)

speech_stopwords_list = list([line.strip() for line in open('speech_stopwords.txt', 'r')])
speech_stopwords_broadcasted = sc.broadcast(speech_stopwords_list)
nltk_stopwords = set(stopwords.words('english'))
nltk_stopwords_broadcasted = sc.broadcast(nltk_stopwords)
more_stopwords = set([line.strip() for line in open('more_stopwords.txt', 'r')])
more_stopwords_broadcasted = sc.broadcast(more_stopwords)

def clean_up(s):
    text_removing_brackets = re.sub("[\(\[].*?[\)\]]", "", s)
    text_removing_double_quotes = re.sub('"',"",text_removing_brackets)
    speech_stopwords = speech_stopwords_broadcasted.value
    text_removing_stopwords = text_removing_double_quotes
    for token in speech_stopwords:
        text_removing_stopwords = re.sub(token,'',text_removing_stopwords)
    return text_removing_stopwords

def unicode_encode(s):
getfrnd = udf(get_f_list, StringType())

original_df = sqlc.read \
      .format("jdbc") \
      .option("url", "jdbc:oracle:thin:@150.136.138.197:1521/BIASDB_PDB1.subnet12011439.vcn12011439.oraclevcn.com") \
      .option("dbtable", "POC.CUSTOMER_DETAILS") \
      .option("user", "poc") \
      .option("password", "WElcome##123") \
      .option("driver", "oracle.jdbc.driver.OracleDriver") \
      .load()

sm_id_df = original_df.select(original_df['social_media_id'])

sm_id_df_rdd = sm_id_df.rdd.flatMap(lambda x: x).collect()

b_rdd = sc.broadcast(sm_id_df_rdd)

b1 = original_df.select(
    original_df['CUSTOMER_NAME'],
    original_df['social_media_id'].alias('SOCIAL_MEDIA_ID'),
    original_df['customer_email'].alias('EMAIL'),
    original_df['customer_address'].alias('ADDRESS'),
    original_df['customer_number'].alias('CONTACT'),
    getfrnd(original_df['is_friends_with']).alias('FRIENDS_LIST'))

b1.write.format('jdbc').options(
    url=
    'jdbc:oracle:thin:@150.136.138.197:1521/BIASDB_PDB1.subnet12011439.vcn12011439.oraclevcn.com',
    driver='oracle.jdbc.driver.OracleDriver',
    dbtable='POC.SOCIAL_MEDIA_DWH',
    user='******',
class MainApp(object):
    def __init__(self):
        pass

    def init(self):
        os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
        # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
        # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
        conf = SparkConf()
        conf.setMaster("local")
        conf.setAppName("PySparkShell")
        conf.set("spark.executor.memory", "2g")
        # conf.set("spark.driver.memory", "1g")
        self.sc = SparkContext(conf=conf)
        self.sqlContext = SQLContext(self.sc)

    def loadData(self):
        self.df_review = self.sqlContext.read.json(
            "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json"
        ).cache()
        # self.df_review = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_review.json").cache()
        self.df_business = self.sqlContext.read.json(
            "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json"
        ).cache()
        # self.df_business = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_business.json").cache()
        self.df_review.registerTempTable("reviews")
        self.df_business.registerTempTable("business")

    def createCheckInDataPerUser(self):
        review_user = self.sqlContext.sql("SELECT business_id, user_id FROM reviews")
        business_loc = self.sqlContext.sql("SELECT business_id, latitude, longitude FROM business")
        review_user.registerTempTable("reviews_user")
        business_loc.registerTempTable("business_loc")

        self.df_join_reviewAndBusiness = self.sqlContext.sql(
            "SELECT r.user_id, b.latitude, b.longitude FROM reviews_user r JOIN business_loc b ON r.business_id = b.business_id"
        ).cache()
        self.df_join_reviewAndBusiness.registerTempTable("userBusiness")

        self.df_unique_users = self.sqlContext.sql(
            'SELECT DISTINCT user_id FROM userBusiness where user_id = "SIfJLNMv7vBwo-fSipxNgg"'
        )
        self.df_unique_users.registerTempTable("users")

        pd = self.df_join_reviewAndBusiness.toPandas()
        global_db = self.sc.broadcast(pd)

        schema = StructType([StructField("latitude", FloatType()), StructField("longitude", FloatType())])
        partialFunc = partial(getLocationsOfUser, business_db=global_db.value)

        self.get_locations = udf(partialFunc, ArrayType(schema))
        self.get_centers = udf(getCentersOfUser, ArrayType(schema))

        self.df_unique_users = self.df_unique_users.withColumn(
            "user_locations", self.get_locations(self.df_unique_users["user_id"])
        )
        self.df_unique_users.registerTempTable("users")

        self.df_unique_users.repartition(1).write.save("user.json", "json", "overwrite")

        print(getCentersOfUser(self.df_unique_users.toPandas().iloc[0]["user_locations"]))

        self.df_unique_users = self.df_unique_users.withColumn(
            "user_centers", self.get_centers(self.df_unique_users["user_locations"])
        )
        self.df_unique_users.registerTempTable("users")

        self.df_unique_users.repartition(1).write.save("center.json", "json", "overwrite")
        self.df_unique_users.show()

    def distanceCalc(self):
        self.df_unique_users = self.sqlContext.read.json(
            "user.json/part-r-00000-23a1b514-f5fe-4f61-9a64-01ebbc88c146"
        ).cache()
        print(len(getCentersOfUser(self.df_unique_users.toPandas().iloc[0]["user_locations"])))