Example #1
0
def main():
    conf = SparkConf().set("spark.ui.showConsoleProgress", "false").setAppName("PythonStatusAPIDemo").setMaster("local[*]")
    sc = SparkContext(conf=conf)

    def run():
        rdd = sc.parallelize(range(10), 10).map(delayed(2))
        reduced = rdd.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
        return reduced.map(delayed(2)).collect()

    result = call_in_background(run)
    status = sc.statusTracker()
    while result.empty():
        ids = status.getJobIdsForGroup()
        for id in ids:
            job = status.getJobInfo(id)
            print("Job", id, "status: ", job.status)
            for sid in job.stageIds:
                info = status.getStageInfo(sid)
                if info:
                    print("Stage %d: %d tasks total (%d active, %d complete)" %
                          (sid, info.numTasks, info.numActiveTasks, info.numCompletedTasks))
        time.sleep(1)

    print("Job results are:", result.get())
    sc.stop()
Example #2
0
 def __init__(self):
     self.sparkname = 'transform'
     self.hostname = "localhost"
     self.dbname = 'bigdata'
     self.jdbcPort = '3306'
     self.properties = {
         "user": '******',
         "password": '******',
         "driver": 'com.mysql.jdbc.Driver'
     }
     self.mailto_list = ["*****@*****.**"]
     self.mail_host = "smtp.qq.com"
     self.mail_user = "******"
     self.mail_pass = "******"
     self.mail_postfix = "qq.com"
     self.database = MySQLdb.connect(host=self.hostname,
                                     user=self.properties.get('user'),
                                     passwd=self.properties.get('password'),
                                     db=self.dbname,
                                     charset="utf8")
     self.cursor = self.database.cursor()
     self.conf = SparkConf().setAppName(
         self.sparkname).setMaster("local[*]")
     self.sc = SparkContext(conf=self.conf)
     self.sqlContext = SQLContext(self.sc)
def save(tablename):
    conf = SparkConf().setAppName("python model").setMaster("local[*]")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    hostname = "10.23.73.118"
    dbname = 'ouye'
    jdbcPort = '3306'
    properties = {
        "user": '******',
        "password": '******',
        "driver": 'com.mysql.jdbc.Driver'
    }
    jdbcUrl = "jdbc:mysql://{0}:{1}/{2}?characterEncoding=utf8".format(
        hostname, jdbcPort, dbname)

    df = sqlContext.read.jdbc(url=jdbcUrl,
                              table=tablename,
                              properties=properties)

    df.show()
    with open("ml-100k/u.item") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

# Take each line of u.data and convert it to (movieID, (rating, 1.0))
# This way we can then add up all the ratings for each movie, and
# the total number of ratings for each movie (which lets us compute the average)
def parseInput(line):
    fields = line.split()
    return (int(fields[1]), (float(fields[2]), 1.0))

if __name__ == "__main__":
    # The main script - create our SparkContext
    conf = SparkConf().setAppName("WorstMovies")
    sc = SparkContext(conf = conf)

    # Load up our movie ID -> movie name lookup table
    movieNames = loadMovieNames()

    # Load up the raw u.data file
    lines = sc.textFile("hdfs:///user/maria_dev/ml-100k/u.data")

    # Convert to (movieID, (rating, 1.0))
    movieRatings = lines.map(parseInput)

    # Reduce to (movieID, (sumOfRatings, totalRatings))
    ratingTotalsAndCount = movieRatings.reduceByKey(lambda movie1, movie2: ( movie1[0] + movie2[0], movie1[1] + movie2[1] ) )

    # Filter out movies rated 10 or fewer times
def generateGraph():
    edges = set()
    while len(edges) < numEdges:
        src = rand.randrange(0, numVertices)
        dst = rand.randrange(0, numVertices)
        if src != dst:
            edges.add((src, dst))
    return edges


if __name__ == "__main__":
    """
    Usage: transitive_closure [partitions]
    """
    conf = SparkConf().setAppName("PythonTransitiveClosure").setMaster(
        "local[*]")
    sc = SparkContext(conf=conf)
    # partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
    tc = sc.parallelize(generateGraph(), 2).cache()

    # Linear transitive closure: each round grows paths by one edge,
    # by joining the graph's edges with the already-discovered paths.
    # e.g. join the path (y, z) from the TC with the edge (x, y) from
    # the graph to obtain the path (x, z).

    # Because join() joins on keys, the edges are stored in reversed order.
    edges = tc.map(lambda x_y: (x_y[1], x_y[0]))

    oldCount = 0
    nextCount = tc.count()
    while True: