def main():
    """Run the belief propagation algorithm for an example problem."""
    # setup context
    conf = SparkConf().setAppName("BeliefPropagation example")
    sc = SparkContext.getOrCreate(conf)
    sql = SQLContext.getOrCreate(sc)

    with SuppressSparkLogs(sc):

        # create graphical model g of size 3 x 3
        g = graphframes.examples.Graphs(sql).gridIsingModel(3)
        print("Original Ising model:")
        g.vertices.show()
        g.edges.show()

        # run BP for 5 iterations
        numIter = 5
        results = BeliefPropagation.runBPwithGraphFrames(g, numIter)

        # display beliefs
        beliefs = results.vertices.select('id', 'belief')
        print("Done with BP. Final beliefs after {} iterations:".format(numIter))
        beliefs.show()

    sc.stop()
def _getScaleHintList():
    featurizer = SparkContext.getOrCreate()._jvm.com.databricks.sparkdl.DeepImageFeaturizer
    if isinstance(featurizer, py4j.java_gateway.JavaPackage):
        # do not see DeepImageFeaturizer, possibly running without spark
        # instead of failing return empty list
        return []
    return dict(featurizer.scaleHintsJava()).keys()
Beispiel #3
0
 def optimize(self):
     """
     Do an optimization. 
     """
     jmodel = callJavaFunc(SparkContext.getOrCreate(), self.value.optimize)
     from nn.layer import Model
     return Model.of(jmodel)
Beispiel #4
0
def parse_raw_wikidata(output):
    spark_conf = SparkConf().setAppName('QB Wikidata').setMaster(QB_SPARK_MASTER)
    sc = SparkContext.getOrCreate(spark_conf)  # type: SparkContext

    wikidata = sc.textFile('s3a://entilzha-us-west-2/wikidata/wikidata-20170306-all.json')

    def parse_line(line):
        if len(line) == 0:
            return []
        if line[0] == '[' or line[0] == ']':
            return []
        elif line.endswith(','):
            return [json.loads(line[:-1])]
        else:
            return [json.loads(line)]

    parsed_wikidata = wikidata.flatMap(parse_line).cache()
    property_map = extract_property_map(parsed_wikidata)
    b_property_map = sc.broadcast(property_map)

    wikidata_items = parsed_wikidata.filter(lambda d: d['type'] == 'item').cache()
    parsed_wikidata.unpersist()
    item_page_map = extract_item_page_map(wikidata_items)
    b_item_page_map = sc.broadcast(item_page_map)

    parsed_item_map = extract_items(wikidata_items, b_property_map, b_item_page_map)

    with open(output, 'wb') as f:
        pickle.dump({
            'parsed_item_map': parsed_item_map,
            'item_page_map': item_page_map,
            'property_map': property_map
        }, f)

    sc.stop()
Beispiel #5
0
def readImages(imageDirectory, numPartition=None):
    """
    Read a directory of images (or a single image) into a DataFrame.

    :param sc: spark context
    :param imageDirectory: str, file path.
    :param numPartition: int, number or partitions to use for reading files.
    :return: DataFrame, with columns: (filepath: str, image: imageSchema).
    """
    return _readImages(imageDirectory, numPartition, SparkContext.getOrCreate())
def readImagesWithCustomFn(path, decode_f, numPartition=None):
    """
    Read a directory of images (or a single image) into a DataFrame using a custom library to
    decode the images.

    :param path: str, file path.
    :param decode_f: function to decode the raw bytes into an array compatible with one of the
        supported OpenCv modes. see @imageIO.PIL_decode for an example.
    :param numPartition: [optional] int, number or partitions to use for reading files.
    :return: DataFrame with schema == ImageSchema.imageSchema.
    """
    return _readImagesWithCustomFn(path, decode_f, numPartition, sc=SparkContext.getOrCreate())
Beispiel #7
0
def load_spark_context(application_name=None):
    if application_name is None:
        application_name = __name__

    conf = SparkConf().setAppName(application_name)
    sc = SparkContext.getOrCreate(conf=conf)
    sql_context = SQLContext(sc)

    # Close logger
    # logger = sc._jvm.org.apache.log4j
    # logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
    # logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
    return sc, sql_context
    def __init__(self, layers, bias=1.0, act_func=None, act_func_prime=None):
        if act_func is None:
            self.act_func = sigmoid
            self.act_func_prime = sigmoid_prime
        else:
            self.act_func = act_func
            self.act_func_prime = act_func_prime
        self.layers = layers

        self.bias = bias
        self.spark_context = SparkContext.getOrCreate()

        log4jLogger = self.spark_context._jvm.org.apache.log4j
        self.logger = log4jLogger.LogManager.getLogger(__name__)
Beispiel #9
0
def callBigDlFunc(bigdl_type, name, *args):
    """ Call API in PythonBigDL """
    sc = SparkContext.getOrCreate()
    if bigdl_type == "float":
        api = getattr(
            sc._jvm.com.intel.analytics.bigdl.python.api.PythonBigDL.ofFloat(),
            name)
    elif bigdl_type == "double":
        api = getattr(
            sc._jvm.com.intel.analytics.bigdl.python.api.PythonBigDL.ofDouble(),
            name)
    else:
        raise Exception("Not supported bigdl_type: %s" % bigdl_type)
    return callJavaFunc(sc, api, *args)
Beispiel #10
0
 def installPackage(self, artifact, base=None, sc=None):
     artifact = self._toArtifact(artifact)
     #Test if we already have a version installed
     res=self.fetchArtifact(artifact)
     fileLoc=None
     if res:
         fileLoc=res[1]
         print("Package already installed: {0}".format(str(artifact)))
     else:
         #download package
         art=[artifact]
         def _doDownload(d):
             artifact=art[0]
             if not artifact.version or artifact.version=='0':
                 artifact.version = d.resolver._find_latest_version_available(artifact)
             fileLoc = artifact.get_filename(self.DOWNLOAD_DIR)
             if os.path.isfile(fileLoc):
                 os.remove(fileLoc)
             results = d.download(artifact,filename=self.DOWNLOAD_DIR)
             if not results[1]:
                 raise Exception("Error downloading package {0}".format(str(artifact)))
             else:
                 artifact=results[0]
                 print("Artifact downloaded successfully {0}".format(str(artifact)))
                 printEx("Please restart Kernel to complete installation of the new package",PrintColors.RED)
             fileLoc=self.storeArtifact(artifact,base)
             return fileLoc
         
         try:
             fileLoc=_doDownload(downloader.Downloader(base) if base is not None else downloader.Downloader())
         except RequestException as e:
             #try another base
             try:
                 fileLoc=_doDownload(downloader.Downloader("http://dl.bintray.com/spark-packages/maven"))
             except RequestException as e:
                 print("Unable to install artifact {0}".format(e.msg))
                 raise
         except:
             print(str(sys.exc_info()[1]))
             raise
     if sc is None:
         sc = SparkContext.getOrCreate()
         
     if sc:
         #convert to file uri for windows platform
         if platform.system()=='Windows':
             fileLoc="file://" + urllib.pathname2url(fileLoc)
         sc.addPyFile(fileLoc)
         
     return artifact
Beispiel #11
0
def isImage(df, column):
    """
    Returns True if the column contains images

    Args:
        df (DataFrame): The DataFrame to be processed
        column  (str): The name of the column being inspected

    Returns:
        bool: True if the colum is an image column
    """

    jvm = SparkContext.getOrCreate()._jvm
    schema = jvm.com.microsoft.ml.spark.schema.ImageSchema
    return schema.isImage(df._jdf, column)
Beispiel #12
0
        def toPython(entity):
            from py4j.java_gateway import JavaObject
            if entity is None or not isinstance(entity, JavaObject):
                return entity

            clazz = entity.getClass().getName()
            if clazz == "org.apache.spark.sql.Dataset":
                entity = entity.toDF()
                clazz = "org.apache.spark.sql.DataFrame"

            if clazz == "org.apache.spark.sql.DataFrame":
                from pyspark.sql import DataFrame, SQLContext
                from pyspark import SparkContext
                entity = DataFrame(entity, SQLContext(SparkContext.getOrCreate(), entity.sqlContext()))

            return entity
Beispiel #13
0
def readImages(sparkSession, path, recursive = False, sampleRatio = 1.0, inspectZip = True):
    """
    Reads the directory of images from the local or remote (WASB) source.
    This function is attached to SparkSession class.
    Example: spark.readImages(path, recursive, ...)

    Args:
        sparkSession (SparkSession): Existing sparkSession
        path (str): Path to the image directory
        recursive (bool): Recursive search flag
        sampleRatio (double): Fraction of the images loaded

    Returns:
        DataFrame: DataFrame with a single column of "images", see imageSchema for details
    """
    ctx = SparkContext.getOrCreate()
    reader = ctx._jvm.com.microsoft.ml.spark.ImageReader
    sql_ctx = pyspark.SQLContext.getOrCreate(ctx)
    jsession = sql_ctx.sparkSession._jsparkSession
    jresult = reader.read(path, recursive, jsession, float(sampleRatio), inspectZip)
    return DataFrame(jresult, sql_ctx)
Beispiel #14
0
def train(filename):
    global model
    sc=SparkContext.getOrCreate()
    user = sc.textFile(filename)

    ratings=user.map(lambda l:l.split("\t")).map(lambda l:Rating(int(l[0]),int(l[1]),float(l[2])))
    #split into training & test 
    (training, test) = ratings.randomSplit([0.8, 0.2])
    testdata = test.map(lambda p: (p[0], p[1]))
    rank = 10
    numIterations = 10

    #training the model
    model = ALS.train(training, rank, numIterations)

    #validating the model
    predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
    ratesAndPreds = test.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    rmse = sqrt(MSE)
    print rmse 
    print "training done"  
    return "OK"
Beispiel #15
0
 def test_active_session_with_None_and_not_None_context(self):
     from pyspark.context import SparkContext
     from pyspark.conf import SparkConf
     sc = None
     session = None
     try:
         sc = SparkContext._active_spark_context
         self.assertEqual(sc, None)
         activeSession = SparkSession.getActiveSession()
         self.assertEqual(activeSession, None)
         sparkConf = SparkConf()
         sc = SparkContext.getOrCreate(sparkConf)
         activeSession = sc._jvm.SparkSession.getActiveSession()
         self.assertFalse(activeSession.isDefined())
         session = SparkSession(sc)
         activeSession = sc._jvm.SparkSession.getActiveSession()
         self.assertTrue(activeSession.isDefined())
         activeSession2 = SparkSession.getActiveSession()
         self.assertNotEqual(activeSession2, None)
     finally:
         if session is not None:
             session.stop()
         if sc is not None:
             sc.stop()
Beispiel #16
0
    def get(self, name: str, tag: str, parties: typing.List[Party],
            gc: GarbageCollectionABC) -> typing.List:
        log_str = f"[rabbitmq.get](name={name}, tag={tag}, parties={parties})"
        LOGGER.debug(f"[{log_str}]start to get")

        _name_dtype_keys = [
            _SPLIT_.join([party.role, party.party_id, name])
            for party in parties
        ]

        if _name_dtype_keys[0] not in self._name_dtype_map:
            mq_names = self._get_mq_names(parties, dtype=NAME_DTYPE_TAG)
            channel_infos = self._get_channels(mq_names=mq_names)
            rtn_dtype = []
            for i, info in enumerate(channel_infos):
                obj = self._receive_obj(info, name, tag=NAME_DTYPE_TAG)
                rtn_dtype.append(obj)
                LOGGER.debug(f"[rabbitmq.get] name: {name}, dtype: {obj}")

            for k in _name_dtype_keys:
                if k not in self._name_dtype_map:
                    self._name_dtype_map[k] = rtn_dtype[0]

        rtn_dtype = self._name_dtype_map[_name_dtype_keys[0]]

        rtn = []
        dtype = rtn_dtype.get("dtype", None)
        partitions = rtn_dtype.get("partitions", None)

        if dtype == FederationDataType.TABLE:
            mq_names = self._get_mq_names(parties, name, partitions=partitions)
            for i in range(len(mq_names)):
                party = parties[i]
                role = party.role
                party_id = party.party_id
                party_mq_names = mq_names[i]
                receive_func = self._get_partition_receive_func(
                    name,
                    tag,
                    party_id,
                    role,
                    party_mq_names,
                    mq=self._mq,
                    connection_conf=self._rabbit_manager.runtime_config.get(
                        'connection', {}))

                sc = SparkContext.getOrCreate()
                rdd = sc.parallelize(range(partitions), partitions)
                rdd = rdd.mapPartitionsWithIndex(receive_func)
                rdd = materialize(rdd)
                table = Table(rdd)
                rtn.append(table)
                # add gc
                gc.add_gc_action(tag, table, '__del__', {})

                LOGGER.debug(
                    f"[{log_str}]received rdd({i + 1}/{len(parties)}), party: {parties[i]} "
                )
        else:
            mq_names = self._get_mq_names(parties, name)
            channel_infos = self._get_channels(mq_names=mq_names)
            for i, info in enumerate(channel_infos):
                obj = self._receive_obj(info, name, tag)
                LOGGER.debug(
                    f"[{log_str}]received obj({i + 1}/{len(parties)}), party: {parties[i]} "
                )
                rtn.append(obj)

        LOGGER.debug(f"[{log_str}]finish to get")
        return rtn
Beispiel #17
0
        print(id, retMapStockMoney[id])
    
    return retMapStockMoney

def reduceFunc(retMapStockMoney1, retMapStockMoney2):
    retMap = retMapStockMoney1.copy()
    retMap.update(retMapStockMoney2)
    return retMap


conf = SparkConf().setAppName("miniProject").setMaster("spark://192.168.32.46:7077")
# conf=SparkConf().setAppName("miniProject").setMaster("local[*]")
# conf.set('spark.scheduler.mode', 'FAIR')
# conf.set("spark.scheduler.pool", None)

sc=sc.getOrCreate(conf)
sc.setLogLevel("ERROR")

# partitions = 32
partitions = 16
# partitions = 8


retMapIdDataOrg = GetStockPrice(checkStartDate, checkEndDate, minCheckDaysData=3)
print( "allCheckStock start:", len(retMapIdDataOrg), np.array(retMapIdDataOrg)[:, 0])

retMapStockMoney = sc.parallelize(retMapIdDataOrg, partitions).map(mapFunc).filter(lambda v: len(list(v.values())[0]) > 0).reduce(reduceFunc)
print( "allCheckStock end:", len(retMapStockMoney), retMapStockMoney.keys())

print("----------------------------------------------------------------------------------------------------------------")
for id in retMapStockMoney:
Beispiel #18
0
import sys,csv,decimal
from pyspark import SparkContext, SparkConf
def movies_mapformat(line): 
  return(line[0],(line[1],line[2].split("|")))
if _name_ == "_main_":
  driver_conf = SparkConf().setAppName("Q3_findMovieStats").setMaster("local")
  sparkcont = SparkContext.getOrCreate(conf = driver_conf)
  input_ratings = sparkcont.textFile("/FileStore/tables/ratings.csv")
  lines_split_includesHeader = input_ratings.map(lambda x: x.split(",")).filter(lambda x:len(x)==4)
  just_header = lines_split_includesHeader.first()
  lines_split_vals= lines_split_includesHeader.filter(lambda x: x!= just_header )
  
  calc_data_avg = lines_split_vals.map(lambda x: [int(x[1]),float(x[2])]).filter(lambda x:len(x)==2)
  Count_sum = calc_data_avg.combineByKey(lambda value: (value, 1),lambda x, value: (x[0] + value, x[1] + 1),lambda x, y: (x[0] + y[0], x[1] + y[1])).filter(lambda x: len(x)==2)
  averageByKey_res = Count_sum.map(lambda x: (x[0], x[1][0] / x[1][1]))

  averageByKey_ressort_ = averageByKey.sortBy(lambda x: x[1])
  find_ten_pairs_last = averageByKey_ressort_.take(10)
  ten_pairs_last = sparkcont.parallelize(find_ten_pairs_last)
  ten_pairs_movies = ten_pairs_last.map(lambda x:[str(x[0]),x[1]])
  input_movies = sparkcont.textFile("/FileStore/tables/movies.csv")
  movies_csvRead = input_movies.mapPartitions(lambda x: csv.reader(x))
  movies_RDD= movies_csvRead.map(movies_mapformat)
  splitbyLines_movies = movies_RDD.filter(lambda x: x[0] != 'movieId')
  list_ofpair_movies = splitbyLines_movies.map(lambda x: [x[0],x[1][0]])
  Movies_names_ratings = ten_pairs_movies.join(list_ofpair_movies)
  Movie_names_ratings = Movies_names_ratings.map(lambda x:[x[0],x[1][1]])
  ten_Movies_Names_Ratings = Movie_names_ratings.take(10)
  ten_Movies_Names_Ratings = sparkcont.parallelize(ten_Movies_Names_Ratings)
  
  input_tags = sparkcont.textFile("/FileStore/tables/tags.csv")
Beispiel #19
0
def InitSpark():
    return SparkContext.getOrCreate()
Beispiel #20
0
        return math.sqrt(sum / n)


#The recommender system is done based on user-user mapping
if __name__ == '__main__':
    arguments = len(sys.argv)
    #if arguments are not in the format we return it
    if arguments != 1:
        print("Improper arguments knn.py")
    else:
        conf = (SparkConf().setMaster('local').setAppName('KNN').set(
            'spark.executor.memory',
            '6g').set('spark.driver.memory',
                      '6g').set('spark.cores.max',
                                '6').set('spark.driver.host', '127.0.0.1'))
        sc = SparkContext.getOrCreate(conf=conf)  # creating the spark context
        sql_sc = SQLContext(sc)

        usebookcolumns = ['book_id', 'authors', 'title']

        books_df = pd.read_csv('./books.csv', usecols=usebookcolumns)
        books_df = sql_sc.createDataFrame(books_df)
        booksRDD = books_df.rdd  # use this RDD to keep track of all the list you need

        #rating count
        useratingcols = ['user_id', 'book_id', 'rating']
        rating_df = pd.read_csv('./smallratings.csv', usecols=useratingcols)
        rating_df = sql_sc.createDataFrame(rating_df)
        ratingRDD = rating_df.rdd
        N = ratingRDD.count()
        print(N)
import sys
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F

spark = SparkSession.builder.getOrCreate()
sc = SparkContext.getOrCreate()
#================== Q4 ====================--

#start_pyspark_shell -e 8 -c 4 -w 4 -m 4
#start_pyspark_shell -e 32 -c 2 -w 4 -m 4

#schema_Daily
schema_Daily = StructType([
    StructField('ID', StringType()),
    StructField('DATE', StringType()),
    StructField('ELEMENT', StringType()),
    StructField('VALUE', IntegerType()),
    StructField('MEASUREMENT_FLAG', StringType()),
    StructField('QUALITY_FLAG', StringType()),
    StructField('SOURCE_FLAG', StringType()),
    StructField('OBSERVATION_TIME', StringType()),
])

path = "hdfs:///data/ghcnd/daily/*.csv.gz"
daily = (spark.read.format("com.databricks.spark.csv").option(
    "header", "false").option("inferSchema",
                              "false").schema(schema_Daily).load(path))
stations = spark.read.orc(
    "hdfs:////user/xzh216/Assign1/output/stations_enriched.orc")
Beispiel #22
0
def writeToPowerBI(df, url, options=dict()):
    jvm = SparkContext.getOrCreate()._jvm
    writer = jvm.com.microsoft.ml.spark.PowerBIWriter
    writer.write(df._jdf, url, options)
Beispiel #23
0
 def _build_spark_context():
     from pyspark import SparkContext
     sc = SparkContext.getOrCreate()
     return sc
Beispiel #24
0
 def test_get_or_create(self):
     with SparkContext.getOrCreate() as sc:
         self.assertTrue(SparkContext.getOrCreate() is sc)
Beispiel #25
0
def getStreamingContext():
    sc = SparkContext.getOrCreate()
    sc.setLogLevel("WARN")
    ssc = StreamingContext(sc, BATCH_DURATION)
    return ssc
Beispiel #26
0
    def _conf(cls):
        from pyspark import SparkContext

        sc = SparkContext.getOrCreate()
        return sc._jsc.hadoopConfiguration()
Beispiel #27
0
 def __init__(self):
     self.data = None
     self.metadata = None
     self.joined = None
     self.avg = None
     self.sc = SparkContext.getOrCreate()
def index():

    # extract data needed for visuals
    # get spark context
    sc = SparkContext.getOrCreate()

    # create spark dataframe to predict customer churn using the model
    #[gender, level, days_active, location, avgSongs, avgEvents, thumbsup, thumbsdown, add_friend]
    gender = ''
    level = 0
    days_active = 0
    location = 0
    avgSongs = 0
    avgEvents = 0
    thumbsup = 0
    thumbsdown = 0
    add_friend = 0
    df = sc.parallelize([[gender, level, days_active, location, avgSongs, avgEvents, thumbsup, thumbsdown, add_friend]]).\
    toDF(["gender", "last_level", "days_active", "last_state", "avg_songs", "avg_events" , "thumbs_up", "thumbs_down", "addfriend"])

    # df = sc.toDF(["gender", "last_level", "days_active", "last_state", "avg_songs", "avg_events" , "thumbs_up", "thumbs_down", "addfriend"])

    #Basic ananlysis for visualisations
    df.show(5)
    # male = df.select('last_level', 'gender').where(df.gender == 'M').groupBy('last_level').count().agg(count("count"))
    # female = df.select('last_level', 'gender').where(df.gender == 'F').groupBy('last_level').count().agg(count("count"))

    # df_pd = male.join(female, "gender", "last_level").drop("count").fillna(0).toPandas()
    # df_pd.show()
    # TODO: Below is an example - modify to extract data for your own visuals
    # git

    # category extractions
    # category = list(df)[4:]
    # category_counts = [np.sum(df[column]) for column in category]

    # categories = df.iloc[:,4:]
    # categories_mean = categories.mean().sort_values(ascending=False)[1:6]
    # categories_names = list(categories_mean.index)

    # create visuals
    # TODO: Below is an example - modify to create your own visuals
    # graphs = [
    #     {
    #         'data': [
    #             Pie(
    #                 labels=gender_names,
    #                 values=gender_counts
    #             )
    #         ],
    #         'layout': {
    #             'title': 'Satistics by Gender',
    #             'height': 450,
    #             'width': 1000
    #         },
    #     },
    #     # {
    #     #     'data': [
    #     #         Bar(
    #     #             x=words,
    #     #             y=count_props
    #     #         )
    #     #     ],

    #     #     'layout': {
    #     #         'title': 'Top 10 words representation(%)',
    #     #         'yaxis': {
    #     #             'title': '% Occurrence',
    #     #             'automargin': True
    #     #         },
    #     #         'xaxis': {
    #     #             'title': 'Words',
    #     #             'automargin': True
    #     #         }
    #     #     }
    #     # },
    #     # {
    #     #     'data': [
    #     #             Bar(
    #     #                 x=category,
    #     #                 y=category_counts
    #     #                 )
    #     #             ],
    #     #       'layout': {
    #     #       'title': 'Message by categories',
    #     #       'yaxis': {
    #     #       'title': "Count"
    #     #       },
    #     #       'xaxis': {
    #     #       'title': "Category"
    #     #       }
    #     #       }
    #     # },
    #     # {
    #     #       'data': [
    #     #                Bar(
    #     #                    x=categories_names,
    #     #                    y=categories_mean
    #     #                    )
    #     #                ],
    #     #       'layout': {
    #     #       'title': 'Top 5 categories',
    #     #       'yaxis': {
    #     #       'title': "Count"
    #     #       },
    #     #       'xaxis': {
    #     #       'title': "Categories"
    #     #       }
    #     #     }
    #     # }
    # ]

    # encode plotly graphs in JSON
    # ids = ["graph-{}".format(i) for i, _ in enumerate(graphs)]
    # graphJSON = json.dumps(graphs, cls=plotly.utils.PlotlyJSONEncoder)

    # render web page with plotly graphs
    #return render_template('master.html', ids=ids, graphJSON=graphJSON)
    return render_template('master.html')
Beispiel #29
0
                else:
                    value_get = (str(Candidate)+","+str(eachFriend),set(ListofCandidateFriends))
                
                Final_List_ofFriends.append(value_get) 
        return(Final_List_ofFriends)

def Map_final(line):
    
    _key = line[0]
    _value = list(line[1])
    s_string = ",".join(_value)
    return("{0}\t {1}".format(_key,s_string))

if __name__ == "__main__":
    config = SparkConf().setAppName("mutualfriends").setMaster("local[2]")
    sparkcont = SparkContext.getOrCreate(conf = config)
    
    input_mutualfriends = sparkcont.textFile("/FileStore/tables/q2_assign/soc_LiveJournal1Adj_txt-b8957.txt")
    lines_split = input_mutualfriends.map(lambda x : x.split("\t")).filter(lambda x : len(x) == 2).map(lambda x: [x[0],x[1].split(",")])
    
    split_mutualFriends = lines_split.flatMap(create_mutual_friends)
    
    Reducer_RDD = split_mutualFriends.reduceByKey(lambda x,y: x.intersection(y))
    #print(Reducer_RDD.first())
    Len_listofFriends = Reducer_RDD.mapValues(lambda x: len(x))
   # print(Len_listofFriends.first())
    List_OfsortedFriends = Len_listofFriends.sortBy(lambda x: -x[1])
    top_teninList = List_OfsortedFriends.take(10)
   # print(top_teninList)
    top_list = sparkcont.parallelize(top_teninList)
    pairs_data = top_list.map(split_top)
Beispiel #30
0
 def __init__(self, values, seed=0):
     ctx = SparkContext.getOrCreate()
     self.jvm = ctx.getOrCreate()._jvm
     self.hyperParam = self.jvm.com.microsoft.azure.synapse.ml.automl.HyperParamUtils.getDiscreteHyperParam(
         values, seed)
def cluster_by_period(start, end):

    sc = SparkContext.getOrCreate()
    D = sc.parallelize(extract_data(start, end))

    # Compute mean temperature by station
    map = D.map(lambda data: (
        (data['station'], data['latitude'], data['longitude']),
        np.array([1, data['temperature'], data['dew_point'], data['feel']])))
    sum_by_station = map.reduceByKey(lambda a, b: a + b)

    mean_by_station = sum_by_station.map(calc_moy_key)
    if (len(mean_by_station.collect()) < 2):
        logging.warning(
            'Only data for one station during this period, so there is one cluster of one station !'
        )
        return

    # KMeans
    # Fist we determine the optimal k with elbow method
    logging.info('Clustering...')
    X = mean_by_station.map(lambda data: [data[1], data[2]])
    X = np.array(X.collect())
    sum_of_squared_distances = []
    K = range(1, 10)
    for k in K:
        km = KMeans(n_clusters=k)
        km = km.fit(X)
        sum_of_squared_distances.append(km.inertia_)

    x = list(range(1, 10))
    y = sum_of_squared_distances
    kn = KneeLocator(x,
                     y,
                     S=1.0,
                     curve='convex',
                     direction='decreasing',
                     interp_method='polynomial')
    plt.xlabel('k')
    plt.ylabel('sum_of_squared_distances')
    plt.title('Elbow method for optimal k between {} and {}'.format(
        start, end))
    plt.xticks(range(1, 9))
    plt.plot(x, y, 'bx-')
    plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')

    filename = 'results/cluster_by_period/{}_{}_elbow.png'.format(start, end)
    if os.path.isfile(filename):
        os.remove(filename)
    plt.savefig(filename)
    plt.clf()

    # Then we cluster with the computed optimal k
    km = KMeans(n_clusters=kn.knee)
    km = km.fit(X)

    # Build map
    logging.info('Building map...')

    lons = []
    lats = []
    vals = km.labels_

    for val in mean_by_station.collect():
        lats.append(val[0][1])
        lons.append(val[0][2])

    map = Basemap(projection='merc',
                  llcrnrlon=19.08,
                  llcrnrlat=59.45,
                  urcrnrlon=31.59,
                  urcrnrlat=70.09,
                  resolution='i')

    map.drawmapboundary(fill_color='aqua')
    map.fillcontinents(color='#cc9955', lake_color='aqua', zorder=1)
    map.drawcoastlines()
    map.drawcountries()

    x, y = map(lons, lats)

    map.scatter(x,
                y,
                c=vals,
                cmap=plt.cm.get_cmap('gist_rainbow', kn.knee),
                zorder=2)

    plt.title('Clustering des stations entre {} et {}'.format(start, end),
              fontsize=10)

    filename = 'results/cluster_by_period/{}_{}.png'.format(start, end)
    if os.path.isfile(filename):
        os.remove(filename)
    plt.savefig(filename)

    logging.info(
        'Success : update your files and check the result in \'results/cluster_by_period\' !'
    )
    logging.info(
        'NB : you can also check \'elbow.png\' to see how we chose the clusters number.'
    )
    plt.clf()
Beispiel #32
0
 def __init__(self):
     ctx = SparkContext.getOrCreate()
     self.jvm = ctx.getOrCreate()._jvm
     self.hyperparams = {}
Beispiel #33
0
def RDD(filePath):
    sc = SparkContext.getOrCreate(SparkConf())
    path = filePath
    RDD = sc.textFile(name=path)
    return RDD
Beispiel #34
0
 def test_get_or_create(self):
     with SparkContext.getOrCreate() as sc:
         self.assertTrue(SparkContext.getOrCreate() is sc)
Beispiel #35
0
def main():
    conf = SparkConf().setAppName("YeJoo_Park_task2_ModelBasedCF")\
     .setMaster("local")

    sc = SparkContext.getOrCreate(conf)
    sc.setLogLevel("ERROR")

    ratingsFilePath = sys.argv[1]
    testFilePath = sys.argv[2]

    data = sc.textFile(testFilePath)
    dataHeader = data.first()

    testingSet = set(data\
     .filter(lambda row: row != dataHeader)\
     .map(lambda r: r.split(","))\
     .map(lambda r: (int(r[USER_INDEX]), int(r[MOVIE_INDEX])))\
     .collect())

    # Load and parse the data
    data = sc.textFile(ratingsFilePath)
    dataHeader = data.first()

    trainRatings = data\
     .filter(lambda row: row != dataHeader)\
     .map(lambda r: r.split(","))\
     .map(lambda r: Rating(int(r[USER_INDEX]), int(r[MOVIE_INDEX]), float(r[RATING_INDEX])))

    print "ratings.count() before filter=" + str(trainRatings.count())

    testRatings = trainRatings.filter(
        lambda rating: (rating.user, rating.product) in testingSet)
    trainRatings = trainRatings.filter(
        lambda rating: (rating.user, rating.product) not in testingSet)

    print "testingSetRatings.count()=" + str(testRatings.count())
    print "ratings.count() after filter=" + str(trainRatings.count())

    rank = 10
    numIterations = 12
    lamb = 0.1
    model = ALS.train(trainRatings, rank, numIterations, lamb)

    print "Training complete"

    userProducts = testRatings.map(lambda rating:
                                   (rating.user, rating.product))
    predictions = model.predictAll(userProducts).map(lambda r:
                                                     ((r[0], r[1]), r[2]))
    ratesAndPreds = testRatings.map(lambda r: ((r[0], r[1]), r[2])).join(
        predictions)
    absDiffBuckets = ratesAndPreds.map(lambda r: int(abs(r[1][0] - r[1][1]))) \
     .map(lambda d: min(d, 4)).cache()
    RMSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()

    # Write predictions to file
    outputFileName = "YeJoo_Park_ModelBasedCF.txt"
    printWriter = open(outputFileName, "a")

    outputPreds = ratesAndPreds.map(lambda r:
                                    (r[0][0], r[0][1], r[1][1])).collect()
    outputPreds.sort()

    for pred in outputPreds:
        printWriter.write(
            str(pred[0]) + ", " + str(pred[1]) + ", " + str(pred[2]))
        printWriter.write("\n")

    printWriter.close()

    print ">=0 and <1: " + str(absDiffBuckets.filter(lambda d: d == 0).count())
    print ">=1 and <2: " + str(absDiffBuckets.filter(lambda d: d == 1).count())
    print ">=2 and <3: " + str(absDiffBuckets.filter(lambda d: d == 2).count())
    print ">=3 and <4: " + str(absDiffBuckets.filter(lambda d: d == 3).count())
    print ">=4: " + str(absDiffBuckets.filter(lambda d: d == 4).count())

    print "RMSE=" + str(RMSE)
Beispiel #36
0
from pyspark import SparkConf
from pyspark import SparkContext
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

import random
num_samples = 100000000


def inside(p):
    x, y = random.random(), random.random()
    return x * x + y * y < 1


count = sc.parallelize(range(0, num_samples)).filter(inside).count()

pi = 4 * count / num_samples
print(pi)

sc.stop()
Beispiel #37
0
 def _conf(cls):
     sc = SparkContext.getOrCreate()
     return sc._jsc.hadoopConfiguration()
Beispiel #38
0
    hash_table = {}
    # running through the tokens
    for token in tokens:
        # if the token is indeed among those we want to keep
        if token in reference_table.keys():
            # updating the frequency table
            hash_table[reference_table[token]] = hash_table.get(reference_table[token], 0) + 1
    # returning a Sparse vector object
    sparse_vector = SparseVector(len(reference_table), hash_table)
    return sparse_vector


if __name__ == '__main__':

    # create a spark context
    spark_context = SparkContext.getOrCreate()
    sql_context = SQLContext(sparkContext=spark_context)

    # defining the schema of the data
    schema = StructType([
        StructField('label', IntegerType(), True),
        StructField('id', StringType(), True),
        StructField('date', StringType(), True),
        StructField('query', StringType(), True),
        StructField('user', StringType(), True),
        StructField('text', StringType(), True)
    ])
    useless_columns = ['id', 'date', 'query', 'user']

    # load data
    df = sql_context.read.csv(path=file_path, schema=schema)
Beispiel #39
0
import pyspark
from pyspark import SparkContext as sc
from pyspark import SparkConf
import os
os.environ['JAVA_HOME'] = 'D:\software\jdk1.8'
conf = SparkConf().setAppName('test').setMaster('local[*]')
sc = sc.getOrCreate(conf)
print(sc)
nums = sc.parallelize([1, 2, 3, 4])
# nums.reduce(lambda x,y: x+y)
print(nums)
print(nums.collect())
print(type(nums))
Beispiel #40
0
from pyspark import SparkContext  #SC
sc = SparkContext.getOrCreate()  #place SparkContext into a Variable
from pyspark.sql import SparkSession  #place SparkSession into a Variable
spark = SparkSession.builder.appName("joint_RDD").getOrCreate()

from operator import add

sum = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9])
adding = sum.reduce(add)

print("adding all the elements in RDD : %i" % (adding))

words = sc.parallelize([
    "python", "data", "big data", "spark", "apache spark", "hadoop",
    "data science"
])

words_map = words.map(lambda x: (x, 1))
mapping = words_map.collect()

print("key value pair -> %s" % (mapping))

#counts = words.count()
#print ("number of elements present in RDD -> %i" % (counts))
#words_filter = words.filter(lambda x: 'spark' in x)
#filtered = words_filter.collect()
from pyspark import SparkContext  #SC
sc = SparkContext.getOrCreate()  #place SparkContext into a Variable
from pyspark.sql import SparkSession  #place SparkSession into a Variable
spark = SparkSession.builder.appName("dataframe").getOrCreate()
def getHDFSRdd():
    sc = SparkContext.getOrCreate()
    postrdd = sc.textFile("./*.xml")
    commentrdd = sc.textFile("./*.xml")
    return sc, postrdd, commentrdd
Beispiel #42
0
def create_spark_context() -> SparkContext:
    spark_conf = SparkConf()\
        .set('spark.rpc.message.maxSize', 300)\
        .setAppName("JMLR")
    return SparkContext.getOrCreate(spark_conf)
)

# conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.EnvironmentVariableCredentialsProvider')
# conf.set('spark.hadoop.fs.s3a.access.key', 'AWS_ACCESS_KEY')
# conf.set('spark.hadoop.fs.s3a.secret.key', 'AWS_SECRET_KEY')
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider',
         'com.amazonaws.auth.profile.ProfileCredentialsProvider')
conf.set('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
conf.set('spark.hadoop.fs.s3a.impl.disable.cache', 'true')
conf.set('com.amazonaws.services.s3.enableV4', 'true')
print('Conf :===')
pprint(conf)

# Create spark context
from pyspark import SparkContext
sc: SparkContext = SparkContext.getOrCreate(conf=conf)
print('Spark version=' + sc.version)
print('Spark Context (i.e. sc) :===')
pprint(sc)

# Create spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.config(conf=conf).getOrCreate()
print('Spark Session (i.e. spark) :===')
pprint(spark)

# imports
from pyspark.sql.functions import col, lit, count
from pyspark.sql.functions import to_timestamp, to_date

PARQUET_SUFFIX: str = '.parquet'
Beispiel #44
0
    def fit(self, X, y=None, sample_weight=None):
        """X is a dataframe."""
        if self.method not in ("dbscan", "hdbscan", "spark"):
            raise ValueError("Unsupported method '%s'" % self.method)
        if not self.dbscan_params:
            self.dbscan_params = dict(
                min_samples=20, n_jobs=-1, algorithm='brute',
                metric=partial(distance_dataframe, X, **dict(
                    junction_dist=StringDistance(),
                    correct=False, tol=0)))
        if not self.hdbscan_params and self.method == 'hdbscan':
            self.hdbscan_params = dict(
                min_samples=20, n_jobs=-1,
                metric=partial(distance_dataframe, X, **dict(
                    junction_dist=StringDistance(),
                    correct=False, tol=0)))

        self.dbscan_params['eps'] = self.eps
        # new part: group by junction and v genes
        if self.method == 'hdbscan' and False:
            # no grouping; unsupported sample_weight
            groups_values = [[x] for x in np.arange(X.shape[0])]
        else:
            # list of lists
            groups_values = X.groupby(
                ["v_gene_set_str", self.model + "junc"]).groups.values()

        idxs = np.array([elem[0] for elem in groups_values])  # take one of them
        sample_weight = np.array([len(elem) for elem in groups_values])
        X_all = idxs.reshape(-1, 1)

        if self.kmeans_params.get('n_clusters', True):
            # ensure the number of clusters is higher than points
            self.kmeans_params['n_clusters'] = min(
                self.kmeans_params['n_clusters'], X_all.shape[0])
        kmeans = MiniBatchKMeans(**self.kmeans_params)

        lengths = X[self.model + 'junction_length'].values
        kmeans.fit(lengths[idxs].reshape(-1, 1))
        dbscan_labels = np.zeros_like(kmeans.labels_).ravel()

        if self.method == 'hdbscan':
            from hdbscan import HDBSCAN
            from hdbscan.prediction import all_points_membership_vectors
            dbscan_sk = HDBSCAN(**self.hdbscan_params)
        else:
            dbscan_sk = DBSCAN(**self.dbscan_params)
        if self.method == 'spark':
            from pyspark import SparkContext
            from icing.externals.pypardis import dbscan as dbpard
            sc = SparkContext.getOrCreate()
            sample_weight_map = dict(zip(idxs, sample_weight))
            # self.dbscan_params.pop('n_jobs', None)
            dbscan = dbpard.DBSCAN(
                dbscan_params=self.dbscan_params,
                **self.dbspark_params)
        # else:

        for i, label in enumerate(np.unique(kmeans.labels_)):
            idx_row = np.where(kmeans.labels_ == label)[0]

            if self.verbose:
                print("Iteration %d/%d" % (i, np.unique(kmeans.labels_).size),
                      "(%d seqs)" % idx_row.size, end='\r')

            X_idx = idxs[idx_row].reshape(-1, 1).astype('float64')
            weights = sample_weight[idx_row]

            if idx_row.size == 1:
                db_labels = np.array([0])
            elif self.method == 'spark' and idx_row.size > 5000:
                test_data = sc.parallelize(enumerate(X_idx))
                dbscan.train(test_data, sample_weight=sample_weight_map)
                db_labels = np.array(dbscan.assignments())[:, 1]
            elif self.method == 'hdbscan':
                db_labels = dbscan_sk.fit_predict(X_idx)  # unsupported weights
                # avoid noise samples
                soft_clusters = all_points_membership_vectors(dbscan_sk)
                db_labels = np.array([np.argmax(x) for x in soft_clusters])
            else:
                db_labels = dbscan_sk.fit_predict(
                    X_idx, sample_weight=weights)

            if len(dbscan_sk.core_sample_indices_) < 1:
                db_labels[:] = 0
            if -1 in db_labels:
                balltree = BallTree(
                    X_idx[dbscan_sk.core_sample_indices_],
                    metric=dbscan_sk.metric)
                noise_labels = balltree.query(
                    X_idx[db_labels == -1], k=1, return_distance=False).ravel()
                # get labels for core points, then assign to noise points based
                # on balltree
                dbscan_noise_labels = db_labels[
                    dbscan_sk.core_sample_indices_][noise_labels]
                db_labels[db_labels == -1] = dbscan_noise_labels

            # hopefully, there are no noisy samples at this time
            db_labels[db_labels > -1] = db_labels[db_labels > -1] + np.max(dbscan_labels) + 1
            dbscan_labels[idx_row] = db_labels  # + np.max(dbscan_labels) + 1

        if self.method == 'spark':
            sc.stop()
        labels = dbscan_labels

        # new part: put together the labels
        labels_ext = np.zeros(X.shape[0], dtype=int)
        labels_ext[idxs] = labels
        for i, list_ in enumerate(groups_values):
            labels_ext[list_] = labels[i]
        self.labels_ = labels_ext
Beispiel #45
0
 def __init__(self, java_model):
     self._sc = SparkContext.getOrCreate()
     self._java_model = java_model
Beispiel #46
0
 def __init__(self, min, max, seed=0):
     ctx = SparkContext.getOrCreate()
     self.jvm = ctx.getOrCreate()._jvm
     self.rangeParam = self.jvm.com.microsoft.azure.synapse.ml.automl.HyperParamUtils.getRangeHyperParam(
         min, max, seed)
Beispiel #47
0
"""Script para configurar contexto en Spark."""

__author__ = 'leferrad'

from pyspark import SparkContext, SparkConf

import os

if 'sc' not in globals():
    appName = 'learninspy-app'
    if 'SPARK_MASTER_IP' not in os.environ.keys() and 'SPARK_MASTER_PORT' not in os.environ.keys():
        master = 'local[*]'  # default: local mode
    else:
        master = 'spark://'+os.environ['SPARK_MASTER_IP']+':'+os.environ['SPARK_MASTER_PORT']  # master defined
    extraJavaOptions = '-XX:+UseG1GC'
    conf = (SparkConf().setAppName(appName)
            .setMaster(master)
            .set('spark.ui.showConsoleProgress', False)  # Para que no muestre el progreso de los Stages (comentar sino)
            .set('spark.driver.extraJavaOptions', '-XX:+UseG1GC')
            .set('spark.executor.extraJavaOptions', '-XX:+UseG1GC')
            .set('spark.executor.extraJavaOptions', '-XX:+UseCompressedOops')  # Cuando se tiene menos de 32GB de RAM, punteros de 4 bytes en vez de 8 bytes
            .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
            )
    sc = SparkContext.getOrCreate(conf=conf)

    from learninspy.utils.fileio import get_logger
    logger = get_logger(name=__name__)

    logger.info("Contexto de Spark inicializado.")

def getLocalRdd():
    sc = SparkContext.getOrCreate()
    postrdd = sc.textFile("./example.xml")
    commentrdd = sc.textFile("./examplecomment.xml")
    return sc, postrdd, commentrdd
Beispiel #49
0
def streamToPowerBI(df, url, options=dict()):
    jvm = SparkContext.getOrCreate()._jvm
    writer = jvm.com.microsoft.ml.spark.PowerBIWriter
    return writer.stream(df.drop("label")._jdf, url, options)
Beispiel #50
0
def callMLlibFunc(name: str, *args: Any) -> Any:
    """Call API in PythonMLLibAPI"""
    sc = SparkContext.getOrCreate()
    assert sc._jvm is not None
    api = getattr(sc._jvm.PythonMLLibAPI(), name)
    return callJavaFunc(sc, api, *args)
Beispiel #51
0
 def __init__(self, java_model: JavaObject):
     self._sc = SparkContext.getOrCreate()
     self._java_model = java_model
Beispiel #52
0
def callMLlibFunc(name, *args):
    """ Call API in PythonMLLibAPI """
    sc = SparkContext.getOrCreate()
    api = getattr(sc._jvm.PythonMLLibAPI(), name)
    return callJavaFunc(sc, api, *args)
Beispiel #53
0
    with open(stopword_file, "r") as fp:
        lines = fp.readlines()

    stopword_list = set()

    for line in lines:
        stopword_list.add(line.strip())

    exclude_set = {'(', '[', ',', '.', '!', '?', ':', ';', ']', ')'}

    sc = SparkContext(appName="DSCI553Task1", master="local[*]")

    scf = SparkConf().setAppName("DSCI553").setMaster("local[*]")

    sc = SparkContext.getOrCreate(conf=scf)

    result = dict()

    json_input_content = sc.textFile(input_file).map(
        lambda row: json.loads(row))

    review_ids = json_input_content.map(lambda kv: kv['review_id'])
    result['A'] = total_num_reviews(review_ids)

    review_years = json_input_content.map(lambda kv: kv['date'])
    result['B'] = number_of_reviews_y(review_years, year)

    business_ids_rdd = json_input_content.map(lambda kv: kv['business_id'])
    result['C'] = number_of_distinct_business(business_ids_rdd)
Beispiel #54
0
from pyspark import SparkContext
import json
sc=SparkContext.getOrCreate()

#Baca data dari storage hdfs
rdd = sc.textFile("hdfs://192.168.43.154:8020/xxxx.json")


#Cek rentang waktu
#2 parameter, ex: cek 17 mei - 20 mei
#3 parameter, ex: status 'ancaman' pada 17 mei - 20 mei
def cekRentang(waktux,waktuy,status=None,mean=False):
        def _cekRentang(data):
                dataJson=json.loads(data)
                temp=[]
		count=0
		avg=0
                waktuxSplit=waktux.split("/")
                waktuySplit=waktuy.split("/")
                for i in dataJson:
			if status == "bahaya":
                                if dataJson[str(i)]["status"]==status:
					temp.append(dataJson[i])
					#count+=1
			else:
				temp.append(dataJson[i])

		if mean==True:
			avg=float(len(temp))/len(dataJson)
			return avg
		
def reponseQuestion1():
    # Create spark environment
    conf = SparkConf().setAppName("PySparkShell").setMaster("local[*]")
    sc = SparkContext.getOrCreate(conf)
    # Create connexion
    cluster = Cluster(["localhost"])
    session = cluster.connect(KEYSPACE)

    # User input
    choix = input("""
Voulez-vous donner
     1 : un nom de station
     2 : les coordonnees d une station ?

Tapez 1 ou 2: """)

    if choix == '1':
        station = input("\nDonnez la station : ")
        query = query = "SELECT lat, lon FROM stations where station='{}' ALLOW FILTERING;".format(
            station)
        result = session.execute(query)
        if result.one():
            latitude = result.one()[0]
            longitude = result.one()[1]
        else:
            print("\n*** La station n'existe pas ! ***\n")
            return
    elif choix == '2':
        print("\nDonnez une localisation avec latitude et longitude.\n")
        latitude_input = float(input("Donnez la latitude : "))
        longitude_input = float(input("Donnez la longitude : "))

        # Find station from key (latitude,longitude)
        query = "SELECT lat, lon FROM stations;"
        result = session.execute(query)
        D = sc.parallelize(result)
        station = np.array(
            D.map(lambda data: (round((data[0] - latitude_input)**2 +
                                      (data[1] - longitude_input)**2, 4), data[
                                          0], data[1])).distinct().collect())
        station_proche = station[np.where(station == min(station[:, 0]))[0],
                                 1:3]
        latitude = station_proche[0, 0]
        longitude = station_proche[0, 1]
        station = session.execute(
            "SELECT station FROM stations WHERE lat={} AND lon={};".format(
                latitude, longitude)).one()[0]
    else:
        print("\n*** Mauvais choix ! ***\n")
        return

    year = int(input("\nDonnez l annee entre 2011 et 2014 : "))
    while (year < 2011 or year > 2014):
        print("\nMauvaise annee !\n")
        year = int(input("\nDonnez l annee entre 2011 et 2014 : "))
    # Max temperature per day
    query = "SELECT year,month,day,tmp FROM asos1 WHERE lat = {} and lon = {} AND year = {} ORDER BY year,month,day ALLOW FILTERING;".format(
        latitude, longitude, year)
    result = session.execute(query)
    D = sc.parallelize(result)
    daily_max_min_temp = D.map(lambda data: (
        (toYMD(data[0], data[1], data[2])), [data[3], data[3]])).reduceByKey(
            lambda a, b: [max(a[0], b[0]), min(a[1], b[1])]).map(
                lambda r: [r[0], round(r[1][0], 2),
                           round(r[1][1], 2)]).collect()
    daily_max_min_temp = sorted(daily_max_min_temp, key=lambda x: x[0])
    daily_max_min_temp = np.array(daily_max_min_temp)

    # Average temperature per quarter
    query = "SELECT year,month,tmp FROM asos1 WHERE  lat = {} AND lon = {} ORDER BY year,month;".format(
        latitude, longitude)
    result = session.execute(query)
    D = sc.parallelize(result)
    moyen_temperature = D.map(lambda data: [(data[0], math.ceil(data[
        1] / 3)), np.array([1, data[2]])]).reduceByKey(lambda a, b: a + b).map(
            lambda r:
            (r[0][0], r[0][1], round(r[1][1] / r[1][0], 2))).collect()
    moyen_temperature = sorted(moyen_temperature, key=lambda moy: moy[0:2])
    moyen_temperature = np.array(moyen_temperature)

    # Max-min temperature per month
    max_min_temp = D.map(
        lambda data: ((data[0], data[1]), [data[2], data[2]])).reduceByKey(
            lambda a, b: [max(a[0], b[0]), min(a[1], b[1])]).map(
                lambda r: [r[0][0], r[0][1], r[1][0], r[1][1]]).collect()
    max_min_temp = sorted(max_min_temp, key=lambda x: x[0:2])
    max_min_temp = np.array(max_min_temp)

    # Wind rose
    query = "SELECT month, drct FROM asos1 WHERE lat = {} AND lon = {};".format(
        latitude, longitude)
    result = session.execute(query)
    D = sc.parallelize(result)
    #wind_direction_frequency = D.map(lambda data:[math.ceil(data[0]/45),1]).reduceByKey(lambda a,b:a+b).map(lambda r:[r[0],r[1]]).collect()
    wind_direction_frequency = D.map(lambda data: [(math.ceil(data[
        0] / 3), 8 if data[1] == 0 else math.ceil(data[
            1] / 45)), 1]).reduceByKey(lambda a, b: a + b).map(
                lambda r: [r[0][0], r[0][1], r[1]]).collect()
    wind_direction_frequency = sorted(wind_direction_frequency,
                                      key=lambda x: x[0:2])
    wind_direction_frequency = np.array(wind_direction_frequency)

    # Temperature boxplot
    '''
    boxdata = []
    for i in range(1,13):
        if i not in temperature[:,0]:
            continue
        else:
            boxdata.append(temperature[temperature[:,0]==i,1])
    labels = range(1,13)
    bplot = plt.boxplot(boxdata, patch_artist=True, labels=labels)
    plt.title('Température box plot')
    colors = ['dodgerblue', 'dodgerblue', 'dodgerblue', 'orange','orange','orange','orangered','orangered','orangered','deepskyblue','deepskyblue','deepskyblue']
    for patch, color in zip(bplot['boxes'], colors):
        patch.set_facecolor(color)  # 为不同的箱型图填充不同的颜色
    plt.xlabel('Mois')
    plt.ylabel('Température')
    plt.savefig("images/Temperature_box_plot.png")
    '''
    # Plot for max temperature per day
    plotDailyMaxtemp(daily_max_min_temp, station, year)

    # Plot for average temperature per quarter
    plotTemperatureMoyenneMensuel(moyen_temperature, station)

    # Plot for max-min temperature per month
    plotTemperatureMaxMinTri(max_min_temp, station)

    # Plot for wind rose
    plotWindRose(wind_direction_frequency, station)

    print("""
     *** Courbes creees avec succes ! ***\n
             ==================\n""")
Beispiel #56
0
glue_db = args['glue_database']
s3_bkt = args['s3_bucket']
rawweather_tbl = args['rawweather_table']
cleanweather_tbl = args['cleanweather_table']
output_s3_path = "s3://{}/{}".format(s3_bkt, cleanweather_tbl)

logger.info({
  'glue_database': glue_db,
  's3_bucket': s3_bkt,
  'rawweather_table': rawweather_tbl,
  'output_s3_path': output_s3_path
})


spark = SparkSession(SparkContext.getOrCreate())
glue_ctx = GlueContext(SparkContext.getOrCreate())

raw_dyf = glue_ctx.create_dynamic_frame.from_catalog(database=glue_db, table_name=rawweather_tbl)


def process_hourly(hours, key, fn):
    nums = []
    for hr in hours:
        if hr[key]:
            try:
                num = float(hr[key])
                if pd.notnull(num):
                    nums.append(num)
            except Exception as e:
                logger.error({
import pandas as pd
from databricks import koalas as ks
from pyspark.sql import SparkSession
from pyspark import SparkContext
import time
import logging

#hush Spark chatter
logger = SparkContext.getOrCreate()._jvm.org.apache.log4j
logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)

print("Starting Script")

start_time = time.time()
path_to_file = '/usr/local/bin/breast_cancer_data.csv'
# Pandas
df = pd.read_csv(path_to_file)
# perform expensive operations
df = df.sample(frac=1)

execution_time = time.time() - start_time
print("Dataframe with Pandas:")
print(df)
print(f"Execution time was: {execution_time}")

start_time = time.time()
# Koalas on top of Spark df
df = ks.read_csv(path_to_file)
# perform expensive operations
df = df.sample(frac=float(1))