def main(): """Run the belief propagation algorithm for an example problem.""" # setup context conf = SparkConf().setAppName("BeliefPropagation example") sc = SparkContext.getOrCreate(conf) sql = SQLContext.getOrCreate(sc) with SuppressSparkLogs(sc): # create graphical model g of size 3 x 3 g = graphframes.examples.Graphs(sql).gridIsingModel(3) print("Original Ising model:") g.vertices.show() g.edges.show() # run BP for 5 iterations numIter = 5 results = BeliefPropagation.runBPwithGraphFrames(g, numIter) # display beliefs beliefs = results.vertices.select('id', 'belief') print("Done with BP. Final beliefs after {} iterations:".format(numIter)) beliefs.show() sc.stop()
def _getScaleHintList(): featurizer = SparkContext.getOrCreate()._jvm.com.databricks.sparkdl.DeepImageFeaturizer if isinstance(featurizer, py4j.java_gateway.JavaPackage): # do not see DeepImageFeaturizer, possibly running without spark # instead of failing return empty list return [] return dict(featurizer.scaleHintsJava()).keys()
def optimize(self): """ Do an optimization. """ jmodel = callJavaFunc(SparkContext.getOrCreate(), self.value.optimize) from nn.layer import Model return Model.of(jmodel)
def parse_raw_wikidata(output): spark_conf = SparkConf().setAppName('QB Wikidata').setMaster(QB_SPARK_MASTER) sc = SparkContext.getOrCreate(spark_conf) # type: SparkContext wikidata = sc.textFile('s3a://entilzha-us-west-2/wikidata/wikidata-20170306-all.json') def parse_line(line): if len(line) == 0: return [] if line[0] == '[' or line[0] == ']': return [] elif line.endswith(','): return [json.loads(line[:-1])] else: return [json.loads(line)] parsed_wikidata = wikidata.flatMap(parse_line).cache() property_map = extract_property_map(parsed_wikidata) b_property_map = sc.broadcast(property_map) wikidata_items = parsed_wikidata.filter(lambda d: d['type'] == 'item').cache() parsed_wikidata.unpersist() item_page_map = extract_item_page_map(wikidata_items) b_item_page_map = sc.broadcast(item_page_map) parsed_item_map = extract_items(wikidata_items, b_property_map, b_item_page_map) with open(output, 'wb') as f: pickle.dump({ 'parsed_item_map': parsed_item_map, 'item_page_map': item_page_map, 'property_map': property_map }, f) sc.stop()
def readImages(imageDirectory, numPartition=None): """ Read a directory of images (or a single image) into a DataFrame. :param sc: spark context :param imageDirectory: str, file path. :param numPartition: int, number or partitions to use for reading files. :return: DataFrame, with columns: (filepath: str, image: imageSchema). """ return _readImages(imageDirectory, numPartition, SparkContext.getOrCreate())
def readImagesWithCustomFn(path, decode_f, numPartition=None): """ Read a directory of images (or a single image) into a DataFrame using a custom library to decode the images. :param path: str, file path. :param decode_f: function to decode the raw bytes into an array compatible with one of the supported OpenCv modes. see @imageIO.PIL_decode for an example. :param numPartition: [optional] int, number or partitions to use for reading files. :return: DataFrame with schema == ImageSchema.imageSchema. """ return _readImagesWithCustomFn(path, decode_f, numPartition, sc=SparkContext.getOrCreate())
def load_spark_context(application_name=None): if application_name is None: application_name = __name__ conf = SparkConf().setAppName(application_name) sc = SparkContext.getOrCreate(conf=conf) sql_context = SQLContext(sc) # Close logger # logger = sc._jvm.org.apache.log4j # logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) # logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) return sc, sql_context
def __init__(self, layers, bias=1.0, act_func=None, act_func_prime=None): if act_func is None: self.act_func = sigmoid self.act_func_prime = sigmoid_prime else: self.act_func = act_func self.act_func_prime = act_func_prime self.layers = layers self.bias = bias self.spark_context = SparkContext.getOrCreate() log4jLogger = self.spark_context._jvm.org.apache.log4j self.logger = log4jLogger.LogManager.getLogger(__name__)
def callBigDlFunc(bigdl_type, name, *args): """ Call API in PythonBigDL """ sc = SparkContext.getOrCreate() if bigdl_type == "float": api = getattr( sc._jvm.com.intel.analytics.bigdl.python.api.PythonBigDL.ofFloat(), name) elif bigdl_type == "double": api = getattr( sc._jvm.com.intel.analytics.bigdl.python.api.PythonBigDL.ofDouble(), name) else: raise Exception("Not supported bigdl_type: %s" % bigdl_type) return callJavaFunc(sc, api, *args)
def installPackage(self, artifact, base=None, sc=None): artifact = self._toArtifact(artifact) #Test if we already have a version installed res=self.fetchArtifact(artifact) fileLoc=None if res: fileLoc=res[1] print("Package already installed: {0}".format(str(artifact))) else: #download package art=[artifact] def _doDownload(d): artifact=art[0] if not artifact.version or artifact.version=='0': artifact.version = d.resolver._find_latest_version_available(artifact) fileLoc = artifact.get_filename(self.DOWNLOAD_DIR) if os.path.isfile(fileLoc): os.remove(fileLoc) results = d.download(artifact,filename=self.DOWNLOAD_DIR) if not results[1]: raise Exception("Error downloading package {0}".format(str(artifact))) else: artifact=results[0] print("Artifact downloaded successfully {0}".format(str(artifact))) printEx("Please restart Kernel to complete installation of the new package",PrintColors.RED) fileLoc=self.storeArtifact(artifact,base) return fileLoc try: fileLoc=_doDownload(downloader.Downloader(base) if base is not None else downloader.Downloader()) except RequestException as e: #try another base try: fileLoc=_doDownload(downloader.Downloader("http://dl.bintray.com/spark-packages/maven")) except RequestException as e: print("Unable to install artifact {0}".format(e.msg)) raise except: print(str(sys.exc_info()[1])) raise if sc is None: sc = SparkContext.getOrCreate() if sc: #convert to file uri for windows platform if platform.system()=='Windows': fileLoc="file://" + urllib.pathname2url(fileLoc) sc.addPyFile(fileLoc) return artifact
def isImage(df, column): """ Returns True if the column contains images Args: df (DataFrame): The DataFrame to be processed column (str): The name of the column being inspected Returns: bool: True if the colum is an image column """ jvm = SparkContext.getOrCreate()._jvm schema = jvm.com.microsoft.ml.spark.schema.ImageSchema return schema.isImage(df._jdf, column)
def toPython(entity): from py4j.java_gateway import JavaObject if entity is None or not isinstance(entity, JavaObject): return entity clazz = entity.getClass().getName() if clazz == "org.apache.spark.sql.Dataset": entity = entity.toDF() clazz = "org.apache.spark.sql.DataFrame" if clazz == "org.apache.spark.sql.DataFrame": from pyspark.sql import DataFrame, SQLContext from pyspark import SparkContext entity = DataFrame(entity, SQLContext(SparkContext.getOrCreate(), entity.sqlContext())) return entity
def readImages(sparkSession, path, recursive = False, sampleRatio = 1.0, inspectZip = True): """ Reads the directory of images from the local or remote (WASB) source. This function is attached to SparkSession class. Example: spark.readImages(path, recursive, ...) Args: sparkSession (SparkSession): Existing sparkSession path (str): Path to the image directory recursive (bool): Recursive search flag sampleRatio (double): Fraction of the images loaded Returns: DataFrame: DataFrame with a single column of "images", see imageSchema for details """ ctx = SparkContext.getOrCreate() reader = ctx._jvm.com.microsoft.ml.spark.ImageReader sql_ctx = pyspark.SQLContext.getOrCreate(ctx) jsession = sql_ctx.sparkSession._jsparkSession jresult = reader.read(path, recursive, jsession, float(sampleRatio), inspectZip) return DataFrame(jresult, sql_ctx)
def train(filename): global model sc=SparkContext.getOrCreate() user = sc.textFile(filename) ratings=user.map(lambda l:l.split("\t")).map(lambda l:Rating(int(l[0]),int(l[1]),float(l[2]))) #split into training & test (training, test) = ratings.randomSplit([0.8, 0.2]) testdata = test.map(lambda p: (p[0], p[1])) rank = 10 numIterations = 10 #training the model model = ALS.train(training, rank, numIterations) #validating the model predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = test.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() rmse = sqrt(MSE) print rmse print "training done" return "OK"
def test_active_session_with_None_and_not_None_context(self): from pyspark.context import SparkContext from pyspark.conf import SparkConf sc = None session = None try: sc = SparkContext._active_spark_context self.assertEqual(sc, None) activeSession = SparkSession.getActiveSession() self.assertEqual(activeSession, None) sparkConf = SparkConf() sc = SparkContext.getOrCreate(sparkConf) activeSession = sc._jvm.SparkSession.getActiveSession() self.assertFalse(activeSession.isDefined()) session = SparkSession(sc) activeSession = sc._jvm.SparkSession.getActiveSession() self.assertTrue(activeSession.isDefined()) activeSession2 = SparkSession.getActiveSession() self.assertNotEqual(activeSession2, None) finally: if session is not None: session.stop() if sc is not None: sc.stop()
def get(self, name: str, tag: str, parties: typing.List[Party], gc: GarbageCollectionABC) -> typing.List: log_str = f"[rabbitmq.get](name={name}, tag={tag}, parties={parties})" LOGGER.debug(f"[{log_str}]start to get") _name_dtype_keys = [ _SPLIT_.join([party.role, party.party_id, name]) for party in parties ] if _name_dtype_keys[0] not in self._name_dtype_map: mq_names = self._get_mq_names(parties, dtype=NAME_DTYPE_TAG) channel_infos = self._get_channels(mq_names=mq_names) rtn_dtype = [] for i, info in enumerate(channel_infos): obj = self._receive_obj(info, name, tag=NAME_DTYPE_TAG) rtn_dtype.append(obj) LOGGER.debug(f"[rabbitmq.get] name: {name}, dtype: {obj}") for k in _name_dtype_keys: if k not in self._name_dtype_map: self._name_dtype_map[k] = rtn_dtype[0] rtn_dtype = self._name_dtype_map[_name_dtype_keys[0]] rtn = [] dtype = rtn_dtype.get("dtype", None) partitions = rtn_dtype.get("partitions", None) if dtype == FederationDataType.TABLE: mq_names = self._get_mq_names(parties, name, partitions=partitions) for i in range(len(mq_names)): party = parties[i] role = party.role party_id = party.party_id party_mq_names = mq_names[i] receive_func = self._get_partition_receive_func( name, tag, party_id, role, party_mq_names, mq=self._mq, connection_conf=self._rabbit_manager.runtime_config.get( 'connection', {})) sc = SparkContext.getOrCreate() rdd = sc.parallelize(range(partitions), partitions) rdd = rdd.mapPartitionsWithIndex(receive_func) rdd = materialize(rdd) table = Table(rdd) rtn.append(table) # add gc gc.add_gc_action(tag, table, '__del__', {}) LOGGER.debug( f"[{log_str}]received rdd({i + 1}/{len(parties)}), party: {parties[i]} " ) else: mq_names = self._get_mq_names(parties, name) channel_infos = self._get_channels(mq_names=mq_names) for i, info in enumerate(channel_infos): obj = self._receive_obj(info, name, tag) LOGGER.debug( f"[{log_str}]received obj({i + 1}/{len(parties)}), party: {parties[i]} " ) rtn.append(obj) LOGGER.debug(f"[{log_str}]finish to get") return rtn
print(id, retMapStockMoney[id]) return retMapStockMoney def reduceFunc(retMapStockMoney1, retMapStockMoney2): retMap = retMapStockMoney1.copy() retMap.update(retMapStockMoney2) return retMap conf = SparkConf().setAppName("miniProject").setMaster("spark://192.168.32.46:7077") # conf=SparkConf().setAppName("miniProject").setMaster("local[*]") # conf.set('spark.scheduler.mode', 'FAIR') # conf.set("spark.scheduler.pool", None) sc=sc.getOrCreate(conf) sc.setLogLevel("ERROR") # partitions = 32 partitions = 16 # partitions = 8 retMapIdDataOrg = GetStockPrice(checkStartDate, checkEndDate, minCheckDaysData=3) print( "allCheckStock start:", len(retMapIdDataOrg), np.array(retMapIdDataOrg)[:, 0]) retMapStockMoney = sc.parallelize(retMapIdDataOrg, partitions).map(mapFunc).filter(lambda v: len(list(v.values())[0]) > 0).reduce(reduceFunc) print( "allCheckStock end:", len(retMapStockMoney), retMapStockMoney.keys()) print("----------------------------------------------------------------------------------------------------------------") for id in retMapStockMoney:
import sys,csv,decimal from pyspark import SparkContext, SparkConf def movies_mapformat(line): return(line[0],(line[1],line[2].split("|"))) if _name_ == "_main_": driver_conf = SparkConf().setAppName("Q3_findMovieStats").setMaster("local") sparkcont = SparkContext.getOrCreate(conf = driver_conf) input_ratings = sparkcont.textFile("/FileStore/tables/ratings.csv") lines_split_includesHeader = input_ratings.map(lambda x: x.split(",")).filter(lambda x:len(x)==4) just_header = lines_split_includesHeader.first() lines_split_vals= lines_split_includesHeader.filter(lambda x: x!= just_header ) calc_data_avg = lines_split_vals.map(lambda x: [int(x[1]),float(x[2])]).filter(lambda x:len(x)==2) Count_sum = calc_data_avg.combineByKey(lambda value: (value, 1),lambda x, value: (x[0] + value, x[1] + 1),lambda x, y: (x[0] + y[0], x[1] + y[1])).filter(lambda x: len(x)==2) averageByKey_res = Count_sum.map(lambda x: (x[0], x[1][0] / x[1][1])) averageByKey_ressort_ = averageByKey.sortBy(lambda x: x[1]) find_ten_pairs_last = averageByKey_ressort_.take(10) ten_pairs_last = sparkcont.parallelize(find_ten_pairs_last) ten_pairs_movies = ten_pairs_last.map(lambda x:[str(x[0]),x[1]]) input_movies = sparkcont.textFile("/FileStore/tables/movies.csv") movies_csvRead = input_movies.mapPartitions(lambda x: csv.reader(x)) movies_RDD= movies_csvRead.map(movies_mapformat) splitbyLines_movies = movies_RDD.filter(lambda x: x[0] != 'movieId') list_ofpair_movies = splitbyLines_movies.map(lambda x: [x[0],x[1][0]]) Movies_names_ratings = ten_pairs_movies.join(list_ofpair_movies) Movie_names_ratings = Movies_names_ratings.map(lambda x:[x[0],x[1][1]]) ten_Movies_Names_Ratings = Movie_names_ratings.take(10) ten_Movies_Names_Ratings = sparkcont.parallelize(ten_Movies_Names_Ratings) input_tags = sparkcont.textFile("/FileStore/tables/tags.csv")
def InitSpark(): return SparkContext.getOrCreate()
return math.sqrt(sum / n) #The recommender system is done based on user-user mapping if __name__ == '__main__': arguments = len(sys.argv) #if arguments are not in the format we return it if arguments != 1: print("Improper arguments knn.py") else: conf = (SparkConf().setMaster('local').setAppName('KNN').set( 'spark.executor.memory', '6g').set('spark.driver.memory', '6g').set('spark.cores.max', '6').set('spark.driver.host', '127.0.0.1')) sc = SparkContext.getOrCreate(conf=conf) # creating the spark context sql_sc = SQLContext(sc) usebookcolumns = ['book_id', 'authors', 'title'] books_df = pd.read_csv('./books.csv', usecols=usebookcolumns) books_df = sql_sc.createDataFrame(books_df) booksRDD = books_df.rdd # use this RDD to keep track of all the list you need #rating count useratingcols = ['user_id', 'book_id', 'rating'] rating_df = pd.read_csv('./smallratings.csv', usecols=useratingcols) rating_df = sql_sc.createDataFrame(rating_df) ratingRDD = rating_df.rdd N = ratingRDD.count() print(N)
import sys from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark.sql.types import * from pyspark.sql import functions as F spark = SparkSession.builder.getOrCreate() sc = SparkContext.getOrCreate() #================== Q4 ====================-- #start_pyspark_shell -e 8 -c 4 -w 4 -m 4 #start_pyspark_shell -e 32 -c 2 -w 4 -m 4 #schema_Daily schema_Daily = StructType([ StructField('ID', StringType()), StructField('DATE', StringType()), StructField('ELEMENT', StringType()), StructField('VALUE', IntegerType()), StructField('MEASUREMENT_FLAG', StringType()), StructField('QUALITY_FLAG', StringType()), StructField('SOURCE_FLAG', StringType()), StructField('OBSERVATION_TIME', StringType()), ]) path = "hdfs:///data/ghcnd/daily/*.csv.gz" daily = (spark.read.format("com.databricks.spark.csv").option( "header", "false").option("inferSchema", "false").schema(schema_Daily).load(path)) stations = spark.read.orc( "hdfs:////user/xzh216/Assign1/output/stations_enriched.orc")
def writeToPowerBI(df, url, options=dict()): jvm = SparkContext.getOrCreate()._jvm writer = jvm.com.microsoft.ml.spark.PowerBIWriter writer.write(df._jdf, url, options)
def _build_spark_context(): from pyspark import SparkContext sc = SparkContext.getOrCreate() return sc
def test_get_or_create(self): with SparkContext.getOrCreate() as sc: self.assertTrue(SparkContext.getOrCreate() is sc)
def getStreamingContext(): sc = SparkContext.getOrCreate() sc.setLogLevel("WARN") ssc = StreamingContext(sc, BATCH_DURATION) return ssc
def _conf(cls): from pyspark import SparkContext sc = SparkContext.getOrCreate() return sc._jsc.hadoopConfiguration()
def __init__(self): self.data = None self.metadata = None self.joined = None self.avg = None self.sc = SparkContext.getOrCreate()
def index(): # extract data needed for visuals # get spark context sc = SparkContext.getOrCreate() # create spark dataframe to predict customer churn using the model #[gender, level, days_active, location, avgSongs, avgEvents, thumbsup, thumbsdown, add_friend] gender = '' level = 0 days_active = 0 location = 0 avgSongs = 0 avgEvents = 0 thumbsup = 0 thumbsdown = 0 add_friend = 0 df = sc.parallelize([[gender, level, days_active, location, avgSongs, avgEvents, thumbsup, thumbsdown, add_friend]]).\ toDF(["gender", "last_level", "days_active", "last_state", "avg_songs", "avg_events" , "thumbs_up", "thumbs_down", "addfriend"]) # df = sc.toDF(["gender", "last_level", "days_active", "last_state", "avg_songs", "avg_events" , "thumbs_up", "thumbs_down", "addfriend"]) #Basic ananlysis for visualisations df.show(5) # male = df.select('last_level', 'gender').where(df.gender == 'M').groupBy('last_level').count().agg(count("count")) # female = df.select('last_level', 'gender').where(df.gender == 'F').groupBy('last_level').count().agg(count("count")) # df_pd = male.join(female, "gender", "last_level").drop("count").fillna(0).toPandas() # df_pd.show() # TODO: Below is an example - modify to extract data for your own visuals # git # category extractions # category = list(df)[4:] # category_counts = [np.sum(df[column]) for column in category] # categories = df.iloc[:,4:] # categories_mean = categories.mean().sort_values(ascending=False)[1:6] # categories_names = list(categories_mean.index) # create visuals # TODO: Below is an example - modify to create your own visuals # graphs = [ # { # 'data': [ # Pie( # labels=gender_names, # values=gender_counts # ) # ], # 'layout': { # 'title': 'Satistics by Gender', # 'height': 450, # 'width': 1000 # }, # }, # # { # # 'data': [ # # Bar( # # x=words, # # y=count_props # # ) # # ], # # 'layout': { # # 'title': 'Top 10 words representation(%)', # # 'yaxis': { # # 'title': '% Occurrence', # # 'automargin': True # # }, # # 'xaxis': { # # 'title': 'Words', # # 'automargin': True # # } # # } # # }, # # { # # 'data': [ # # Bar( # # x=category, # # y=category_counts # # ) # # ], # # 'layout': { # # 'title': 'Message by categories', # # 'yaxis': { # # 'title': "Count" # # }, # # 'xaxis': { # # 'title': "Category" # # } # # } # # }, # # { # # 'data': [ # # Bar( # # x=categories_names, # # y=categories_mean # # ) # # ], # # 'layout': { # # 'title': 'Top 5 categories', # # 'yaxis': { # # 'title': "Count" # # }, # # 'xaxis': { # # 'title': "Categories" # # } # # } # # } # ] # encode plotly graphs in JSON # ids = ["graph-{}".format(i) for i, _ in enumerate(graphs)] # graphJSON = json.dumps(graphs, cls=plotly.utils.PlotlyJSONEncoder) # render web page with plotly graphs #return render_template('master.html', ids=ids, graphJSON=graphJSON) return render_template('master.html')
else: value_get = (str(Candidate)+","+str(eachFriend),set(ListofCandidateFriends)) Final_List_ofFriends.append(value_get) return(Final_List_ofFriends) def Map_final(line): _key = line[0] _value = list(line[1]) s_string = ",".join(_value) return("{0}\t {1}".format(_key,s_string)) if __name__ == "__main__": config = SparkConf().setAppName("mutualfriends").setMaster("local[2]") sparkcont = SparkContext.getOrCreate(conf = config) input_mutualfriends = sparkcont.textFile("/FileStore/tables/q2_assign/soc_LiveJournal1Adj_txt-b8957.txt") lines_split = input_mutualfriends.map(lambda x : x.split("\t")).filter(lambda x : len(x) == 2).map(lambda x: [x[0],x[1].split(",")]) split_mutualFriends = lines_split.flatMap(create_mutual_friends) Reducer_RDD = split_mutualFriends.reduceByKey(lambda x,y: x.intersection(y)) #print(Reducer_RDD.first()) Len_listofFriends = Reducer_RDD.mapValues(lambda x: len(x)) # print(Len_listofFriends.first()) List_OfsortedFriends = Len_listofFriends.sortBy(lambda x: -x[1]) top_teninList = List_OfsortedFriends.take(10) # print(top_teninList) top_list = sparkcont.parallelize(top_teninList) pairs_data = top_list.map(split_top)
def __init__(self, values, seed=0): ctx = SparkContext.getOrCreate() self.jvm = ctx.getOrCreate()._jvm self.hyperParam = self.jvm.com.microsoft.azure.synapse.ml.automl.HyperParamUtils.getDiscreteHyperParam( values, seed)
def cluster_by_period(start, end): sc = SparkContext.getOrCreate() D = sc.parallelize(extract_data(start, end)) # Compute mean temperature by station map = D.map(lambda data: ( (data['station'], data['latitude'], data['longitude']), np.array([1, data['temperature'], data['dew_point'], data['feel']]))) sum_by_station = map.reduceByKey(lambda a, b: a + b) mean_by_station = sum_by_station.map(calc_moy_key) if (len(mean_by_station.collect()) < 2): logging.warning( 'Only data for one station during this period, so there is one cluster of one station !' ) return # KMeans # Fist we determine the optimal k with elbow method logging.info('Clustering...') X = mean_by_station.map(lambda data: [data[1], data[2]]) X = np.array(X.collect()) sum_of_squared_distances = [] K = range(1, 10) for k in K: km = KMeans(n_clusters=k) km = km.fit(X) sum_of_squared_distances.append(km.inertia_) x = list(range(1, 10)) y = sum_of_squared_distances kn = KneeLocator(x, y, S=1.0, curve='convex', direction='decreasing', interp_method='polynomial') plt.xlabel('k') plt.ylabel('sum_of_squared_distances') plt.title('Elbow method for optimal k between {} and {}'.format( start, end)) plt.xticks(range(1, 9)) plt.plot(x, y, 'bx-') plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') filename = 'results/cluster_by_period/{}_{}_elbow.png'.format(start, end) if os.path.isfile(filename): os.remove(filename) plt.savefig(filename) plt.clf() # Then we cluster with the computed optimal k km = KMeans(n_clusters=kn.knee) km = km.fit(X) # Build map logging.info('Building map...') lons = [] lats = [] vals = km.labels_ for val in mean_by_station.collect(): lats.append(val[0][1]) lons.append(val[0][2]) map = Basemap(projection='merc', llcrnrlon=19.08, llcrnrlat=59.45, urcrnrlon=31.59, urcrnrlat=70.09, resolution='i') map.drawmapboundary(fill_color='aqua') map.fillcontinents(color='#cc9955', lake_color='aqua', zorder=1) map.drawcoastlines() map.drawcountries() x, y = map(lons, lats) map.scatter(x, y, c=vals, cmap=plt.cm.get_cmap('gist_rainbow', kn.knee), zorder=2) plt.title('Clustering des stations entre {} et {}'.format(start, end), fontsize=10) filename = 'results/cluster_by_period/{}_{}.png'.format(start, end) if os.path.isfile(filename): os.remove(filename) plt.savefig(filename) logging.info( 'Success : update your files and check the result in \'results/cluster_by_period\' !' ) logging.info( 'NB : you can also check \'elbow.png\' to see how we chose the clusters number.' ) plt.clf()
def __init__(self): ctx = SparkContext.getOrCreate() self.jvm = ctx.getOrCreate()._jvm self.hyperparams = {}
def RDD(filePath): sc = SparkContext.getOrCreate(SparkConf()) path = filePath RDD = sc.textFile(name=path) return RDD
def test_get_or_create(self): with SparkContext.getOrCreate() as sc: self.assertTrue(SparkContext.getOrCreate() is sc)
def main(): conf = SparkConf().setAppName("YeJoo_Park_task2_ModelBasedCF")\ .setMaster("local") sc = SparkContext.getOrCreate(conf) sc.setLogLevel("ERROR") ratingsFilePath = sys.argv[1] testFilePath = sys.argv[2] data = sc.textFile(testFilePath) dataHeader = data.first() testingSet = set(data\ .filter(lambda row: row != dataHeader)\ .map(lambda r: r.split(","))\ .map(lambda r: (int(r[USER_INDEX]), int(r[MOVIE_INDEX])))\ .collect()) # Load and parse the data data = sc.textFile(ratingsFilePath) dataHeader = data.first() trainRatings = data\ .filter(lambda row: row != dataHeader)\ .map(lambda r: r.split(","))\ .map(lambda r: Rating(int(r[USER_INDEX]), int(r[MOVIE_INDEX]), float(r[RATING_INDEX]))) print "ratings.count() before filter=" + str(trainRatings.count()) testRatings = trainRatings.filter( lambda rating: (rating.user, rating.product) in testingSet) trainRatings = trainRatings.filter( lambda rating: (rating.user, rating.product) not in testingSet) print "testingSetRatings.count()=" + str(testRatings.count()) print "ratings.count() after filter=" + str(trainRatings.count()) rank = 10 numIterations = 12 lamb = 0.1 model = ALS.train(trainRatings, rank, numIterations, lamb) print "Training complete" userProducts = testRatings.map(lambda rating: (rating.user, rating.product)) predictions = model.predictAll(userProducts).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = testRatings.map(lambda r: ((r[0], r[1]), r[2])).join( predictions) absDiffBuckets = ratesAndPreds.map(lambda r: int(abs(r[1][0] - r[1][1]))) \ .map(lambda d: min(d, 4)).cache() RMSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() # Write predictions to file outputFileName = "YeJoo_Park_ModelBasedCF.txt" printWriter = open(outputFileName, "a") outputPreds = ratesAndPreds.map(lambda r: (r[0][0], r[0][1], r[1][1])).collect() outputPreds.sort() for pred in outputPreds: printWriter.write( str(pred[0]) + ", " + str(pred[1]) + ", " + str(pred[2])) printWriter.write("\n") printWriter.close() print ">=0 and <1: " + str(absDiffBuckets.filter(lambda d: d == 0).count()) print ">=1 and <2: " + str(absDiffBuckets.filter(lambda d: d == 1).count()) print ">=2 and <3: " + str(absDiffBuckets.filter(lambda d: d == 2).count()) print ">=3 and <4: " + str(absDiffBuckets.filter(lambda d: d == 3).count()) print ">=4: " + str(absDiffBuckets.filter(lambda d: d == 4).count()) print "RMSE=" + str(RMSE)
from pyspark import SparkConf from pyspark import SparkContext sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]")) import random num_samples = 100000000 def inside(p): x, y = random.random(), random.random() return x * x + y * y < 1 count = sc.parallelize(range(0, num_samples)).filter(inside).count() pi = 4 * count / num_samples print(pi) sc.stop()
def _conf(cls): sc = SparkContext.getOrCreate() return sc._jsc.hadoopConfiguration()
hash_table = {} # running through the tokens for token in tokens: # if the token is indeed among those we want to keep if token in reference_table.keys(): # updating the frequency table hash_table[reference_table[token]] = hash_table.get(reference_table[token], 0) + 1 # returning a Sparse vector object sparse_vector = SparseVector(len(reference_table), hash_table) return sparse_vector if __name__ == '__main__': # create a spark context spark_context = SparkContext.getOrCreate() sql_context = SQLContext(sparkContext=spark_context) # defining the schema of the data schema = StructType([ StructField('label', IntegerType(), True), StructField('id', StringType(), True), StructField('date', StringType(), True), StructField('query', StringType(), True), StructField('user', StringType(), True), StructField('text', StringType(), True) ]) useless_columns = ['id', 'date', 'query', 'user'] # load data df = sql_context.read.csv(path=file_path, schema=schema)
import pyspark from pyspark import SparkContext as sc from pyspark import SparkConf import os os.environ['JAVA_HOME'] = 'D:\software\jdk1.8' conf = SparkConf().setAppName('test').setMaster('local[*]') sc = sc.getOrCreate(conf) print(sc) nums = sc.parallelize([1, 2, 3, 4]) # nums.reduce(lambda x,y: x+y) print(nums) print(nums.collect()) print(type(nums))
from pyspark import SparkContext #SC sc = SparkContext.getOrCreate() #place SparkContext into a Variable from pyspark.sql import SparkSession #place SparkSession into a Variable spark = SparkSession.builder.appName("joint_RDD").getOrCreate() from operator import add sum = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9]) adding = sum.reduce(add) print("adding all the elements in RDD : %i" % (adding)) words = sc.parallelize([ "python", "data", "big data", "spark", "apache spark", "hadoop", "data science" ]) words_map = words.map(lambda x: (x, 1)) mapping = words_map.collect() print("key value pair -> %s" % (mapping)) #counts = words.count() #print ("number of elements present in RDD -> %i" % (counts)) #words_filter = words.filter(lambda x: 'spark' in x) #filtered = words_filter.collect() from pyspark import SparkContext #SC sc = SparkContext.getOrCreate() #place SparkContext into a Variable from pyspark.sql import SparkSession #place SparkSession into a Variable spark = SparkSession.builder.appName("dataframe").getOrCreate()
def getHDFSRdd(): sc = SparkContext.getOrCreate() postrdd = sc.textFile("./*.xml") commentrdd = sc.textFile("./*.xml") return sc, postrdd, commentrdd
def create_spark_context() -> SparkContext: spark_conf = SparkConf()\ .set('spark.rpc.message.maxSize', 300)\ .setAppName("JMLR") return SparkContext.getOrCreate(spark_conf)
) # conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.EnvironmentVariableCredentialsProvider') # conf.set('spark.hadoop.fs.s3a.access.key', 'AWS_ACCESS_KEY') # conf.set('spark.hadoop.fs.s3a.secret.key', 'AWS_SECRET_KEY') conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.profile.ProfileCredentialsProvider') conf.set('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem') conf.set('spark.hadoop.fs.s3a.impl.disable.cache', 'true') conf.set('com.amazonaws.services.s3.enableV4', 'true') print('Conf :===') pprint(conf) # Create spark context from pyspark import SparkContext sc: SparkContext = SparkContext.getOrCreate(conf=conf) print('Spark version=' + sc.version) print('Spark Context (i.e. sc) :===') pprint(sc) # Create spark session from pyspark.sql import SparkSession spark = SparkSession.builder.config(conf=conf).getOrCreate() print('Spark Session (i.e. spark) :===') pprint(spark) # imports from pyspark.sql.functions import col, lit, count from pyspark.sql.functions import to_timestamp, to_date PARQUET_SUFFIX: str = '.parquet'
def fit(self, X, y=None, sample_weight=None): """X is a dataframe.""" if self.method not in ("dbscan", "hdbscan", "spark"): raise ValueError("Unsupported method '%s'" % self.method) if not self.dbscan_params: self.dbscan_params = dict( min_samples=20, n_jobs=-1, algorithm='brute', metric=partial(distance_dataframe, X, **dict( junction_dist=StringDistance(), correct=False, tol=0))) if not self.hdbscan_params and self.method == 'hdbscan': self.hdbscan_params = dict( min_samples=20, n_jobs=-1, metric=partial(distance_dataframe, X, **dict( junction_dist=StringDistance(), correct=False, tol=0))) self.dbscan_params['eps'] = self.eps # new part: group by junction and v genes if self.method == 'hdbscan' and False: # no grouping; unsupported sample_weight groups_values = [[x] for x in np.arange(X.shape[0])] else: # list of lists groups_values = X.groupby( ["v_gene_set_str", self.model + "junc"]).groups.values() idxs = np.array([elem[0] for elem in groups_values]) # take one of them sample_weight = np.array([len(elem) for elem in groups_values]) X_all = idxs.reshape(-1, 1) if self.kmeans_params.get('n_clusters', True): # ensure the number of clusters is higher than points self.kmeans_params['n_clusters'] = min( self.kmeans_params['n_clusters'], X_all.shape[0]) kmeans = MiniBatchKMeans(**self.kmeans_params) lengths = X[self.model + 'junction_length'].values kmeans.fit(lengths[idxs].reshape(-1, 1)) dbscan_labels = np.zeros_like(kmeans.labels_).ravel() if self.method == 'hdbscan': from hdbscan import HDBSCAN from hdbscan.prediction import all_points_membership_vectors dbscan_sk = HDBSCAN(**self.hdbscan_params) else: dbscan_sk = DBSCAN(**self.dbscan_params) if self.method == 'spark': from pyspark import SparkContext from icing.externals.pypardis import dbscan as dbpard sc = SparkContext.getOrCreate() sample_weight_map = dict(zip(idxs, sample_weight)) # self.dbscan_params.pop('n_jobs', None) dbscan = dbpard.DBSCAN( dbscan_params=self.dbscan_params, **self.dbspark_params) # else: for i, label in enumerate(np.unique(kmeans.labels_)): idx_row = np.where(kmeans.labels_ == label)[0] if self.verbose: print("Iteration %d/%d" % (i, np.unique(kmeans.labels_).size), "(%d seqs)" % idx_row.size, end='\r') X_idx = idxs[idx_row].reshape(-1, 1).astype('float64') weights = sample_weight[idx_row] if idx_row.size == 1: db_labels = np.array([0]) elif self.method == 'spark' and idx_row.size > 5000: test_data = sc.parallelize(enumerate(X_idx)) dbscan.train(test_data, sample_weight=sample_weight_map) db_labels = np.array(dbscan.assignments())[:, 1] elif self.method == 'hdbscan': db_labels = dbscan_sk.fit_predict(X_idx) # unsupported weights # avoid noise samples soft_clusters = all_points_membership_vectors(dbscan_sk) db_labels = np.array([np.argmax(x) for x in soft_clusters]) else: db_labels = dbscan_sk.fit_predict( X_idx, sample_weight=weights) if len(dbscan_sk.core_sample_indices_) < 1: db_labels[:] = 0 if -1 in db_labels: balltree = BallTree( X_idx[dbscan_sk.core_sample_indices_], metric=dbscan_sk.metric) noise_labels = balltree.query( X_idx[db_labels == -1], k=1, return_distance=False).ravel() # get labels for core points, then assign to noise points based # on balltree dbscan_noise_labels = db_labels[ dbscan_sk.core_sample_indices_][noise_labels] db_labels[db_labels == -1] = dbscan_noise_labels # hopefully, there are no noisy samples at this time db_labels[db_labels > -1] = db_labels[db_labels > -1] + np.max(dbscan_labels) + 1 dbscan_labels[idx_row] = db_labels # + np.max(dbscan_labels) + 1 if self.method == 'spark': sc.stop() labels = dbscan_labels # new part: put together the labels labels_ext = np.zeros(X.shape[0], dtype=int) labels_ext[idxs] = labels for i, list_ in enumerate(groups_values): labels_ext[list_] = labels[i] self.labels_ = labels_ext
def __init__(self, java_model): self._sc = SparkContext.getOrCreate() self._java_model = java_model
def __init__(self, min, max, seed=0): ctx = SparkContext.getOrCreate() self.jvm = ctx.getOrCreate()._jvm self.rangeParam = self.jvm.com.microsoft.azure.synapse.ml.automl.HyperParamUtils.getRangeHyperParam( min, max, seed)
"""Script para configurar contexto en Spark.""" __author__ = 'leferrad' from pyspark import SparkContext, SparkConf import os if 'sc' not in globals(): appName = 'learninspy-app' if 'SPARK_MASTER_IP' not in os.environ.keys() and 'SPARK_MASTER_PORT' not in os.environ.keys(): master = 'local[*]' # default: local mode else: master = 'spark://'+os.environ['SPARK_MASTER_IP']+':'+os.environ['SPARK_MASTER_PORT'] # master defined extraJavaOptions = '-XX:+UseG1GC' conf = (SparkConf().setAppName(appName) .setMaster(master) .set('spark.ui.showConsoleProgress', False) # Para que no muestre el progreso de los Stages (comentar sino) .set('spark.driver.extraJavaOptions', '-XX:+UseG1GC') .set('spark.executor.extraJavaOptions', '-XX:+UseG1GC') .set('spark.executor.extraJavaOptions', '-XX:+UseCompressedOops') # Cuando se tiene menos de 32GB de RAM, punteros de 4 bytes en vez de 8 bytes .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") ) sc = SparkContext.getOrCreate(conf=conf) from learninspy.utils.fileio import get_logger logger = get_logger(name=__name__) logger.info("Contexto de Spark inicializado.")
def getLocalRdd(): sc = SparkContext.getOrCreate() postrdd = sc.textFile("./example.xml") commentrdd = sc.textFile("./examplecomment.xml") return sc, postrdd, commentrdd
def streamToPowerBI(df, url, options=dict()): jvm = SparkContext.getOrCreate()._jvm writer = jvm.com.microsoft.ml.spark.PowerBIWriter return writer.stream(df.drop("label")._jdf, url, options)
def callMLlibFunc(name: str, *args: Any) -> Any: """Call API in PythonMLLibAPI""" sc = SparkContext.getOrCreate() assert sc._jvm is not None api = getattr(sc._jvm.PythonMLLibAPI(), name) return callJavaFunc(sc, api, *args)
def __init__(self, java_model: JavaObject): self._sc = SparkContext.getOrCreate() self._java_model = java_model
def callMLlibFunc(name, *args): """ Call API in PythonMLLibAPI """ sc = SparkContext.getOrCreate() api = getattr(sc._jvm.PythonMLLibAPI(), name) return callJavaFunc(sc, api, *args)
with open(stopword_file, "r") as fp: lines = fp.readlines() stopword_list = set() for line in lines: stopword_list.add(line.strip()) exclude_set = {'(', '[', ',', '.', '!', '?', ':', ';', ']', ')'} sc = SparkContext(appName="DSCI553Task1", master="local[*]") scf = SparkConf().setAppName("DSCI553").setMaster("local[*]") sc = SparkContext.getOrCreate(conf=scf) result = dict() json_input_content = sc.textFile(input_file).map( lambda row: json.loads(row)) review_ids = json_input_content.map(lambda kv: kv['review_id']) result['A'] = total_num_reviews(review_ids) review_years = json_input_content.map(lambda kv: kv['date']) result['B'] = number_of_reviews_y(review_years, year) business_ids_rdd = json_input_content.map(lambda kv: kv['business_id']) result['C'] = number_of_distinct_business(business_ids_rdd)
from pyspark import SparkContext import json sc=SparkContext.getOrCreate() #Baca data dari storage hdfs rdd = sc.textFile("hdfs://192.168.43.154:8020/xxxx.json") #Cek rentang waktu #2 parameter, ex: cek 17 mei - 20 mei #3 parameter, ex: status 'ancaman' pada 17 mei - 20 mei def cekRentang(waktux,waktuy,status=None,mean=False): def _cekRentang(data): dataJson=json.loads(data) temp=[] count=0 avg=0 waktuxSplit=waktux.split("/") waktuySplit=waktuy.split("/") for i in dataJson: if status == "bahaya": if dataJson[str(i)]["status"]==status: temp.append(dataJson[i]) #count+=1 else: temp.append(dataJson[i]) if mean==True: avg=float(len(temp))/len(dataJson) return avg
def reponseQuestion1(): # Create spark environment conf = SparkConf().setAppName("PySparkShell").setMaster("local[*]") sc = SparkContext.getOrCreate(conf) # Create connexion cluster = Cluster(["localhost"]) session = cluster.connect(KEYSPACE) # User input choix = input(""" Voulez-vous donner 1 : un nom de station 2 : les coordonnees d une station ? Tapez 1 ou 2: """) if choix == '1': station = input("\nDonnez la station : ") query = query = "SELECT lat, lon FROM stations where station='{}' ALLOW FILTERING;".format( station) result = session.execute(query) if result.one(): latitude = result.one()[0] longitude = result.one()[1] else: print("\n*** La station n'existe pas ! ***\n") return elif choix == '2': print("\nDonnez une localisation avec latitude et longitude.\n") latitude_input = float(input("Donnez la latitude : ")) longitude_input = float(input("Donnez la longitude : ")) # Find station from key (latitude,longitude) query = "SELECT lat, lon FROM stations;" result = session.execute(query) D = sc.parallelize(result) station = np.array( D.map(lambda data: (round((data[0] - latitude_input)**2 + (data[1] - longitude_input)**2, 4), data[ 0], data[1])).distinct().collect()) station_proche = station[np.where(station == min(station[:, 0]))[0], 1:3] latitude = station_proche[0, 0] longitude = station_proche[0, 1] station = session.execute( "SELECT station FROM stations WHERE lat={} AND lon={};".format( latitude, longitude)).one()[0] else: print("\n*** Mauvais choix ! ***\n") return year = int(input("\nDonnez l annee entre 2011 et 2014 : ")) while (year < 2011 or year > 2014): print("\nMauvaise annee !\n") year = int(input("\nDonnez l annee entre 2011 et 2014 : ")) # Max temperature per day query = "SELECT year,month,day,tmp FROM asos1 WHERE lat = {} and lon = {} AND year = {} ORDER BY year,month,day ALLOW FILTERING;".format( latitude, longitude, year) result = session.execute(query) D = sc.parallelize(result) daily_max_min_temp = D.map(lambda data: ( (toYMD(data[0], data[1], data[2])), [data[3], data[3]])).reduceByKey( lambda a, b: [max(a[0], b[0]), min(a[1], b[1])]).map( lambda r: [r[0], round(r[1][0], 2), round(r[1][1], 2)]).collect() daily_max_min_temp = sorted(daily_max_min_temp, key=lambda x: x[0]) daily_max_min_temp = np.array(daily_max_min_temp) # Average temperature per quarter query = "SELECT year,month,tmp FROM asos1 WHERE lat = {} AND lon = {} ORDER BY year,month;".format( latitude, longitude) result = session.execute(query) D = sc.parallelize(result) moyen_temperature = D.map(lambda data: [(data[0], math.ceil(data[ 1] / 3)), np.array([1, data[2]])]).reduceByKey(lambda a, b: a + b).map( lambda r: (r[0][0], r[0][1], round(r[1][1] / r[1][0], 2))).collect() moyen_temperature = sorted(moyen_temperature, key=lambda moy: moy[0:2]) moyen_temperature = np.array(moyen_temperature) # Max-min temperature per month max_min_temp = D.map( lambda data: ((data[0], data[1]), [data[2], data[2]])).reduceByKey( lambda a, b: [max(a[0], b[0]), min(a[1], b[1])]).map( lambda r: [r[0][0], r[0][1], r[1][0], r[1][1]]).collect() max_min_temp = sorted(max_min_temp, key=lambda x: x[0:2]) max_min_temp = np.array(max_min_temp) # Wind rose query = "SELECT month, drct FROM asos1 WHERE lat = {} AND lon = {};".format( latitude, longitude) result = session.execute(query) D = sc.parallelize(result) #wind_direction_frequency = D.map(lambda data:[math.ceil(data[0]/45),1]).reduceByKey(lambda a,b:a+b).map(lambda r:[r[0],r[1]]).collect() wind_direction_frequency = D.map(lambda data: [(math.ceil(data[ 0] / 3), 8 if data[1] == 0 else math.ceil(data[ 1] / 45)), 1]).reduceByKey(lambda a, b: a + b).map( lambda r: [r[0][0], r[0][1], r[1]]).collect() wind_direction_frequency = sorted(wind_direction_frequency, key=lambda x: x[0:2]) wind_direction_frequency = np.array(wind_direction_frequency) # Temperature boxplot ''' boxdata = [] for i in range(1,13): if i not in temperature[:,0]: continue else: boxdata.append(temperature[temperature[:,0]==i,1]) labels = range(1,13) bplot = plt.boxplot(boxdata, patch_artist=True, labels=labels) plt.title('Température box plot') colors = ['dodgerblue', 'dodgerblue', 'dodgerblue', 'orange','orange','orange','orangered','orangered','orangered','deepskyblue','deepskyblue','deepskyblue'] for patch, color in zip(bplot['boxes'], colors): patch.set_facecolor(color) # 为不同的箱型图填充不同的颜色 plt.xlabel('Mois') plt.ylabel('Température') plt.savefig("images/Temperature_box_plot.png") ''' # Plot for max temperature per day plotDailyMaxtemp(daily_max_min_temp, station, year) # Plot for average temperature per quarter plotTemperatureMoyenneMensuel(moyen_temperature, station) # Plot for max-min temperature per month plotTemperatureMaxMinTri(max_min_temp, station) # Plot for wind rose plotWindRose(wind_direction_frequency, station) print(""" *** Courbes creees avec succes ! ***\n ==================\n""")
glue_db = args['glue_database'] s3_bkt = args['s3_bucket'] rawweather_tbl = args['rawweather_table'] cleanweather_tbl = args['cleanweather_table'] output_s3_path = "s3://{}/{}".format(s3_bkt, cleanweather_tbl) logger.info({ 'glue_database': glue_db, 's3_bucket': s3_bkt, 'rawweather_table': rawweather_tbl, 'output_s3_path': output_s3_path }) spark = SparkSession(SparkContext.getOrCreate()) glue_ctx = GlueContext(SparkContext.getOrCreate()) raw_dyf = glue_ctx.create_dynamic_frame.from_catalog(database=glue_db, table_name=rawweather_tbl) def process_hourly(hours, key, fn): nums = [] for hr in hours: if hr[key]: try: num = float(hr[key]) if pd.notnull(num): nums.append(num) except Exception as e: logger.error({
import pandas as pd from databricks import koalas as ks from pyspark.sql import SparkSession from pyspark import SparkContext import time import logging #hush Spark chatter logger = SparkContext.getOrCreate()._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) print("Starting Script") start_time = time.time() path_to_file = '/usr/local/bin/breast_cancer_data.csv' # Pandas df = pd.read_csv(path_to_file) # perform expensive operations df = df.sample(frac=1) execution_time = time.time() - start_time print("Dataframe with Pandas:") print(df) print(f"Execution time was: {execution_time}") start_time = time.time() # Koalas on top of Spark df df = ks.read_csv(path_to_file) # perform expensive operations df = df.sample(frac=float(1))