def create_VP_tables(self): print "Beginning the creation of VP tables." total_properties = len(self.properties) i = 0 # for each distinct property, create a table for p in self.properties: i += 1 prop_df = self.sqlContext.sql( "SELECT s AS s, o AS o FROM tripletable WHERE p='" + p + "'") df_writer = DataFrameWriter(prop_df) df_writer.saveAsTable("VP_" + valid_string(p)) sys.stdout.write("\rTables created: %d / %d " % (i, total_properties)) # if statistics are enabled, compute them if self.statsEnabled: i = 0 stat = Stats() for p in self.properties: i += 1 tableDF = self.sqlContext.sql("SELECT * FROM VP_" + valid_string(p)) stat.addTableStat(p, tableDF) sys.stdout.write("\rStatistics created: %d / %d " % (i, total_properties)) with open(self.statsFile, "w") as f: f.write(stat.getSerializedStats()) print "Statistics created: %d / %d " % (i, total_properties)
def download_file_from_one_drive(URL, destination): import requests URL = URL session = requests.Session() response = session.get(URL, stream=True) token = None for key, value in response.cookies.items(): if key.startswith('download_warning'): token = value if token: params = {'confirm': token} response = session.get(URL, params=params, stream=True) CHUNK_SIZE = 32768 with open(destination, "wb") as f: for chunk in response.iter_content(CHUNK_SIZE): if chunk: # filter out keep- f.write(chunk)
def download_file_from_google_drive(id, destination): import requests URL = "https://docs.google.com/uc?export=download" session = requests.Session() response = session.get(URL, params={'id': id}, stream=True) token = None for key, value in response.cookies.items(): if key.startswith('download_warning'): token = value if token: params = {'id': id, 'confirm': token} response = session.get(URL, params=params, stream=True) CHUNK_SIZE = 32768 with open(destination, "wb") as f: for chunk in response.iter_content(CHUNK_SIZE): if chunk: # filter out keep-alive new chunks f.write(chunk)
def get_game_list(user_id): base_url = 'http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/' keys = [key1, key2] curr_keyid = 0 params = { 'key' : keys[curr_keyid], 'steamid' : user_id.strip(), 'format' : 'json' } games = None for i in range(3): try: games = requests.get(base_url, params = params).json().get('response').get('games') break except: try: curr_keyid = 1 - curr_keyid params.update({'key' : keys[curr_keyid]}) games = requests.get(base_url, params = params).json().get('response').get('games') break except: time.sleep(5) pass if games and len(games) > 0: gamelist = [] f = open('userinfo', 'a') for g in games: gamelist.append(g) g.update({'userid' : int(user_id.strip())}) f.write(json.dumps(g)) f.write('\n') userid = int(user_id.strip()) appid = g.get('appid') playtime_forever = g.get('playtime_forever') spark.sql("INSERT INTO userinfo ('%s', '%s', '%s')" % (userid, appid, playtime_forever)) return gamelist
) display(dfOutput) # COMMAND ---------- import pyspark.sql.functions as f # Concat all text for the same file with a space " " dfOutputPerFile = dfOutput \ .groupby(dfOutput.filename) \ .agg(f.concat_ws(" ", f.collect_list(dfOutput.text)) \ .alias("text")) display(dfOutputPerFile) # COMMAND ---------- import pathlib outputDir = f"/dbfs/mnt/{dbfs_mount_name}/output" pathlib.Path(outputDir).mkdir(parents = True, exist_ok = True) for row in dfOutputPerFile.collect(): with open(f"{outputDir}/{row.filename}", "w") as f: f.write(row.text) # COMMAND ----------
def training(sparkSession, arguments, logger): # Get the input file path inputPath = arguments['--input'] logger.info("...Starting training...") logger.info("Loading data from: {0}".format(inputPath)) # Read the input dataset trainDF = sparkSession.read.parquet(inputPath) # Preprocess the data processData = preprocess(trainDF, logger) # Select final columns for training the algorithms processData = processData.select(processData.tdur.cast(FloatType()), processData.sport.cast(IntegerType()), processData.dport.cast(IntegerType()), processData.flag_onehot0, processData.flag_onehot1, processData.flag_onehot2, processData.flag_onehot3, processData.flag_onehot4, processData.flag_onehot5, processData.proto_onehot0, processData.proto_onehot1, processData.proto_onehot2, processData.proto_onehot3, processData.proto_onehot4, processData.ipkt.cast(FloatType()), processData.ibyt.cast(FloatType()), processData.opkt.cast(FloatType()), processData.obyt.cast(FloatType()), processData.nconnections.cast(IntegerType())) ## Normalize the data # Initialize a dictionary for the minimum and the maximum values for each normalized feature min_max = {} min_max['inputFeatures'] = [] [dataNor, min_max] = NormalizeValues(processData, arguments, min_max, logger) # Save the minimum-maximum json file in the local filesystem with open(minMaxFile,"w") as f: f.write(json.dumps(min_max)) # Transform Spark Data Frame into Pandas Data Frame dataNor = dataNor.toPandas() # Get the values to train the network dataTrain = dataNor.values # Select the training algorithm if arguments['OneClassSVM'] == True: nu = arguments['<nu>'] kernel = arguments['<kernel>'] alg = svm.OneClassSVM(kernel=kernel, nu=nu) algorithm = "OneClassSVM" logger.info("One Class SVM model") elif arguments['IsolationForest'] == True: estimators = arguments['<estimator>'] contamination = arguments['<contam>'] logger.info("Estimators: {0}; Contamination: {1}".format(estimators, contamination)) alg = IsolationForest(n_estimators=estimators, contamination=contamination) algorithm = "IsolationForest" logger.info("Isolation model") elif arguments['LocalOutlier'] == True: neighbors = arguments['<neigh>'] contamination = arguments['<contam>'] alg = LocalOutlierFactor(n_neighbors=neighbors, contamination=contamination) algorithm = "LocalOutlier" logger.info("Local Outlier Factor model") logger.info("Fitting the network") # Training the network with the data alg.fit(dataTrain) logger.info("Algorithm has been trained.") algFile = algorithm + "_network.plk" # Copy the trained network to the network_trained parameter joblib.dump(alg, algFile) # Get the path to save the results hdfsTrainDir = arguments['--output'] logger.info("Saving results to {0}".format(hdfsTrainDir)) # Get HDFS structures path = sparkSession.sparkContext._gateway.jvm.org.apache.hadoop.fs.Path fileSystem = sparkSession.sparkContext._gateway.jvm.org.apache.hadoop.fs.FileSystem hadoopConfiguration = sparkSession.sparkContext._gateway.jvm.org.apache.hadoop.conf.Configuration fs = fileSystem.get(hadoopConfiguration()) hdfsAlgPath = hdfsTrainDir + "/algorithms/" + algorithm + "_network.plk" hdfsMinMaxPath = hdfsTrainDir + "/" + minMaxFile if(fs.exists(path(hdfsAlgPath)) == True): logger.warn("It already exists a trained network for the {0} algorithm in {1}".format(algorithm, hdfsTrainDir)) logger.warn("The file is going to be overrited.") try: fs.delete(path(hdfsAlgPath), False) except: logger.error("Couldn't delete the network file") if(fs.exists(path(hdfsMinMaxPath)) == True): logger.warn("It already exists a file with the minimum and maximum values in {0}".format(hdfsTrainDir)) logger.warn("The file is going to be overrited.") try: fs.delete(path(hdfsMinMaxPath), False) except: logger.error("Couldn't delete the minimum and maximum file") try: srcAlgFile = path(algFile) dstAlgFile = path(hdfsAlgPath) fs.moveFromLocalFile(srcAlgFile, dstAlgFile) logger.info("Training model exported correctly.") except: logger.error("Couldn't save the network in the file system.") try: srcMinMaxFile = path(minMaxFile) dstMinMaxFile = path(hdfsMinMaxPath) fs.moveFromLocalFile(srcMinMaxFile, dstMinMaxFile) logger.info("Minimum and maximum features file exported correctly.") except: logger.error("Couldn't save the minimum and maximum file in the file system.") logger.info("..Training has finished..")
g = GraphFrame(vertices_df, edges_df) result = g.labelPropagation(maxIter=5) # Get result and sort result_rdd = result.select("id", "label").rdd ans = result_rdd \ .map(lambda line: (line.label, line.id)) \ .groupByKey() \ .map(lambda line: (len(line[1]), [str(i) for i in line[1]]))\ .groupByKey()\ .flatMap(lambda line: sorted([sorted(i) for i in line[1]], key=lambda x: x[0]))\ .collect() # Output as txt file with open(community_output_file_path, 'w') as f: for line in ans: for i in range(len(line)): user = line[i] if i != len(line) - 1: f.write('\'' + str(user) + '\', ') else: f.write('\'' + str(user) + '\'') f.write('\n') f.close() # Finish time timer(start) # spark-submit --packages graphframes:graphframes:0.6.0-spark2.3-s_2.11 task1.py 7 ub_sample_data.csv task1_ans # spark-submit --packages graphframes:graphframes:0.6.0-spark2.3-s_2.11 task1.py 7 $ASNLIB/publicdata/ub_sample_data.csv task1_ans
userdf = spark.createDataFrame(nodes, StringType()).selectExpr("value as id") #print(filterPhaseA.count()) #print(nodes.count()) g = GraphFrame(userdf, edgedf) result = g.labelPropagation(maxIter=5) community = result.groupby("label").agg(f.collect_list("id").alias("id")) fin = community.select("id").rdd.flatMap(lambda x: x).collect() li = [] for i in fin: li.append(sorted(i)) #print(li) reslen = sorted(list(set(len(x) for x in li))) f = open(sys.argv[3], "w") for i in reslen: can = [] for j in li: if len(j) == i: can.append(j) can = sorted(can) for m in can: f.write(str(m).replace("[", "").replace("]", "")) f.write("\n") end = time.time() print("Duration", end - start)
# Lat and Lon for Las Vegas city lat = 36.127430 lon = -115.138460 # Limit the Lat and Lon for easy visulization of results lon_min, lon_max = lon - 0.3, lon + 0.5 lat_min, lat_max = lat - 0.4, lat + 0.5 # Logic to filter only Las Vegas records yelp_LV = yelp_b.select( "city", yelp_b.latitude.cast("double"), yelp_b.longitude.cast("double"), yelp_b.stars.cast("double")).filter(yelp_b.city == "Las Vegas") # Logic to select only records which are within the configured Lat and Lon Limit yelp_LV_PlotInfo = yelp_LV.withColumn( 'plotOnMap', (yelp_LV.latitude > lat_min) & (yelp_LV.latitude < lat_max) & (yelp_LV.longitude > lon_min) & (yelp_LV.longitude < lon_max)) yelp_LV_PlotInfo_True = yelp_LV_PlotInfo.filter( yelp_LV_PlotInfo.plotOnMap == "true") # Logic to generate output as per Folium Mapping input Format distinct_stars = yelp_b.select(yelp_b.stars).distinct() distinct_stars_list = map( lambda x: x.stars, distinct_stars.select(distinct_stars.stars).collect()) data = [] for star in distinct_stars_list: subset = yelp_LV_PlotInfo_True.filter(yelp_LV_PlotInfo_True.stars == star) data.append( map(lambda x: [x.latitude, x.longitude], subset.select(subset.latitude, subset.longitude).collect())) # Write the data to output File. with open("VegasHeatMapData.txt", "w") as f: f.write(str(data))