def iqiyi2tag(data_path, iqiyi_tags): iqiyi2tag_df = sc.textFile(data_path).map(lambda x: Row(iqiyi=x)).toDF() iqiyi2tag_df = iqiyi2tag_df.withColumn( 'movie', split(iqiyi2tag_df['iqiyi'], ',').getItem(0)) for i in range(len(iqiyi_tags)): iqiyi2tag_df = iqiyi2tag_df.withColumn( iqiyi_tags[i], split(iqiyi2tag_df['iqiyi'], ',').getItem(i + 1)) iqiyi2tag_df = iqiyi2tag_df.drop('iqiyi') iqy_movies = sc.textFile(data_path).map( lambda x: Row(iqiyi=x.split(",")[0])).collect() iqy = [item[0] for item in iqy_movies] tag_list = ['美容', '母婴育儿', '电影', '搞笑', '健康', '教育', '音乐', '资讯'] for tag in tag_list: iqiyi2tag_df = iqiyi2tag_df.withColumn(tag, lit(0)).persist( StorageLevel.DISK_ONLY) return iqiyi2tag_df, iqy
def iqiyi2tag(data_path, iqiyi_tags): ''' 读取iqy影片、标签 :param data_path: :param iqiyi_tags: :return: ''' iqiyi2tag_df = sc.textFile(data_path).map(lambda x: Row(iqiyi=x)).toDF() iqiyi2tag_df = iqiyi2tag_df.withColumn( 'iqytag', split(iqiyi2tag_df['iqiyi'], ',').getItem(0)) for i in range(len(iqiyi_tags)): iqiyi2tag_df = iqiyi2tag_df.withColumn( iqiyi_tags[i], split(iqiyi2tag_df['iqiyi'], ',').getItem(i + 1)) iqiyi2tag_df = iqiyi2tag_df.drop('iqiyi') iqy_movies = sc.textFile(data_path).map( lambda x: Row(iqiyi=x.split(",")[0])).collect() iqy = [item[0] for item in iqy_movies] return iqiyi2tag_df, iqy
from pyspark.shell import sc from Spark.settings import HDFS_HOST_ADDR, HDFS_HOST_PORT if __name__ == '__main__': text_file = sc.textFile('hdfs://{}:{}{}'.format( HDFS_HOST_ADDR, HDFS_HOST_PORT, '/test_data/sparktest.txt')) filter_RDD = text_file.filter(lambda line: 'spark' in line) filter_RDD.cache() counter = filter_RDD.count() print('\n\n\n\n\ncounter: ', counter)
import time from pyspark.shell import spark, sc testFile = "/mnt/sda/Spark/spark-3.0.1-bin-hadoop3.2/RDDTESTFILE.TXT" print("Test input file = ", testFile) def printfunc(x): print('Word {} occurs {} '.format(x[0], x[1])) # Initial RDD (rdd_0) creation using API: pyspark.SparkContext.textFile rdd_00 = sc.textFile(testFile) rdd_0 = rdd_00.repartition(8) # pyspark.RDD.flatMap: Return a new RDD (rdd_1) by applying a function to all elements of rdd_0 and then flattening the results rdd_1 = rdd_0.flatMap(lambda x: x.split()) # pyspark.RDD.map: Return a new RDD (rdd_2) by applying a function to each element of rdd_1 rdd_2 = rdd_1.map(lambda x: (x, 1)) # pyspark.RDD.reduceByKey: Merge the values for each key using an associative and commutative reduce function rdd_3 = rdd_2.reduceByKey(lambda x, y: x + y) # pyspark.RDD.toDebugString: A description of this RDD and its recursive dependencies for debugging. print(rdd_3.toDebugString()) # pyspark.RDD.foreach: Applies a function to all elements of this RDD rdd_3.foreach(printfunc) time.sleep(1000)
import sys from pyspark.mllib.classification import NaiveBayes from pyspark.mllib.feature import HashingTF from pyspark.mllib.regression import LabeledPoint from pyspark.shell import sc if __name__ == "__main__": input_file = sys.argv[1] num_features = int(sys.argv[2]) # load input files print("Loading input file %s ..." % input_file) emails = sc.textFile(input_file) print("\tTotal number of emails: %i" % emails.count()) spam_rdd = emails.filter(lambda x: int(x.split("\t")[0]) == 1) spam = spam_rdd.map(lambda x: x.split("\t")[1]) ham_rdd = emails.filter(lambda x: int(x.split("\t")[0]) == 0) ham = ham_rdd.map(lambda x: x.split("\t")[1]) print("\tTotal number of spam emails: %i" % spam.count()) print("\tTotal number of ham emails: %i" % ham.count()) # hash words print("Hashing words into features ...") tf = HashingTF(numFeatures=num_features) spam_features = spam.map(lambda email: tf.transform(email.split(" "))) ham_features = ham.map(lambda email: tf.transform(email.split(" "))) # label the features
def tweet_search_job(): session = OAuth1Session(settings.TWITTER['CONSUMER_KEY'], settings.TWITTER['CONSUMER_SECRET'], settings.TWITTER['ACCESS_TOKEN'], settings.TWITTER['ACCESS_TOKEN_SECRET']) yesterday = datetime.date.today() - datetime.timedelta(1) params = { 'q': SEARCH_KEYWORD, 'count': 200, 'lang': 'ja', 'until': yesterday.strftime("%Y-%m-%d") } if os.path.isfile('tweets.txt'): os.remove('tweets.txt') f = open('tweets.txt', 'w') try: error_count = 0 too_many_requests_count = 0 while True: response = session.get(SEARCH_URL, params=params) if response.status_code == 503: if error_count > 5: raise Exception("エラー上限に達したため終了します") error_count += 1 time.sleep(30) continue if response.status_code == 429: if too_many_requests_count >= 7: break too_many_requests_count += 1 sec = int( response.headers['X-Rate-Limit-Reset']) - time.mktime( datetime.datetime.now().timetuple()) print("{0}---{1} sec sleep".format( datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S"), sec)) time.sleep(sec + 5) continue if response.status_code != 200: print("Twitter API Error: %d" % response.status_code) print("Twitter API Error: {}".format(response.text)) time.sleep(30) continue reset = 0 limit = response.headers.get('X-Rate-Limit-Remaining', None) if limit is None: limit, reset = getLimitStatus(session) if limit == 0: sec = int(response.headers.get( 'X-Rate-Limit-Reset', reset)) - time.mktime( datetime.datetime.now().timetuple()) time.sleep(sec + 5) continue error_count = 0 res_text = json.loads(response.text) if len(res_text) == 0: break max_id = "" for tweet in res_text['statuses']: match = re.search(r'(全員|ふぁぼ|ファボ|定期|相互|RT)', tweet['text'], re.MULTILINE) if match is None: f.write(tweet['text'] + '\n') max_id = tweet['id'] params['max_id'] = max_id finally: f.close() try: textfile = sc.textFile("tweets.txt") print(textfile.count()) words = textfile.flatMap(lambda line: line.split()) words_filter = words.filter(lambda x: SEARCH_KEYWORD not in x) words_filter = words_filter.filter(lambda x: "#" not in x) words_filter = words_filter.filter(lambda x: len(x) >= 2) words_tuple = words_filter.map(lambda word: (word, 1)) words_count = words_tuple.reduceByKey(lambda a, b: a + b) words_count_sorted = words_count.sortBy(lambda t: t[1], False) print(words_count_sorted.collect()[:10]) for ranking in words_count_sorted.collect()[:10]: trends = Trends(target_date=yesterday.strftime("%Y-%m-%d"), word=ranking[0], count=ranking[1]) trends.save() finally: os.remove('tweets.txt')
from pyspark.shell import sc txt_file = sc.textFile("README.md") print(*txt_file.collect()[0]) s = txt_file.collect() print(type(s)) broadcastVar = sc.broadcast(s) print(broadcastVar.value) # [1, 2, 3] # driver write small data(user list) # executer readonly
spark = SparkSession\ .builder\ .appName("PythonWordCount")\ .getOrCreate() # input part importPath = sys.argv[1] supportRate = float(sys.argv[2]) outputPath = sys.argv[3] # print(importPath, supportRate, outputPath) chunk = 2 # lines = spark.read.text(importPath).rdd.map(lambda r: r) # sc = SparkContext("ee") lines = sc.textFile(importPath).map(lambda r: r) totalNum=len(lines.collect()) originallines = lines # lines = spark.read.text(sys.argv[1],2).rdd.map(lambda r: r[0]) ratio=supportRate # Stage1 lines = lines.mapPartitions(aprioriF,2) # print(type(lines)) # print(lines.distinct().collect()) lines = lines.reduce(agg) # print(type(lines)) # print(lines) # print(doubtCandidate)
sys.path.append("/home/spark/python/lib") sys.path.append("/opt/spark/python/lib/py4j-0.10.8.1-src") path = "/home/hpinto/Desktop/mySpark-data/DimCurrency.csv" read_file = open(path, 'r') days = read_file.read() print(days) pathw = "/home/hpinto/Desktop/mySpark-data/DimnewCurrency.csv" wr_file = open(pathw, 'a+') wr = wr_file.write(days) print(wr) #Exception: Python in worker has different version 2.7 than that in driver 3.7, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set. pairs = map(lambda s: (s, 1), days) counts = reduce(lambda a, b: a + b, days) print(counts) lines = sc.textFile("/home/hpinto/Desktop/mySpark-data/DimAccounts.csv" ) # Distribute the data - Create a RDD countX = ( lines.flatMap(lambda x: x.split(' ')) # Create a list with all words .map(lambda x: (x, 1)) # Create tuple (word,1) .reduceByKey(lambda x, y: x + y)) # reduce by key i.e. the word #output = countX.take(100) # get the output on local x1 = countX.take(100) for (word, count) in x1: # print output print("%s: %i" % (word, count))
# -*- coding:utf-8 -*- """ 2019/4/15 11:04 by young """ from pyspark.shell import sc lines = sc.textFile('./data.txt') s = lines.count() print(s)
import pyspark from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.shell import spark, sc from pyspark.mllib.clustering import LDA, LDAModel from pyspark.mllib.linalg import Vectors # Load and parse the data data = sc.textFile("data/mllib/sample_lda_data.txt") parsedData = data.map( lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])) # Index documents with unique IDs corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache() # Cluster the documents into three topics using LDA ldaModel = LDA.train(corpus, k=3) # Output topics. Each is a distribution over words (matching word count vectors) print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):") topics = ldaModel.topicsMatrix() for topic in range(3): print("Topic " + str(topic) + ":") for word in range(0, ldaModel.vocabSize()): print(" " + str(topics[word][topic])) # Save and load model ldaModel.save(