Python textFile Exemples, pyspark.shell.sc.textFile Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : youku_user_tag.py Projet : yyangcg/...

def iqiyi2tag(data_path, iqiyi_tags):
    iqiyi2tag_df = sc.textFile(data_path).map(lambda x: Row(iqiyi=x)).toDF()
    iqiyi2tag_df = iqiyi2tag_df.withColumn(
        'movie',
        split(iqiyi2tag_df['iqiyi'], ',').getItem(0))
    for i in range(len(iqiyi_tags)):
        iqiyi2tag_df = iqiyi2tag_df.withColumn(
            iqiyi_tags[i],
            split(iqiyi2tag_df['iqiyi'], ',').getItem(i + 1))
    iqiyi2tag_df = iqiyi2tag_df.drop('iqiyi')
    iqy_movies = sc.textFile(data_path).map(
        lambda x: Row(iqiyi=x.split(",")[0])).collect()
    iqy = [item[0] for item in iqy_movies]
    tag_list = ['美容', '母婴育儿', '电影', '搞笑', '健康', '教育', '音乐', '资讯']
    for tag in tag_list:
        iqiyi2tag_df = iqiyi2tag_df.withColumn(tag, lit(0)).persist(
            StorageLevel.DISK_ONLY)
    return iqiyi2tag_df, iqy

Exemple #2

0

Afficher le fichier

Fichier : youku_user_tag.py Projet : yyangcg/bbak

def iqiyi2tag(data_path, iqiyi_tags):
    '''
    读取iqy影片、标签
    :param data_path:
    :param iqiyi_tags:
    :return:
    '''
    iqiyi2tag_df = sc.textFile(data_path).map(lambda x: Row(iqiyi=x)).toDF()
    iqiyi2tag_df = iqiyi2tag_df.withColumn(
        'iqytag',
        split(iqiyi2tag_df['iqiyi'], ',').getItem(0))
    for i in range(len(iqiyi_tags)):
        iqiyi2tag_df = iqiyi2tag_df.withColumn(
            iqiyi_tags[i],
            split(iqiyi2tag_df['iqiyi'], ',').getItem(i + 1))
    iqiyi2tag_df = iqiyi2tag_df.drop('iqiyi')
    iqy_movies = sc.textFile(data_path).map(
        lambda x: Row(iqiyi=x.split(",")[0])).collect()
    iqy = [item[0] for item in iqy_movies]
    return iqiyi2tag_df, iqy

Exemple #3

0

Afficher le fichier

from pyspark.shell import sc

from Spark.settings import HDFS_HOST_ADDR, HDFS_HOST_PORT

if __name__ == '__main__':
    text_file = sc.textFile('hdfs://{}:{}{}'.format(
        HDFS_HOST_ADDR, HDFS_HOST_PORT, '/test_data/sparktest.txt'))

    filter_RDD = text_file.filter(lambda line: 'spark' in line)
    filter_RDD.cache()
    counter = filter_RDD.count()
    print('\n\n\n\n\ncounter: ', counter)

Exemple #4

0

Afficher le fichier

Fichier : rddtest01.py Projet : jrahman1988/spark-demo

import time
from pyspark.shell import spark, sc

testFile = "/mnt/sda/Spark/spark-3.0.1-bin-hadoop3.2/RDDTESTFILE.TXT"
print("Test input file = ", testFile)


def printfunc(x):
    print('Word {} occurs {} '.format(x[0], x[1]))


# Initial RDD (rdd_0) creation using API: pyspark.SparkContext.textFile
rdd_00 = sc.textFile(testFile)
rdd_0 = rdd_00.repartition(8)

# pyspark.RDD.flatMap: Return a new RDD (rdd_1) by applying a function to all elements of rdd_0 and then flattening the results
rdd_1 = rdd_0.flatMap(lambda x: x.split())

# pyspark.RDD.map: Return a new RDD (rdd_2) by applying a function to each element of rdd_1
rdd_2 = rdd_1.map(lambda x: (x, 1))

# pyspark.RDD.reduceByKey: Merge the values for each key using an associative and commutative reduce function
rdd_3 = rdd_2.reduceByKey(lambda x, y: x + y)

# pyspark.RDD.toDebugString: A description of this RDD and its recursive dependencies for debugging.
print(rdd_3.toDebugString())

# pyspark.RDD.foreach: Applies a function to all elements of this RDD
rdd_3.foreach(printfunc)

time.sleep(1000)

Exemple #5

0

Afficher le fichier

Fichier : nb_spark.py Projet : zexi26/drexel-CS660

import sys

from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.regression import LabeledPoint
from pyspark.shell import sc

if __name__ == "__main__":
    input_file = sys.argv[1]
    num_features = int(sys.argv[2])
    # load input files
    print("Loading input file %s ..." % input_file)
    emails = sc.textFile(input_file)

    print("\tTotal number of emails: %i" % emails.count())

    spam_rdd = emails.filter(lambda x: int(x.split("\t")[0]) == 1)
    spam = spam_rdd.map(lambda x: x.split("\t")[1])
    ham_rdd = emails.filter(lambda x: int(x.split("\t")[0]) == 0)
    ham = ham_rdd.map(lambda x: x.split("\t")[1])

    print("\tTotal number of spam emails: %i" % spam.count())
    print("\tTotal number of ham emails: %i" % ham.count())

    # hash words
    print("Hashing words into features ...")
    tf = HashingTF(numFeatures=num_features)
    spam_features = spam.map(lambda email: tf.transform(email.split(" ")))
    ham_features = ham.map(lambda email: tf.transform(email.split(" ")))

    # label the features

Exemple #6

0

Afficher le fichier

Fichier : tweet_search.py Projet : chichi1091/twitter_trend_graph

def tweet_search_job():
    session = OAuth1Session(settings.TWITTER['CONSUMER_KEY'],
                            settings.TWITTER['CONSUMER_SECRET'],
                            settings.TWITTER['ACCESS_TOKEN'],
                            settings.TWITTER['ACCESS_TOKEN_SECRET'])

    yesterday = datetime.date.today() - datetime.timedelta(1)
    params = {
        'q': SEARCH_KEYWORD,
        'count': 200,
        'lang': 'ja',
        'until': yesterday.strftime("%Y-%m-%d")
    }

    if os.path.isfile('tweets.txt'):
        os.remove('tweets.txt')
    f = open('tweets.txt', 'w')

    try:
        error_count = 0
        too_many_requests_count = 0
        while True:
            response = session.get(SEARCH_URL, params=params)

            if response.status_code == 503:
                if error_count > 5:
                    raise Exception("エラー上限に達したため終了します")

                error_count += 1
                time.sleep(30)
                continue

            if response.status_code == 429:
                if too_many_requests_count >= 7:
                    break

                too_many_requests_count += 1
                sec = int(
                    response.headers['X-Rate-Limit-Reset']) - time.mktime(
                        datetime.datetime.now().timetuple())
                print("{0}---{1} sec sleep".format(
                    datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S"),
                    sec))
                time.sleep(sec + 5)
                continue

            if response.status_code != 200:
                print("Twitter API Error: %d" % response.status_code)
                print("Twitter API Error: {}".format(response.text))

                time.sleep(30)
                continue

            reset = 0
            limit = response.headers.get('X-Rate-Limit-Remaining', None)
            if limit is None:
                limit, reset = getLimitStatus(session)

            if limit == 0:
                sec = int(response.headers.get(
                    'X-Rate-Limit-Reset', reset)) - time.mktime(
                        datetime.datetime.now().timetuple())
                time.sleep(sec + 5)
                continue

            error_count = 0

            res_text = json.loads(response.text)
            if len(res_text) == 0:
                break

            max_id = ""
            for tweet in res_text['statuses']:
                match = re.search(r'(全員|ふぁぼ|ファボ|定期|相互|RT)', tweet['text'],
                                  re.MULTILINE)
                if match is None:
                    f.write(tweet['text'] + '\n')
                max_id = tweet['id']

            params['max_id'] = max_id

    finally:
        f.close()

    try:
        textfile = sc.textFile("tweets.txt")
        print(textfile.count())

        words = textfile.flatMap(lambda line: line.split())
        words_filter = words.filter(lambda x: SEARCH_KEYWORD not in x)
        words_filter = words_filter.filter(lambda x: "#" not in x)
        words_filter = words_filter.filter(lambda x: len(x) >= 2)

        words_tuple = words_filter.map(lambda word: (word, 1))
        words_count = words_tuple.reduceByKey(lambda a, b: a + b)
        words_count_sorted = words_count.sortBy(lambda t: t[1], False)
        print(words_count_sorted.collect()[:10])

        for ranking in words_count_sorted.collect()[:10]:
            trends = Trends(target_date=yesterday.strftime("%Y-%m-%d"),
                            word=ranking[0],
                            count=ranking[1])
            trends.save()
    finally:
        os.remove('tweets.txt')

Exemple #7

0

Afficher le fichier

Fichier : broadcast-example.py Projet : cevheri/Spark-Broadcast-and-Accumulators

from pyspark.shell import sc

txt_file = sc.textFile("README.md")

print(*txt_file.collect()[0])

s = txt_file.collect()
print(type(s))

broadcastVar = sc.broadcast(s)

print(broadcastVar.value)
# [1, 2, 3]

# driver write small data(user list)
# executer readonly

Exemple #8

0

Afficher le fichier

Fichier : Yu_Hou_SON.py Projet : uscyuhou/DataMining

    spark = SparkSession\
        .builder\
        .appName("PythonWordCount")\
        .getOrCreate()

    # input part
    importPath = sys.argv[1]
    supportRate = float(sys.argv[2])
    outputPath = sys.argv[3]
    # print(importPath, supportRate, outputPath)

    chunk = 2
    # lines = spark.read.text(importPath).rdd.map(lambda r: r)
    # sc = SparkContext("ee")

    lines = sc.textFile(importPath).map(lambda r: r)
    totalNum=len(lines.collect())
    originallines = lines
    # lines = spark.read.text(sys.argv[1],2).rdd.map(lambda r: r[0])

    ratio=supportRate

    # Stage1
    lines = lines.mapPartitions(aprioriF,2)
    # print(type(lines))
    # print(lines.distinct().collect())
    lines = lines.reduce(agg)
    # print(type(lines))
    # print(lines)

    # print(doubtCandidate)

Exemple #9

0

Afficher le fichier

Fichier : pythonread.py Projet : harrypintous/TATA

sys.path.append("/home/spark/python/lib")
sys.path.append("/opt/spark/python/lib/py4j-0.10.8.1-src")

path = "/home/hpinto/Desktop/mySpark-data/DimCurrency.csv"
read_file = open(path, 'r')
days = read_file.read()
print(days)

pathw = "/home/hpinto/Desktop/mySpark-data/DimnewCurrency.csv"
wr_file = open(pathw, 'a+')
wr = wr_file.write(days)
print(wr)

#Exception: Python in worker has different version 2.7 than that in driver 3.7, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

pairs = map(lambda s: (s, 1), days)
counts = reduce(lambda a, b: a + b, days)
print(counts)

lines = sc.textFile("/home/hpinto/Desktop/mySpark-data/DimAccounts.csv"
                    )  # Distribute the data - Create a RDD

countX = (
    lines.flatMap(lambda x: x.split(' '))  # Create a list with all words
    .map(lambda x: (x, 1))  # Create tuple (word,1)
    .reduceByKey(lambda x, y: x + y))  # reduce by key i.e. the word
#output = countX.take(100)                                 # get the output on local
x1 = countX.take(100)

for (word, count) in x1:  # print output
    print("%s: %i" % (word, count))

Exemple #10

0

Afficher le fichier

# -*- coding:utf-8 -*-
"""
    2019/4/15 11:04 by young
"""

from pyspark.shell import sc

lines = sc.textFile('./data.txt')
s = lines.count()
print(s)

Exemple #11

0

Afficher le fichier

import pyspark

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.shell import spark, sc

from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors

# Load and parse the data
data = sc.textFile("data/mllib/sample_lda_data.txt")
parsedData = data.map(
    lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
# Index documents with unique IDs
corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " +
      str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))

# Save and load model
ldaModel.save(