Python SparkContext.hadoopFile Beispiele, pyspark.SparkContext.hadoopFile Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: taxishift1.py Projekt: mbudassi/taxishift

def main(*argv):

    #Setup to read from s3
    aws_access_key = os.getenv('AWS_ACCESS_KEY_ID', 'default')
    aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY', 'default')

    conf = SparkConf().setAppName("taxishift1")
    SparkContext.setSystemProperty('spark.executor.memory', '5g')
    sc = SparkContext(conf=conf)
    sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId",aws_access_key)
    sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey",aws_secret_access_key)

    #Read from s3
    for i in range(1,13):

        raw_data = sc.hadoopFile('s3a://ddrum-s3/trip_data/trip_data_' + str(i) + '.csv',\
                                 'org.apache.hadoop.mapred.TextInputFormat',\
                                 'org.apache.hadoop.io.Text',\
                                 'org.apache.hadoop.io.LongWritable')

        #Call user defined function to map raw data into key-value pairs 
        new_data = raw_data.map(lambda x:splitit(x[1]))

        #Combine data from multiple csv files
        if i<2:
            total_data = new_data
        else:
            total_data = total_data.union(new_data)

        #Create total list of rides/shifts for each driver
        total_data = total_data.reduceByKey(lambda x,y:x+y)\
                               .mapValues(lambda x:mergeit(x))\
    
    #Create key-value for each shift, and set shifts greater than 10 hours to 10 hours (We want to know exactly when they go over)
    ungrouped_data = total_data.flatMap(lambda x: [(x[0],r) for r in x[1]])\
                               .mapValues(lambda x: x if x[1]-x[0] < 36000 else [x[0],x[0]+36000]) 

    #Extract only 10 hour shift offenders
    offenders = ungrouped_data.filter(lambda x:x[1][1] - x[1][0] >= 36000)\
                              .map(lambda x:(x[1][1]))\
                              .collect()

    #Plot number of offenders for every 30 minutes
    offender_hist = plt.hist(offenders, bins=range(int(min(offenders)), int(max(offenders)) + 1800, 1800))

    hist_min = int(min(offender_hist))
    hist_min+=900

    #Save to file
    hist_csv  = open('/home/ubuntu/offenders.csv','w')

    for i in offender_hist[0]:
        hist_csv.write(str(hist_min)+','+str(i)+'\n')
        hist_min+=1800
    hist_csv.close()

    return

Beispiel #2

0

Datei anzeigen

def main(*argv):

    #Setup to read from s3
    aws_access_key = os.getenv('AWS_ACCESS_KEY_ID', 'default')
    aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY', 'default')

    conf = SparkConf().setAppName("taxishift3")
    SparkContext.setSystemProperty('spark.executor.memory', '5g')
    sc = SparkContext(conf=conf)
    sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", aws_access_key)
    sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey",
                                      aws_secret_access_key)

    #Read from s3
    for i in range(1, 13):

        raw_data = sc.hadoopFile('s3a://ddrum-s3/trip_data/trip_data_' + str(i) + '.csv',\
                                 'org.apache.hadoop.mapred.TextInputFormat',\
                                 'org.apache.hadoop.io.Text',\
                                 'org.apache.hadoop.io.LongWritable')

        #Call user defined function to map raw data into key-value pairs
        new_data = raw_data.map(lambda x: splitit(x[1]))

        #Combine data from multiple csv files
        if i < 2:
            total_data = new_data
        else:
            total_data = total_data.union(new_data)

    #Create total list of pickup chains and pickups for each coordinate
    #If there are more than three times as many pickups as pickup chains, this might be a taxistand
    total_data = total_data.reduceByKey(lambda x,y:x+y)\
                           .mapValues(lambda x:mergeit(x))\
                           .filter(lambda x:x[1][1]/x[1][0] > 3)\
                           .collect()

    #Print results
    total_data.sort()
    for i in total_data:
        print i

    return

Beispiel #3

0

Datei anzeigen

from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("spark_app_read_data_from_rcfile")

sc = SparkContext(conf=conf)

rowRDD = sc.hadoopFile(path="hdfs://dip.cdh5.dev:8020/user/yurun/rcfile",
                       inputFormatClass="org.apache.hadoop.hive.ql.io.RCFileInputFormat",
                       keyClass="org.apache.hadoop.io.LongWritable",
                       valueClass="org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable",
                       valueConverter="com.sina.dip.spark.converter.BytesRefArrayWritableToObjectArrayConverter")

pairs = rowRDD.collect()

for pair in pairs:
    print pair[0], pair[1]

sc.stop()

Beispiel #4

0

Datei anzeigen

def main(cmd_line_args=None):
    if cmd_line_args is None:
        cmd_line_args = sys.argv[1:]

    parser = _make_arg_parser()
    args = parser.parse_args(cmd_line_args)

    if args.num_reducers is not None and args.num_reducers <= 0:
        raise ValueError(
            'You can only configure num_reducers to positive number.')

    # get job_class
    job_module_name, job_class_name = args.job_class.rsplit('.', 1)
    job_module = import_module(job_module_name)
    job_class = getattr(job_module, job_class_name)

    # load initial data
    from pyspark import SparkContext

    if args.job_args:
        job_args = shlex_split(args.job_args)
    else:
        job_args = []

    # get job steps. don't pass --steps, which is deprecated
    job = job_class(job_args)
    steps = job.steps()

    # pick steps
    start = args.first_step_num or 0
    end = None if args.last_step_num is None else args.last_step_num + 1
    steps_to_run = list(enumerate(steps))[start:end]

    sc = SparkContext()

    # keep track of one set of counters per job step
    counter_accumulators = [
        sc.accumulator(defaultdict(dict), CounterAccumulator())
        for _ in steps_to_run
    ]

    def make_increment_counter(step_num):
        counter_accumulator = counter_accumulators[step_num - start]

        def increment_counter(group, name, amount=1):
            counter_accumulator.add({group: {name: amount}})

        return increment_counter

    def make_job(mrc, step_num):
        j = job_class(job_args + ['--%s' % mrc, '--step-num=%d' % step_num])
        j.sandbox()  # so Spark doesn't try to serialize stdin

        # patch increment_counter() to update the accumulator for this step
        j.increment_counter = make_increment_counter(step_num)

        return j

    try:
        if job.hadoop_input_format() is not None:
            rdd = sc.hadoopFile(args.input_path,
                                inputFormatClass=job.hadoop_input_format(),
                                keyClass='org.apache.hadoop.io.Text',
                                valueClass='org.apache.hadoop.io.Text')

            # hadoopFile loads each line as a key-value pair in which the contents
            # of the line are the key and the value is an empty string. Convert to
            # an rdd of just lines, encoded as bytes.
            rdd = rdd.map(lambda kv: kv[0].encode('utf-8'))
        else:
            rdd = sc.textFile(args.input_path, use_unicode=False)

        # run steps
        for step_num, step in steps_to_run:
            rdd = _run_step(step, step_num, rdd, make_job, args.num_reducers)

        # write the results
        if job.hadoop_output_format() is not None:
            # saveAsHadoopFile takes an rdd of key-value pairs, so convert to that
            # format
            rdd = rdd.map(lambda line: tuple(
                x.decode('utf-8') for x in line.split(b'\t', 1)))
            rdd.saveAsHadoopFile(args.output_path,
                                 outputFormatClass=job.hadoop_output_format(),
                                 compressionCodecClass=args.compression_codec)
        else:
            rdd.saveAsTextFile(args.output_path,
                               compressionCodecClass=args.compression_codec)
    finally:
        if args.counter_output_dir is not None:
            counters = [ca.value for ca in counter_accumulators]

            sc.parallelize([json.dumps(counters)],
                           numSlices=1).saveAsTextFile(args.counter_output_dir)

Beispiel #5

0

Datei anzeigen

Datei: userlogin_repeat.py Projekt: wangcunxin/spark_py

if __name__ == '__main__':

    if len(sys.argv) != 5:
        print("Usage: spark_streaming.py <master> <begin> <end> <input>", file=sys.stderr)
        exit(-1)

    master, time_begin, time_end, input = sys.argv[1:]
    input_path = input + '/' + time_begin + '.csv'
    logger.info("--->" + master + " " + input_path)

    sc = SparkContext(master, 'wxcity_userlogin_repeat_app')
    sql_context = SQLContext(sc)

    lines = sc.hadoopFile(input,
                          'org.apache.hadoop.mapred.TextInputFormat',
                          'org.apache.hadoop.io.LongWritable',
                          'org.apache.hadoop.io.Text'
                          )

    rs_tuples = MysqlDao().findWithQuery(ConfigPortalSql.select_mysql_hos_gw_sup)
    gwid_hosid_dict = {}
    for r in rs_tuples:
        hos_id = str(r[0])
        gw_id = r[1]
        gwid_hosid_dict[gw_id] = hos_id
    logger.debug('-->gwid_hosid:' + str(gwid_hosid_dict.__len__()))
    users = lines.map(lambda x: x[1].split(',')).filter(lambda x: len(x) == 17) \
        .map(lambda p: (p[0].strip(), p[1].strip(), p[2].strip(), p[3].strip(), p[4].strip(), \
                        p[5].strip(), p[6].strip(), p[7].strip(), p[8].strip(), p[9].strip(), \
                        p[10].strip(), p[11].strip(), p[12].strip(), p[13].strip(), p[14].strip(), \
                        p[15].strip(), p[16].strip(), gwid_hosid_dict.get(p[1].strip(), "")))

Beispiel #6

0

Datei anzeigen

Datei: harness.py Projekt: poorya-noorossana/mrjob

def main(cmd_line_args=None):
    if cmd_line_args is None:
        cmd_line_args = sys.argv[1:]

    parser = _make_arg_parser()
    args = parser.parse_args(cmd_line_args)

    if args.num_reducers is not None and args.num_reducers <= 0:
        raise ValueError(
            'You can only configure num_reducers to positive number.')

    # get job_class
    job_module_name, job_class_name = args.job_class.rsplit('.', 1)
    job_module = import_module(job_module_name)
    job_class = getattr(job_module, job_class_name)

    # load initial data
    from pyspark import SparkContext

    if args.job_args:
        job_args = shlex_split(args.job_args)
    else:
        job_args = []

    # determine hadoop_*_format, steps
    # try to avoid instantiating a job in the driver; see #2044
    job = None

    if args.hadoop_input_format is None:
        job = job or job_class(job_args)
        hadoop_input_format = job.hadoop_input_format()
    else:
        hadoop_input_format = args.hadoop_input_format or None

    if args.hadoop_output_format is None:
        job = job or job_class(job_args)
        hadoop_output_format = job.hadoop_output_format()
    else:
        hadoop_output_format = args.hadoop_output_format or None

    if args.sort_values is None:
        job = job or job_class(job_args)
        sort_values = job.sort_values()
    else:
        sort_values = args.sort_values

    if args.steps_desc is None:
        job = job or job_class(job_args)
        steps = [
            step.description(step_num)
            for step_num, step in enumerate(job.steps())
        ]
    else:
        steps = json.loads(args.steps_desc)

    # pick steps
    start = args.first_step_num or 0
    end = None if args.last_step_num is None else args.last_step_num + 1
    steps_to_run = list(enumerate(steps))[start:end]

    sc = SparkContext()

    # keep track of one set of counters per job step
    counter_accumulators = [
        sc.accumulator(defaultdict(dict), CounterAccumulator())
        for _ in steps_to_run
    ]

    def make_increment_counter(step_num):
        counter_accumulator = counter_accumulators[step_num - start]

        def increment_counter(group, counter, amount=1):
            counter_accumulator.add({group: {counter: amount}})

        return increment_counter

    def make_mrc_job(mrc, step_num):
        j = job_class(job_args + ['--%s' % mrc, '--step-num=%d' % step_num])

        # patch increment_counter() to update the accumulator for this step
        j.increment_counter = make_increment_counter(step_num)

        # if skip_internal_protocol is true, patch internal_protocol() to
        # return an object whose *read* and *write* attributes are ``None``
        if args.skip_internal_protocol:
            j.internal_protocol = lambda: _NO_INTERNAL_PROTOCOL

        return j

    # --emulate-map-input-file doesn't work with hadoop_input_format
    emulate_map_input_file = (args.emulate_map_input_file
                              and not hadoop_input_format)

    try:
        if emulate_map_input_file:
            # load an rdd with pairs of (input_path, line). *path* here
            # has to be a single path, not a comma-separated list
            rdd = sc.union([
                _text_file_with_path(sc, path)
                for path in args.input_path.split(',')
            ])

        elif hadoop_input_format:
            rdd = sc.hadoopFile(args.input_path,
                                inputFormatClass=hadoop_input_format,
                                keyClass='org.apache.hadoop.io.Text',
                                valueClass='org.apache.hadoop.io.Text')

            # hadoopFile loads each line as a key-value pair in which the
            # contents of the line are the key and the value is an empty
            # string. Convert to an rdd of just lines, encoded as bytes.
            rdd = rdd.map(lambda kv: kv[0].encode('utf-8'))

        else:
            rdd = sc.textFile(args.input_path, use_unicode=False)

        # run steps
        for step_num, step in steps_to_run:
            rdd = _run_step(step, step_num, rdd, make_mrc_job,
                            args.num_reducers, sort_values,
                            emulate_map_input_file,
                            args.skip_internal_protocol)

        # max_output_files: limit number of partitions
        if args.max_output_files:
            rdd = rdd.coalesce(args.max_output_files)

        # write the results
        if hadoop_output_format:
            # saveAsHadoopFile takes an rdd of key-value pairs, so convert to
            # that format
            rdd = rdd.map(lambda line: tuple(
                x.decode('utf-8') for x in line.split(b'\t', 1)))
            rdd.saveAsHadoopFile(args.output_path,
                                 outputFormatClass=hadoop_output_format,
                                 compressionCodecClass=args.compression_codec)
        else:
            rdd.saveAsTextFile(args.output_path,
                               compressionCodecClass=args.compression_codec)
    finally:
        if args.counter_output_dir is not None:
            counters = [ca.value for ca in counter_accumulators]

            # If the given path is an s3 path, use s3.parallelize,
            # otherwise just write them directly to the local dir
            if is_uri(args.counter_output_dir):
                sc.parallelize([json.dumps(counters)],
                               numSlices=1).saveAsTextFile(
                                   args.counter_output_dir)
            else:
                # Use regular python built-in file writer if the part-* file
                # is not created
                path = os.path.join(args.counter_output_dir, "part-00000")
                if not os.path.exists(args.counter_output_dir):
                    os.mkdir(args.counter_output_dir)
                with open(path, 'w') as wb:
                    wb.write(str(json.dumps(counters)))

Beispiel #7

0

Datei anzeigen

Datei: spark_app_read_data_from_seqfile.py Projekt: Leaderman/pyspark

from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("spark_app_read_data_from_seqfile")

sc = SparkContext(conf=conf)

lineRDD = sc.hadoopFile(path="hdfs://dip.cdh5.dev:8020/user/yurun/seqfile",
                        inputFormatClass="org.apache.hadoop.mapred.SequenceFileInputFormat",
                        keyClass="com.sina.dip.spark.converter.IntArrayWritable",
                        valueClass="org.apache.hadoop.io.NullWritable",
                        keyConverter="com.sina.dip.spark.converter.IntArrayWritableToObjectArrayConverter").map(lambda pair: pair[0])

lines = lineRDD.collect()

for line in lines:
    print line

sc.stop()

Beispiel #8

0

Datei anzeigen

Datei: analysis_spark.py Projekt: yennanliu/twitter_real_time_pipeline

def main():
    conf = SparkConf().setAppName("first")
    sc = SparkContext(conf=conf)
    #print 40 * '-'
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", aws_key_id)
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", aws_key)
    config_dict = {
        "fs.s3n.awsAccessKeyId": aws_key_id,
        "fs.s3n.awsSecretAccessKey": aws_key
    }
    bucket = "project4capstones3"
    prefix = "/2017/07/*/*/*"
    filename = "s3n://{}/Trump/{}".format(bucket, prefix)

    rdd = sc.hadoopFile(filename,
                        'org.apache.hadoop.mapred.TextInputFormat',
                        'org.apache.hadoop.io.Text',
                        'org.apache.hadoop.io.LongWritable',
                        conf=config_dict)
    spark = SparkSession.builder.appName("PythonWordCount").config(
        "spark.files.overwrite", "true").getOrCreate()

    df = spark.read.json(rdd.map(lambda x: x[1]))
    data_rm_na = df.filter(df["status_id"] != 'None')
    features_of_interest = ["rt_status_user_followers_count",\
                        'rt_status_user_friends_count',\
                        'rt_status_user_statuses_count',\
                        'rt_status_retweet_count',\
                        'rt_status_user_listed_count',\
                        'rt_status_user_id',\
                        'rt_status_created_at',\
                        'status_created_at',\
                        'rt_status_user_name',\
                        'rt_status_num_user_mentions',\
                        'searched_names',\
                        'rt_status_sentMag',\
                        'rt_status_sentScore',\
                        'rt_status_favorite_count',\
                        'status_id']

    df_reduce = data_rm_na.select(features_of_interest)
    df_reduce = df_reduce.withColumn(
        "rt_status_user_followers_count",
        df_reduce["rt_status_user_followers_count"].cast(IntegerType()))
    df_reduce = df_reduce.withColumn(
        "rt_status_user_friends_count",
        df_reduce["rt_status_user_friends_count"].cast(IntegerType()))
    df_reduce = df_reduce.withColumn(
        "rt_status_user_statuses_count",
        df_reduce["rt_status_user_statuses_count"].cast(IntegerType()))
    df_reduce = df_reduce.withColumn(
        "rt_status_retweet_count",
        df_reduce["rt_status_retweet_count"].cast(IntegerType()))
    df_reduce = df_reduce.withColumn(
        "rt_status_user_listed_count",
        df_reduce["rt_status_user_listed_count"].cast(IntegerType()))
    df_reduce = df_reduce.withColumn(
        "rt_status_favorite_count",
        df_reduce["rt_status_favorite_count"].cast(IntegerType()))
    df_reduce = df_reduce.withColumn(
        "rt_status_num_user_mentions",
        df_reduce["rt_status_num_user_mentions"].cast(IntegerType()))

    url_ = "jdbc:mysql://twittertalker1.csjkhjjygutf.us-east-1.rds.amazonaws.com:3306/innodb"
    table_name_ = "retweet"
    mode_ = "overwrite"

    df_reduce.write.format("jdbc").option("url", url_)\
    .option("dbtable", table_name_)\
    .option("driver", "com.mysql.jdbc.Driver")\
    .option("user", "XXXXXX")\
    .option("password", "XXXXXXXX")\
    .mode(mode_)\
    .save()

Beispiel #9

0

Datei anzeigen

""""""
# 读取sequenceFile
data = sc.sequenceFile(inputFile, "org.apache.hadoop.io.Text",
                       'org.apache.hadoop.io.IntWritable')
# 保存sequenceFile
data2 = sc.parallelize([('Panda', 3), ('Kay', 6), ('Snail', 2)])
data2.saveAsSequenceFile(outputFile)

# 5.2.5 对象文件
"""对象文件在 Python 中无法使用，不过 Python 中的 RDD 和 SparkContext 支持 saveAsPickleFile()
和 pickleFile() 方法作为替代。这使用了 Python 的 pickle 序列化库。不过，对象文件的
注意事项同样适用于 pickle 文件： pickle 库可能很慢，并且在修改类定义后，已经生产的
数据文件可能无法再读出来"""
# 5.2.6 Hadoop输入输出格式
# 1.读取其他Hadoop输入格式
input2 = sc.hadoopFile(inputFile).map(lambda x, y: (str(x), str(y)))
# 2.保存Hadoop输出格式
input2.saveAsNewAPIHadoopFile(inputFile)
# 3.to do
# 5.3文件系统
# 5.3.1 本地文件系统
rdd = sc.textFile(inputFile)
# 5.3.2 Amazon S3
# 5.3.3 HDFS:只需要将输入输出路径指定为hdfs://master:port/path即可
# 5.4 Spark SQL中的结构化数据,详见第九章
# 5.4.1Apache Hive
"""要把 Spark SQL 连接到已有的 Hive 上，你需要提供 Hive 的配置文件。你需要将 hive-site.
xml 文件复制到 Spark 的 ./conf/ 目录下"""
hiveCtx = HiveContext(sc)
rows = hiveCtx.sql('SELECT name, age FROM users')
firstRow = rows.first()

Beispiel #10

0

Datei anzeigen

Datei: 01_pyspark.py Projekt: wupengbo125/penter

# sc.cancelJobGroup(groupId)
# sc.setJobGroup(groupId,"")

# 将概要信息转储到目录路径中
# sc.dump_profiles(path)

rdd = sc.emptyRDD()
# 创建一个没有分区或元素的RDD。

print(sc.getConf())  # 返回SparkConf对象

# getLocalProperty(key)
# Get a local property set in this thread, or null if it is missing. See setLocalProperty().

# classmethod getOrCreate(conf=None)
"""
sc.hadoopFile()
sc.hadoopRDD()
sheet = sc.newAPIHadoopFile(
    '/user/me/sample.txt',
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text',
    conf={'textinputformat.record.delimiter': 'Time\tMHist'}
)

parquet_rdd = sc.newAPIHadoopFile(
        path,
        'org.apache.parquet.avro.AvroParquetInputFormat',
        'java.lang.Void',
        'org.apache.avro.generic.IndexedRecord',

Beispiel #11

0

Datei anzeigen

Datei: harness.py Projekt: Yelp/mrjob

def main(cmd_line_args=None):
    if cmd_line_args is None:
        cmd_line_args = sys.argv[1:]

    parser = _make_arg_parser()
    args = parser.parse_args(cmd_line_args)

    if args.num_reducers is not None and args.num_reducers <= 0:
        raise ValueError(
            'You can only configure num_reducers to positive number.')

    # get job_class
    job_module_name, job_class_name = args.job_class.rsplit('.', 1)
    job_module = import_module(job_module_name)
    job_class = getattr(job_module, job_class_name)

    # load initial data
    from pyspark import SparkContext

    if args.job_args:
        job_args = shlex_split(args.job_args)
    else:
        job_args = []

    # determine hadoop_*_format, steps
    # try to avoid instantiating a job in the driver; see #2044
    job = None

    if args.hadoop_input_format is None:
        job = job or job_class(job_args)
        hadoop_input_format = job.hadoop_input_format()
    else:
        hadoop_input_format = args.hadoop_input_format or None

    if args.hadoop_output_format is None:
        job = job or job_class(job_args)
        hadoop_output_format = job.hadoop_output_format()
    else:
        hadoop_output_format = args.hadoop_output_format or None

    if args.sort_values is None:
        job = job or job_class(job_args)
        sort_values = job.sort_values()
    else:
        sort_values = args.sort_values

    if args.steps_desc is None:
        job = job or job_class(job_args)
        steps = [step.description(step_num)
                 for step_num, step in enumerate(job.steps())]
    else:
        steps = json.loads(args.steps_desc)

    # pick steps
    start = args.first_step_num or 0
    end = None if args.last_step_num is None else args.last_step_num + 1
    steps_to_run = list(enumerate(steps))[start:end]

    sc = SparkContext()

    # keep track of one set of counters per job step
    counter_accumulators = [
        sc.accumulator(defaultdict(dict), CounterAccumulator())
        for _ in steps_to_run
    ]

    def make_increment_counter(step_num):
        counter_accumulator = counter_accumulators[step_num - start]

        def increment_counter(group, name, amount=1):
            counter_accumulator.add({group: {name: amount}})

        return increment_counter

    def make_mrc_job(mrc, step_num):
        j = job_class(job_args + [
            '--%s' % mrc, '--step-num=%d' % step_num
        ])

        # patch increment_counter() to update the accumulator for this step
        j.increment_counter = make_increment_counter(step_num)

        return j

    try:
        if hadoop_input_format:
            rdd = sc.hadoopFile(
                args.input_path,
                inputFormatClass=hadoop_input_format,
                keyClass='org.apache.hadoop.io.Text',
                valueClass='org.apache.hadoop.io.Text')

            # hadoopFile loads each line as a key-value pair in which the
            # contents of the line are the key and the value is an empty
            # string. Convert to an rdd of just lines, encoded as bytes.
            rdd = rdd.map(lambda kv: kv[0].encode('utf-8'))
        else:
            rdd = sc.textFile(args.input_path, use_unicode=False)

        # run steps
        for step_num, step in steps_to_run:
            rdd = _run_step(step, step_num, rdd,
                            make_mrc_job, args.num_reducers, sort_values)

        # max_output_files: limit number of partitions
        if args.max_output_files:
            rdd = rdd.coalesce(args.max_output_files)

        # write the results
        if hadoop_output_format:
            # saveAsHadoopFile takes an rdd of key-value pairs, so convert to
            # that format
            rdd = rdd.map(lambda line: tuple(
                x.decode('utf-8') for x in line.split(b'\t', 1)))
            rdd.saveAsHadoopFile(
                args.output_path,
                outputFormatClass=hadoop_output_format,
                compressionCodecClass=args.compression_codec)
        else:
            rdd.saveAsTextFile(
                args.output_path, compressionCodecClass=args.compression_codec)
    finally:
        if args.counter_output_dir is not None:
            counters = [ca.value for ca in counter_accumulators]

            sc.parallelize(
                [json.dumps(counters)],
                numSlices=1
            ).saveAsTextFile(
                args.counter_output_dir
            )

Beispiel #12

0

Datei anzeigen

Datei: spark_app_read_data_from_seqfile.py Projekt: imran273/pyspark-1

from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("spark_app_read_data_from_seqfile")

sc = SparkContext(conf=conf)

lineRDD = sc.hadoopFile(
    path="hdfs://dip.cdh5.dev:8020/user/yurun/seqfile",
    inputFormatClass="org.apache.hadoop.mapred.SequenceFileInputFormat",
    keyClass="com.sina.dip.spark.converter.IntArrayWritable",
    valueClass="org.apache.hadoop.io.NullWritable",
    keyConverter=
    "com.sina.dip.spark.converter.IntArrayWritableToObjectArrayConverter").map(
        lambda pair: pair[0])

lines = lineRDD.collect()

for line in lines:
    print line

sc.stop()

Beispiel #13

0

Datei anzeigen

from pyspark import SparkContext, SparkConf
"""
org.apache.hadoop.mapred
    KeyValueTextInputFormat
    SequenceFileAsBinaryInputFormat
    SequenceFileAsTextInputFormat
    SequenceFileInputFormat<K,V>
    TextInputFormat
"""

sparkconf = SparkConf().setAppName('hadoop io read').setMaster('local')
sc = SparkContext(conf=sparkconf)

f = sc.hadoopFile('hdfs://172.19.0.2/newtextfile/part-00000',
                  'org.apache.hadoop.mapred.TextInputFormat',
                  'org.apache.hadoop.io.Text',
                  'org.apache.hadoop.io.IntWritable')
print f.collect()