コード例 #1
0
    from typing import List
    million_ints: List[int] = []
    for number in range(0, 1000000):
        million_ints.append(-1)

    from pympler import asizeof
    print(asizeof.asizeof(million_ints))
    #
    from Chapter02.utilities02_py.domain_objects  import WarcRecord
    million_warcs = list()
    for index in range(0, 1000000):
        million_warcs.append(WarcRecord.create_dummy())
    print(asizeof.asizeof(million_warcs))

    ##########################################################################
    spark = create_session(2, "Collection Sizes")
    ## (B)

    million_ints_rdd = spark.sparkContext.range(0, 1000000)
    million_ints_rdd.cache()
    million_ints_rdd.count()

    million_ints_rdd.unpersist()
    million_warcs_rdd = million_ints_rdd.map(lambda number: WarcRecord.create_dummy())
    million_warcs_rdd.cache()
    million_warcs_rdd.count()

    from pyspark.sql import DataFrame
    million_ints_df: DataFrame = spark.range(0, 1000000)
    million_ints_df.cache()
    million_ints_df.count()
コード例 #2
0
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import sample_wet_loc, extract_raw_records, parse_raw_wet

if __name__ == "__main__":
    session = create_session(3, 'WET Parser')
    session.sparkContext.setLogLevel(
        'ERROR')  # avoids printing of info messages

    raw_records = extract_raw_records(sample_wet_loc, session)
    wet_records = raw_records.flatMap(lambda record: parse_raw_wet(record))

    wet_records.toDF().printSchema()
    print('Total # of records: ' + str(wet_records.count()))
コード例 #3
0
import time
from pyspark import RDD
from pyspark.sql import SparkSession, DataFrame
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, parse_raw_wet

if __name__ == "__main__":
    input_loc_warc = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc'  # ToDo: Modify path
    input_loc_wet = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc.wet'  # ToDo: Modify path
    session: SparkSession = create_session(3, 'Activity 4.2 DataFrame')

    warc_records: RDD = extract_raw_records(
        input_loc_warc, session).flatMap(lambda record: parse_raw_warc(record))
    wet_records: RDD = extract_raw_records(
        input_loc_wet, session).flatMap(lambda record: parse_raw_wet(record))

    from pyspark.sql.functions import col
    warc_records_df: DataFrame = warc_records.toDF().select(
        col('target_uri'), col('language'))
    wet_records_df: DataFrame = wet_records.toDF().select(
        col('target_uri'), col('plain_text'))

    joined_df = warc_records_df.join(wet_records_df, ['target_uri'])
    spanish_records = joined_df.filter(col('language') == 'es')

    time.sleep(10 * 60)  # For exploring WebUI
コード例 #4
0
import time
from pyspark import RDD
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import count, sum, col
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, sample_warc_loc

if __name__ == "__main__":
    session: SparkSession = create_session(3, "Query Plans")
    session.sparkContext.setLogLevel("TRACE")
    lang_tag_mapping = [('en', 'english'), ('pt-pt', 'portugese'),
                        ('cs', 'czech'), ('de', 'german'), ('es', 'spanish'),
                        ('eu', 'basque'), ('it', 'italian'),
                        ('hu', 'hungarian'), ('pt-br', 'portugese'),
                        ('fr', 'french'), ('en-US', 'english'),
                        ('zh-TW', 'chinese')]
    lang_tag_df = session.createDataFrame(lang_tag_mapping,
                                          ['tag', 'language'])
    session.createDataFrame(lang_tag_mapping).show()
    raw_records = extract_raw_records(sample_warc_loc, session)
    warc_records_rdd: RDD = raw_records.flatMap(parse_raw_warc)
    warc_records_df: DataFrame = warc_records_rdd.toDF()\
        .select(col('target_uri'), col('language'))\
        .filter(col('language') != '')

    aggregated = warc_records_df\
        .groupBy(col('language'))\
        .agg(count(col('target_uri')))\
        .withColumnRenamed('language', 'tag')

    joined_df = aggregated.join(lang_tag_df, ['tag'])
コード例 #5
0
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import sample_warc_loc, extract_raw_records, parse_raw_warc

if __name__ == "__main__":
    session = create_session(3, 'Spark-Submit')
    session.sparkContext.setLogLevel(
        'ERROR')  # avoids printing of info messages

    raw_records = extract_raw_records(sample_warc_loc, session)
    warc_records = raw_records.flatMap(lambda record: parse_raw_warc(record))

    warc_records.toDF().printSchema()
    print('Total # of records: ' + str(warc_records.count()))
コード例 #6
0
from pyspark.sql import SparkSession
from pyspark.ml.common import _java2py
from Chapter01.utilities01_py.helper_python import create_session

#  ~/spark-2.4.6-bin-hadoop2.7/bin/spark-submit --driver-class-path ~/IdeaProjects/The-Spark-Workshop/target/packt-uber-jar.jar ~/IdeaProjects/The-Spark-Workshop/Chapter04/Exercise4_06/Exercise4_06.py
if __name__ == "__main__":
    session: SparkSession = create_session(2, "PySpark <> JVM")
    session.sparkContext.setLogLevel('ERROR')
    python_rdd = session.sparkContext.range(0, 5)

    java_rdd = session.sparkContext._jvm.SerDe.pythonToJava(
        python_rdd._jrdd, True)
    mapped_java_rdd = session.sparkContext._jvm.Exercise4_06.ScalaObject.executeInScala(
        java_rdd)
    mapped_python_rdd = _java2py(session.sparkContext, mapped_java_rdd)
    print(mapped_python_rdd.collect())
コード例 #7
0
import time
from pyspark import RDD
from pyspark.sql import SparkSession, DataFrame
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, parse_raw_wet

if __name__ == "__main__":
    input_loc_warc = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc'  # ToDo: Modify path
    input_loc_wet = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc.wet'  # ToDo: Modify path
    session: SparkSession = create_session(3, 'Activity 4.2 RDD')

    warc_records: RDD = extract_raw_records(
        input_loc_warc, session).flatMap(lambda record: parse_raw_warc(record))
    wet_records: RDD = extract_raw_records(
        input_loc_wet, session).flatMap(lambda record: parse_raw_wet(record))

    pair_warc: RDD = warc_records.map(lambda warc:
                                      (warc.target_uri, warc.language))
    pair_wet: RDD = wet_records.map(lambda wet:
                                    (wet.target_uri, wet.plain_text))

    joined = pair_warc.join(pair_wet)
    spanish_records = joined.filter(lambda triple: triple[1][0] == 'es')

    print(spanish_records.count())  # 133
    time.sleep(10 * 60)  # For exploring WebUI
コード例 #8
0
from sys import argv
from pyspark.ml.common import _java2py
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import sample_warc_loc, extract_raw_records, parse_raw_warc

#  ~/spark-2.4.6-bin-hadoop2.7/bin/spark-submit  --driver-class-path  ~/IdeaProjects/The-Spark-Workshop/target/packt-uber-jar.jar:/Users/a/.m2/repository/com/google/guava/guava/28.2-jre/guava-28.2-jre.jar:/Users/a/.m2/repository/org/apache/commons/commons-compress/1.20/commons-compress-1.20.jar ~/IdeaProjects/The-Spark-Workshop/Chapter04/Activity4_03/Activity4_03.py ~/Output_Act4_3
if __name__ == "__main__":
    output_dir = argv[1]
    session = create_session(3, 'WARC Parser')

    warc_records = extract_raw_records(sample_warc_loc, session) \
        .flatMap(lambda record: parse_raw_warc(record)) \
        .filter(lambda record: record.warc_type == 'response')

    plaintexts_rdd = warc_records.map(lambda record: record.html_source)
    java_rdd = session.sparkContext._jvm.SerDe.pythonToJava(
        plaintexts_rdd._jrdd, True)
    tagged_java_rdd = session.sparkContext._jvm.Activity4_03.Activity4_03.tagJavaRDD(
        java_rdd)
    tagged_python_rdd = _java2py(session.sparkContext, tagged_java_rdd)

    tagged_python_rdd.saveAsTextFile(output_dir)
コード例 #9
0
from pyspark.sql import SparkSession
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, sample_warc_loc
import time
import sys

if __name__ == "__main__":
    threads = int(sys.argv[1])
    warc_loc = sample_warc_loc
    session: SparkSession = create_session(threads, "PySpark Design")

    raw_records = extract_raw_records(warc_loc, session)
    warc_records = raw_records \
        .flatMap(lambda record: parse_raw_warc(record))
    print(warc_records.getNumPartitions())
    warc_records.cache()
    print(warc_records.count())
    time.sleep(60 * 10)
コード例 #10
0
import time
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, parse_raw_wet

if __name__ == "__main__":
    input_loc_warc = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc'  # ToDo: Modify path
    input_loc_wet = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc.wet'  # ToDo: Modify path
    session = create_session(3, 'Activity 3.1')

    raw_records_warc = extract_raw_records(input_loc_warc, session)
    warc_records = raw_records_warc.flatMap(
        lambda record: parse_raw_warc(record))
    raw_records_wet = extract_raw_records(input_loc_wet, session)
    wet_records = raw_records_wet.flatMap(lambda record: parse_raw_wet(record))
    pair_warc = warc_records.map(lambda warc: (warc.target_uri, (
        warc.warc_type, warc.record_id, warc.content_type, warc.block_digest,
        warc.date_s, warc.content_length, warc.info_id, warc.concurrent_to,
        warc.ip, warc.payload_digest, warc.payload_type, warc.
        html_content_type, warc.language, warc.html_length, warc.html_source)))
    pair_wet = wet_records.map(lambda wet: (wet.target_uri, wet.plain_text))

    joined = pair_warc.join(pair_wet, numPartitions=7)

    print(joined.count())
    time.sleep(10 * 60)  # For exploring WebUI
コード例 #11
0
from pyspark.sql import SparkSession
from Chapter01.utilities01_py.helper_python import create_session
from pyspark.ml.common import _java2py

if __name__ == "__main__":
    threads = 2
    session: SparkSession = create_session(threads, "PySpark <> JVM")
    python_rdd = session.sparkContext.range(0, 5)

    java_rdd = session.sparkContext._jvm.SerDe.pythonToJava(
        python_rdd._jrdd, True)
    mapped_java_rdd = session.sparkContext._jvm.Exercise4_06.ScalaObject.executeInScala(
        java_rdd)
    mapped_python_rdd = _java2py(session.sparkContext, mapped_java_rdd)
    print(mapped_python_rdd.collect())
コード例 #12
0
    return process_id, current_uri


def trivial_filter(processid_uri: (int, str)) -> bool:
    new_process_id = str(getpid())
    timepoint = str(datetime.now())
    print('3@ filter in process {} at {} processing {}'.format(new_process_id, timepoint, processid_uri[1]))
    return True


def quick_print(processid_uri: (int, str)) -> (int, int):
    new_process_id = str(getpid())
    timepoint = str(datetime.now())
    print('4@ map2 in process {} at {} processing {}'.format(new_process_id, timepoint, processid_uri[1]))
    return processid_uri[0], new_process_id


if __name__ == "__main__":
    session: SparkSession = create_session(3, "Wave exploration")
    raw_records = extract_raw_records(sample_warc_loc, session)
    warc_records = raw_records \
        .flatMap(parse_raw_warc)

    process_ids_rdd = warc_records\
        .map(fall_asleep)\
        .filter(trivial_filter)\
        .map(quick_print)

    distinct_process_ids: List[Tuple[int, int]] = process_ids_rdd.distinct().collect()
    print(distinct_process_ids)