Example #1
0
    def test_perrecord_vs_perpartition(self):
        raw_records = extract_raw_records(sample_warc_loc, self.spark)
        warc_records = raw_records \
            .flatMap(lambda record: parse_raw_warc(record))

        def map_function(_):
            new_heavy_object = HeavyObject('map')
            objet_id = new_heavy_object.get_id()
            return objet_id

        ids_after_map: RDD = warc_records.map(map_function)

        def partition_function(partition):
            new_heavy_object = HeavyObject('mapPartition')
            object_id = new_heavy_object.get_id()
            for _ in partition:
                yield object_id

        ids_after_mappartitions: RDD = warc_records.mapPartitions(
            partition_function)

        unique_ids_map: List[int] = ids_after_map.distinct().collect()
        unique_ids_mappartitions: List[int] = ids_after_mappartitions.distinct(
        ).collect()

        print('@' * 50)
        number_of_records: int = warc_records.count()
        number_of_partitions: int = warc_records.getNumPartitions()
        print('@@ Number of records: {}'.format(number_of_records))
        print('@@ Number of partitions: {}'.format(number_of_partitions))
        self.assertGreater(len(unique_ids_map), len(unique_ids_mappartitions))
        self.assertGreaterEqual(number_of_partitions,
                                len(unique_ids_mappartitions))
        print(unique_ids_map)
        print(unique_ids_mappartitions)
Example #2
0
from pyspark import RDD
from pyspark.sql import SparkSession
from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, parse_raw_wet
import time

if __name__ == "__main__":
    session: SparkSession = SparkSession.builder \
        .master('local[{}]'.format(3)) \
        .appName('Caching & Eviction') \
        .getOrCreate()
    session.sparkContext.setLogLevel('DEBUG')

    input_loc_warc = '/Users/a/Desktop/Buch/CC-MAIN-20191013195541-20191013222541-00000.warc'
    input_loc_wet = '/Users/a/Desktop/Buch/CC-MAIN-20191013195541-20191013222541-00000.warc.wet'

    raw_records_warc: RDD = extract_raw_records(input_loc_warc, session)
    warc_records: RDD = raw_records_warc \
        .flatMap(lambda record: parse_raw_warc(record))

    raw_records_wet: RDD = extract_raw_records(input_loc_wet, session)
    wet_records: RDD = raw_records_wet \
        .flatMap(lambda record: parse_raw_wet(record))

    warc_records.cache()
    wet_records.cache()

    uri_keyed_warc = warc_records.map(lambda record:
                                      (record.target_uri, record))
    uri_keyed_wet = wet_records.map(lambda record: (record.target_uri, record))
    joined = uri_keyed_warc.join(uri_keyed_wet)
import time
from pyspark import RDD
from pyspark.sql import SparkSession, DataFrame
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, parse_raw_wet

if __name__ == "__main__":
    input_loc_warc = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc'  # ToDo: Modify path
    input_loc_wet = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc.wet'  # ToDo: Modify path
    session: SparkSession = create_session(3, 'Activity 4.2 DataFrame')

    warc_records: RDD = extract_raw_records(
        input_loc_warc, session).flatMap(lambda record: parse_raw_warc(record))
    wet_records: RDD = extract_raw_records(
        input_loc_wet, session).flatMap(lambda record: parse_raw_wet(record))

    from pyspark.sql.functions import col
    warc_records_df: DataFrame = warc_records.toDF().select(
        col('target_uri'), col('language'))
    wet_records_df: DataFrame = wet_records.toDF().select(
        col('target_uri'), col('plain_text'))

    joined_df = warc_records_df.join(wet_records_df, ['target_uri'])
    spanish_records = joined_df.filter(col('language') == 'es')

    time.sleep(10 * 60)  # For exploring WebUI
def tag_records(partition):
    for warc_record in partition:
        parser = BeautifulSoup(warc_record.html_source, 'html.parser')
        plaintext = ' '.join(parser.stripped_strings)
        plaintext_stripped = sub('\\s+', ' ', plaintext)
        if plaintext_stripped is None or plaintext_stripped == '':
            yield ()  # empty tuple
        else:
            cleaned_text = ''.join(x for x in plaintext_stripped if x in printable)
            _, _, details = pycld2.detect(cleaned_text)
            (languageName, languageCode, percent, score) = details[0]
            yield warc_record.target_uri, languageCode, str(score)


if __name__ == "__main__":
    session: SparkSession = SparkSession.builder \
        .appName('Improved Crawl Tagger') \
        .getOrCreate()
    input_file = argv[1]
    output_dir = argv[2]
    warc_records = extract_raw_records(input_file, session) \
        .flatMap(lambda record: parse_raw_warc(record)) \
        .filter(lambda record: record.warc_type == 'response')

    tagged_texts_rdd = warc_records \
        .mapPartitions(tag_records) \
        .filter(lambda record: record != ())

    tagged_texts_rdd.saveAsTextFile(output_dir)
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import sample_wet_loc, extract_raw_records, parse_raw_wet

if __name__ == "__main__":
    session = create_session(3, 'WET Parser')
    session.sparkContext.setLogLevel(
        'ERROR')  # avoids printing of info messages

    raw_records = extract_raw_records(sample_wet_loc, session)
    wet_records = raw_records.flatMap(lambda record: parse_raw_wet(record))

    wet_records.toDF().printSchema()
    print('Total # of records: ' + str(wet_records.count()))

def parse_method(text: str) -> Tuple[WarcRecord]:
    parsed_raw_warc = parse_raw_warc(text)
    # crasher = 5 / 0  # ToDo: Uncomment
    # print(crasher)  # ToDo: Uncomment
    return parsed_raw_warc


if __name__ == "__main__":
    session: SparkSession = SparkSession.builder \
        .master('local[3, 3]') \
        .appName('Failure Exploration') \
        .getOrCreate()
    session.sparkContext.setLogLevel('ERROR')

    session.sparkContext._gateway.start_callback_server()
    java_process = launch_gateway()
    gateway = JavaGateway(
        gateway_parameters=GatewayParameters(port=java_process),
        callback_server_parameters=CallbackServerParameters(port=0))
    listener = RetryListener()
    session.sparkContext._jsc.sc().addSparkListener(listener)

    input_warc = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc'  # ToDo: Change path
    raw_records = extract_raw_records(input_warc, session)
    warc_records = raw_records.flatMap(parse_method)
    print(warc_records.count())

    gateway.shutdown_callback_server()
from sys import argv
from pyspark.ml.common import _java2py
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import sample_warc_loc, extract_raw_records, parse_raw_warc

#  ~/spark-2.4.6-bin-hadoop2.7/bin/spark-submit  --driver-class-path  ~/IdeaProjects/The-Spark-Workshop/target/packt-uber-jar.jar:/Users/a/.m2/repository/com/google/guava/guava/28.2-jre/guava-28.2-jre.jar:/Users/a/.m2/repository/org/apache/commons/commons-compress/1.20/commons-compress-1.20.jar ~/IdeaProjects/The-Spark-Workshop/Chapter04/Activity4_03/Activity4_03.py ~/Output_Act4_3
if __name__ == "__main__":
    output_dir = argv[1]
    session = create_session(3, 'WARC Parser')

    warc_records = extract_raw_records(sample_warc_loc, session) \
        .flatMap(lambda record: parse_raw_warc(record)) \
        .filter(lambda record: record.warc_type == 'response')

    plaintexts_rdd = warc_records.map(lambda record: record.html_source)
    java_rdd = session.sparkContext._jvm.SerDe.pythonToJava(
        plaintexts_rdd._jrdd, True)
    tagged_java_rdd = session.sparkContext._jvm.Activity4_03.Activity4_03.tagJavaRDD(
        java_rdd)
    tagged_python_rdd = _java2py(session.sparkContext, tagged_java_rdd)

    tagged_python_rdd.saveAsTextFile(output_dir)
Example #8
0
from pyspark.sql import SparkSession
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, sample_warc_loc
import time
import sys

if __name__ == "__main__":
    threads = int(sys.argv[1])
    warc_loc = sample_warc_loc
    session: SparkSession = create_session(threads, "PySpark Design")

    raw_records = extract_raw_records(warc_loc, session)
    warc_records = raw_records \
        .flatMap(lambda record: parse_raw_warc(record))
    print(warc_records.getNumPartitions())
    warc_records.cache()
    print(warc_records.count())
    time.sleep(60 * 10)
from pyspark.sql import SparkSession

if __name__ == "__main__":
    # input = sample_warc_loc
    spark: SparkSession = SparkSession.builder \
        .appName('Activity 2.1') \
        .getOrCreate()
    spark.sparkContext.setLogLevel('ERROR')  # avoids printing of info messages

    from operator import add
    from collections import defaultdict
    from typing import Dict
    from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc

    input = "/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc"
    warc_records = extract_raw_records(
        input, spark).flatMap(lambda record: parse_raw_warc(record))

    # print(warc_records.count())

    keyed_by_language = warc_records.filter(
        lambda rec: rec.language != '').map(lambda rec: (rec.language, 1))
    language_map: Dict[str, int] = keyed_by_language.reduceByKey(
        add).collectAsMap()
    ## language_list = keyed_by_language.reduceByKey(add).collect()
    ## language_map: Dict[str, int] = defaultdict(int)
    ## for key, value in language_list:
    ## ...     language_map[key] += value
    ## language_map
    # warc_records.filter(lambda rec: rec.language != '').map(lambda rec: rec.language).countByValue()

    sorted_language_list = [
Example #10
0
from pyspark.sql import SparkSession
from Chapter02.utilities02_py.helper_python import sample_warc_loc, extract_raw_records, parse_raw_warc
from Chapter02.Exercise2_06.Exercise2_06 import heavy_computation

spark: SparkSession = SparkSession.builder \
    .appName('SubmitWithMaster') \
    .getOrCreate()

raw_records = extract_raw_records(sample_warc_loc, spark)
warc_records = raw_records.flatMap(parse_raw_warc)
invoked_heavy_rdd = warc_records.map(lambda record: heavy_computation())
print(invoked_heavy_rdd.collect())