Esempio n. 1
0
    def test_perrecord_vs_perpartition(self):
        raw_records = extract_raw_records(sample_warc_loc, self.spark)
        warc_records = raw_records \
            .flatMap(lambda record: parse_raw_warc(record))

        def map_function(_):
            new_heavy_object = HeavyObject('map')
            objet_id = new_heavy_object.get_id()
            return objet_id

        ids_after_map: RDD = warc_records.map(map_function)

        def partition_function(partition):
            new_heavy_object = HeavyObject('mapPartition')
            object_id = new_heavy_object.get_id()
            for _ in partition:
                yield object_id

        ids_after_mappartitions: RDD = warc_records.mapPartitions(
            partition_function)

        unique_ids_map: List[int] = ids_after_map.distinct().collect()
        unique_ids_mappartitions: List[int] = ids_after_mappartitions.distinct(
        ).collect()

        print('@' * 50)
        number_of_records: int = warc_records.count()
        number_of_partitions: int = warc_records.getNumPartitions()
        print('@@ Number of records: {}'.format(number_of_records))
        print('@@ Number of partitions: {}'.format(number_of_partitions))
        self.assertGreater(len(unique_ids_map), len(unique_ids_mappartitions))
        self.assertGreaterEqual(number_of_partitions,
                                len(unique_ids_mappartitions))
        print(unique_ids_map)
        print(unique_ids_mappartitions)
import time
from pyspark import RDD
from pyspark.sql import SparkSession, DataFrame
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, parse_raw_wet

if __name__ == "__main__":
    input_loc_warc = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc'  # ToDo: Modify path
    input_loc_wet = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc.wet'  # ToDo: Modify path
    session: SparkSession = create_session(3, 'Activity 4.2 DataFrame')

    warc_records: RDD = extract_raw_records(
        input_loc_warc, session).flatMap(lambda record: parse_raw_warc(record))
    wet_records: RDD = extract_raw_records(
        input_loc_wet, session).flatMap(lambda record: parse_raw_wet(record))

    from pyspark.sql.functions import col
    warc_records_df: DataFrame = warc_records.toDF().select(
        col('target_uri'), col('language'))
    wet_records_df: DataFrame = wet_records.toDF().select(
        col('target_uri'), col('plain_text'))

    joined_df = warc_records_df.join(wet_records_df, ['target_uri'])
    spanish_records = joined_df.filter(col('language') == 'es')

    time.sleep(10 * 60)  # For exploring WebUI
Esempio n. 3
0
from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, parse_raw_wet
import time

if __name__ == "__main__":
    session: SparkSession = SparkSession.builder \
        .master('local[{}]'.format(3)) \
        .appName('Caching & Eviction') \
        .getOrCreate()
    session.sparkContext.setLogLevel('DEBUG')

    input_loc_warc = '/Users/a/Desktop/Buch/CC-MAIN-20191013195541-20191013222541-00000.warc'
    input_loc_wet = '/Users/a/Desktop/Buch/CC-MAIN-20191013195541-20191013222541-00000.warc.wet'

    raw_records_warc: RDD = extract_raw_records(input_loc_warc, session)
    warc_records: RDD = raw_records_warc \
        .flatMap(lambda record: parse_raw_warc(record))

    raw_records_wet: RDD = extract_raw_records(input_loc_wet, session)
    wet_records: RDD = raw_records_wet \
        .flatMap(lambda record: parse_raw_wet(record))

    warc_records.cache()
    wet_records.cache()

    uri_keyed_warc = warc_records.map(lambda record:
                                      (record.target_uri, record))
    uri_keyed_wet = wet_records.map(lambda record: (record.target_uri, record))
    joined = uri_keyed_warc.join(uri_keyed_wet)

    print(joined.count())
    time.sleep(60 * 10)
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import sample_warc_loc, extract_raw_records, parse_raw_warc

if __name__ == "__main__":
    session = create_session(3, 'Spark-Submit')
    session.sparkContext.setLogLevel(
        'ERROR')  # avoids printing of info messages

    raw_records = extract_raw_records(sample_warc_loc, session)
    warc_records = raw_records.flatMap(lambda record: parse_raw_warc(record))

    warc_records.toDF().printSchema()
    print('Total # of records: ' + str(warc_records.count()))
def parse_method(text: str) -> Tuple[WarcRecord]:
    parsed_raw_warc = parse_raw_warc(text)
    # crasher = 5 / 0  # ToDo: Uncomment
    # print(crasher)  # ToDo: Uncomment
    return parsed_raw_warc
Esempio n. 6
0
import time
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, parse_raw_wet

if __name__ == "__main__":
    input_loc_warc = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc'  # ToDo: Modify path
    input_loc_wet = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc.wet'  # ToDo: Modify path
    session = create_session(3, 'Activity 3.1')

    raw_records_warc = extract_raw_records(input_loc_warc, session)
    warc_records = raw_records_warc.flatMap(
        lambda record: parse_raw_warc(record))
    raw_records_wet = extract_raw_records(input_loc_wet, session)
    wet_records = raw_records_wet.flatMap(lambda record: parse_raw_wet(record))
    pair_warc = warc_records.map(lambda warc: (warc.target_uri, (
        warc.warc_type, warc.record_id, warc.content_type, warc.block_digest,
        warc.date_s, warc.content_length, warc.info_id, warc.concurrent_to,
        warc.ip, warc.payload_digest, warc.payload_type, warc.
        html_content_type, warc.language, warc.html_length, warc.html_source)))
    pair_wet = wet_records.map(lambda wet: (wet.target_uri, wet.plain_text))

    joined = pair_warc.join(pair_wet, numPartitions=7)

    print(joined.count())
    time.sleep(10 * 60)  # For exploring WebUI
if __name__ == "__main__":
    # input = sample_warc_loc
    spark: SparkSession = SparkSession.builder \
        .appName('Activity 2.1') \
        .getOrCreate()
    spark.sparkContext.setLogLevel('ERROR')  # avoids printing of info messages

    from operator import add
    from collections import defaultdict
    from typing import Dict
    from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc

    input = "/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc"
    warc_records = extract_raw_records(
        input, spark).flatMap(lambda record: parse_raw_warc(record))

    # print(warc_records.count())

    keyed_by_language = warc_records.filter(
        lambda rec: rec.language != '').map(lambda rec: (rec.language, 1))
    language_map: Dict[str, int] = keyed_by_language.reduceByKey(
        add).collectAsMap()
    ## language_list = keyed_by_language.reduceByKey(add).collect()
    ## language_map: Dict[str, int] = defaultdict(int)
    ## for key, value in language_list:
    ## ...     language_map[key] += value
    ## language_map
    # warc_records.filter(lambda rec: rec.language != '').map(lambda rec: rec.language).countByValue()

    sorted_language_list = [