def test_perrecord_vs_perpartition(self): raw_records = extract_raw_records(sample_warc_loc, self.spark) warc_records = raw_records \ .flatMap(lambda record: parse_raw_warc(record)) def map_function(_): new_heavy_object = HeavyObject('map') objet_id = new_heavy_object.get_id() return objet_id ids_after_map: RDD = warc_records.map(map_function) def partition_function(partition): new_heavy_object = HeavyObject('mapPartition') object_id = new_heavy_object.get_id() for _ in partition: yield object_id ids_after_mappartitions: RDD = warc_records.mapPartitions( partition_function) unique_ids_map: List[int] = ids_after_map.distinct().collect() unique_ids_mappartitions: List[int] = ids_after_mappartitions.distinct( ).collect() print('@' * 50) number_of_records: int = warc_records.count() number_of_partitions: int = warc_records.getNumPartitions() print('@@ Number of records: {}'.format(number_of_records)) print('@@ Number of partitions: {}'.format(number_of_partitions)) self.assertGreater(len(unique_ids_map), len(unique_ids_mappartitions)) self.assertGreaterEqual(number_of_partitions, len(unique_ids_mappartitions)) print(unique_ids_map) print(unique_ids_mappartitions)
import time from pyspark import RDD from pyspark.sql import SparkSession, DataFrame from Chapter01.utilities01_py.helper_python import create_session from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, parse_raw_wet if __name__ == "__main__": input_loc_warc = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc' # ToDo: Modify path input_loc_wet = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc.wet' # ToDo: Modify path session: SparkSession = create_session(3, 'Activity 4.2 DataFrame') warc_records: RDD = extract_raw_records( input_loc_warc, session).flatMap(lambda record: parse_raw_warc(record)) wet_records: RDD = extract_raw_records( input_loc_wet, session).flatMap(lambda record: parse_raw_wet(record)) from pyspark.sql.functions import col warc_records_df: DataFrame = warc_records.toDF().select( col('target_uri'), col('language')) wet_records_df: DataFrame = wet_records.toDF().select( col('target_uri'), col('plain_text')) joined_df = warc_records_df.join(wet_records_df, ['target_uri']) spanish_records = joined_df.filter(col('language') == 'es') time.sleep(10 * 60) # For exploring WebUI
from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, parse_raw_wet import time if __name__ == "__main__": session: SparkSession = SparkSession.builder \ .master('local[{}]'.format(3)) \ .appName('Caching & Eviction') \ .getOrCreate() session.sparkContext.setLogLevel('DEBUG') input_loc_warc = '/Users/a/Desktop/Buch/CC-MAIN-20191013195541-20191013222541-00000.warc' input_loc_wet = '/Users/a/Desktop/Buch/CC-MAIN-20191013195541-20191013222541-00000.warc.wet' raw_records_warc: RDD = extract_raw_records(input_loc_warc, session) warc_records: RDD = raw_records_warc \ .flatMap(lambda record: parse_raw_warc(record)) raw_records_wet: RDD = extract_raw_records(input_loc_wet, session) wet_records: RDD = raw_records_wet \ .flatMap(lambda record: parse_raw_wet(record)) warc_records.cache() wet_records.cache() uri_keyed_warc = warc_records.map(lambda record: (record.target_uri, record)) uri_keyed_wet = wet_records.map(lambda record: (record.target_uri, record)) joined = uri_keyed_warc.join(uri_keyed_wet) print(joined.count()) time.sleep(60 * 10)
from Chapter01.utilities01_py.helper_python import create_session from Chapter02.utilities02_py.helper_python import sample_warc_loc, extract_raw_records, parse_raw_warc if __name__ == "__main__": session = create_session(3, 'Spark-Submit') session.sparkContext.setLogLevel( 'ERROR') # avoids printing of info messages raw_records = extract_raw_records(sample_warc_loc, session) warc_records = raw_records.flatMap(lambda record: parse_raw_warc(record)) warc_records.toDF().printSchema() print('Total # of records: ' + str(warc_records.count()))
def parse_method(text: str) -> Tuple[WarcRecord]: parsed_raw_warc = parse_raw_warc(text) # crasher = 5 / 0 # ToDo: Uncomment # print(crasher) # ToDo: Uncomment return parsed_raw_warc
import time from Chapter01.utilities01_py.helper_python import create_session from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc, parse_raw_wet if __name__ == "__main__": input_loc_warc = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc' # ToDo: Modify path input_loc_wet = '/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc.wet' # ToDo: Modify path session = create_session(3, 'Activity 3.1') raw_records_warc = extract_raw_records(input_loc_warc, session) warc_records = raw_records_warc.flatMap( lambda record: parse_raw_warc(record)) raw_records_wet = extract_raw_records(input_loc_wet, session) wet_records = raw_records_wet.flatMap(lambda record: parse_raw_wet(record)) pair_warc = warc_records.map(lambda warc: (warc.target_uri, ( warc.warc_type, warc.record_id, warc.content_type, warc.block_digest, warc.date_s, warc.content_length, warc.info_id, warc.concurrent_to, warc.ip, warc.payload_digest, warc.payload_type, warc. html_content_type, warc.language, warc.html_length, warc.html_source))) pair_wet = wet_records.map(lambda wet: (wet.target_uri, wet.plain_text)) joined = pair_warc.join(pair_wet, numPartitions=7) print(joined.count()) time.sleep(10 * 60) # For exploring WebUI
if __name__ == "__main__": # input = sample_warc_loc spark: SparkSession = SparkSession.builder \ .appName('Activity 2.1') \ .getOrCreate() spark.sparkContext.setLogLevel('ERROR') # avoids printing of info messages from operator import add from collections import defaultdict from typing import Dict from Chapter02.utilities02_py.helper_python import extract_raw_records, parse_raw_warc input = "/Users/a/CC-MAIN-20191013195541-20191013222541-00000.warc" warc_records = extract_raw_records( input, spark).flatMap(lambda record: parse_raw_warc(record)) # print(warc_records.count()) keyed_by_language = warc_records.filter( lambda rec: rec.language != '').map(lambda rec: (rec.language, 1)) language_map: Dict[str, int] = keyed_by_language.reduceByKey( add).collectAsMap() ## language_list = keyed_by_language.reduceByKey(add).collect() ## language_map: Dict[str, int] = defaultdict(int) ## for key, value in language_list: ## ... language_map[key] += value ## language_map # warc_records.filter(lambda rec: rec.language != '').map(lambda rec: rec.language).countByValue() sorted_language_list = [