from utilities01_py.helper_python import create_session from utilities02_py.helper_python import extract_raw_records, parse_raw_warc, sample_warc_loc from pyspark.sql import SparkSession import time from time import gmtime, strftime def heavy_computation(record): time.sleep(0.2) return record if __name__ == "__main__": session: SparkSession = create_session(2, "Activity 2") raw_records = extract_raw_records(sample_warc_loc, session) print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) warc_records = raw_records \ .flatMap(lambda record: parse_raw_warc(record)) warc_records \ .map(lambda record: heavy_computation(record)) \ .foreach(lambda _: None) print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))
from pyspark.rdd import RDD from pyspark.sql import SparkSession from utilities01_py.helper_python import create_session from utilities02_py.helper_python import sample_wet_loc if __name__ == "__main__": session: SparkSession = create_session(2, "Default parsing") session.sparkContext.setLogLevel('ERROR') # skips INFO messages records: RDD = session.sparkContext.textFile(sample_wet_loc) for record in records.take(50): print(record) print('-' * 20) print('#' * 40) print('Total # of records: ' + str(records.count()))
def map_function(_): new_heavy_object = HeavyObject('MapRecord') object_id = new_heavy_object.get_id() return object_id def partition_function(partition): new_heavy_object = HeavyObject('MapPartition') object_id = new_heavy_object.get_id() return iter([object_id for _ in partition]) if __name__ == "__main__": warc_loc = sample_warc_loc session: SparkSession = create_session(2, "PerRecordVsPerPartition") raw_records = extract_raw_records(warc_loc, session) warc_records = raw_records \ .flatMap(lambda record: parse_raw_warc(record)) input_partitions = warc_records.getNumPartitions() number_of_records = warc_records.count() ids_of_map = warc_records \ .map(map_function) \ .count() print('@' * 50) ids_of_mappartition = warc_records \ .mapPartitions(partition_function) \
from pyspark.rdd import RDD from utilities01_py.helper_python import create_session from utilities02_py.helper_python import sample_wet_loc if __name__ == "__main__": session = create_session(2, "Proper crawl parsing") session.sparkContext.setLogLevel( 'ERROR') # avoids printing of info messages hadoop_conf = {"textinputformat.record.delimiter": "WARC/1.0"} input_format_name = 'org.apache.hadoop.mapreduce.lib.input.TextInputFormat' record_pairs: RDD = session.sparkContext \ .newAPIHadoopFile(path=sample_wet_loc, inputFormatClass=input_format_name, keyClass="org.apache.hadoop.io.LongWritable", valueClass="org.apache.hadoop.io.Text", conf=hadoop_conf) record_texts = record_pairs.map(lambda pair: pair[1].strip()) for record in record_texts.take(5): print(record) print('-' * 20) print('#' * 40) print('Total # of records: ' + str(record_texts.count()))
from pyspark.rdd import RDD from utilities01_py.helper_python import create_session from utilities02_py.helper_python import sample_wet_loc if __name__ == "__main__": session = create_session(2, "Default crawl parsing") session.sparkContext.setLogLevel('ERROR') # skips INFO messages records: RDD = session.sparkContext.textFile(sample_wet_loc) for record in records.take(50): print(record) print('-' * 20) print('#' * 40) print('Total # of records: ' + str(records.count()))
start_time = str(datetime.datetime.now()) # thread_id = str(threading.currentThread().daemon.real) thread_id = str(threading.currentThread().ident) print('@@1 falling asleep in thread ' + thread_id + ' at ' + start_time + ' accessing ' + current_uri) time.sleep(2) end_time = str(datetime.datetime.now()) print('@@2 awakening in thread ' + thread_id + ' at ' + end_time + ' accessing ' + current_uri) return thread_id, current_uri if __name__ == "__main__": # main method of Exercise4_01.py comes here input_warc = "/Users/a/Desktop/Buch/CC-MAIN-20191013195541-20191013222541-00000.warc" # ToDo: Change session: SparkSession = create_session(3, "Wave exploration") raw_records = extract_raw_records(input_warc, session) warc_records = raw_records \ .flatMap(lambda record: parse_raw_warc(record)) thread_ids = warc_records.map(lambda record: fall_asleep(record)) print(thread_ids.count()) # val threadIdsRDD: RDD[(Long, Long)] = warcRecords # .map(record => { # val currentUri = record.targetURI # val startTime = LocalDateTime.now() # val threadId: Long = Thread.currentThread().getId # println(s"@@1 falling asleep in thread $threadId at $startTime accessing $currentUri") # Thread.sleep(2000)