from utilities01_py.helper_python import create_session
from utilities02_py.helper_python import extract_raw_records, parse_raw_warc, sample_warc_loc
from pyspark.sql import SparkSession
import time
from time import gmtime, strftime


def heavy_computation(record):
    time.sleep(0.2)
    return record


if __name__ == "__main__":
    session: SparkSession = create_session(2, "Activity 2")
    raw_records = extract_raw_records(sample_warc_loc, session)
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))
    warc_records = raw_records \
        .flatMap(lambda record: parse_raw_warc(record))
    warc_records \
        .map(lambda record: heavy_computation(record)) \
        .foreach(lambda _: None)
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))
Example #2
0
from pyspark.rdd import RDD
from pyspark.sql import SparkSession

from utilities01_py.helper_python import create_session
from utilities02_py.helper_python import sample_wet_loc

if __name__ == "__main__":
    session: SparkSession = create_session(2, "Default parsing")
    session.sparkContext.setLogLevel('ERROR')  # skips INFO messages
    records: RDD = session.sparkContext.textFile(sample_wet_loc)
    for record in records.take(50):
        print(record)
        print('-' * 20)
    print('#' * 40)
    print('Total # of records: ' + str(records.count()))
def map_function(_):
    new_heavy_object = HeavyObject('MapRecord')
    object_id = new_heavy_object.get_id()
    return object_id


def partition_function(partition):
    new_heavy_object = HeavyObject('MapPartition')
    object_id = new_heavy_object.get_id()
    return iter([object_id for _ in partition])


if __name__ == "__main__":
    warc_loc = sample_warc_loc
    session: SparkSession = create_session(2, "PerRecordVsPerPartition")

    raw_records = extract_raw_records(warc_loc, session)
    warc_records = raw_records \
        .flatMap(lambda record: parse_raw_warc(record))

    input_partitions = warc_records.getNumPartitions()
    number_of_records = warc_records.count()

    ids_of_map = warc_records \
        .map(map_function) \
        .count()
    print('@' * 50)

    ids_of_mappartition = warc_records \
        .mapPartitions(partition_function) \
Example #4
0
from pyspark.rdd import RDD
from utilities01_py.helper_python import create_session
from utilities02_py.helper_python import sample_wet_loc

if __name__ == "__main__":
    session = create_session(2, "Proper crawl parsing")
    session.sparkContext.setLogLevel(
        'ERROR')  # avoids printing of info messages

    hadoop_conf = {"textinputformat.record.delimiter": "WARC/1.0"}
    input_format_name = 'org.apache.hadoop.mapreduce.lib.input.TextInputFormat'
    record_pairs: RDD = session.sparkContext \
        .newAPIHadoopFile(path=sample_wet_loc, inputFormatClass=input_format_name,
                          keyClass="org.apache.hadoop.io.LongWritable", valueClass="org.apache.hadoop.io.Text",
                          conf=hadoop_conf)
    record_texts = record_pairs.map(lambda pair: pair[1].strip())

    for record in record_texts.take(5):
        print(record)
        print('-' * 20)
    print('#' * 40)
    print('Total # of records: ' + str(record_texts.count()))
Example #5
0
from pyspark.rdd import RDD

from utilities01_py.helper_python import create_session
from utilities02_py.helper_python import sample_wet_loc

if __name__ == "__main__":
    session = create_session(2, "Default crawl parsing")
    session.sparkContext.setLogLevel('ERROR')  # skips INFO messages
    records: RDD = session.sparkContext.textFile(sample_wet_loc)
    for record in records.take(50):
        print(record)
        print('-' * 20)
    print('#' * 40)
    print('Total # of records: ' + str(records.count()))
    start_time = str(datetime.datetime.now())
    # thread_id = str(threading.currentThread().daemon.real)
    thread_id = str(threading.currentThread().ident)
    print('@@1 falling asleep in thread ' + thread_id + ' at ' + start_time +
          ' accessing ' + current_uri)
    time.sleep(2)
    end_time = str(datetime.datetime.now())
    print('@@2 awakening in thread ' + thread_id + ' at ' + end_time +
          ' accessing ' + current_uri)
    return thread_id, current_uri


if __name__ == "__main__":
    # main method of Exercise4_01.py comes here
    input_warc = "/Users/a/Desktop/Buch/CC-MAIN-20191013195541-20191013222541-00000.warc"  # ToDo: Change
    session: SparkSession = create_session(3, "Wave exploration")

    raw_records = extract_raw_records(input_warc, session)
    warc_records = raw_records \
        .flatMap(lambda record: parse_raw_warc(record))

    thread_ids = warc_records.map(lambda record: fall_asleep(record))
    print(thread_ids.count())

# val threadIdsRDD: RDD[(Long, Long)] = warcRecords
# .map(record => {
#     val currentUri = record.targetURI
# val startTime = LocalDateTime.now()
# val threadId: Long = Thread.currentThread().getId
# println(s"@@1 falling asleep in thread $threadId at $startTime accessing $currentUri")
# Thread.sleep(2000)