def reducer(stream): airport_acc = {} for airport, count in mapred.iter_key_values(stream): airport_acc[airport] = airport_acc.get(airport, 0) + int(count) for airport, count in airport_acc.iteritems(): mapred.send(airport, count)
def mapper_step2(stream): """This mapper receives records from the SuperSpecificCartesianInputFormat, aka the cartesian product of the morning flights and +2 days afternoon flights. It simply have to reduce to the input where dest1 == origin2. """ from cassandra.cluster import Cluster from cassandra.cqlengine import connection from decimal import Decimal from cassandra.cqlengine import columns from cassandra.cqlengine.models import Model from setup_cassandradb import Q3Entry cluster = Cluster(['node7', 'node8', 'node9']) connection.register_connection('con', session=cluster.connect(), default=True) for first_leg, second_leg in mapred.iter_key_values(stream): date1, origin1, dest1, delay1, time1, carrier1, flight1 = first_leg date2, origin2, dest2, delay2, time2, carrier2, flight2 = second_leg if dest1 == origin2: total_delay = float(delay1) + float(delay2) Q3Entry.create(date_origin_dest1='_'.join([date1, origin1, dest1]), date1=date1, origin=origin1, dest1=dest1, dest2=dest2, total_delay=Decimal(total_delay), datetime1=date1 + ' ' + time1, flight1=carrier1 + ' ' + flight1, datetime2=date2 + ' ' + time2, flight2=carrier2 + ' ' + flight2)
def load_db(stream): import boto3 dynamodb = boto3.resource('dynamodb', region_name='us-east-1') table = dynamodb.Table('ccproject_q1.1') with table.batch_writer() as batch: for airport, count in mapred.iter_key_values(stream): batch.put_item(Item={ 'dummy_key': 1, 'airport': airport, 'count': int(count), })
def load_db(stream): import boto3 dynamodb = boto3.resource('dynamodb', region_name='us-east-1') table = dynamodb.Table('ccproject_q1.3') with table.batch_writer() as batch: for dow, (delay, _) in mapred.iter_key_values(stream): batch.put_item(Item={ 'dummy_key': 1, 'dow': dow, 'delay': Decimal(delay), })
def load_db(stream): from cassandra.cluster import Cluster from cassandra.cqlengine import connection from decimal import Decimal from cassandra.cqlengine import columns from cassandra.cqlengine.models import Model from setup_cassandradb import Q22Entry cluster = Cluster(['node7', 'node8', 'node9']) connection.register_connection('con', session=cluster.connect(), default=True) for (origin, dest), (delay, _) in mapred.iter_key_values(stream): Q22Entry.create(origin=origin, dest=dest, delay=Decimal(delay))
def reducer_step1(stream): """This implements the first reduction step. The output are keyed against day/day-part for the specific KeySplitOutputFormat to split the files. """ least_delay_legs = {} for key, value in mapred.iter_key_values(stream): _, origin, dest, delay, _, _, _ = value leg_key = (key, origin, dest) if leg_key not in least_delay_legs: least_delay_legs[leg_key] = value else: _, _, _, best_delay, _, _, _ = least_delay_legs[leg_key] if float(delay) < float(best_delay): least_delay_legs[leg_key] = value for (key, _, _), value in least_delay_legs.iteritems(): mapred.send(key, value)