Beispiel #1
0
def reducer(stream):
    airport_acc = {}
    for airport, count in mapred.iter_key_values(stream):
        airport_acc[airport] = airport_acc.get(airport, 0) + int(count)

    for airport, count in airport_acc.iteritems():
        mapred.send(airport, count)
Beispiel #2
0
def mapper_step2(stream):
    """This mapper receives records from the SuperSpecificCartesianInputFormat, aka the cartesian product
    of the morning flights and +2 days afternoon flights. It simply have to reduce to the input where
    dest1 == origin2.
    """

    from cassandra.cluster import Cluster
    from cassandra.cqlengine import connection
    from decimal import Decimal

    from cassandra.cqlengine import columns
    from cassandra.cqlengine.models import Model

    from setup_cassandradb import Q3Entry

    cluster = Cluster(['node7', 'node8', 'node9'])
    connection.register_connection('con',
                                   session=cluster.connect(),
                                   default=True)

    for first_leg, second_leg in mapred.iter_key_values(stream):
        date1, origin1, dest1, delay1, time1, carrier1, flight1 = first_leg
        date2, origin2, dest2, delay2, time2, carrier2, flight2 = second_leg
        if dest1 == origin2:
            total_delay = float(delay1) + float(delay2)
            Q3Entry.create(date_origin_dest1='_'.join([date1, origin1, dest1]),
                           date1=date1,
                           origin=origin1,
                           dest1=dest1,
                           dest2=dest2,
                           total_delay=Decimal(total_delay),
                           datetime1=date1 + ' ' + time1,
                           flight1=carrier1 + ' ' + flight1,
                           datetime2=date2 + ' ' + time2,
                           flight2=carrier2 + ' ' + flight2)
Beispiel #3
0
def load_db(stream):
    import boto3
    dynamodb = boto3.resource('dynamodb', region_name='us-east-1')
    table = dynamodb.Table('ccproject_q1.1')
    with table.batch_writer() as batch:
        for airport, count in mapred.iter_key_values(stream):
            batch.put_item(Item={
                'dummy_key': 1,
                'airport': airport,
                'count': int(count),
            })
Beispiel #4
0
def load_db(stream):
    import boto3
    dynamodb = boto3.resource('dynamodb', region_name='us-east-1')
    table = dynamodb.Table('ccproject_q1.3')
    with table.batch_writer() as batch:
        for dow, (delay, _) in mapred.iter_key_values(stream):
            batch.put_item(Item={
                'dummy_key': 1,
                'dow': dow,
                'delay': Decimal(delay),
            })
Beispiel #5
0
def load_db(stream):
    from cassandra.cluster import Cluster
    from cassandra.cqlengine import connection
    from decimal import Decimal

    from cassandra.cqlengine import columns
    from cassandra.cqlengine.models import Model

    from setup_cassandradb import Q22Entry

    cluster = Cluster(['node7', 'node8', 'node9'])
    connection.register_connection('con',
                                   session=cluster.connect(),
                                   default=True)

    for (origin, dest), (delay, _) in mapred.iter_key_values(stream):
        Q22Entry.create(origin=origin, dest=dest, delay=Decimal(delay))
Beispiel #6
0
def reducer_step1(stream):
    """This implements the first reduction step. The output are keyed against day/day-part for the
    specific KeySplitOutputFormat to split the files.
    """

    least_delay_legs = {}
    for key, value in mapred.iter_key_values(stream):
        _, origin, dest, delay, _, _, _ = value
        leg_key = (key, origin, dest)
        if leg_key not in least_delay_legs:
            least_delay_legs[leg_key] = value
        else:
            _, _, _, best_delay, _, _, _ = least_delay_legs[leg_key]
            if float(delay) < float(best_delay):
                least_delay_legs[leg_key] = value

    for (key, _, _), value in least_delay_legs.iteritems():
        mapred.send(key, value)