Example #1
0
def prepare_count(df, direction):
    """
    This function is used to prepare flight count data used for flight analysis in terms of airports and states.
    Specially, the direction specifies if we are calculating the "DEPARTURE" or "ARRIVAL" flight number count.
    @param df: input flight dataFrame
    @type df: pd.DataFrame
    @param direction: input direction for analysis
    @type direction: str
    @return: count dataFrame
    @rtype: pd.DataFrame
    """
    assert isinstance(df, pd.DataFrame)
    assert isinstance(direction, str)
    assert direction == constants.DIRECTION_ARRIVAL or direction == constants.DIRECTION_DEPARTURE

    if direction == constants.DIRECTION_DEPARTURE:
        airport_type = 'ORIGIN'
        count_type = "ORIGIN_COUNT"
    elif direction == constants.DIRECTION_ARRIVAL:
        airport_type = 'DEST'
        count_type = "DEST_COUNT"

    df_delay = df[df['CANCELLED'] != 1]
    df_us_airport = read_csv_file(constants.CLEANED_AIRPORT_DATA_PATH)

    df_origin_counts = count(df_delay, airport_type, count_type)
    df_origin = merge(df_us_airport, df_origin_counts, 'iata_code',
                      airport_type).dropna()
    df_origin_by_state = aggregate(df_origin, 'iso_region', count_type)
    return df_origin, df_origin_by_state
Example #2
0
def total_delay(df):
    """
    This function calculates the average total delay which is the sum of the DEP_DELAY and ARR_DELAY for different airlines.
    @param df: input airline delay data dataFrame
    @type df: pd.DataFrame
    @return: the dataFrame which shows the average the delay for different airlines
    @rtype: pd.DataFrame
    """
    assert isinstance(df, pd.DataFrame)

    df_cur = df[df['CANCELLED'] != 1]
    df_cur['total delay'] = df_cur['DEP_DELAY'] + df_cur['ARR_DELAY']
    # carrier total delay
    df_cur_valid = df_cur[['OP_CARRIER', 'total delay']]
    # carrier counts
    df_airline_counts = count(df_cur_valid, 'OP_CARRIER', 'counts')
    # carrier total delay
    df_airline_delay = aggregate(df_cur_valid, 'OP_CARRIER', 'total delay')
    # merge
    df_airline_merge_delay = merge(df_airline_delay, df_airline_counts,
                                   'OP_CARRIER', 'OP_CARRIER')
    # average
    df_airline_avg_delay = average(df_airline_merge_delay, 'total delay',
                                   'counts')

    return df_airline_avg_delay
Example #3
0
def count_cancellation_by_airport():
    """
    This function returns the statistics for cancellation reasons and cancellation records for different airports
    """
    code_a = []
    code_b = []
    code_c = []
    code_d = []
    all_records = pd.DataFrame()
    cancel_records = pd.DataFrame()

    df_us_airport = read_csv_file(constants.CLEANED_AIRPORT_DATA_PATH)

    for year in constants.YEAR_LIST:
        df_cur = get_flight_data_by_year(year, [])

        df_all = df_cur[['FL_DATE', 'ORIGIN']]
        df_all['month'] = df_all['FL_DATE'].str.split('-').str[1]
        df_all = merge(df_us_airport, df_all, 'iata_code', 'ORIGIN')
        df_all = df_all[['iso_region', 'month']].dropna()
        df_all = df_all.groupby(['iso_region',
                                 'month']).size().reset_index(name='counts')
        all_records = all_records.append(df_all)
        del df_all

        df_cancel = df_cur[df_cur['CANCELLED'] != 0]
        df_cancel = merge(df_us_airport, df_cancel, 'iata_code', 'ORIGIN')
        df_cancel = df_cancel[['FL_DATE', 'iso_region',
                               'CANCELLATION_CODE']].dropna()
        df_cancel['FL_DATE'] = df_cancel['FL_DATE'].str.rsplit(pat='-',
                                                               n=1).str[0]

        a = df_cancel[df_cancel['CANCELLATION_CODE'] == 'A'].shape[0]
        b = df_cancel[df_cancel['CANCELLATION_CODE'] == 'B'].shape[0]
        c = df_cancel[df_cancel['CANCELLATION_CODE'] == 'C'].shape[0]
        d = df_cancel[df_cancel['CANCELLATION_CODE'] == 'D'].shape[0]
        code_a.append(a)
        code_b.append(b)
        code_c.append(c)
        code_d.append(d)
        cancel_records = cancel_records.append(
            df_cancel.reset_index(drop=True))
    return all_records, cancel_records, code_a, code_b, code_c, code_d
Example #4
0
def plot_airline_routes():
    """
    This function plots the route distributions over states for all airlines.
    :return:
    """
    df_airport = pd.read_csv(constants.AIRPORT_DATA_PATH)
    df_us_airport = extract_us_airport(df_airport)
    df_2018 = get_flight_data_by_year(2018, ['OP_CARRIER', 'ORIGIN', 'DEST'])
    df_2018_origin = merge(df_us_airport, df_2018, 'iata_code', 'ORIGIN')
    df_2018_dest = merge(df_us_airport, df_2018, 'iata_code', 'DEST')

    fig = make_subplots(
        rows=3,
        cols=4,
        specs=[[{
            "type": "domain"
        } for i in range(4)] for j in range(3)],
        subplot_titles=[
            constants.AIRLINE_FULLNAME_MAP[k]
            for k in constants.AIRLINE_CODES_STILL_WORKING
        ],
        vertical_spacing=0.01,
    )

    for idx, airline in enumerate(constants.AIRLINE_CODES_STILL_WORKING):
        df_region_route_cnts = get_airline_route_by_state(
            df_2018_origin, df_2018_dest, airline)
        fig.add_trace(go.Pie(
            labels=df_region_route_cnts['Region'],
            values=df_region_route_cnts['route_counts'],
        ),
                      row=idx // 4 + 1,
                      col=idx % 4 + 1)
    fig.update_layout(
        autosize=False,
        width=900,
        height=800,
        title_text=
        '2018 US Domestic Airline Origin and Destination Statistics (by Region)',
        margin=dict(b=0),
    )
    fig.show()
Example #5
0
def get_airline_route_by_state(df_origin, df_dest, airline):
    """
    This function takes all of the origin flights and destination flights as input. It calculates the distribution of
    the flight routes in terms of different states. For example, for the given airline = "AA", it will return the flight
    number distribution to all reachable states.
    @param df_origin: the input origin dataFrame
    @type df_origin: pd.DataFrame
    @param df_dest: the input destination dataFrame
    @type df_dest: pd.DataFrame
    @param airline: the given airline code
    @type airline: str
    @return: flight number distribution dataFrame for this airline.
    @rtype: pd.DataFrame
    """
    assert isinstance(df_origin, pd.DataFrame)
    assert isinstance(df_dest, pd.DataFrame)
    assert isinstance(airline, str)

    us_division = read_csv_file(constants.US_REGION_DIVISION_DATA_PATH)
    df_us_airport = read_csv_file(constants.CLEANED_AIRPORT_DATA_PATH)

    df_airport_origin_cnts  = df_origin[df_origin['OP_CARRIER']==airline]['ORIGIN']\
        .value_counts().rename_axis('iata_code').reset_index(name='origin_counts')
    df_airport_dest_cnts  = df_dest[df_dest['OP_CARRIER']==airline]['DEST']\
        .value_counts().rename_axis('iata_code').reset_index(name='dest_counts')
    df_airport_route_cnts = merge(df_airport_origin_cnts, df_airport_dest_cnts,
                                  'iata_code', 'iata_code')
    df_airport_route_cnts['route_counts'] = df_airport_route_cnts[
        'origin_counts'] + df_airport_route_cnts['dest_counts']
    df_airport_route_cnts = merge(
        df_airport_route_cnts, df_us_airport, 'iata_code',
        'iata_code')[['iata_code', 'route_counts', 'iso_region']]
    df_state_route_cnts = aggregate(df_airport_route_cnts, 'iso_region',
                                    'route_counts')
    df_region_route_cnts = pd.merge(df_state_route_cnts,
                                    us_division,
                                    left_on='iso_region',
                                    right_on='State Code')
    df_region_route_cnts = aggregate(df_region_route_cnts, 'Region',
                                     'route_counts')

    return df_region_route_cnts
Example #6
0
def prepare_delay(df, direction):
    """
    This function is used to prepare delay data used for flight analysis in terms of airports and states. Specially, the
    direction specifies if we are calculating the "DEPARTURE" or "ARRIVAL" delay.
    @param df: input flight dataFrame
    @type df: pd.DataFrame
    @param direction: input direction for analysis
    @type direction: str
    @return: delay dataFrame
    @rtype: pd.DataFrame
    """
    assert isinstance(df, pd.DataFrame)
    assert isinstance(direction, str)
    assert direction == constants.DIRECTION_ARRIVAL or direction == constants.DIRECTION_DEPARTURE

    if direction == constants.DIRECTION_DEPARTURE:
        airport_type = 'ORIGIN'
        delay_type = 'DEP_DELAY'
        count_type = "ORIGIN_COUNT"
    elif direction == constants.DIRECTION_ARRIVAL:
        airport_type = 'DEST'
        delay_type = 'ARR_DELAY'
        count_type = "DEST_COUNT"

    df_delay = df[df['CANCELLED'] != 1]
    df_us_airport = read_csv_file(constants.CLEANED_AIRPORT_DATA_PATH)

    # ORIGIN counts_origin
    df_cnts = count(df_delay, airport_type, count_type)

    # ORIGIN DEP_DELAY
    df_delay_cnts = aggregate(df_delay, airport_type, delay_type)

    # ORIGIN DEP_DELAY counts_origin
    df_delay_cnts = merge(df_cnts, df_delay_cnts, airport_type, airport_type)
    # flights>50
    df_delay_cnts = df_delay_cnts[df_delay_cnts[count_type] > 50]

    # airport-info ORIGIN DEP_DELAY counts_origin
    df_delay_by_airport = merge(df_us_airport, df_delay_cnts, 'iata_code',
                                airport_type).dropna()
    # state total DEP_DELAY
    df_delay_by_state = df_delay_by_airport.groupby(['iso_region']) \
        .agg({delay_type: sum}) \
        .rename_axis('iso_region') \
        .reset_index()
    # state flights
    df_cnts_by_state = df_delay_by_airport.groupby(['iso_region']) \
        .agg({count_type: sum}) \
        .rename_axis('iso_region') \
        .reset_index()

    # state flights total DEP_DELAY
    df_delay_cnts_by_state = pd.merge(df_delay_by_state,
                                      df_cnts_by_state,
                                      on='iso_region')

    # state flights average DEP_DELAY
    df_delay_by_state = average(df_delay_cnts_by_state, delay_type, count_type)

    # airport-info average DEP_DELAY
    df_delay_by_airport = average(df_delay_by_airport, delay_type, count_type)

    return df_delay_by_airport, df_delay_by_state