def new_seg(origin, destination, airline, airline_ref_code, ym, pax, rev,
             unique_id, provider, loaded_from_file):
     ym_ym = YearMonth(ym)
     # Basic segment in the case where external_segment is sufficiently atomical and
     # corresponding new_segment_initial_data does not exist
     seg = NewSegmentInitialData(
         leg_origin=origin,
         origin=origin,
         trip_origin=origin,
         leg_destination=destination,
         destination=destination,
         trip_destination=destination,
         trip=[origin, destination],
         operating_airline=airline,
         operating_airline_ref_code=airline_ref_code,
         year_month=ym,
         year=ym_ym.year,
         month=ym_ym.month,
         passengers=pax,
         segment_revenue_usd=rev,
         od_revenue_usd=rev,
         cabin_class='EC',
         segment_split='Local',
         loaded_from_file=loaded_from_file,
         loaded_from_record=unique_id,
         record_ok=True,
         source='external_source',
         external_provider=provider,
         loaded_from_date=now)
     return seg
Ejemplo n.º 2
0
def launch_import(year_months, selected_providers):
    providers = {
        'USA': 'load_files_from_USA.py',
        'Brazil': 'load_files_from_Brazil.py',
        'Ireland': 'load_files_from_Ireland.py',
        'Colombia': 'load_files_from_Colombia.py',
        'Mexico': 'load_files_from_Mexico.py'
    }
    for prov in providers:
        if prov in selected_providers:
            os.system('python %s %s' % providers.get(prov),
                      YearMonth(year_months))
def get_data(csv_files, year_months):
    """
    Populate the database with data extract in csv files
    :return:
    """
    now = utcnow()
    ref_code.init_cache()
    airport_replacement = {
        "SBCD": "SSCC",
        "SWUY": "SBUY",
        "SBJI": "SWJI",
        "RJNN": "RJNA",
        "SBPM": "SBPJ",
        "SEQU": "SEQM",
        "SNQY": "SBJU",
        "SJDB": "SBDB",
        "SWJH": "SIZX",
        "SNNG": "SJNP",
        "SDFR": "SDDN",
        "1AON": "SDOW",
        "SMPB": "SMJP",
        "2NHT": "SBTC",
        "SWIQ": "SBMC",
        "SWKK": "SSKW",
        "SAIG": "SARI",
        "SBER": "SWEI"
    }
    airport_exclusions = ["SBNT", "SUPE", "6ASO", "SAMQ"]
    airline_replacements = {"VIP": "FPG", "BLC": "TAM"}

    def log_bulk(self):
        log.info('  store external_segment: %r', self.nresult)

    for csv_f in csv_files:  # loop through each file
        print('******************** processed csv:  ', csv_f)
        with open('%s/%s' % (tmp_dir, csv_f)) as csv_file:
            dict_reader = csv.DictReader(csv_file)
            all_rows = len(
                list(csv.DictReader(open('%s/%s' % (tmp_dir, csv_f)))))
            row_nb = 0
            previous_data = pd.DataFrame(columns=[
                'origin', 'destination', 'year_month', 'airline', 'ref_code',
                'passengers'
            ])

            with External_Segment_Tmp.unordered_bulk(
                    1000, execute_callback=log_bulk) as bulk:

                for row in dict_reader:  # loop through each row (origin, destination) in file
                    row_nb += 1
                    for key, value in row.items():
                        if value == ':':
                            row[key] = ''

                    if ((row['PASSAGEIROS PAGOS'] == '0') and (row['PASSAGEIROS PAGOS'] == '0')) or \
                            (row['PASSAGEIROS PAGOS'] == ''):  # skip rows with no pax
                        continue

                    total_pax = int(row['PASSAGEIROS PAGOS']) + int(
                        row['PASSAGEIROS GRÁTIS'])

                    row_airline = get_airline_by_icao(row['EMPRESA (SIGLA)'],
                                                      row['EMPRESA (NOME)'],
                                                      total_pax)

                    if row['AEROPORTO DE ORIGEM (SIGLA)'] in airport_exclusions or \
                        row['AEROPORTO DE DESTINO (SIGLA)'] in airport_exclusions:  # skip exclusions
                        continue

                    airport_origin = get_airport_by_icao(
                        row['AEROPORTO DE ORIGEM (SIGLA)'],
                        row['AEROPORTO DE ORIGEM (NOME)'], total_pax)
                    airport_destination = get_airport_by_icao(
                        row['AEROPORTO DE DESTINO (SIGLA)'],
                        row['AEROPORTO DE DESTINO (NOME)'], total_pax)
                    if airport_destination is None:
                        continue
                    if airport_origin is None:
                        continue
                    if row_airline in airline_replacements:
                        row_airline = airline_replacements.get(row_airline)
                    if airport_origin in airport_replacement:
                        airport_origin = airport_replacement.get(
                            airport_origin)
                    if airport_destination in airport_replacement:
                        airport_destination = airport_replacement.get(
                            airport_destination)

                    year_month = '%04d-%02d' % (int(row['ANO']), int(
                        row['MÊS']))
                    # Only treat the requested year_months
                    if year_month not in year_months:
                        continue

                    if year_month not in previous_data['year_month'].values:
                        if External_Segment_Tmp.find_one({
                                'year_month': year_month,
                                'provider': provider
                        }):
                            log.warning(
                                "This year_month (%s) already exists for provider %s",
                                year_month, provider)

                    airline_ref_code = ref_code.get_airline_ref_code(
                        row_airline, airport_origin, airport_destination,
                        YearMonth(year_month))

                    if ((previous_data['origin'] == airport_origin) &
                        (previous_data['destination'] == airport_destination)
                            & (previous_data['year_month'] == year_month)
                            & (previous_data['airline'] == row_airline)
                            &
                        (previous_data['ref_code'] == airline_ref_code)).any():
                        new_row = False
                        # Add to Excel file's total_pax the number of passengers you get from
                        # filtering previous_data on other columns
                        total_pax += int(previous_data['passengers'][
                            (previous_data['origin'] == airport_origin)
                            & (previous_data['destination'] ==
                               airport_destination)
                            & (previous_data['year_month'] == year_month)
                            & (previous_data['airline'] == row_airline)
                            & (previous_data['ref_code'] == airline_ref_code)])
                    else:
                        new_row = True

                    dic = dict(provider=provider,
                               data_type='airport',
                               airline=[row_airline],
                               airline_ref_code=[airline_ref_code],
                               origin=[airport_origin],
                               destination=[airport_destination],
                               year_month=[year_month],
                               total_pax=total_pax,
                               overlap=[],
                               raw_rec=dict(row),
                               both_ways=False,
                               from_line=row_nb,
                               from_filename=csv_f,
                               url=full_url)

                    new_data = pd.Series({
                        'origin': airport_origin,
                        'destination': airport_destination,
                        'year_month': year_month,
                        'airline': row_airline,
                        'ref_code': airline_ref_code,
                        'passengers': total_pax
                    }).to_frame()
                    if new_row:
                        previous_data = previous_data.append(new_data.T,
                                                             ignore_index=True)
                    else:
                        previous_data['passengers'][
                            (previous_data['origin'] == airport_origin)
                            & (previous_data['destination'] ==
                               airport_destination) &
                            (previous_data['airline'] == row_airline) &
                            (previous_data['ref_code'] == airline_ref_code) &
                            (previous_data['year_month'] == year_month
                             )] = total_pax  # Modify previous_data's pax

                    query = dict(
                        (k, dic[k])
                        for k in ('origin', 'destination', 'year_month',
                                  'provider', 'data_type', 'airline'))
                    bulk.find(query).upsert().update_one({
                        '$set':
                        dic,
                        '$setOnInsert':
                        dict(inserted=now)
                    })
                    if row_nb % 1000 == 0:
                        print('{0:.3g}'.format(
                            float(row_nb) / float(all_rows) * 100) + '%')
            log.info('stored: %r', bulk.nresult)
def get_data(xlsx_files):
    """
    Populate the database with data extract in xlsx files
    :return:
    """
    now = utcnow()
    ref_code.init_cache()
    airport_replacement = {"BUE": "EZE", "RIO": "GIG", "LMC": "LMC", "LMA": "MCJ", "VGP": "VGZ", "PTL": "PTX",
                           "MIL": "MXP", "LON": "LHR", "SAO": "CGH", "BSL": "BSL", "TRP": "TCD", "RLB": "LIR",
                           "NYC": "JFK", "GTK": "FRS", "AWH": "USH", "STO": "ARN", "WAS": "IAD", "BHZ": "PLU"}

    def log_bulk(self):
        log.info('  store external_segment: %r', self.nresult)

    for xlsx_f in xlsx_files:  # loop through each file
        print('******************** processing Excel file:', xlsx_f)
        xls = pd.read_excel(tmp_dir + "/" + xlsx_f)
        header = np.where(xls.loc[:, :] == "Pasajeros")[0] + 1  # Look for column names
        xls = pd.read_excel(tmp_dir + "/" + xlsx_f, header=header)  # Re-load file with headers
        xls = format_columns(xls)
        # Create a dataframe to save data line after line, so we can check later on
        previous_data = pd.DataFrame(columns=['origin', 'destination', 'year_month', 'airline', 'airline_ref_code', 'passengers'])

        with External_Segment_Tmp.unordered_bulk(1000, execute_callback=log_bulk) as bulk:
            for row in range(0, len(xls)):    # loop through each row (origin, destination) in file
                full_row = xls.iloc[row]
                # skip rows with no pax
                if np.isnan(full_row['Passengers']) or full_row['Passengers'] == "" or int(full_row['Passengers']) == 0:
                    continue

                year_month = full_row['Year_Month']
                # Only treat the requested year_months
                if year_month not in year_months:
                    continue

                total_pax = int(full_row['Passengers'])

                row_airline = get_airline_by_icao(full_row['Airline'], full_row['Airline_Name'], total_pax)
                if row_airline is None:
                    continue

                airport_origin = full_row['Origen']
                airport_destination = full_row['Destino']
                if airport_origin in airport_replacement:  # correct the wrong codes
                    airport_origin = airport_replacement.get(airport_origin)
                if airport_destination in airport_replacement:  # correct the wrong codes
                    airport_destination = airport_replacement.get(airport_destination)

                if not check_airport(airport_origin, full_row['Ciudad Origen'],
                                     full_row['Pais Origen'], total_pax):
                    continue
                if not check_airport(airport_destination, full_row['Ciudad Destino'],
                                     full_row['Pais Destino'], total_pax):
                    continue

                airline_ref_code = ref_code.get_airline_ref_code(row_airline, airport_origin,
                                                                 airport_destination,
                                                                 YearMonth(year_month))

                # if External_Segment_Tmp.find_one({'year_month': year_month, 'provider': provider}):
                #     log.warning("This year_month (%s) already exists for provider %s", year_month, provider)

                if ((previous_data['origin'] == airport_origin) &
                        (previous_data['destination'] == airport_destination)
                        & (previous_data['year_month'] == year_month)
                        & (previous_data['airline'] == row_airline)
                        & (previous_data['airline_ref_code'] == airline_ref_code)).any():
                    new_row = False
                    # Add to Excel file's total_pax the "passenger" integer you get from filtering previous_data on other columns
                    total_pax += int(previous_data['passengers'][
                        (previous_data['origin'] == airport_origin) &
                        (previous_data['destination'] == airport_destination)
                        & (previous_data['year_month'] == year_month)
                        & (previous_data['airline'] == row_airline)
                        & (previous_data['airline_ref_code'] == airline_ref_code)])
                else:
                    new_row = True

                dic = dict(provider=provider,
                           data_type='airport',
                           airline=[row_airline],
                           airline_ref_code=[airline_ref_code],
                           origin=[airport_origin],
                           destination=[airport_destination],
                           year_month=[year_month],
                           total_pax=total_pax,
                           overlap=[],
                           raw_rec=dict(full_row),
                           both_ways=False,
                           from_line=row,
                           from_filename=xlsx_f,
                           url=full_url)

                new_data = pd.Series({'origin': airport_origin, 'destination': airport_destination,
                                      'year_month': year_month, 'airline': row_airline,
                                      'airline_ref_code': airline_ref_code, 'passengers': total_pax}).to_frame()
                if new_row:
                    previous_data = previous_data.append(new_data.T, ignore_index=True)
                else:
                    # Update the previous_data data frame with the new passengers count
                    previous_data['passengers'][
                        (previous_data['origin'] == airport_origin) &
                        (previous_data['destination'] == airport_destination) &
                        (previous_data['airline'] == row_airline) &
                        (previous_data['airline_ref_code'] == airline_ref_code) &
                        (previous_data['year_month'] == year_month)] = total_pax

                query = dict((k, dic[k]) for k in ('origin', 'destination', 'year_month', 'provider',
                                                   'data_type', 'airline'))
                bulk.find(query).upsert().update_one({'$set': dic, '$setOnInsert': dict(inserted=now)})
                if row % 1000 == 0:
                    print('{0:.3g}'.format(float(row) / float(len(xls)) * 100) + '%')
        log.info('stored: %r', bulk.nresult)
def get_data(csv_files):
    """
    Populate the database with data from csv files
    :param csv_files: list of files
    :return:
    """
    global unknown_airlines
    global airports_codes
    now = utcnow()
    ref_code.init_cache()
    airports_codes = get_airports_codes()
    airline_codes = get_airline_codes()
    # Some codes are incorrect in the files from the FAA, "replacements" are the corrected codes):
    airport_replacement = {
        "JQF": "USA",
        "JRV": "NRR",
        "1G4": "GCW",
        "L41": "MYH",
        "7AK": "KQA",
        "UXR": "GMW",
        "NYL": "YUM",
        "YR1": "YRC",
        "PBX": "PVL",
        "ZXA": "ROZ",
        "ZXU": "OQU",
        "NZC": "VQQ",
        "FAQ": "FTI"
    }
    airline_replacements = {
        "09Q": "Q7",
        "SEB": "BB",
        "TQQ": "TA",
        "1BQ": "2D",
        "0MQ": "3E",
        "1YQ": "F4",
        "KAH": "M5",
        "0JQ": "V2",
        "1DQ": "IS",
        "23Q": "5K",
        "1AQ": "VC",
        "1WQ": "UE",
        "J5": "J5",
        "FCQ": "6F",
        "04Q": "TJ",
        "1RQ": "6G",
        "1QQ": "V9",
        "PBQ": "PV",
        "3F": "3F",
        "AMQ": "7Z",
        "3SD": "3S",
        "20Q": "O5",
        "28Q": "ZB",
        "AAT": "YI",
        "AJQ": "4A",
        "2HQ": "7Q",
        "NLQ": "N5",
        "02Q": "ZT",
        "07Q": "F8",
        "GCH": "ZS",
        "4EQ": "4E",
        "15Q": "6I"
    }
    # Some FAA codes are incorrect and have existing IATA equivalent, so they should be skipped:
    airport_exclusions = {
        "XWA", "NAD", "QMA", "ZXM", "ZXN", "RMN", "MQJ", "QMN", "QSO"
    }

    def log_bulk(self):
        log.info('store external_segment: %r', self.nresult)

    for csv_f in csv_files:  # loop through each file
        print('******************** processed csv:  ', csv_f)
        with open('%s/%s' % (tmp_dir, csv_f)) as csv_file:
            dict_reader = csv.DictReader(csv_file)
            row_nb = 0
            previous_data = pd.DataFrame(columns=[
                'origin', 'destination', 'year_month', 'airline', 'passengers'
            ])
            all_rows = len(
                list(csv.DictReader(open('%s/%s' % (tmp_dir, csv_f)))))
            """
                In previous_data, we store all the lines that are sent to bulk, to refer to for next lines.
                This allows sum of passengers for similar origin/destination/year_month/airline tuples.
                """

            with External_Segment_Tmp.unordered_bulk(
                    1000, execute_callback=log_bulk) as bulk:

                for row in dict_reader:  # loop through each row (origin, destination) in file
                    row_nb += 1
                    for key, value in row.items():
                        if value == ':':
                            row[key] = ''

                    passengers = int(row['PASSENGERS'].split('.')[0])
                    if passengers <= 0:  # skip rows with no pax
                        continue

                    row_airline = row['UNIQUE_CARRIER']
                    airport_origin = row['ORIGIN']
                    airport_destination = row['DEST']
                    if airport_origin in airport_exclusions or airport_destination in airport_exclusions:  # skip exclusions
                        continue
                    if row_airline in airline_replacements:
                        row_airline = airline_replacements.get(row_airline)
                    if airport_origin in airport_replacement:  # correct the wrong codes
                        airport_origin = airport_replacement.get(
                            airport_origin)
                    if airport_destination in airport_replacement:  # correct the wrong codes
                        airport_destination = airport_replacement.get(
                            airport_destination)

                    if row_airline not in airline_codes:  # Check airline
                        if row_airline in unknown_airlines['code'].values:
                            unknown_airlines.loc[unknown_airlines['code'] ==
                                                 row_airline,
                                                 'passengers'] += passengers
                        else:
                            info = pd.Series({
                                'code': row_airline,
                                'name': row['CARRIER_NAME'],
                                'passengers': passengers
                            })
                            unknown_airlines = unknown_airlines.append(
                                info, ignore_index=True)
                        continue

                    if not check_airport(
                            airport_origin, row['ORIGIN_CITY_NAME'],
                            row['ORIGIN_COUNTRY'], row['ORIGIN_STATE_ABR'],
                            passengers):
                        continue
                    if not check_airport(
                            airport_destination, row['DEST_CITY_NAME'],
                            row['DEST_COUNTRY'], row['DEST_STATE_ABR'],
                            passengers):
                        continue

                    year_month = '%04d-%02d' % (int(
                        row['YEAR']), int(row['MONTH']))
                    airline_ref_code = ref_code.get_airline_ref_code(
                        row_airline, airport_origin, airport_destination,
                        YearMonth(year_month))
                    if ((previous_data['origin'] == airport_origin) &
                        (previous_data['destination'] == airport_destination)
                            & (previous_data['year_month'] == year_month)
                            & (previous_data['airline'] == row_airline)).any():
                        new_row = False
                        # Add to Excel file's total_pax the "passenger" integer you get from filtering
                        # previous_data on other columns
                        passengers += int(previous_data['passengers'][
                            (previous_data['origin'] == airport_origin)
                            & (previous_data['destination'] ==
                               airport_destination) &
                            (previous_data['year_month'] == year_month) &
                            (previous_data['airline'] == row_airline)])
                    else:
                        new_row = True

                    dic = dict(provider=provider,
                               data_type='airport',
                               airline=[row_airline],
                               airline_ref_code=[airline_ref_code],
                               total_pax=passengers,
                               overlap=[],
                               origin=[airport_origin],
                               destination=[airport_destination],
                               year_month=[year_month],
                               raw_rec=dict(row),
                               both_ways=False,
                               from_line=row_nb,
                               from_filename=csv_f,
                               url=full_url)

                    new_data = pd.Series({
                        'origin': airport_origin,
                        'destination': airport_destination,
                        'year_month': year_month,
                        'airline': row_airline,
                        'passengers': passengers
                    }).to_frame()
                    if new_row:
                        previous_data = previous_data.append(new_data.T,
                                                             ignore_index=True)
                    else:
                        previous_data['passengers'][
                            (previous_data['origin'] == airport_origin)
                            & (previous_data['destination'] ==
                               airport_destination) &
                            (previous_data['airline'] == row_airline) &
                            (previous_data['year_month'] == year_month
                             )] = passengers  # Modify previous_data's pax

                    query = dict(
                        (k, dic[k])
                        for k in ('origin', 'destination', 'year_month',
                                  'provider', 'data_type', 'airline'))
                    bulk.find(query).upsert().update_one({
                        '$set':
                        dic,
                        '$setOnInsert':
                        dict(inserted=now)
                    })
                    if row_nb % 1000 == 0:
                        print('{0:.3g}'.format(row_nb / all_rows * 100) + '%')
            log.info('stored: %r', bulk.nresult)
def get_data():
    """
    Populate the database with data extract in xlsx files. One line per year_month.
    Back/Forth routes in rows, one column per way.
    :param xlsx_files: dict of file names
    :return:
    """
    def log_bulk(self):
        log.info('  store external_segment: %r', self.nresult)

    now = utcnow()
    provider = "CRK"
    tmp_dir = "/home/laurent"
    xlsx_f = "CRK.xlsx"
    ref_code.init_cache()

    airports_codes = get_airports_codes()
    airlines_codes = get_airlines_codes()

    log.info('******************** processing Excel file: %s', xlsx_f)
    xl = pd.ExcelFile(tmp_dir + "/" + xlsx_f)
    for tab in xl.sheet_names:
        log.info('************ processing tab %s', tab)
        xls = xl.parse(sheetname=tab, header=1)
        all_rows = len(xls.index)
        row_nb = 0

        with External_Segment_Tmp.unordered_bulk(
                1000, execute_callback=log_bulk) as bulk:
            for row_index, full_row in xls.iterrows(
            ):  # loop through each row (origin, destination, airline, ym) in file
                row_nb += 1
                # Skip empty rows (no text in Origin column, or year Total = 0)
                if pd.isnull(full_row['ORIGIN']):
                    continue
                else:
                    total_pax = int(full_row['TRAFFIC'])
                year = int(full_row['YEAR'])
                month = int(full_row['MONTH'])
                year_month = str(year) + '-' + format(month, '02d')
                airline = full_row['AIRLINE']
                origin = full_row['ORIGIN']
                destination = full_row['DESTINATION']

                if not check_airport(origin, total_pax, airports_codes):
                    continue
                if not check_airport(destination, total_pax, airports_codes):
                    continue
                if not check_airline(airline, total_pax, airlines_codes):
                    continue
                airline_ref_code = ref_code.get_airline_ref_code(
                    airline, origin, destination, YearMonth(year_month))
                dic = dict(provider=provider,
                           data_type='airport',
                           airline=[airline],
                           airline_ref_code=[airline_ref_code],
                           origin=[origin],
                           destination=[destination],
                           year_month=[year_month],
                           total_pax=total_pax,
                           overlap=[],
                           raw_rec=dict(full_row),
                           both_ways=True,
                           from_line=row_nb,
                           from_filename=xlsx_f)
                query = dict((k, dic[k])
                             for k in ('origin', 'destination', 'year_month',
                                       'provider', 'data_type', 'airline'))
                bulk.find(query).upsert().update_one({
                    '$set':
                    dic,
                    '$setOnInsert':
                    dict(inserted=now)
                })

                if row_nb % 100 == 0:
                    print('{0:.3g}'.format(row_nb / all_rows * 100) + '%')
        log.info('stored: %r', bulk.nresult)
import os
import sys
sys.path.append('../')
from utils import YearMonth, utcnow

year_months = [
    '2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06',
    '2016-07', '2016-08', '2016-09', '2016-10', '2016-11', '2016-12'
]
for ym in year_months:
    os.system('python treat_sources_scope.py %s --reset_overlap' %
              YearMonth(ym))