def new_seg(origin, destination, airline, airline_ref_code, ym, pax, rev, unique_id, provider, loaded_from_file): ym_ym = YearMonth(ym) # Basic segment in the case where external_segment is sufficiently atomical and # corresponding new_segment_initial_data does not exist seg = NewSegmentInitialData( leg_origin=origin, origin=origin, trip_origin=origin, leg_destination=destination, destination=destination, trip_destination=destination, trip=[origin, destination], operating_airline=airline, operating_airline_ref_code=airline_ref_code, year_month=ym, year=ym_ym.year, month=ym_ym.month, passengers=pax, segment_revenue_usd=rev, od_revenue_usd=rev, cabin_class='EC', segment_split='Local', loaded_from_file=loaded_from_file, loaded_from_record=unique_id, record_ok=True, source='external_source', external_provider=provider, loaded_from_date=now) return seg
def launch_import(year_months, selected_providers): providers = { 'USA': 'load_files_from_USA.py', 'Brazil': 'load_files_from_Brazil.py', 'Ireland': 'load_files_from_Ireland.py', 'Colombia': 'load_files_from_Colombia.py', 'Mexico': 'load_files_from_Mexico.py' } for prov in providers: if prov in selected_providers: os.system('python %s %s' % providers.get(prov), YearMonth(year_months))
def get_data(csv_files, year_months): """ Populate the database with data extract in csv files :return: """ now = utcnow() ref_code.init_cache() airport_replacement = { "SBCD": "SSCC", "SWUY": "SBUY", "SBJI": "SWJI", "RJNN": "RJNA", "SBPM": "SBPJ", "SEQU": "SEQM", "SNQY": "SBJU", "SJDB": "SBDB", "SWJH": "SIZX", "SNNG": "SJNP", "SDFR": "SDDN", "1AON": "SDOW", "SMPB": "SMJP", "2NHT": "SBTC", "SWIQ": "SBMC", "SWKK": "SSKW", "SAIG": "SARI", "SBER": "SWEI" } airport_exclusions = ["SBNT", "SUPE", "6ASO", "SAMQ"] airline_replacements = {"VIP": "FPG", "BLC": "TAM"} def log_bulk(self): log.info(' store external_segment: %r', self.nresult) for csv_f in csv_files: # loop through each file print('******************** processed csv: ', csv_f) with open('%s/%s' % (tmp_dir, csv_f)) as csv_file: dict_reader = csv.DictReader(csv_file) all_rows = len( list(csv.DictReader(open('%s/%s' % (tmp_dir, csv_f))))) row_nb = 0 previous_data = pd.DataFrame(columns=[ 'origin', 'destination', 'year_month', 'airline', 'ref_code', 'passengers' ]) with External_Segment_Tmp.unordered_bulk( 1000, execute_callback=log_bulk) as bulk: for row in dict_reader: # loop through each row (origin, destination) in file row_nb += 1 for key, value in row.items(): if value == ':': row[key] = '' if ((row['PASSAGEIROS PAGOS'] == '0') and (row['PASSAGEIROS PAGOS'] == '0')) or \ (row['PASSAGEIROS PAGOS'] == ''): # skip rows with no pax continue total_pax = int(row['PASSAGEIROS PAGOS']) + int( row['PASSAGEIROS GRÁTIS']) row_airline = get_airline_by_icao(row['EMPRESA (SIGLA)'], row['EMPRESA (NOME)'], total_pax) if row['AEROPORTO DE ORIGEM (SIGLA)'] in airport_exclusions or \ row['AEROPORTO DE DESTINO (SIGLA)'] in airport_exclusions: # skip exclusions continue airport_origin = get_airport_by_icao( row['AEROPORTO DE ORIGEM (SIGLA)'], row['AEROPORTO DE ORIGEM (NOME)'], total_pax) airport_destination = get_airport_by_icao( row['AEROPORTO DE DESTINO (SIGLA)'], row['AEROPORTO DE DESTINO (NOME)'], total_pax) if airport_destination is None: continue if airport_origin is None: continue if row_airline in airline_replacements: row_airline = airline_replacements.get(row_airline) if airport_origin in airport_replacement: airport_origin = airport_replacement.get( airport_origin) if airport_destination in airport_replacement: airport_destination = airport_replacement.get( airport_destination) year_month = '%04d-%02d' % (int(row['ANO']), int( row['MÊS'])) # Only treat the requested year_months if year_month not in year_months: continue if year_month not in previous_data['year_month'].values: if External_Segment_Tmp.find_one({ 'year_month': year_month, 'provider': provider }): log.warning( "This year_month (%s) already exists for provider %s", year_month, provider) airline_ref_code = ref_code.get_airline_ref_code( row_airline, airport_origin, airport_destination, YearMonth(year_month)) if ((previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['year_month'] == year_month) & (previous_data['airline'] == row_airline) & (previous_data['ref_code'] == airline_ref_code)).any(): new_row = False # Add to Excel file's total_pax the number of passengers you get from # filtering previous_data on other columns total_pax += int(previous_data['passengers'][ (previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['year_month'] == year_month) & (previous_data['airline'] == row_airline) & (previous_data['ref_code'] == airline_ref_code)]) else: new_row = True dic = dict(provider=provider, data_type='airport', airline=[row_airline], airline_ref_code=[airline_ref_code], origin=[airport_origin], destination=[airport_destination], year_month=[year_month], total_pax=total_pax, overlap=[], raw_rec=dict(row), both_ways=False, from_line=row_nb, from_filename=csv_f, url=full_url) new_data = pd.Series({ 'origin': airport_origin, 'destination': airport_destination, 'year_month': year_month, 'airline': row_airline, 'ref_code': airline_ref_code, 'passengers': total_pax }).to_frame() if new_row: previous_data = previous_data.append(new_data.T, ignore_index=True) else: previous_data['passengers'][ (previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['airline'] == row_airline) & (previous_data['ref_code'] == airline_ref_code) & (previous_data['year_month'] == year_month )] = total_pax # Modify previous_data's pax query = dict( (k, dic[k]) for k in ('origin', 'destination', 'year_month', 'provider', 'data_type', 'airline')) bulk.find(query).upsert().update_one({ '$set': dic, '$setOnInsert': dict(inserted=now) }) if row_nb % 1000 == 0: print('{0:.3g}'.format( float(row_nb) / float(all_rows) * 100) + '%') log.info('stored: %r', bulk.nresult)
def get_data(xlsx_files): """ Populate the database with data extract in xlsx files :return: """ now = utcnow() ref_code.init_cache() airport_replacement = {"BUE": "EZE", "RIO": "GIG", "LMC": "LMC", "LMA": "MCJ", "VGP": "VGZ", "PTL": "PTX", "MIL": "MXP", "LON": "LHR", "SAO": "CGH", "BSL": "BSL", "TRP": "TCD", "RLB": "LIR", "NYC": "JFK", "GTK": "FRS", "AWH": "USH", "STO": "ARN", "WAS": "IAD", "BHZ": "PLU"} def log_bulk(self): log.info(' store external_segment: %r', self.nresult) for xlsx_f in xlsx_files: # loop through each file print('******************** processing Excel file:', xlsx_f) xls = pd.read_excel(tmp_dir + "/" + xlsx_f) header = np.where(xls.loc[:, :] == "Pasajeros")[0] + 1 # Look for column names xls = pd.read_excel(tmp_dir + "/" + xlsx_f, header=header) # Re-load file with headers xls = format_columns(xls) # Create a dataframe to save data line after line, so we can check later on previous_data = pd.DataFrame(columns=['origin', 'destination', 'year_month', 'airline', 'airline_ref_code', 'passengers']) with External_Segment_Tmp.unordered_bulk(1000, execute_callback=log_bulk) as bulk: for row in range(0, len(xls)): # loop through each row (origin, destination) in file full_row = xls.iloc[row] # skip rows with no pax if np.isnan(full_row['Passengers']) or full_row['Passengers'] == "" or int(full_row['Passengers']) == 0: continue year_month = full_row['Year_Month'] # Only treat the requested year_months if year_month not in year_months: continue total_pax = int(full_row['Passengers']) row_airline = get_airline_by_icao(full_row['Airline'], full_row['Airline_Name'], total_pax) if row_airline is None: continue airport_origin = full_row['Origen'] airport_destination = full_row['Destino'] if airport_origin in airport_replacement: # correct the wrong codes airport_origin = airport_replacement.get(airport_origin) if airport_destination in airport_replacement: # correct the wrong codes airport_destination = airport_replacement.get(airport_destination) if not check_airport(airport_origin, full_row['Ciudad Origen'], full_row['Pais Origen'], total_pax): continue if not check_airport(airport_destination, full_row['Ciudad Destino'], full_row['Pais Destino'], total_pax): continue airline_ref_code = ref_code.get_airline_ref_code(row_airline, airport_origin, airport_destination, YearMonth(year_month)) # if External_Segment_Tmp.find_one({'year_month': year_month, 'provider': provider}): # log.warning("This year_month (%s) already exists for provider %s", year_month, provider) if ((previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['year_month'] == year_month) & (previous_data['airline'] == row_airline) & (previous_data['airline_ref_code'] == airline_ref_code)).any(): new_row = False # Add to Excel file's total_pax the "passenger" integer you get from filtering previous_data on other columns total_pax += int(previous_data['passengers'][ (previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['year_month'] == year_month) & (previous_data['airline'] == row_airline) & (previous_data['airline_ref_code'] == airline_ref_code)]) else: new_row = True dic = dict(provider=provider, data_type='airport', airline=[row_airline], airline_ref_code=[airline_ref_code], origin=[airport_origin], destination=[airport_destination], year_month=[year_month], total_pax=total_pax, overlap=[], raw_rec=dict(full_row), both_ways=False, from_line=row, from_filename=xlsx_f, url=full_url) new_data = pd.Series({'origin': airport_origin, 'destination': airport_destination, 'year_month': year_month, 'airline': row_airline, 'airline_ref_code': airline_ref_code, 'passengers': total_pax}).to_frame() if new_row: previous_data = previous_data.append(new_data.T, ignore_index=True) else: # Update the previous_data data frame with the new passengers count previous_data['passengers'][ (previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['airline'] == row_airline) & (previous_data['airline_ref_code'] == airline_ref_code) & (previous_data['year_month'] == year_month)] = total_pax query = dict((k, dic[k]) for k in ('origin', 'destination', 'year_month', 'provider', 'data_type', 'airline')) bulk.find(query).upsert().update_one({'$set': dic, '$setOnInsert': dict(inserted=now)}) if row % 1000 == 0: print('{0:.3g}'.format(float(row) / float(len(xls)) * 100) + '%') log.info('stored: %r', bulk.nresult)
def get_data(csv_files): """ Populate the database with data from csv files :param csv_files: list of files :return: """ global unknown_airlines global airports_codes now = utcnow() ref_code.init_cache() airports_codes = get_airports_codes() airline_codes = get_airline_codes() # Some codes are incorrect in the files from the FAA, "replacements" are the corrected codes): airport_replacement = { "JQF": "USA", "JRV": "NRR", "1G4": "GCW", "L41": "MYH", "7AK": "KQA", "UXR": "GMW", "NYL": "YUM", "YR1": "YRC", "PBX": "PVL", "ZXA": "ROZ", "ZXU": "OQU", "NZC": "VQQ", "FAQ": "FTI" } airline_replacements = { "09Q": "Q7", "SEB": "BB", "TQQ": "TA", "1BQ": "2D", "0MQ": "3E", "1YQ": "F4", "KAH": "M5", "0JQ": "V2", "1DQ": "IS", "23Q": "5K", "1AQ": "VC", "1WQ": "UE", "J5": "J5", "FCQ": "6F", "04Q": "TJ", "1RQ": "6G", "1QQ": "V9", "PBQ": "PV", "3F": "3F", "AMQ": "7Z", "3SD": "3S", "20Q": "O5", "28Q": "ZB", "AAT": "YI", "AJQ": "4A", "2HQ": "7Q", "NLQ": "N5", "02Q": "ZT", "07Q": "F8", "GCH": "ZS", "4EQ": "4E", "15Q": "6I" } # Some FAA codes are incorrect and have existing IATA equivalent, so they should be skipped: airport_exclusions = { "XWA", "NAD", "QMA", "ZXM", "ZXN", "RMN", "MQJ", "QMN", "QSO" } def log_bulk(self): log.info('store external_segment: %r', self.nresult) for csv_f in csv_files: # loop through each file print('******************** processed csv: ', csv_f) with open('%s/%s' % (tmp_dir, csv_f)) as csv_file: dict_reader = csv.DictReader(csv_file) row_nb = 0 previous_data = pd.DataFrame(columns=[ 'origin', 'destination', 'year_month', 'airline', 'passengers' ]) all_rows = len( list(csv.DictReader(open('%s/%s' % (tmp_dir, csv_f))))) """ In previous_data, we store all the lines that are sent to bulk, to refer to for next lines. This allows sum of passengers for similar origin/destination/year_month/airline tuples. """ with External_Segment_Tmp.unordered_bulk( 1000, execute_callback=log_bulk) as bulk: for row in dict_reader: # loop through each row (origin, destination) in file row_nb += 1 for key, value in row.items(): if value == ':': row[key] = '' passengers = int(row['PASSENGERS'].split('.')[0]) if passengers <= 0: # skip rows with no pax continue row_airline = row['UNIQUE_CARRIER'] airport_origin = row['ORIGIN'] airport_destination = row['DEST'] if airport_origin in airport_exclusions or airport_destination in airport_exclusions: # skip exclusions continue if row_airline in airline_replacements: row_airline = airline_replacements.get(row_airline) if airport_origin in airport_replacement: # correct the wrong codes airport_origin = airport_replacement.get( airport_origin) if airport_destination in airport_replacement: # correct the wrong codes airport_destination = airport_replacement.get( airport_destination) if row_airline not in airline_codes: # Check airline if row_airline in unknown_airlines['code'].values: unknown_airlines.loc[unknown_airlines['code'] == row_airline, 'passengers'] += passengers else: info = pd.Series({ 'code': row_airline, 'name': row['CARRIER_NAME'], 'passengers': passengers }) unknown_airlines = unknown_airlines.append( info, ignore_index=True) continue if not check_airport( airport_origin, row['ORIGIN_CITY_NAME'], row['ORIGIN_COUNTRY'], row['ORIGIN_STATE_ABR'], passengers): continue if not check_airport( airport_destination, row['DEST_CITY_NAME'], row['DEST_COUNTRY'], row['DEST_STATE_ABR'], passengers): continue year_month = '%04d-%02d' % (int( row['YEAR']), int(row['MONTH'])) airline_ref_code = ref_code.get_airline_ref_code( row_airline, airport_origin, airport_destination, YearMonth(year_month)) if ((previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['year_month'] == year_month) & (previous_data['airline'] == row_airline)).any(): new_row = False # Add to Excel file's total_pax the "passenger" integer you get from filtering # previous_data on other columns passengers += int(previous_data['passengers'][ (previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['year_month'] == year_month) & (previous_data['airline'] == row_airline)]) else: new_row = True dic = dict(provider=provider, data_type='airport', airline=[row_airline], airline_ref_code=[airline_ref_code], total_pax=passengers, overlap=[], origin=[airport_origin], destination=[airport_destination], year_month=[year_month], raw_rec=dict(row), both_ways=False, from_line=row_nb, from_filename=csv_f, url=full_url) new_data = pd.Series({ 'origin': airport_origin, 'destination': airport_destination, 'year_month': year_month, 'airline': row_airline, 'passengers': passengers }).to_frame() if new_row: previous_data = previous_data.append(new_data.T, ignore_index=True) else: previous_data['passengers'][ (previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['airline'] == row_airline) & (previous_data['year_month'] == year_month )] = passengers # Modify previous_data's pax query = dict( (k, dic[k]) for k in ('origin', 'destination', 'year_month', 'provider', 'data_type', 'airline')) bulk.find(query).upsert().update_one({ '$set': dic, '$setOnInsert': dict(inserted=now) }) if row_nb % 1000 == 0: print('{0:.3g}'.format(row_nb / all_rows * 100) + '%') log.info('stored: %r', bulk.nresult)
def get_data(): """ Populate the database with data extract in xlsx files. One line per year_month. Back/Forth routes in rows, one column per way. :param xlsx_files: dict of file names :return: """ def log_bulk(self): log.info(' store external_segment: %r', self.nresult) now = utcnow() provider = "CRK" tmp_dir = "/home/laurent" xlsx_f = "CRK.xlsx" ref_code.init_cache() airports_codes = get_airports_codes() airlines_codes = get_airlines_codes() log.info('******************** processing Excel file: %s', xlsx_f) xl = pd.ExcelFile(tmp_dir + "/" + xlsx_f) for tab in xl.sheet_names: log.info('************ processing tab %s', tab) xls = xl.parse(sheetname=tab, header=1) all_rows = len(xls.index) row_nb = 0 with External_Segment_Tmp.unordered_bulk( 1000, execute_callback=log_bulk) as bulk: for row_index, full_row in xls.iterrows( ): # loop through each row (origin, destination, airline, ym) in file row_nb += 1 # Skip empty rows (no text in Origin column, or year Total = 0) if pd.isnull(full_row['ORIGIN']): continue else: total_pax = int(full_row['TRAFFIC']) year = int(full_row['YEAR']) month = int(full_row['MONTH']) year_month = str(year) + '-' + format(month, '02d') airline = full_row['AIRLINE'] origin = full_row['ORIGIN'] destination = full_row['DESTINATION'] if not check_airport(origin, total_pax, airports_codes): continue if not check_airport(destination, total_pax, airports_codes): continue if not check_airline(airline, total_pax, airlines_codes): continue airline_ref_code = ref_code.get_airline_ref_code( airline, origin, destination, YearMonth(year_month)) dic = dict(provider=provider, data_type='airport', airline=[airline], airline_ref_code=[airline_ref_code], origin=[origin], destination=[destination], year_month=[year_month], total_pax=total_pax, overlap=[], raw_rec=dict(full_row), both_ways=True, from_line=row_nb, from_filename=xlsx_f) query = dict((k, dic[k]) for k in ('origin', 'destination', 'year_month', 'provider', 'data_type', 'airline')) bulk.find(query).upsert().update_one({ '$set': dic, '$setOnInsert': dict(inserted=now) }) if row_nb % 100 == 0: print('{0:.3g}'.format(row_nb / all_rows * 100) + '%') log.info('stored: %r', bulk.nresult)
import os import sys sys.path.append('../') from utils import YearMonth, utcnow year_months = [ '2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06', '2016-07', '2016-08', '2016-09', '2016-10', '2016-11', '2016-12' ] for ym in year_months: os.system('python treat_sources_scope.py %s --reset_overlap' % YearMonth(ym))