def get_task(n): findspark.init() sc = pyspark.SparkContext() sqlContext = SQLContext(sc) raw_bookings = sqlContext.read \ .format('com.databricks.spark.csv') \ .options(header='true', delimiter='^',inferSchema='true') \ .load('bookings.csv') arr_port_by_pass=raw_bookings.select(['arr_port','pax']).groupby('arr_port').sum('pax').orderBy('sum(pax)',ascending=0) df=pd.DataFrame(data=arr_port_by_pass.collect()[:n],columns=['arr_port','num_pass']) geo_o = GeoBase(data='ori_por', verbose=False) df['arr_port_name']=df['arr_port'].map(lambda x: geo_o.get(str(x).replace(' ',''),'name')) json_st=df.to_json() return json_st
def airportsNearCoords(location, radius): g = geocoders.GoogleV3(api_key=GMAPS_KEY) geo_a = GeoBase(data='airports', verbose=False) # Build list of primary airports for filtering IATA coded airports f = open('airports.txt', 'r') IATA = [line.strip() for line in f.readlines()] f.close() # Lookup airports near a destination address0, (lat0, long0) = g.geocode(location, exactly_one=False)[0] latlong = (lat0, long0) near = sorted(geo_a.findNearPoint((lat0, long0), radius)) airports = [k for _, k in near if k in IATA] return airports
def main(): g = GeoBase('ori_por', verbose=False) for p in g: if not g.get(p, 'name'): print 'No name for {0}'.format(p) if not g.get(p, 'city_code_list'): print 'No city_code_list for {0}'.format(p) if not g.get(p, 'city_name_list'): print '{0} with name {1} has city_code_list {2} and city_name_list {3}'.format( g.get(p, 'iata_code'), g.get(p, 'name'), g.get(p, 'city_code_list'), g.get(p, 'city_name_list'))
def create_currencies(apps, schema_editor): Currency = apps.get_model("refgeo", "Currency") geo_c = GeoBase(data='currencies', verbose=False) for code in geo_c: cdata = geo_c.get(code) try: p = int(cdata.get('digits_number')) except: p = 2 Currency.objects.create( code=code, prec=p, name=cdata.get('currency_name')[:64], html=None, one_dollar=1.0, )
def main(): db_oripor = GeoBase('ori_por', verbose=False) db_geonames = GeoBase('cities', verbose=False) with open('tz_fixes.csv', 'w') as out: for p, p_tz, p_iata, p_city, p_geocode in pors_with_unk_tz(db_oripor): if p_geocode is None: print '! Could not find geocode for {0}'.format(p) continue # Closest match in GeoNames dist, id_ = db_geonames.findClosestFromPoint(p_geocode).next() g_city = db_geonames.get(id_, 'name') g_tz = db_geonames.get(id_, 'timezone') out.write('{0},{1},{2},{3:.2f}\n'.format(p_iata, p_tz, g_tz, dist)) print ('{0} with tz "{1}" matches tz "{2}" ' '(dist {3:.1f}km, "{4}" -> "{5}")').format( p_iata, p_tz, g_tz, dist, p_city, g_city)
def create_countries(apps, schema_editor): Currency = apps.get_model("refgeo", "Currency") Country = apps.get_model("refgeo", "Country") geo_c = GeoBase(data='countries', verbose=False) for code in geo_c: cdata = geo_c.get(code) try: geoname_id = int(cdata.get("geoname_id")) except: geoname_id = None Country.objects.create( code=code, code3=cdata.get("iso_alpha3"), name=cdata.get("name"), alternateNames="", capitalCode="", currency=get_or_none(Currency, code=cdata.get("currency_code")), geonameId=geoname_id, population=int(cdata.get("population", 0)), continentCode="", )
def create_airports(apps, schema_editor): Airport = apps.get_model("refgeo", "Airport") Country = apps.get_model("refgeo", "Country") PEPR_REFERENCE_GEO_AIRPORT_PAGERANK_GT = getattr( settings, "PEPR_REFERENCE_GEO_AIRPORT_PAGERANK_GT", -1) geo_por = GeoBase(data='ori_por', key_fields=['city_code', 'iata_code'], discard_dups=True) for code in geo_por: with transaction.atomic(): cdata = geo_por.get(code) location_type = cdata.get("location_type") page_rank = float_or_default(cdata.get("page_rank"), 0) if page_rank <= PEPR_REFERENCE_GEO_AIRPORT_PAGERANK_GT: continue try: Airport.objects.create( iataCode=cdata.get("iata_code"), icao_code=cdata.get("icao_code"), location_type=location_type, is_airport=('A' in location_type), all_airports=('C' in location_type), #geoname = cdata.get("geoname_id") name=cdata.get("name"), #alternateNames = parse_alternate_name_section(cdata.get('alt_name_section')), timezone=cdata.get("timezone"), stateCode=cdata.get("state_code"), country=get_or_none(Country, pk=cdata.get("country_code")), cityCode=cdata.get("city_code"), cityName=cdata.get("city_name_utf"), lat=float_or_default(cdata.get("lat")), lng=float_or_default(cdata.get("lng")), page_rank=page_rank, ) except IntegrityError as e: lg.error("Not unique: %s", code)
def create_airlines(apps, schema_editor): Airline = apps.get_model("refair", "Airline") Alliance = apps.get_model("refair", "Alliance") geo_a = GeoBase(data='airlines', verbose=False, key_fields=['2char_code']) for code in geo_a: adata = geo_a.get(code) try: with transaction.atomic(): alliance_code = adata.get('alliance_code') or None if alliance_code: alliance, _ = Alliance.objects.get_or_create( name=alliance_code) else: alliance = None Airline.objects.create( code=adata.get('2char_code'), code3=adata.get('3char_code'), name=adata.get('name'), alliance=alliance, alliance_status=adata.get('alliance_status') or None, ) except IntegrityError as e: ex = Airline.objects.get(code=adata.get('2char_code')) print >> sys.stderr, adata, ex.name, ex.code3
def __init__(self, fnam, auto = False): """ Open the file and read the header, also try to load a GeoBases airport object and gracefully handle installation issues. """ self.fnam = fnam self.auto = auto self.f = open(fnam) try: from GeoBases import GeoBase self.prts = GeoBase(data = 'airports', verbose = False) except ImportError: pass li = self.f.readline() ky = [i.strip(' ') for i in li.rstrip('\n').split('^')] self.hea = dict(zip(ky, range(len(ky))))
def main(): db_oripor = GeoBase('ori_por', verbose=False) db_geonames = GeoBase('cities', verbose=False) with open('tz_fixes.csv', 'w') as out: for p, p_tz, p_iata, p_city, p_geocode in pors_with_unk_tz(db_oripor): if p_geocode is None: print '! Could not find geocode for {0}'.format(p) continue # Closest match in GeoNames dist, id_ = db_geonames.findClosestFromPoint(p_geocode).next() g_city = db_geonames.get(id_, 'name') g_tz = db_geonames.get(id_, 'timezone') out.write('{0},{1},{2},{3:.2f}\n'.format(p_iata, p_tz, g_tz, dist)) print( '{0} with tz "{1}" matches tz "{2}" ' '(dist {3:.1f}km, "{4}" -> "{5}")').format( p_iata, p_tz, g_tz, dist, p_city, g_city)
if CREATE_HDFS: createHDhdfile(hdfile, bookingsFile) # Open the created store store = pd.HDFStore(hdfile, mode='r') try: # get groups groups = store.select_column('df','arr_port').unique() # create an empty data frame as result df = pd.DataFrame(columns=['airport', 'total'], index=groups) geo_o = GeoBase(data='ori_por', verbose=False) # iterate over groups and apply my operations for g in groups: grp = store.select('df', where = "arr_port='%s'" % g) # Set the sum in the empty dataframe total = grp[['pax']].sum() df['total'][g] = total['pax'] df['airport'][g] = geo_o.get(g.strip(), 'name', default="Undefined") # Sort in descending order result = df.sort(['total'], ascending=[0])
import sys from pyspark import SparkContext from GeoBases import GeoBase geo_a = GeoBase(data='airports', verbose=False) def get_arrival_airport(line): line_split=line.split("^") if len(line_split)<35: airport=line_split[10] return airport.split(" ")[0] airport=line_split[12] return airport.split(" ")[0] def get_pax(line): line_split=line.split("^") if len(line_split)<35: return int(line_split[len(line_split)-4]) return int(line_split[34]) def is_valid_line(line): line_split=line.split("^") if len(line_split)>=35: if line_split[34].split(" ")[0]=='pax': return False else: return True else: if line_split[len(line_split)-4].split(" ")[0]=='pax': return False else:
from GeoBases import GeoBase import numpy as np import matplotlib.pyplot as plt import csv from tzwhere import tzwhere import scipy.optimize as sp from scipy.integrate import odeint geo_a = GeoBase(data='airports', verbose=False) csvdata = list(csv.reader(open("UTFData.csv", encoding='utf-8'))) csvdata = np.array(csvdata) dataset1 = [[36.6, -121.89846], [52.1427, 6.1961], [-37.8136, 144.9631], [31.2304, 121.4737], [22.39, 114.1095], [55.7558, 37.6173]] dataset2 = [[42.3601, -71.0589], [42.3601, -71.0589], [1.3521, 103.8198], [39.91, 160.3636], [22.3964, 114.1095], [22.3964, 114.1095], [55.7558, 37.6173], [52.0907, 5.1412], [52.2297, 21.0122], [55.6761, 12.5683], [-37.8136, 144.9631]] latlongarray = dataset1 #conference=[59.9343,30.3351] #conference=[24.958202,46.700779] #conference=[-11.2027,17.8739] tz = tzwhere.tzwhere() panic = False if (latlongarray == dataset1): zonerestriction = 3 elif (latlongarray == dataset2): zonerestriction = 1 #loc1 = (37.5665,126.9780)
__author__ = 'dani' import pandas as pd from Timer import Timer from config import getFilePath from GeoBases import GeoBase geo_o = GeoBase(data='ori_por', verbose=False) cols = ['year', 'arr_port', 'pax'] with Timer() as t: data = pd.read_csv(getFilePath('bookings', '2'), sep='^', error_bad_lines=False, warn_bad_lines=True, usecols=cols) l = len(data) print("=> read: %s s" % t.secs) print l print data.columns filtered_data = data[data.year == 2013] filtered_data = filtered_data[['arr_port', 'pax']] #filtered_data has the counter row as index being a DataFrame #and two series objects: arr_port, pax #Method 1 #groupby on the arr_port column and then iterate on groups result = {} with Timer() as t: for arr_port, group in filtered_data.groupby('arr_port'): result[arr_port] = group["pax"].sum() result = sorted(result.items(), key=lambda x:x[1], reverse=True) print "=> top computation - 1: %s s" % t.secs #for k,v in result[:10]: # print k,v
from geopy import geocoders, distance import googlemaps from GeoBases import GeoBase GMAPS_KEY = 'AIzaSyCC7J4WeMBCJiwoEPUhm9-mOZlc8NDR7Kc' # geopy geocoder g = geocoders.GoogleV3(api_key=GMAPS_KEY) # GoogleMaps for driving directions gmaps = googlemaps.Client(key=GMAPS_KEY) # GeoBase for geocoding data for all IATA-coded airports geo_a = GeoBase(data='airports', verbose=False) # Build list of primary airports (as defined by the FAA) # for filtering IATA-coded airports f = open('airports.txt','r') IATA = [line.strip() for line in f.readlines()] f.close() # Lookup airports near a destination destination = "Chattanooga, TN" address0, (lat0, long0) = g.geocode(destination, exactly_one=False)[0] latlong0 = (lat0, long0) near = sorted(geo_a.findNearPoint((lat0, long0), 160)) #print(near); airports = [k for _, k in sorted(geo_a.findNearPoint((lat0,long0), 160)) if k in IATA] print(airports);
import pandas as pd from pandas import DataFrame import sys, datetime, pandas.io.data import matplotlib.pyplot as plt from GeoBases import GeoBase df = pd.read_csv(sys.argv[1], sep = '^') df = df.rename(columns=lambda x: x.strip()) df2 = df.groupby(by='arr_port',level='shit').size() print df2 df3.replace(" ", "") df3.sort(ascending=False) df4 = df2.head(n=10) geo_a = GeoBase(data='ori_por', verbose = False) hijodeputa = geo_a.get('AAR', 'city_name_utf') print hijodeputa joder = geo_a.get('AAR', 'name') print joder lista = [] lista = ['AAE', 'AAL', 'AAQ', 'AAR', 'ABE'] for i in lista: shit = geo_a.get(i, 'name') print "f**k", i, shit cities = [] for airport, size in df4.iteritems(): print airport
if __name__ == '__main__': if len(sys.argv) != 2: print 'Please provide a booking file.' sys.exit(0) aFile = sys.argv[1] aData = None if not os.path.exists(aFile): print 'Please check if the booking file', aFile, 'does exist' sys.exit(0) else: # extract data from file aChunks = pandas.read_table(aFile,chunksize=10000,sep='^',usecols=['arr_port','pax']) aData = pandas.DataFrame() aData = pandas.concat(chunk for chunk in aChunks) grouped = aData.groupby(aData['arr_port']).sum() sgrouped = grouped.sort_values(by='pax',ascending=False) # GeoBase geo = GeoBase(data='ori_por',verbose=False) # write into csv file csvFile = open('test2_result.csv','wb') csvWriter = csv.writer(csvFile) for i in sgrouped.index: item = sgrouped.ix[i] try: print geo.get(item.name.strip(),'name'),item.pax csvWriter.writerow([geo.get(item.name.strip(),'name'),item.pax]) except: print item.name.strip(),item.pax csvWriter.writerow([item.name.strip(),item.pax]) csvFile.close()
# -*- coding: utf-8 -*- import web # Import the library web to create the web service import pandas as pd # import the library pandas import json # import the library json for JSON files from GeoBases import GeoBase # import GeoBase to retrieve airport countries # Columns names and filenames filenameBooking="bookings.csv" # name of the bookings file usedColumns = ['arr_port','pax'] # columns used to process the file geo_o = GeoBase(data='ori_por', verbose=False) # load the GeoBase data # Create a function that retrieves the country name from the airport code # If the airport code is unknown, it returns the default value "UNKNOWN" # I use the strip function to remove whitespaces in airport codes (otherwise, no airport code is recognized) def strCountryName(x): return geo_o.get(x.strip(), 'city_name_ascii',default='UNKNOWN') # Define the function that writes the JSON file def writeJsonFile(topNumber,filenameJSON): # read the CSV file, keeping only columns 'arr_port' and'pax' df = pd.read_csv(filenameBooking,sep='^', usecols=usedColumns, nrows=1000) dfTop=df['pax'].groupby(df['arr_port']).sum().reset_index().sort_values(by='pax', ascending=False)[:topNumber] dfTop['Rank']=range(1,topNumber+1,1) # Generate integers up to topNumber+1, but not including topNumber+1 dfTop = dfTop.reindex(columns=['Rank','arr_port','pax']) dfTopJSON=dfTop.rename(columns={'arr_port': 'Airport','pax': 'Number of bookings'}) # I "map" the function 'strCountryName' to each row of the column 'Airport' to create a new column named 'Country' dfTopJSON['Country']=dfTopJSON['Airport'].map(strCountryName)
import sys,os,pandas,csv from GeoBases import GeoBase if __name__ == '__main__': if len(sys.argv) != 2: print 'Please provide a booking file.' sys.exit(0) aFile = sys.argv[1] aData = None if not os.path.exists(aFile): print 'Please check if the booking file', aFile, 'does exist' sys.exit(0) else: # extract data from file aChunks = pandas.read_table(aFile,chunksize=10000,sep='^',usecols=['arr_port','pax']) aData = pandas.DataFrame() aData = pandas.concat(chunk for chunk in aChunks) grouped = aData.groupby(aData['arr_port']).sum() sgrouped = grouped.sort_values(by='pax',ascending=False) geo = GeoBase(data='ori_por',verbose=False) for i in range(10): try: item = sgrouped.ix[i] print geo.get(item.name.strip(),'name'),item.pax except: print 'no more airports...' sys.exit(0)
) ] layout = dict(title='Pyongyang to Chicago', showlegend=False, geo=dict( resolution=100, showland=True, showlakes=True, landcolor='rgb(204, 204, 204)', countrycolor='rgb(204, 204, 204)', lakecolor='rgb(255, 255, 255)', projection=dict(type="equirectangular"), coastlinewidth=2, lataxis=dict(range=[-100, 100], showgrid=False, tickmode="linear", dtick=20), lonaxis=dict(range=[-100, 100], showgrid=False, tickmode="linear", dtick=20), )) fig = dict(data=nyc_london, layout=layout) py.iplot(fig, validate=False, filename='d3-great-circle') point = (40, -75) geo_a = GeoBase(data='airports', verbose=False) print(list(geo_a.findClosestFromPoint(point))) print()
if (index > block_limit) : break # Remove spaces, done here because it's costly aggregated['arr_port'] = aggregated['arr_port'].apply(lambda x : x.strip()) sorted_data = aggregated.groupby('arr_port').sum().sort(columns='pax',ascending=False) sorted_data.to_csv(path_or_buf=config.data_folder + 'sorted_list.csv',sep='^') final_data = sorted_data[:10] final_data # Bonus point: Look up the names for each airport from GeoBases import GeoBase record_count = 200 airport_data = pd.read_table(config.data_folder + 'sorted_list.csv', sep='^') airport_data['city_name'] = 'None' geo_o = GeoBase(data='ori_por', verbose=False) # Define a Lambda function find_city = lambda x,geo_o : geo_o.get(x.strip(),'city_name_ascii') for i, row in enumerate(airport_data.values): try: city_name = find_city(airport_data['arr_port'][i],geo_o) airport_data['city_name'][i] = city_name print city_name except: print "Oops! That was no valid number. Try again...",i,airport_data['arr_port'][i] # Save it for using it later on in the bonus exercise (Web service) airport_data.to_csv(path_or_buf= config.data_folder + 'sorted_named_list.csv',sep='^') airport_data[:10]
class csvContainer: """ Container to do simple operations with csv files """ f = None # The file. Is always open and positioned after the header for reading . hea = None # The header as a dictionary converting names into column numbers fnam = None # The file name. For creating realted names for audit() auto = False # Automatically convert comma separated files when ^-separated fails prts = None # GeoBases aiports object. If == None, gracefully reject the search def __init__(self, fnam, auto = False): """ Open the file and read the header, also try to load a GeoBases airport object and gracefully handle installation issues. """ self.fnam = fnam self.auto = auto self.f = open(fnam) try: from GeoBases import GeoBase self.prts = GeoBase(data = 'airports', verbose = False) except ImportError: pass li = self.f.readline() ky = [i.strip(' ') for i in li.rstrip('\n').split('^')] self.hea = dict(zip(ky, range(len(ky)))) def audit(self): """ Scan the entire file counting columns and writing problematic lines to a file """ numcol = len(self.hea) stpos = self.f.tell() errfile = open(self.fnam + '.audit.csv', 'w') ln = 1 li = self.f.readline() while li != '': row = li.rstrip('\n').split('^') if self.auto and len(row) == 1: row = li.rstrip('\n').split(',') if len(row) != numcol: # if len(row) > numcol: raise ContainerError, 'Too many columns %s ' % len(row) # if len(row) < 2: raise ContainerError, 'No separator found or empty row' errfile.write(li) print ln, len(row), row, self.f.tell(), '\n' li = self.f.readline() ln += 1 self.f.seek(stpos) errfile.close() def countlines(self): """ Count lines not counting lines with errors """ numcol = len(self.hea) stpos = self.f.tell() ln = 0 li = self.f.readline() while li != '': row = li.rstrip('\n').split('^') if self.auto and len(row) == 1: row = li.rstrip('\n').split(',') if len(row) == numcol: ln += 1 li = self.f.readline() self.f.seek(stpos) return (ln) def niceAirportDescription (self, iatacode): """ Returns a string with a nicec description of airport iatacode if GeoBases is available """ if self.prts is None: return('** GeoBases is not installed. **') dd = self.prts.get(iatacode) return(dd['iata_code'] + ': ' + dd['name'] + ', ' + dd['city_code'] + ', ' + dd['country_name']) def aggregate(self, x, groupby, fun): """ Apply function fun to variable' x grouping by groupby. x and groupby are variable names and fun is a function possibly a lambda like lambda a, b: a+b. The function is computed incrementally where a is the accumulated value and b is the new value. """ numcol = len(self.hea) stpos = self.f.tell() grby = {} idx_x = self.hea[x] idx_g = self.hea[groupby] li = self.f.readline() while li != '': row = li.rstrip('\n').split('^') if self.auto and len(row) == 1: row = li.rstrip('\n').split(',') if len(row) == numcol: x = row[idx_x].strip(' ') g = row[idx_g].strip(' ') if g in grby: x = fun(grby[g], float(x)) grby[g] = float(x) li = self.f.readline() self.f.seek(stpos) return(grby) def groupby(self, vnames, numchars): """ Group by all variables in vnames and count the number of rows. Take only the first numchars characters of each vname's value. """ numcol = len(self.hea) stpos = self.f.tell() grby = {} idx = tuple([self.hea[i] for i in vnames]) n = len(vnames) li = self.f.readline() while li != '': row = li.rstrip('\n').split('^') if self.auto and len(row) == 1: row = li.rstrip('\n').split(',') if len(row) == numcol: g = tuple([row[idx[i]].strip(' ')[:numchars[i]] for i in range(n)]) if g in grby: x = grby[g] + 1 else: x = 1 grby[g] = x li = self.f.readline() self.f.seek(stpos) return(grby) def writeSubset(self, vname, rexp, outfi): """ Writes all lines to a file where variable vname matches the regular expression rexp to a new file outfi """ numcol = len(self.hea) stpos = self.f.tell() ofile = open(outfi, 'w') colnames = sorted(self.hea, key = self.hea.get) li = '^'.join(colnames) + '\n' ofile.write(li) import re rex = re.compile(rexp) idx = self.hea[vname] li = self.f.readline() while li != '': row = li.rstrip('\n').split('^') if self.auto and len(row) == 1: row = li.rstrip('\n').split(',') if len(row) == numcol: x = row[idx].strip(' ') if rex.match(x) is not None: ofile.write(li) li = self.f.readline() self.f.seek(stpos) ofile.close() #S a n d b o x functions: These functions do not make part of the "neat" object but are kept to understand what was done. #------------------------- def join_step1(self): """ This function was only implemented for researching the datafiles. it is NOT generale purpose but includes file specifics. See the document OnJoiningSearchesWithBookings.txt for an explanation. """ numcol = len(self.hea) stpos = self.f.tell() grby = {} idx_1 = self.hea['Origin'] idx_2 = self.hea['Destination'] idx_3 = self.hea['Seg1Date'] li = self.f.readline() while li != '': row = li.rstrip('\n').split('^') if self.auto and len(row) == 1: row = li.rstrip('\n').split(',') if len(row) == numcol: g = (row[idx_1].strip(' '), row[idx_2].strip(' '), row[idx_3].strip(' ')) if g in grby: x = grby[g] + 1 else: x = 1 grby[g] = x li = self.f.readline() self.f.seek(stpos) return(grby) def join_step2_search(self, filtr): """ This function was only implemented for researching the datafiles. it is NOT generale purpose but includes file specifics. See the document OnJoiningSearchesWithBookings.txt for an explanation. """ numcol = len(self.hea) stpos = self.f.tell() ofile = open(self.fnam + '.filter.csv', 'w') idx_1 = self.hea['Origin'] idx_2 = self.hea['Destination'] idx_3 = self.hea['Seg1Date'] li = self.f.readline() while li != '': row = li.rstrip('\n').split('^') if self.auto and len(row) == 1: row = li.rstrip('\n').split(',') if len(row) == numcol: g = (row[idx_1].strip(' '), row[idx_2].strip(' '), row[idx_3].strip(' ')) if g in filtr: ofile.write(li) li = self.f.readline() self.f.seek(stpos) ofile.close() def join_step2_book(self, filtr): """ This function was only implemented for researching the datafiles. it is NOT generale purpose but includes file specifics. See the document OnJoiningSearchesWithBookings.txt for an explanation. """ numcol = len(self.hea) stpos = self.f.tell() ofile = open(self.fnam + '.filter.csv', 'w') idx_1 = self.hea['dep_port'] idx_2 = self.hea['arr_port'] idx_3 = self.hea['brd_time'] li = self.f.readline() while li != '': row = li.rstrip('\n').split('^') if self.auto and len(row) == 1: row = li.rstrip('\n').split(',') if len(row) == numcol: g = (row[idx_1].strip(' '), row[idx_2].strip(' '), row[idx_3].strip(' ')[:10]) if g in filtr: ofile.write(li) li = self.f.readline() self.f.seek(stpos) ofile.close() ## end of csvContainer