def get_task(n):
	findspark.init()
	sc = pyspark.SparkContext()
	sqlContext = SQLContext(sc)
	raw_bookings = sqlContext.read \
		.format('com.databricks.spark.csv') \
		.options(header='true', delimiter='^',inferSchema='true') \
		.load('bookings.csv')
	arr_port_by_pass=raw_bookings.select(['arr_port','pax']).groupby('arr_port').sum('pax').orderBy('sum(pax)',ascending=0)
	df=pd.DataFrame(data=arr_port_by_pass.collect()[:n],columns=['arr_port','num_pass'])
	geo_o = GeoBase(data='ori_por', verbose=False)
	df['arr_port_name']=df['arr_port'].map(lambda x: geo_o.get(str(x).replace(' ',''),'name'))
	json_st=df.to_json()
	return json_st
Example #2
0
def airportsNearCoords(location, radius):
    g = geocoders.GoogleV3(api_key=GMAPS_KEY)
    geo_a = GeoBase(data='airports', verbose=False)

    # Build list of primary airports for filtering IATA coded airports
    f = open('airports.txt', 'r')
    IATA = [line.strip() for line in f.readlines()]
    f.close()

    # Lookup airports near a destination
    address0, (lat0, long0) = g.geocode(location, exactly_one=False)[0]
    latlong = (lat0, long0)
    near = sorted(geo_a.findNearPoint((lat0, long0), radius))
    airports = [k for _, k in near if k in IATA]
    return airports
def main():
    g = GeoBase('ori_por', verbose=False)

    for p in g:
        if not g.get(p, 'name'):
            print 'No name for {0}'.format(p)
        if not g.get(p, 'city_code_list'):
            print 'No city_code_list for {0}'.format(p)
        if not g.get(p, 'city_name_list'):
            print '{0} with name {1} has city_code_list {2} and city_name_list {3}'.format(
                g.get(p, 'iata_code'), g.get(p, 'name'),
                g.get(p, 'city_code_list'), g.get(p, 'city_name_list'))
def create_currencies(apps, schema_editor):
    Currency = apps.get_model("refgeo", "Currency")
    geo_c = GeoBase(data='currencies', verbose=False)
    for code in geo_c:
        cdata = geo_c.get(code)

        try:
            p = int(cdata.get('digits_number'))
        except:
            p = 2

        Currency.objects.create(
            code=code,
            prec=p,
            name=cdata.get('currency_name')[:64],
            html=None,
            one_dollar=1.0,
        )
def main():
    db_oripor = GeoBase('ori_por', verbose=False)
    db_geonames = GeoBase('cities', verbose=False)

    with open('tz_fixes.csv', 'w') as out:
        for p, p_tz, p_iata, p_city, p_geocode in pors_with_unk_tz(db_oripor):
            if p_geocode is None:
                print '! Could not find geocode for {0}'.format(p)
                continue
            # Closest match in GeoNames
            dist, id_ = db_geonames.findClosestFromPoint(p_geocode).next()
            g_city = db_geonames.get(id_, 'name')
            g_tz = db_geonames.get(id_, 'timezone')

            out.write('{0},{1},{2},{3:.2f}\n'.format(p_iata, p_tz, g_tz, dist))

            print ('{0} with tz "{1}" matches tz "{2}" '
                   '(dist {3:.1f}km, "{4}" -> "{5}")').format(
                    p_iata, p_tz, g_tz, dist, p_city, g_city)
def create_countries(apps, schema_editor):
    Currency = apps.get_model("refgeo", "Currency")
    Country = apps.get_model("refgeo", "Country")
    geo_c = GeoBase(data='countries', verbose=False)
    for code in geo_c:
        cdata = geo_c.get(code)
        try:
            geoname_id = int(cdata.get("geoname_id"))
        except:
            geoname_id = None
        Country.objects.create(
            code=code,
            code3=cdata.get("iso_alpha3"),
            name=cdata.get("name"),
            alternateNames="",
            capitalCode="",
            currency=get_or_none(Currency, code=cdata.get("currency_code")),
            geonameId=geoname_id,
            population=int(cdata.get("population", 0)),
            continentCode="",
        )
def create_airports(apps, schema_editor):
    Airport = apps.get_model("refgeo", "Airport")
    Country = apps.get_model("refgeo", "Country")

    PEPR_REFERENCE_GEO_AIRPORT_PAGERANK_GT = getattr(
        settings, "PEPR_REFERENCE_GEO_AIRPORT_PAGERANK_GT", -1)

    geo_por = GeoBase(data='ori_por',
                      key_fields=['city_code', 'iata_code'],
                      discard_dups=True)
    for code in geo_por:
        with transaction.atomic():
            cdata = geo_por.get(code)
            location_type = cdata.get("location_type")
            page_rank = float_or_default(cdata.get("page_rank"), 0)
            if page_rank <= PEPR_REFERENCE_GEO_AIRPORT_PAGERANK_GT:
                continue
            try:
                Airport.objects.create(
                    iataCode=cdata.get("iata_code"),
                    icao_code=cdata.get("icao_code"),
                    location_type=location_type,
                    is_airport=('A' in location_type),
                    all_airports=('C' in location_type),
                    #geoname = cdata.get("geoname_id")
                    name=cdata.get("name"),
                    #alternateNames = parse_alternate_name_section(cdata.get('alt_name_section')),
                    timezone=cdata.get("timezone"),
                    stateCode=cdata.get("state_code"),
                    country=get_or_none(Country, pk=cdata.get("country_code")),
                    cityCode=cdata.get("city_code"),
                    cityName=cdata.get("city_name_utf"),
                    lat=float_or_default(cdata.get("lat")),
                    lng=float_or_default(cdata.get("lng")),
                    page_rank=page_rank,
                )
            except IntegrityError as e:
                lg.error("Not unique: %s", code)
Example #8
0
def create_airlines(apps, schema_editor):
    Airline = apps.get_model("refair", "Airline")
    Alliance = apps.get_model("refair", "Alliance")
    geo_a = GeoBase(data='airlines', verbose=False, key_fields=['2char_code'])
    for code in geo_a:
        adata = geo_a.get(code)
        try:
            with transaction.atomic():
                alliance_code = adata.get('alliance_code') or None
                if alliance_code:
                    alliance, _ = Alliance.objects.get_or_create(
                        name=alliance_code)
                else:
                    alliance = None
                Airline.objects.create(
                    code=adata.get('2char_code'),
                    code3=adata.get('3char_code'),
                    name=adata.get('name'),
                    alliance=alliance,
                    alliance_status=adata.get('alliance_status') or None,
                )
        except IntegrityError as e:
            ex = Airline.objects.get(code=adata.get('2char_code'))
            print >> sys.stderr, adata, ex.name, ex.code3
Example #9
0
  def __init__(self, fnam, auto = False):
    """ Open the file and read the header, also try to load a GeoBases airport object and gracefully handle installation issues. """
    
    self.fnam = fnam
    self.auto = auto
    self.f    = open(fnam)
    
    try:
      from GeoBases import GeoBase
      self.prts = GeoBase(data = 'airports', verbose = False)
    except ImportError:
      pass

    li = self.f.readline()
    ky = [i.strip(' ') for i in li.rstrip('\n').split('^')]
    
    self.hea = dict(zip(ky, range(len(ky))))
Example #10
0
def main():
    db_oripor = GeoBase('ori_por', verbose=False)
    db_geonames = GeoBase('cities', verbose=False)

    with open('tz_fixes.csv', 'w') as out:
        for p, p_tz, p_iata, p_city, p_geocode in pors_with_unk_tz(db_oripor):
            if p_geocode is None:
                print '! Could not find geocode for {0}'.format(p)
                continue
            # Closest match in GeoNames
            dist, id_ = db_geonames.findClosestFromPoint(p_geocode).next()
            g_city = db_geonames.get(id_, 'name')
            g_tz = db_geonames.get(id_, 'timezone')

            out.write('{0},{1},{2},{3:.2f}\n'.format(p_iata, p_tz, g_tz, dist))

            print(
                '{0} with tz "{1}" matches tz "{2}" '
                '(dist {3:.1f}km, "{4}" -> "{5}")').format(
                    p_iata, p_tz, g_tz, dist, p_city, g_city)
Example #11
0
if CREATE_HDFS:
    
    createHDhdfile(hdfile, bookingsFile)

# Open the created store
store = pd.HDFStore(hdfile, mode='r')

try:
    # get groups

    groups = store.select_column('df','arr_port').unique()

    # create an empty data frame as result
    df = pd.DataFrame(columns=['airport', 'total'], index=groups)

    geo_o = GeoBase(data='ori_por', verbose=False)

    # iterate over groups and apply my operations

    for g in groups:

        grp = store.select('df', where = "arr_port='%s'" % g)

        # Set the sum in the empty dataframe
        total = grp[['pax']].sum()
        df['total'][g] = total['pax']
        df['airport'][g] = geo_o.get(g.strip(), 'name', default="Undefined")
 
    # Sort in descending order

    result = df.sort(['total'], ascending=[0])
import sys
from pyspark import SparkContext
from GeoBases import GeoBase

geo_a = GeoBase(data='airports', verbose=False)

def get_arrival_airport(line):
	line_split=line.split("^")
	if len(line_split)<35:
		airport=line_split[10]
		return airport.split(" ")[0]
	airport=line_split[12]
	return airport.split(" ")[0]

def get_pax(line):
	line_split=line.split("^")
	if len(line_split)<35:
		return int(line_split[len(line_split)-4])
	return int(line_split[34])

def is_valid_line(line):
	line_split=line.split("^")
	if len(line_split)>=35:
		if line_split[34].split(" ")[0]=='pax':
			return False
		else:
			return True
	else:
		if line_split[len(line_split)-4].split(" ")[0]=='pax':
			return False
		else:
from GeoBases import GeoBase
import numpy as np
import matplotlib.pyplot as plt
import csv
from tzwhere import tzwhere
import scipy.optimize as sp
from scipy.integrate import odeint

geo_a = GeoBase(data='airports', verbose=False)

csvdata = list(csv.reader(open("UTFData.csv", encoding='utf-8')))
csvdata = np.array(csvdata)
dataset1 = [[36.6, -121.89846], [52.1427, 6.1961], [-37.8136, 144.9631],
            [31.2304, 121.4737], [22.39, 114.1095], [55.7558, 37.6173]]
dataset2 = [[42.3601, -71.0589], [42.3601, -71.0589], [1.3521, 103.8198],
            [39.91, 160.3636], [22.3964, 114.1095], [22.3964, 114.1095],
            [55.7558, 37.6173], [52.0907, 5.1412], [52.2297, 21.0122],
            [55.6761, 12.5683], [-37.8136, 144.9631]]
latlongarray = dataset1
#conference=[59.9343,30.3351]
#conference=[24.958202,46.700779]
#conference=[-11.2027,17.8739]

tz = tzwhere.tzwhere()

panic = False
if (latlongarray == dataset1):
    zonerestriction = 3
elif (latlongarray == dataset2):
    zonerestriction = 1
#loc1 = (37.5665,126.9780)
Example #14
0
__author__ = 'dani'
import pandas as pd
from Timer import Timer
from config import getFilePath
from GeoBases import GeoBase

geo_o = GeoBase(data='ori_por', verbose=False)
cols = ['year', 'arr_port', 'pax']
with Timer() as t:
    data = pd.read_csv(getFilePath('bookings', '2'), sep='^', error_bad_lines=False, warn_bad_lines=True, usecols=cols)
    l = len(data)
print("=> read: %s s" % t.secs)
print l
print data.columns

filtered_data = data[data.year == 2013]
filtered_data = filtered_data[['arr_port', 'pax']]
#filtered_data has the counter row as index being a DataFrame
#and two series objects: arr_port, pax

#Method 1
#groupby on the arr_port column and then iterate on groups
result = {}
with Timer() as t:
    for arr_port, group in filtered_data.groupby('arr_port'):
        result[arr_port] = group["pax"].sum()
result = sorted(result.items(), key=lambda x:x[1], reverse=True)
print "=> top computation - 1: %s s" % t.secs
#for k,v in result[:10]:
#    print k,v
Example #15
0
from geopy import geocoders, distance
import googlemaps
from GeoBases import GeoBase

GMAPS_KEY = 'AIzaSyCC7J4WeMBCJiwoEPUhm9-mOZlc8NDR7Kc'

# geopy geocoder
g = geocoders.GoogleV3(api_key=GMAPS_KEY)
# GoogleMaps for driving directions
gmaps = googlemaps.Client(key=GMAPS_KEY)
# GeoBase for geocoding data for all IATA-coded airports
geo_a = GeoBase(data='airports', verbose=False)

# Build list of primary airports (as defined by the FAA)
# for filtering IATA-coded airports
f = open('airports.txt','r')
IATA = [line.strip() for line in f.readlines()]
f.close()

# Lookup airports near a destination
destination = "Chattanooga, TN"
address0, (lat0, long0) = g.geocode(destination, exactly_one=False)[0]
latlong0 = (lat0, long0)
near = sorted(geo_a.findNearPoint((lat0, long0), 160))
#print(near);
airports = [k for _, k in sorted(geo_a.findNearPoint((lat0,long0), 160)) if k in IATA]
print(airports);
Example #16
0
import pandas as pd
from pandas import DataFrame
import sys, datetime, pandas.io.data
import matplotlib.pyplot as plt
from GeoBases import GeoBase


df = pd.read_csv(sys.argv[1], sep = '^')
df = df.rename(columns=lambda x: x.strip())

df2 = df.groupby(by='arr_port',level='shit').size()
print df2
df3.replace(" ", "")
df3.sort(ascending=False)
df4 = df2.head(n=10)
geo_a = GeoBase(data='ori_por', verbose = False)
hijodeputa = geo_a.get('AAR', 'city_name_utf')
print hijodeputa

joder = geo_a.get('AAR', 'name')
print joder

lista = []
lista = ['AAE', 'AAL', 'AAQ', 'AAR', 'ABE']
for i in lista:
    shit = geo_a.get(i, 'name')
    print "f**k", i, shit

cities = []
for airport, size in df4.iteritems():
    print airport
if __name__ == '__main__':
    if len(sys.argv) != 2:
        print 'Please provide a booking file.'
        sys.exit(0)
    aFile = sys.argv[1]
    aData = None
    if not os.path.exists(aFile):
        print 'Please check if the booking file', aFile, 'does exist'
        sys.exit(0)
    else:
        # extract data from file
        aChunks = pandas.read_table(aFile,chunksize=10000,sep='^',usecols=['arr_port','pax'])
        aData = pandas.DataFrame()
        aData = pandas.concat(chunk for chunk in aChunks)
        grouped = aData.groupby(aData['arr_port']).sum()
        sgrouped = grouped.sort_values(by='pax',ascending=False)
		# GeoBase
        geo = GeoBase(data='ori_por',verbose=False)
		# write into csv file
        csvFile = open('test2_result.csv','wb')
        csvWriter = csv.writer(csvFile)
        for i in sgrouped.index:
            item = sgrouped.ix[i]
            try:
                print geo.get(item.name.strip(),'name'),item.pax
                csvWriter.writerow([geo.get(item.name.strip(),'name'),item.pax])
            except:
                print item.name.strip(),item.pax
                csvWriter.writerow([item.name.strip(),item.pax])
        csvFile.close()
# -*- coding: utf-8 -*-

import web # Import the library web to create the web service
import pandas as pd # import the library pandas
import json # import the library json for JSON files
from GeoBases import GeoBase # import GeoBase to retrieve airport countries

# Columns names and filenames
filenameBooking="bookings.csv" # name of the bookings file
usedColumns = ['arr_port','pax'] # columns used to process the file

geo_o = GeoBase(data='ori_por', verbose=False) # load the GeoBase data

# Create a function that retrieves the country name from the airport code
# If the airport code is unknown, it returns the default value "UNKNOWN"
# I use the strip function to remove whitespaces in airport codes (otherwise, no airport code is recognized)
def strCountryName(x):
    return geo_o.get(x.strip(), 'city_name_ascii',default='UNKNOWN')
    
# Define the function that writes the JSON file
def writeJsonFile(topNumber,filenameJSON):
    # read the CSV file, keeping only columns 'arr_port' and'pax'
    df = pd.read_csv(filenameBooking,sep='^', usecols=usedColumns, nrows=1000) 
    dfTop=df['pax'].groupby(df['arr_port']).sum().reset_index().sort_values(by='pax', ascending=False)[:topNumber]
        
    dfTop['Rank']=range(1,topNumber+1,1) # Generate integers up to topNumber+1, but not including topNumber+1
    dfTop = dfTop.reindex(columns=['Rank','arr_port','pax'])
    dfTopJSON=dfTop.rename(columns={'arr_port': 'Airport','pax': 'Number of bookings'})
        
    # I "map" the function 'strCountryName' to each row of the column 'Airport' to create a new column named 'Country'       
    dfTopJSON['Country']=dfTopJSON['Airport'].map(strCountryName)    
Example #19
0
import sys,os,pandas,csv
from GeoBases import GeoBase

if __name__ == '__main__':
    if len(sys.argv) != 2:
        print 'Please provide a booking file.'
        sys.exit(0)
    aFile = sys.argv[1]
    aData = None
    if not os.path.exists(aFile):
        print 'Please check if the booking file', aFile, 'does exist'
        sys.exit(0)
    else:
        # extract data from file
        aChunks = pandas.read_table(aFile,chunksize=10000,sep='^',usecols=['arr_port','pax'])
        aData = pandas.DataFrame()
        aData = pandas.concat(chunk for chunk in aChunks)
        grouped = aData.groupby(aData['arr_port']).sum()
        sgrouped = grouped.sort_values(by='pax',ascending=False)
        geo = GeoBase(data='ori_por',verbose=False)
        for i in range(10):
            try:
                item = sgrouped.ix[i]
                print geo.get(item.name.strip(),'name'),item.pax
            except:
                print 'no more airports...'
                sys.exit(0)
Example #20
0
    )
]

layout = dict(title='Pyongyang to Chicago',
              showlegend=False,
              geo=dict(
                  resolution=100,
                  showland=True,
                  showlakes=True,
                  landcolor='rgb(204, 204, 204)',
                  countrycolor='rgb(204, 204, 204)',
                  lakecolor='rgb(255, 255, 255)',
                  projection=dict(type="equirectangular"),
                  coastlinewidth=2,
                  lataxis=dict(range=[-100, 100],
                               showgrid=False,
                               tickmode="linear",
                               dtick=20),
                  lonaxis=dict(range=[-100, 100],
                               showgrid=False,
                               tickmode="linear",
                               dtick=20),
              ))

fig = dict(data=nyc_london, layout=layout)
py.iplot(fig, validate=False, filename='d3-great-circle')

point = (40, -75)
geo_a = GeoBase(data='airports', verbose=False)
print(list(geo_a.findClosestFromPoint(point)))
print()
Example #21
0
    if (index > block_limit) : break 

# Remove spaces, done here because it's costly
aggregated['arr_port'] = aggregated['arr_port'].apply(lambda x : x.strip())

sorted_data = aggregated.groupby('arr_port').sum().sort(columns='pax',ascending=False)
sorted_data.to_csv(path_or_buf=config.data_folder + 'sorted_list.csv',sep='^')
final_data = sorted_data[:10]
final_data

# Bonus point: Look up the names for each airport 

from GeoBases import GeoBase
record_count = 200
airport_data = pd.read_table(config.data_folder + 'sorted_list.csv', sep='^')
airport_data['city_name'] = 'None'
geo_o = GeoBase(data='ori_por', verbose=False)
# Define a Lambda function
find_city = lambda x,geo_o : geo_o.get(x.strip(),'city_name_ascii')

for i, row in enumerate(airport_data.values):
    try:
        city_name = find_city(airport_data['arr_port'][i],geo_o)
        airport_data['city_name'][i] = city_name
        print city_name
    except:
        print "Oops!  That was no valid number.  Try again...",i,airport_data['arr_port'][i]

# Save it for using it later on in the bonus exercise (Web service)
airport_data.to_csv(path_or_buf= config.data_folder + 'sorted_named_list.csv',sep='^')
airport_data[:10]
Example #22
0
class csvContainer:
  """ Container to do simple operations with csv files """
  
  f    = None    # The file. Is always open and positioned after the header for reading .
  hea  = None    # The header as a dictionary converting names into column numbers
  fnam = None    # The file name. For creating realted names for audit()
  auto = False   # Automatically convert comma separated files when ^-separated fails
  prts = None    # GeoBases aiports object. If == None, gracefully reject the search

  
  def __init__(self, fnam, auto = False):
    """ Open the file and read the header, also try to load a GeoBases airport object and gracefully handle installation issues. """
    
    self.fnam = fnam
    self.auto = auto
    self.f    = open(fnam)
    
    try:
      from GeoBases import GeoBase
      self.prts = GeoBase(data = 'airports', verbose = False)
    except ImportError:
      pass

    li = self.f.readline()
    ky = [i.strip(' ') for i in li.rstrip('\n').split('^')]
    
    self.hea = dict(zip(ky, range(len(ky))))
   
   
  def audit(self):
    """ Scan the entire file counting columns and writing problematic lines to a file """
    
    numcol = len(self.hea)
    stpos  = self.f.tell()
    
    errfile = open(self.fnam + '.audit.csv', 'w')
    
    ln = 1
    li = self.f.readline()
    while li != '':
      row = li.rstrip('\n').split('^')
      
      if self.auto and len(row) == 1: row = li.rstrip('\n').split(',')
      
      if len(row) != numcol: 
#        if len(row) > numcol: raise ContainerError, 'Too many columns %s ' % len(row)
#        if len(row) < 2:      raise ContainerError, 'No separator found or empty row'
        errfile.write(li)
          
        print ln, len(row), row, self.f.tell(), '\n'
      
      li = self.f.readline()
      ln += 1
    
    self.f.seek(stpos)
    errfile.close()


  def countlines(self):
    """ Count lines not counting lines with errors """
    
    numcol = len(self.hea)
    stpos  = self.f.tell()
    
    ln = 0
    li = self.f.readline()
    while li != '':
      row = li.rstrip('\n').split('^')
      
      if self.auto and len(row) == 1: row = li.rstrip('\n').split(',')
      
      if len(row) == numcol: 
        ln += 1
      
      li = self.f.readline()
    
    self.f.seek(stpos)
    return (ln)
    
    
  def niceAirportDescription (self, iatacode):
    """ Returns a string with a nicec description of airport iatacode if GeoBases is available """
    
    if self.prts is None:
      return('** GeoBases is not installed. **')
      
    dd = self.prts.get(iatacode)
    
    return(dd['iata_code'] + ': ' + dd['name'] + ', ' + dd['city_code'] + ', ' + dd['country_name'])
    


  def aggregate(self, x, groupby, fun):
    """ Apply function fun to variable' x grouping by groupby. x and groupby are variable names and fun is a function possibly 
    a lambda like lambda a, b: a+b. The function is computed incrementally where a is the accumulated value and b is the new value. """

    numcol = len(self.hea)
    stpos  = self.f.tell()
    
    grby  = {}
    idx_x = self.hea[x]
    idx_g = self.hea[groupby]
    
    li = self.f.readline()
    while li != '':
      row = li.rstrip('\n').split('^')
      
      if self.auto and len(row) == 1: row = li.rstrip('\n').split(',')
      
      if len(row) == numcol:
        x = row[idx_x].strip(' ')
        g = row[idx_g].strip(' ')
        
        if g in grby:
          x = fun(grby[g], float(x))
          
        grby[g] = float(x)
      
      li = self.f.readline()
    
    self.f.seek(stpos)

    return(grby)
    

  def groupby(self, vnames, numchars):
    """ Group by all variables in vnames and count the number of rows. Take only the first numchars characters of each vname's value. """

    numcol = len(self.hea)
    stpos  = self.f.tell()
    
    grby  = {}
    idx   = tuple([self.hea[i] for i in vnames])
    n     = len(vnames)
    
    li = self.f.readline()
    while li != '':
      row = li.rstrip('\n').split('^')
      
      if self.auto and len(row) == 1: row = li.rstrip('\n').split(',')
      
      if len(row) == numcol:
        g = tuple([row[idx[i]].strip(' ')[:numchars[i]] for i in range(n)])
        
        if g in grby:
          x = grby[g] + 1
        else:
          x = 1
          
        grby[g] = x
      
      li = self.f.readline()
    
    self.f.seek(stpos)

    return(grby)
    

  def writeSubset(self, vname, rexp, outfi):
    """ Writes all lines to a file where variable vname matches the regular expression rexp to a new file outfi """

    numcol = len(self.hea)
    stpos  = self.f.tell()
    
    ofile    = open(outfi, 'w')
    colnames = sorted(self.hea, key = self.hea.get)
    li       = '^'.join(colnames) + '\n'
    ofile.write(li)
    
    import re
    
    rex = re.compile(rexp)
    idx = self.hea[vname]
    
    li = self.f.readline()
    while li != '':
      row = li.rstrip('\n').split('^')
      
      if self.auto and len(row) == 1: row = li.rstrip('\n').split(',')
      
      if len(row) == numcol:
        x = row[idx].strip(' ')
        
        if rex.match(x) is not None:
          ofile.write(li)
            
      li = self.f.readline()
    
    self.f.seek(stpos)
    ofile.close()
    
   


#S a n d b o x  functions: These functions do not make part of the "neat" object but are kept to understand what was done.
#-------------------------
    
  def join_step1(self):
    """ This function was only implemented for researching the datafiles. it is NOT generale purpose but includes file specifics.
        See the document OnJoiningSearchesWithBookings.txt for an explanation. """

    numcol = len(self.hea)
    stpos  = self.f.tell()
    
    grby  = {}
    idx_1 = self.hea['Origin']
    idx_2 = self.hea['Destination']
    idx_3 = self.hea['Seg1Date']
    
    li = self.f.readline()
    while li != '':
      row = li.rstrip('\n').split('^')
      
      if self.auto and len(row) == 1: row = li.rstrip('\n').split(',')
      
      if len(row) == numcol:
        g = (row[idx_1].strip(' '), row[idx_2].strip(' '), row[idx_3].strip(' '))
        
        if g in grby:
          x = grby[g] + 1
        else:
          x = 1
          
        grby[g] = x
      
      li = self.f.readline()
    
    self.f.seek(stpos)

    return(grby)       
    

  def join_step2_search(self, filtr):
    """ This function was only implemented for researching the datafiles. it is NOT generale purpose but includes file specifics.
        See the document OnJoiningSearchesWithBookings.txt for an explanation. """

    numcol = len(self.hea)
    stpos  = self.f.tell()

    ofile = open(self.fnam + '.filter.csv', 'w')
    
    idx_1 = self.hea['Origin']
    idx_2 = self.hea['Destination']
    idx_3 = self.hea['Seg1Date']
    
    li = self.f.readline()
    while li != '':
      row = li.rstrip('\n').split('^')
      
      if self.auto and len(row) == 1: row = li.rstrip('\n').split(',')
      
      if len(row) == numcol:
        g = (row[idx_1].strip(' '), row[idx_2].strip(' '), row[idx_3].strip(' '))
        
        if g in filtr:
          ofile.write(li)
      
      li = self.f.readline()
    
    self.f.seek(stpos)       
    ofile.close()

    
  def join_step2_book(self, filtr):
    """ This function was only implemented for researching the datafiles. it is NOT generale purpose but includes file specifics.
        See the document OnJoiningSearchesWithBookings.txt for an explanation. """

    numcol = len(self.hea)
    stpos  = self.f.tell()

    ofile = open(self.fnam + '.filter.csv', 'w')
    
    idx_1 = self.hea['dep_port']
    idx_2 = self.hea['arr_port']
    idx_3 = self.hea['brd_time']
    
    li = self.f.readline()
    while li != '':
      row = li.rstrip('\n').split('^')
      
      if self.auto and len(row) == 1: row = li.rstrip('\n').split(',')
      
      if len(row) == numcol:
        g = (row[idx_1].strip(' '), row[idx_2].strip(' '), row[idx_3].strip(' ')[:10])
        
        if g in filtr:
          ofile.write(li)
      
      li = self.f.readline()
    
    self.f.seek(stpos)       
    ofile.close()
    
      
## end of csvContainer