def __init__(self): self.database_name = glb('dbName') self.hostname = glb('pgHost') self.username = glb('pgUser') self.password = glb('pgPasswd') self.url_connect = "jdbc:postgresql://" + self.hostname + \ ":5432/" + self.database_name self.properties = {"user" : self.username, "password" : self.password, "driver" : "org.postgresql.Driver"}
def __init__(self, year, month): self.year = year self.month = month self.weatherBucket = glb('s3WeatherBucket') self.taxiBucket = glb('s3TaxiBucket') self.ylwTaxiPrefix = glb('ylwTaxiPrefix') self.weatherS3Key = [] self.weatherFile = [] self.wthrRep = [] self.getYlwTaxiFilename() self.getWeatherFilename( ['Central-Park', 'La-Guardia', 'JFK-Intl', 'Newark-Intl'])
def writeToPostgres(self, prefix): # There are two different kind of schemas: # Prior to 2017, use coordinates; after, location ID if 'pULocId' in self.ylwTaxi.columns: keepCols = glb('pgKeepCols1') else: keepCols = glb('pgKeepCols2') dropCols = [clm for clm in self.ylwTaxi.columns if clm not in keepCols] for clm in dropCols: self.ylwTaxi = self.ylwTaxi.drop(clm) self.ylwTaxi = self.ylwTaxi.select(keepCols) self.pgTableName = prefix + '_' + self.year + '_' + self.month connector = postgres.PostgresConnector() connector.write(self.ylwTaxi, self.pgTableName, glb('pgWriteMode'))
def parseIsdLine(line): weatherFields = glb('weatherFields') # Initialize all to none readings={key: None for key in weatherFields} # First 4 fields are year, month, day and hour. # Convert them into a time-stamp: date_hr = [] fields = line.split() for cnt in range(4): date_hr.append(fields.pop(0)) date_hr_str = date_hr[0]+'-'+date_hr[1]+'-'+date_hr[2]+' '+date_hr[3]+':00:00' timeType = dtt.getTimeType(date_hr_str) timeStmp = dtt.getTimeStamp(timeType) fields.insert(0,timeStmp) # Load values into readings and remove scaling: for cnt, val in enumerate(fields): if not isNone(val): key_val = getActualReadings(cnt, val) readings[weatherFields[key_val[0]]] = key_val[1] return readings
def main(): conf = SparkConf() conf.set('spark.executorEnv.PGHOST' , os.environ['PGHOST']) conf.set('spark.executorEnv.PGUSER' , os.environ['PGUSER']) conf.set('spark.executorEnv.PGPASSWORD', os.environ['PGPASSWORD']) spark = SparkSession.builder \ .appName("batchProcess") \ .config(conf=conf) \ .getOrCreate() spark.sparkContext.addPyFile("postgres.py") spark.sparkContext.addPyFile("globalVar.py") spark.sparkContext.addPyFile("datetimeTools.py") spark.sparkContext.addPyFile("batchProcessing.py") sqlc = SQLContext(sparkContext=spark.sparkContext, sparkSession=spark) # Years ond months of interest: n-years back from current year nOfYears = glb('nOfPassYears') currYear = dt.now().year yearList = [str(cnt + currYear - nOfYears + 1) for cnt in range(nOfYears)] #yearList = ['2017','2018','2019'] months = [str(val + 1).zfill(2) for val in range(12)] # Create an object for every taxi table # Make sure to remove object if file does not exist ptr = 0; tableObj = [] for yr in yearList: for mn in months: tableObj.append(batchProcess(yr, mn)) if not tableObj[ptr].hasTable(sqlc): del tableObj[ptr] else: ptr = ptr + 1 # Start calling methods in batchProcessing.py for table in tableObj: table.readTable(sqlc) # Read table table.addDatetime() # Add year, month, day and hour to table table.addMetrics() # Add vehicle speed and fare per mile table.fixPrecip() # Fix precipitation values for ind, station in enumerate(glb('noaaStations')): table.setStation(ind) table.aggByHour() # Aggregate data by the hour table.writeResults('hourly_yellow_') # Write to DB with prefix 'yellow' spark.stop()
def writeResults(self, prefix): pgTableName = prefix + self.year + '_' + self.month + '_' + 'st' + str( self.station) conn = postgres.PostgresConnector() conn.write(self.hr, pgTableName, glb('pgWriteMode'), db='taxi_aggregates')
def getWeatherFilename(self, stationNames): # Create a list of weather filepaths for every station s3Prefix = glb('s3Prefix') for stationName in stationNames: keyName = stationName + '/' + getStationID(stationName) + '-' + \ self.year + '-' + self.month + '.csv' filepath = s3Prefix + self.weatherBucket + "/" + keyName self.weatherS3Key.append(keyName) self.weatherFile.append(filepath)
def readData(self, spark): # Read both weather reports (all stations) and taxi data s3ReadMode = glb('s3ReadMode') if self.year == '2016' and self.month in [ '07', '08', '09', '10', '11', '12' ]: badSchmType = '1' # Known bad schema else: badSchmType = '0' fields, dTypes = \ gtf.getColNamesAndTypes(self.taxiBucket, self.ylwTaxiS3Key, badSchmType) self.ylwTaxiSchema = makeSchema(fields, dTypes) self.ylwTaxi = spark.read.csv(self.ylwTaxiFile, header=True, \ mode=s3ReadMode, schema=self.ylwTaxiSchema) self.weatherSchema = makeSchema(glb('weatherFields'), glb('weatherDataType')) for wFile in self.weatherFile: self.wthrRep.append(spark.read.csv(wFile, header=True, \ mode=s3ReadMode, schema=self.weatherSchema))
def noaaGzipToCsv(filename): # Translate the isd-lite format to csv readings = [] with gzip.open(filename, "r") as text: for line in text: readings.append(parseISD.parseIsdLine(line)) # See parseISD.py outFName = filename.replace('.gz', '.csv') with open(outFName, 'w') as outFile: writer = csv.DictWriter(outFile, fieldnames=glb('weatherFields')) writer.writeheader() for row in readings: writer.writerow(row) return outFName
def fixBadNoaaData(): print("Central-Park 2012 APR-AUG messed up, use La-Guardia data.") weatherBucket = glb('s3WeatherBucket') ovr = True # Overwrite S3Tools.copy_among_buckets(weatherBucket, 'La-Guardia/725030-14732-2012-04.csv', \ weatherBucket, 'Central-Park/725053-94728-2012-04.csv', ovr) S3Tools.copy_among_buckets(weatherBucket, 'La-Guardia/725030-14732-2012-05.csv', \ weatherBucket, 'Central-Park/725053-94728-2012-05.csv', ovr) S3Tools.copy_among_buckets(weatherBucket, 'La-Guardia/725030-14732-2012-06.csv', \ weatherBucket, 'Central-Park/725053-94728-2012-06.csv', ovr) S3Tools.copy_among_buckets(weatherBucket, 'La-Guardia/725030-14732-2012-07.csv', \ weatherBucket, 'Central-Park/725053-94728-2012-07.csv', ovr) S3Tools.copy_among_buckets(weatherBucket, 'La-Guardia/725030-14732-2012-08.csv', \ weatherBucket, 'Central-Park/725053-94728-2012-08.csv', ovr)
def splitIntoMonth(csvFiles, years): # Partition into months # Need to read the year before and after to account for # bad taxi data (some entry appears a couple of days late) outFiles = [] for ind, csvFile in enumerate(csvFiles): curYearRec = [] with open(csvFile) as curFile: dfCur = csv.DictReader(curFile) for row in dfCur: curYearRec.append(row) if not ind == 0: # Previous year data ptr = 0 with open(csvFiles[ind - 1]) as preFile: dfPre = csv.DictReader(preFile) for row in dfPre: curYearRec.insert(ptr, row) ptr = ptr + 1 if not ind == len(csvFiles) - 1: # Following year data with open(csvFiles[ind + 1]) as posFile: dfPost = csv.DictReader(posFile) for row in dfPost: curYearRec.append(row) # Create a time stamp for each beginning of month months = [str(val + 1).zfill(2) for val in range(12)] monStr = [str(years[ind]) + '-' + mn + '-01 00:00:00' for mn in months] monTS = dtt.strArrayToTimeStamps(monStr) monTS.append(monTS[-1] + 31 * 24 * 3600) for cnt, month in enumerate(months): indices = [idx for idx, val in enumerate(curYearRec) \ if int(val['timeStamp']) >= monTS[cnt] - 48*3600 and \ int(val['timeStamp']) <= monTS[cnt + 1] + 49*3600] # 48 hr overlap; 49 because of the way list works if not indices == []: monthRec = curYearRec[indices[0]:indices[-1]] outFName = csvFile.replace('.csv', '-' + month + '.csv') with open(outFName, 'w') as outFile: writer = csv.DictWriter(outFile, fieldnames=glb('weatherFields')) writer.writeheader() for row in monthRec: writer.writerow(row) outFiles.append(outFName) return outFiles
def main(): # One spark session to join them all conf = SparkConf() conf.set('spark.executorEnv.PGHOST', os.environ['PGHOST']) conf.set('spark.executorEnv.PGUSER', os.environ['PGUSER']) conf.set('spark.executorEnv.PGPASSWORD', os.environ['PGPASSWORD']) spark = SparkSession.builder \ .appName("timeJoin") \ .config(conf=conf) \ .getOrCreate() spark.sparkContext.addPyFile("postgres.py") spark.sparkContext.addPyFile("globalVar.py") spark.sparkContext.addPyFile("getTaxiFields.py") spark.sparkContext.addPyFile("datetimeTools.py") spark.sparkContext.addPyFile("appendWeatherData.py") spark.sparkContext.addPyFile("dataProcessing.py") # Years ond months of interest: n-years back from current year nOfYears = glb('nOfPassYears') currYear = datetime.now().year yearList = [str(cnt + currYear - nOfYears + 1) for cnt in range(nOfYears)] months = [str(val + 1).zfill(2) for val in range(12)] # Create an object for every taxi data file # Make sure to remove object if file does not exist ptr = 0 dataObj = [] for yr in yearList: for mn in months: dataObj.append(dataProcess(yr, mn)) if not dataObj[ptr].hasData(): del dataObj[ptr] else: ptr = ptr + 1 # Start calling methods in dataProcessing.py for dProp in dataObj: dProp.readData(spark) # Read data dProp.addTimestamp() # Convert string to timestamp dProp.addWthrStationID() # Add weather station ID dProp.joinTables(spark) # Main join process dProp.writeToPostgres('yellow') # Write to DB with prefix 'yellow' #dProp.printCheck() spark.stop()
def main(): # These variables are sourced from globalVar.py noaaFtpDomain = glb('noaaFtpDomain') noaaFtpPath = glb('noaaFtpPath') noaaStations = glb('noaaStations') noaaLogin = glb('noaaLogin') noaaPassword = glb('noaaPassword') nOfYears = glb('nOfPassYears') weatherBucket = glb('s3WeatherBucket') # Backup weather data print('Moving current data to back-up bucket.') backupBucket = weatherBucket + '-bak' S3Tools.duplicateBucket(origBucket=weatherBucket, newBucket=backupBucket) # Years of interest: n-years back from current year currYear = datetime.now().year yearList = [cnt + currYear - nOfYears + 1 for cnt in range(nOfYears)] # Loop over a few stations of interest for station in noaaStations: csvFiles = [] for yrInd, year in enumerate(yearList): pathname = noaaFtpPath + str( year) # According to NOAA dir. structure filename = station + '-' + str(year) + '.gz' print("Processing: %s" % filename) downloadFromFtp(noaaFtpDomain, pathname, noaaLogin, noaaPassword, [filename]) csvFiles.append(noaaGzipToCsv(filename)) cleanupLocal([filename]) # Remove *.gz files # Partition by month print("Partitioning by month...") outFiles = splitIntoMonth(csvFiles, yearList) uploadToS3(outFiles, weatherBucket, noaaStations.get(station)) cleanupLocal(csvFiles) cleanupLocal(outFiles) fixBadNoaaData()
def getYlwTaxiFilename(self): # Define taxi data filepath s3Prefix = glb('s3Prefix') self.ylwTaxiS3Key = self.ylwTaxiPrefix + self.year + '-' + self.month + '.csv' self.ylwTaxiFile = s3Prefix + self.taxiBucket + "/" + self.ylwTaxiS3Key
def getStationID(stationName): noaaStations = glb('noaaStations') return noaaStations.keys()[noaaStations.values().index(stationName)]