def distanceAllToCities(cls): Log.info("Calculating earthquakes distance to all cities..") (ret, out, err) = System.command( ['hive', '-f', '../hive_ql/distance-to-cities.hql']) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def createEarthquakesTables(cls): Log.info("Creating hive tables:") (ret, out, err) = System.command( ['hive', '-f', '../hive_ql/create-earthquakes-tables.hql']) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def distanceToClosestCity(cls): Log.info("Calculating earthquakes distance to closest city..") (ret, out, err) = System.command( ['hive', '-f', '../hive_ql/distance-to-city-closest.hql']) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def clearEarthquakesTables(cls): Log.warning( "Option 'drop-tables' is enabled. All data will be removed.") (ret, out, err) = System.command(['hive', '-f', '../hive_ql/clear-tables.hql']) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def distanceToAllSeismographicStations(cls): Log.info( "Calculating earthquakes distance to all seismographic stations..") (ret, out, err) = System.command( ['hive', '-f', '../hive_ql/distance-to-stations.hql']) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def createDB(cls, path): hivevar = "path='" + path + "/earthquakes.db'" Log.info("Creating hive database: 'earthquakes'") (ret, out, err) = System.command([ 'hive', '-hivevar', hivevar, '-f', '../hive_ql/create-database.hql' ]) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def produceOutputSeismographs(cls): Log.info( "ETL pipeline Output: Join earthquakes with closest city,station and produce seismograph.." ) (ret, out, err) = System.command( ['hive', '-f', '../hive_ql/output-seismograph.hql']) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def loadEarthquakesData(cls, file): hivevar = "path='" + file + "'" Log.info("Loading earthquakes data to hive:") Log.info(file) (ret, out, err) = System.command([ 'hive', '-hivevar', hivevar, '-f', '../hive_ql/load-earthquakes.hql' ]) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def Read(cls): try: with open(r'../../conf/earthquakes-application.yaml') as file: configuration = yaml.load(file, Loader=yaml.FullLoader) Log.info("Loading configuration from earthquakes-application.yaml") Log.info("values: {}".format(configuration)) history_args, hive_args = cls.Evaluate(configuration) return history_args, hive_args except EnvironmentError as error: Log.error("Configuration can not be loaded.") Log.error(error) Log.exit()
def validateMagnitude(cls, arg): try: magnutide = float(arg) if 0 <= magnutide <= 8: return magnutide else: Log.exit() except: Log.error( "invalid magnitude input, value: '{}'. You can only pass magnitude values from '0.0' to '8.0'. Exciting the application..".format( arg)) Log.exit()
def validateYear(cls, arg): now = datetime.utcnow() currentYear = now.year try: year = int(arg) if 1900 <= year <= currentYear: return year else: Log.exit() except: Log.error( "invalid year input, value: '{}'. You can only pass year values from '1900' to '{}'. Exciting the application..".format( arg, currentYear)) Log.exit()
def Request(cls, start, end, magnitude_over): eventlet.monkey_patch() with eventlet.Timeout(180): try: with requests.Session() as s: download = s.get( "https://earthquake.usgs.gov/fdsnws/event/1/query?format=csv&starttime={}&endtime={}&minmagnitude={}" .format(start, end, str(magnitude_over))) decoded_content = download.content.decode('utf-8') eq_csv = csv.reader(decoded_content.splitlines(), delimiter=',') eq_list = list(eq_csv) return eq_list except Exception as error: Log.error("Request Error:") Log.error(error)
def filesInPath(cls, path): cls.path = path Log.info("HDFS path validation:") (ret, out, err) = cls.command(['hdfs', 'dfs', '-ls', path]) if ret == 1: Log.error("HDFS path Error. Exiting the Application..") Log.error(err) Log.exit() else: Log.info("Valid HDFS path") lines = out.splitlines() for line in lines: line_split = line.split(' ') line_len = len(line_split) file_exists = re.search('.*csv$', line_split[line_len - 1]) if file_exists: cls.files.append(line_split[line_len - 1])
def loadCities(cls, path): hivevar = "path='" + path + "/cities.csv'" Log.info("Loading cities data to hive:") Log.info("Creating cities staging table..") (ret, out, err) = System.command([ 'hive', '-hivevar', hivevar, '-f', '../hive_ql/load-cities-staging.hql' ]) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err)) Log.info("Creating cities final table..") (ret, out, err) = System.command(['hive', '-f', '../hive_ql/load-cities.hql']) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def toList(cls, args1, args2): yearsList = [] if args2 is None: yearsTempList = str(args1).split(",") for year in yearsTempList: yearInt = cls.validateYear(year) if yearInt not in yearsList: yearsList.append(yearInt) yearsList.sort() elif args2 is not None: fromYear = cls.validateYear(args1) toYear = cls.validateYear(args2) if fromYear > toYear: Log.error("Input Error. 'from-year' value must be less that 'to-year' value. Exiting the application..") Log.exit() else: for year in range(fromYear, toYear + 1): yearsList.append(year) return yearsList
def loadSeismographicStations(cls, path): hivevar = "path='" + path + "/seismographic-stations.csv'" Log.info("Loading seismographic stations data to hive:") Log.info("Creating seismographic stations staging table..") (ret, out, err) = System.command([ 'hive', '-hivevar', hivevar, '-f', '../hive_ql/load-seismographic-stations-staging.hql' ]) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err)) Log.info("Creating seismographic stations final table..") (ret, out, err) = System.command([ 'hive', '-hivevar', hivevar, '-f', '../hive_ql/load-seismographic-stations.hql' ]) Log.info("return, {}".format(ret)) Log.info("output, {}".format(out)) Log.error("error, {}".format(err))
def main(): Log.info('-----------------------') Log.info('Download process starts') Log.info('-----------------------') inputArgs = sys.argv args = inputArgs[1:] StoreData.createFolder() yearsTempList, magnitudeOver, download_again = Input.getValues(args) path = HDFS.getPath() Log.info("Earthquakes acquisition starts..") years = Database.QueryInput(yearsTempList, magnitudeOver, download_again) Log.info( "Requesting earthquakes data with magnitude over {}, for years: {}". format(magnitudeOver, years)) for year in years: Log.info("Processing year: {}".format(year)) Log.info("Earthquakes acquisition starts.") firstDate = date(year, 1, 1) lastDate = date(year, 12, 31) for d in dateRange(firstDate, lastDate): start = d.strftime("%Y-%m-%d") + "T00:00:00.000Z" end = (d + timedelta(days=1)).strftime("%Y-%m-%d") + "T00:00:00.000Z" try: eq_list_raw = Acquisition.Request(start, end, magnitudeOver) eq_list_no_headers = Preprocessing.cleanHeaders(eq_list_raw) eq_list_split_date_time = Preprocessing.splitDateTime( eq_list_no_headers) eq_list = Preprocessing.checkCountry(eq_list_split_date_time) StoreData.toFile(eq_list, year, d, magnitudeOver) except Exception as error: Log.error("Error while processing a Request:") Log.error(error) Log.info("Earthquakes acquisition for year {} finished".format(year)) HDFS.put( '../../data/earthquakes-history/earthquakes{}mag{}.csv'.format( year, magnitudeOver), path) Log.info('---------------------') Log.info('Download process ends') Log.info('---------------------')
def Evaluate(cls, configuration): history_args = {} hive_args = {} for config, values in configuration.items(): if config == 'hdfs-path': history_args['--hdfs-path='] = values hive_args['--hdfs-path='] = values elif config == 'download-list-of-years': list_of_years = "" if values is not None: for value in values: if list_of_years == "": list_of_years = str(value) else: list_of_years = list_of_years + "," + str(value) if list_of_years is not "": history_args['--year='] = list_of_years elif config == 'download-group-of-years': if values is not None: if len(values) == 2: history_args['--from-year='] = str(values[0]) history_args['--to-year='] = str(values[1]) else: Log.error("Property 'download-group-of-years' can take only two values, from-year, to-year") Log.exit() elif config == 'download-magnitude-over': history_args['--magnitude-over='] = str(values) elif config == 'download-again-historical-data': if values is True: history_args['--download-again'] = "" elif config == 'hive-drop-all-tables': hive_args['--drop-tables'] = values if '--hdfs-path=' in history_args.keys(): if history_args['--hdfs-path='] is None: Log.error("You must specify an HDFS path for the data to be stored.") Log.exit() else: Log.error("You must specify an HDFS path for the data to be stored.") Log.exit() if '--year=' in history_args.keys() and ( '--from-year=' in history_args.keys() or '--to-year=' in history_args.keys()): Log.error( "You can not pass values for both 'download-list-of-years' and 'download-group-of-years'. Chose one of this options.") Log.exit() return history_args, hive_args
def getValues(cls, inputArgs): Log.info("input arguments: {}".format(inputArgs)) options = "p:d" longOptions = ["hdfs-path=", "drop-tables"] try: opts, args = getopt.getopt(inputArgs, options, longOptions) except getopt.GetoptError as err: Log.error(err) Log.exit() hdfsPathFlag = False hdfsPathArg = None dropTablesFlag = False for opt, arg in opts: Log.info("processing option: {} with arguments: {}".format( opt, arg)) if opt in ("-p", "--hdfs-path"): if hdfsPathFlag: cls.notUniqueArg() else: hdfsPathFlag = True hdfsPathArg = arg elif opt in ("-d", "--drop-tables"): if dropTablesFlag: cls.notUniqueArg() else: dropTablesFlag = True if hdfsPathFlag is False: Log.error( "Input Error. You must specify a valid HDFS path. Exiting the application.." ) Log.exit() else: HDFS.filesInPath(hdfsPathArg) return dropTablesFlag
def notUniqueArg(cls): Log.error( "Input Error. Can't pass one argument twice. Exiting the application.." ) Log.exit()
def getValues(cls, inputArgs): Log.info("input arguments: {}".format(inputArgs)) options = "y:f:t:m:p:d" longOptions = ["year=", "from-year=", "to-year=", "magnitude-over=", "download-again","hdfs-path="] try: opts, args = getopt.getopt(inputArgs, options, longOptions) except getopt.GetoptError as err: Log.error(err) Log.exit() yearFlag = False yearArg = None fromYearFlag = False fromYearArg = None toYearFlag = False toYearArg = None magnOverFlag = False magnOverArg = None overwriteFlag = False hdfsPathFlag = False hdfsPathArg = None for opt, arg in opts: Log.info("processing option: {} with arguments: {}".format(opt,arg)) if opt in ("-p", "--hdfs-path"): if hdfsPathFlag: cls.notUniqueArg() else: hdfsPathFlag = True hdfsPathArg = arg elif opt in ("-y", "--year"): if yearFlag: cls.notUniqueArg() else: yearFlag = True yearArg = arg elif opt in ("-f", "--from-year"): if fromYearFlag: cls.notUniqueArg() else: fromYearFlag = True fromYearArg = arg elif opt in ("-t", "--to-year"): if toYearFlag: cls.notUniqueArg() else: toYearFlag = True toYearArg = arg elif opt in ("-m", "--magnitude-over"): if magnOverFlag: cls.notUniqueArg() else: magnOverFlag = True magnOverArg = arg elif opt in ("-d", "--download-again"): if overwriteFlag: cls.notUniqueArg() else: overwriteFlag = True if hdfsPathFlag is False: Log.error("Input Error. You must specify a valid HDFS path. Exiting the application..") Log.exit() else: HDFS.pathValidation(hdfsPathArg) fromToOption = False yearOption = False if fromYearFlag and toYearFlag and not yearFlag: fromToOption = True elif not fromYearFlag and not toYearFlag and yearFlag: yearOption = True else: Log.error("Input Parameters Error.\r\n" \ "You must pass parameters in one of the following formats:\r\n" \ "Example with a range of values: '--from-year=2010 --to-year=2020'\r\n" \ "Example with a list of unique values: '--year=2010,2011,2012'\r\n" \ "Exiting the application..") Log.exit() if fromToOption: fromYearInt = cls.validateYear(fromYearArg) toYearInt = cls.validateYear(toYearArg) yearsList = cls.toList(fromYearInt, toYearInt) elif yearOption: yearsList = cls.toList(yearArg, None) if magnOverArg is None: magnOverArg = 0 magnitudeOver = cls.validateMagnitude(magnOverArg) return yearsList, magnitudeOver, overwriteFlag