def distanceAllToCities(cls):
     Log.info("Calculating earthquakes distance to all cities..")
     (ret, out, err) = System.command(
         ['hive', '-f', '../hive_ql/distance-to-cities.hql'])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
 def createEarthquakesTables(cls):
     Log.info("Creating hive tables:")
     (ret, out, err) = System.command(
         ['hive', '-f', '../hive_ql/create-earthquakes-tables.hql'])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
 def distanceToClosestCity(cls):
     Log.info("Calculating earthquakes distance to closest city..")
     (ret, out, err) = System.command(
         ['hive', '-f', '../hive_ql/distance-to-city-closest.hql'])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
 def clearEarthquakesTables(cls):
     Log.warning(
         "Option 'drop-tables' is enabled. All data will be removed.")
     (ret, out,
      err) = System.command(['hive', '-f', '../hive_ql/clear-tables.hql'])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
 def distanceToAllSeismographicStations(cls):
     Log.info(
         "Calculating earthquakes distance to all seismographic stations..")
     (ret, out, err) = System.command(
         ['hive', '-f', '../hive_ql/distance-to-stations.hql'])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
 def createDB(cls, path):
     hivevar = "path='" + path + "/earthquakes.db'"
     Log.info("Creating hive database: 'earthquakes'")
     (ret, out, err) = System.command([
         'hive', '-hivevar', hivevar, '-f', '../hive_ql/create-database.hql'
     ])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
 def produceOutputSeismographs(cls):
     Log.info(
         "ETL pipeline Output: Join earthquakes with closest city,station and produce seismograph.."
     )
     (ret, out, err) = System.command(
         ['hive', '-f', '../hive_ql/output-seismograph.hql'])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
 def loadEarthquakesData(cls, file):
     hivevar = "path='" + file + "'"
     Log.info("Loading earthquakes data to hive:")
     Log.info(file)
     (ret, out, err) = System.command([
         'hive', '-hivevar', hivevar, '-f',
         '../hive_ql/load-earthquakes.hql'
     ])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
 def Read(cls):
     try:
         with open(r'../../conf/earthquakes-application.yaml') as file:
             configuration = yaml.load(file, Loader=yaml.FullLoader)
             Log.info("Loading configuration from earthquakes-application.yaml")
             Log.info("values: {}".format(configuration))
             history_args, hive_args = cls.Evaluate(configuration)
             return history_args, hive_args
     except EnvironmentError as error:
         Log.error("Configuration can not be loaded.")
         Log.error(error)
         Log.exit()
Example #10
0
 def validateMagnitude(cls, arg):
     try:
         magnutide = float(arg)
         if 0 <= magnutide <= 8:
             return magnutide
         else:
             Log.exit()
     except:
         Log.error(
             "invalid magnitude input, value: '{}'. You can only pass magnitude values from '0.0' to '8.0'. Exciting the application..".format(
                 arg))
         Log.exit()
Example #11
0
 def validateYear(cls, arg):
     now = datetime.utcnow()
     currentYear = now.year
     try:
         year = int(arg)
         if 1900 <= year <= currentYear:
             return year
         else:
             Log.exit()
     except:
         Log.error(
             "invalid year input, value: '{}'. You can only pass year values from '1900' to '{}'. Exciting the application..".format(
                 arg, currentYear))
         Log.exit()
 def Request(cls, start, end, magnitude_over):
     eventlet.monkey_patch()
     with eventlet.Timeout(180):
         try:
             with requests.Session() as s:
                 download = s.get(
                     "https://earthquake.usgs.gov/fdsnws/event/1/query?format=csv&starttime={}&endtime={}&minmagnitude={}"
                     .format(start, end, str(magnitude_over)))
                 decoded_content = download.content.decode('utf-8')
                 eq_csv = csv.reader(decoded_content.splitlines(),
                                     delimiter=',')
                 eq_list = list(eq_csv)
                 return eq_list
         except Exception as error:
             Log.error("Request Error:")
             Log.error(error)
Example #13
0
 def filesInPath(cls, path):
     cls.path = path
     Log.info("HDFS path validation:")
     (ret, out, err) = cls.command(['hdfs', 'dfs', '-ls', path])
     if ret == 1:
         Log.error("HDFS path Error. Exiting the Application..")
         Log.error(err)
         Log.exit()
     else:
         Log.info("Valid HDFS path")
         lines = out.splitlines()
         for line in lines:
             line_split = line.split(' ')
             line_len = len(line_split)
             file_exists = re.search('.*csv$', line_split[line_len - 1])
             if file_exists:
                 cls.files.append(line_split[line_len - 1])
 def loadCities(cls, path):
     hivevar = "path='" + path + "/cities.csv'"
     Log.info("Loading cities data to hive:")
     Log.info("Creating cities staging table..")
     (ret, out, err) = System.command([
         'hive', '-hivevar', hivevar, '-f',
         '../hive_ql/load-cities-staging.hql'
     ])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
     Log.info("Creating cities final table..")
     (ret, out,
      err) = System.command(['hive', '-f', '../hive_ql/load-cities.hql'])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
Example #15
0
 def toList(cls, args1, args2):
     yearsList = []
     if args2 is None:
         yearsTempList = str(args1).split(",")
         for year in yearsTempList:
             yearInt = cls.validateYear(year)
             if yearInt not in yearsList:
                 yearsList.append(yearInt)
         yearsList.sort()
     elif args2 is not None:
         fromYear = cls.validateYear(args1)
         toYear = cls.validateYear(args2)
         if fromYear > toYear:
             Log.error("Input Error. 'from-year' value must be less that 'to-year' value. Exiting the application..")
             Log.exit()
         else:
             for year in range(fromYear, toYear + 1):
                 yearsList.append(year)
     return yearsList
 def loadSeismographicStations(cls, path):
     hivevar = "path='" + path + "/seismographic-stations.csv'"
     Log.info("Loading seismographic stations data to hive:")
     Log.info("Creating seismographic stations staging table..")
     (ret, out, err) = System.command([
         'hive', '-hivevar', hivevar, '-f',
         '../hive_ql/load-seismographic-stations-staging.hql'
     ])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
     Log.info("Creating seismographic stations final table..")
     (ret, out, err) = System.command([
         'hive', '-hivevar', hivevar, '-f',
         '../hive_ql/load-seismographic-stations.hql'
     ])
     Log.info("return, {}".format(ret))
     Log.info("output, {}".format(out))
     Log.error("error, {}".format(err))
Example #17
0
def main():
    Log.info('-----------------------')
    Log.info('Download process starts')
    Log.info('-----------------------')
    inputArgs = sys.argv
    args = inputArgs[1:]
    StoreData.createFolder()
    yearsTempList, magnitudeOver, download_again = Input.getValues(args)
    path = HDFS.getPath()
    Log.info("Earthquakes acquisition starts..")
    years = Database.QueryInput(yearsTempList, magnitudeOver, download_again)
    Log.info(
        "Requesting earthquakes data with magnitude over {}, for years: {}".
        format(magnitudeOver, years))
    for year in years:
        Log.info("Processing year: {}".format(year))
        Log.info("Earthquakes acquisition starts.")
        firstDate = date(year, 1, 1)
        lastDate = date(year, 12, 31)
        for d in dateRange(firstDate, lastDate):
            start = d.strftime("%Y-%m-%d") + "T00:00:00.000Z"
            end = (d +
                   timedelta(days=1)).strftime("%Y-%m-%d") + "T00:00:00.000Z"
            try:
                eq_list_raw = Acquisition.Request(start, end, magnitudeOver)
                eq_list_no_headers = Preprocessing.cleanHeaders(eq_list_raw)
                eq_list_split_date_time = Preprocessing.splitDateTime(
                    eq_list_no_headers)
                eq_list = Preprocessing.checkCountry(eq_list_split_date_time)
                StoreData.toFile(eq_list, year, d, magnitudeOver)
            except Exception as error:
                Log.error("Error while processing a Request:")
                Log.error(error)
        Log.info("Earthquakes acquisition for  year {} finished".format(year))

        HDFS.put(
            '../../data/earthquakes-history/earthquakes{}mag{}.csv'.format(
                year, magnitudeOver), path)
    Log.info('---------------------')
    Log.info('Download process ends')
    Log.info('---------------------')
    def Evaluate(cls, configuration):
        history_args = {}
        hive_args = {}
        for config, values in configuration.items():
            if config == 'hdfs-path':
                history_args['--hdfs-path='] = values
                hive_args['--hdfs-path='] = values
            elif config == 'download-list-of-years':
                list_of_years = ""
                if values is not None:
                    for value in values:
                        if list_of_years == "":
                            list_of_years = str(value)
                        else:
                            list_of_years = list_of_years + "," + str(value)
                if list_of_years is not "":
                    history_args['--year='] = list_of_years
            elif config == 'download-group-of-years':
                if values is not None:
                    if len(values) == 2:
                        history_args['--from-year='] = str(values[0])
                        history_args['--to-year='] = str(values[1])
                    else:
                        Log.error("Property 'download-group-of-years' can take only two values, from-year, to-year")
                        Log.exit()
            elif config == 'download-magnitude-over':
                history_args['--magnitude-over='] = str(values)
            elif config == 'download-again-historical-data':
                if values is True:
                    history_args['--download-again'] = ""
            elif config == 'hive-drop-all-tables':
                hive_args['--drop-tables'] = values

        if '--hdfs-path=' in history_args.keys():
            if history_args['--hdfs-path='] is None:
                Log.error("You must specify an HDFS path for the data to be stored.")
                Log.exit()
        else:
            Log.error("You must specify an HDFS path for the data to be stored.")
            Log.exit()

        if '--year=' in history_args.keys() and (
                '--from-year=' in history_args.keys() or '--to-year=' in history_args.keys()):
            Log.error(
                "You can not pass values for both 'download-list-of-years' and 'download-group-of-years'. Chose one of this options.")
            Log.exit()

        return history_args, hive_args
Example #19
0
    def getValues(cls, inputArgs):
        Log.info("input arguments: {}".format(inputArgs))
        options = "p:d"
        longOptions = ["hdfs-path=", "drop-tables"]
        try:
            opts, args = getopt.getopt(inputArgs, options, longOptions)
        except getopt.GetoptError as err:
            Log.error(err)
            Log.exit()

        hdfsPathFlag = False
        hdfsPathArg = None
        dropTablesFlag = False

        for opt, arg in opts:
            Log.info("processing option: {} with arguments: {}".format(
                opt, arg))
            if opt in ("-p", "--hdfs-path"):
                if hdfsPathFlag:
                    cls.notUniqueArg()
                else:
                    hdfsPathFlag = True
                    hdfsPathArg = arg
            elif opt in ("-d", "--drop-tables"):
                if dropTablesFlag:
                    cls.notUniqueArg()
                else:
                    dropTablesFlag = True

        if hdfsPathFlag is False:
            Log.error(
                "Input Error. You must specify a valid HDFS path. Exiting the application.."
            )
            Log.exit()
        else:
            HDFS.filesInPath(hdfsPathArg)

        return dropTablesFlag
Example #20
0
 def notUniqueArg(cls):
     Log.error(
         "Input Error. Can't pass one argument twice. Exiting the application.."
     )
     Log.exit()
Example #21
0
    def getValues(cls, inputArgs):
        Log.info("input arguments: {}".format(inputArgs))
        options = "y:f:t:m:p:d"
        longOptions = ["year=", "from-year=", "to-year=", "magnitude-over=", "download-again","hdfs-path="]
        try:
            opts, args = getopt.getopt(inputArgs, options, longOptions)
        except getopt.GetoptError as err:
            Log.error(err)
            Log.exit()

        yearFlag = False
        yearArg = None
        fromYearFlag = False
        fromYearArg = None
        toYearFlag = False
        toYearArg = None
        magnOverFlag = False
        magnOverArg = None
        overwriteFlag = False
        hdfsPathFlag = False
        hdfsPathArg = None

        for opt, arg in opts:
            Log.info("processing option: {} with arguments: {}".format(opt,arg))
            if opt in ("-p", "--hdfs-path"):
                if hdfsPathFlag:
                    cls.notUniqueArg()
                else:
                    hdfsPathFlag = True
                    hdfsPathArg = arg
            elif opt in ("-y", "--year"):
                if yearFlag:
                    cls.notUniqueArg()
                else:
                    yearFlag = True
                    yearArg = arg
            elif opt in ("-f", "--from-year"):
                if fromYearFlag:
                    cls.notUniqueArg()
                else:
                    fromYearFlag = True
                    fromYearArg = arg
            elif opt in ("-t", "--to-year"):
                if toYearFlag:
                    cls.notUniqueArg()
                else:
                    toYearFlag = True
                    toYearArg = arg
            elif opt in ("-m", "--magnitude-over"):
                if magnOverFlag:
                    cls.notUniqueArg()
                else:
                    magnOverFlag = True
                    magnOverArg = arg
            elif opt in ("-d", "--download-again"):
                if overwriteFlag:
                    cls.notUniqueArg()
                else:
                    overwriteFlag = True

        if hdfsPathFlag is False:
            Log.error("Input Error. You must specify a valid HDFS path. Exiting the application..")
            Log.exit()
        else:
            HDFS.pathValidation(hdfsPathArg)

        fromToOption = False
        yearOption = False
        if fromYearFlag and toYearFlag and not yearFlag:
            fromToOption = True
        elif not fromYearFlag and not toYearFlag and yearFlag:
            yearOption = True
        else:
            Log.error("Input Parameters Error.\r\n" \
                  "You must pass parameters in one of the following formats:\r\n" \
                  "Example with a range of values:       '--from-year=2010 --to-year=2020'\r\n" \
                  "Example with a list of unique values: '--year=2010,2011,2012'\r\n" \
                  "Exiting the application..")
            Log.exit()

        if fromToOption:
            fromYearInt = cls.validateYear(fromYearArg)
            toYearInt = cls.validateYear(toYearArg)
            yearsList = cls.toList(fromYearInt, toYearInt)
        elif yearOption:
            yearsList = cls.toList(yearArg, None)

        if magnOverArg is None:
            magnOverArg = 0
        magnitudeOver = cls.validateMagnitude(magnOverArg)

        return yearsList, magnitudeOver, overwriteFlag