コード例 #1
0
from pyspark.sql.functions import udf, array
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql.functions import col


class Job(ETL_Base):
    def transform(self, some_events):

        udf_format_datetime = udf(self.format_datetime, StringType())

        events_cleaned = some_events \
            .withColumn('timestamp_obj', udf_format_datetime(some_events.timestamp).cast("timestamp")) \
            .where(col('timestamp').like("%2.016%") == False)
        return events_cleaned

    @staticmethod
    def format_datetime(wiki_dt):
        dt = {}
        dt['year'] = wiki_dt[:4]
        dt['month'] = wiki_dt[4:6]
        dt['day'] = wiki_dt[6:8]
        dt['hour'] = wiki_dt[8:10]
        dt['minute'] = wiki_dt[10:12]
        dt['sec'] = wiki_dt[12:14]
        return '{year}-{month}-{day} {hour}:{minute}:{sec}'.format(**dt)


if __name__ == "__main__":
    args = {'job_param_file': 'conf/jobs_metadata.yml'}
    Commandliner(Job, **args)
コード例 #2
0
from yaetos.etl_utils import Commandliner

Commandliner(Job=None, launcher_file='jobs/generic/launcher.py')
コード例 #3
0
from yaetos.etl_utils import Commandliner

Commandliner(Job=None)
コード例 #4
0
class Job(ETL_Base):
    """To run/deploy sql jobs, using --sql_file arg."""
    def set_jargs(self, pre_jargs, loaded_inputs={}):
        # Function called only if running the job directly, i.e. "python yaetos/sql_job.py --sql_file=jobs/some_job.sql", ignored if running from "python jobs/generic/launcher.py --job_name=some_job.sql"
        sql_file = pre_jargs['cmd_args']['sql_file']
        job_name = Job_Yml_Parser.set_job_name_from_file(sql_file)
        pre_jargs['job_args']['job_name'] = job_name
        return Job_Args_Parser(defaults_args=pre_jargs['defaults_args'],
                               yml_args=None,
                               job_args=pre_jargs['job_args'],
                               cmd_args=pre_jargs['cmd_args'],
                               loaded_inputs=loaded_inputs)

    def transform(self, **ignored):
        sql = self.read_sql_file(self.jargs.sql_file)
        df = self.query(sql)
        if self.jargs.merged_args.get('repartition'):
            df = df.repartition(self.jargs.merged_args['repartition'])
        return df

    @staticmethod
    def read_sql_file(fname):
        fh = open(fname, 'r')
        sql = fh.read()
        fh.close()
        return sql


if __name__ == "__main__":
    Commandliner(Job)
コード例 #5
0
 def define_commandline_args():
     parser = Commandliner.define_commandline_args()
     parser.add_argument("-q", "--sql_file", help="path of sql file to run")
     return parser