def test_parse_deprecated(self): dm = luigi.DateMinuteParameter().parse('2013-02-01T18H42') self.assertEqual(dm, datetime.datetime(2013, 2, 1, 18, 42, 0))
def test_serialize(self): dm = luigi.DateMinuteParameter().serialize( datetime.datetime(2013, 2, 1, 18, 42, 0)) self.assertEqual(dm, '2013-02-01T1842')
class DateMinuteTaskOk(luigi.Task): minute = luigi.DateMinuteParameter() def complete(self): # test against 2000.03.01T02H03 return self.minute in [datetime.datetime(2000, 3, 1, 2, 0), datetime.datetime(2000, 3, 1, 2, 3), datetime.datetime(2000, 3, 1, 2, 4)]
def test_parse_padding_zero(self): dm = luigi.DateMinuteParameter().parse('2013-02-01T1807') self.assertEqual(dm, datetime.datetime(2013, 2, 1, 18, 7, 0))
def test_serialize(self): dh = luigi.DateMinuteParameter().serialize( datetime.datetime(2013, 1, 1, 18, 42, 0)) self.assertEqual(dh, '2013-01-01T18H42')
class DateMinuteTask(luigi.Task): dh = luigi.DateMinuteParameter()
class AvailabilityToCSV(luigi.Task): """Turn real-time bike availability to CSV files """ city = luigi.Parameter() timestamp = luigi.DateMinuteParameter(default=dt.now(), interval=5) @property def path(self): return os.path.join(DATADIR, self.city, '{year}', '{month:02d}', '{day:02d}', '{ts}.csv') def requires(self): return BikeAvailability(self.city, self.timestamp) def output(self): triple = lambda x: (x.year, x.month, x.day) year, month, day = triple(self.timestamp) ts = self.timestamp.strftime('%HH%M') # 16H35 return luigi.LocalTarget( self.path.format(year=year, month=month, day=day, ts=ts, format=UTF8)) def run(self): with self.input().open() as fobj: if self.city == 'bordeaux': tree = etree.parse(fobj) wfs_ns = '{http://www.opengis.net/wfs/2.0}' bm_ns = '{http://data.bordeaux-metropole.fr/wfs}' elements = (node.find(bm_ns + 'CI_VCUB_P') for node in tree.findall(wfs_ns + 'member')) data = [] for node in elements: data.append(extract_xml_feature(node)) df = pd.DataFrame([dict(x) for x in data]) status_key = config[self.city]['feature_status'] df[status_key] = df[status_key].apply( lambda x: 'open' if x == 'CONNECTEE' else 'closed') elif self.city == 'lyon': data = json.load(fobj) df = pd.DataFrame(data['values'], columns=data['fields']) status_key = config[self.city]['feature_status'] df[status_key] = df[status_key].apply( lambda x: 'open' if x == 'OPEN' else 'closed') else: raise ValueError(("{} is an unknown city.".format(self.city))) df = df[[ config[self.city]['feature_avl_id'], config[self.city]['feature_timestamp'], config[self.city]['feature_avl_stands'], config[self.city]['feature_avl_bikes'], config[self.city]['feature_status'] ]] df.columns = [ "id", "timestamp", "available_stands", "available_bikes", "status" ] df = df.sort_values(by="id") with self.output().open('w') as fobj: df.to_csv(fobj, index=False)
def test_parse(self): dh = luigi.DateMinuteParameter().parse('2013-01-01T18H42') self.assertEqual(dh, datetime.datetime(2013, 1, 1, 18, 42, 0))
def _format_datetime(self, dt): return luigi.DateMinuteParameter().serialize(dt)
class UpdateCollectionsSolr(luigi.Task): task_namespace = 'discovery' date = luigi.DateMinuteParameter(default=datetime.datetime.now()) solr_endpoint = luigi.Parameter(default='http://localhost:8983/solr/collections') def requires(self): return [TargetList(self.date), CollectionList(self.date), SubjectList(self.date)] def output(self): return state_file(self.date,'access-data', 'updated-collections-solr.json') @staticmethod def add_collection(s, targets_by_id, col, parent_id): if col['publish']: print("Publishing...", col['name']) # add a document to the Solr index s.add([ { "id": col["id"], "type": "collection", "name": col["name"], "description": col["description"], "parentId": parent_id } ], commit=False) # Look up all Targets within this Collection and add them. for tid in col.get('target_ids',[]): target = targets_by_id.get(tid, None) if not target: logger.error("Warning! Could not find target %i" % tid) continue # Skip items with no URLs: if len(target.get('urls',[])) == 0: continue # Determine license status: licenses = [] if target.get('isOA', False): licenses = target.get("license_ids",[]) # Use a special value to indicate an inherited license: if len(licenses) == 0: licenses = ['1000'] # add a document to the Solr index s.add([{ "id": "cid:%i-tid:%i" % (col['id'], target['id']), "type": "target", "parentId": col['id'], "title": target["title"], "description": target["description"], "url": target["urls"][0], "additionalUrl": target["urls"][1:], "language": target["language"], "startDate": target["crawl_start_date"], "endDate": target["crawl_end_date"], "licenses": licenses }], commit=False) # Add child collections for cc in col["children"]: UpdateCollectionsSolr.add_collection(s, targets_by_id, cc, col['id']) else: print("Skipping...", col['name']) return def run(self): targets = json.load(self.input()[0].open()) collections = json.load(self.input()[1].open()) subjects = json.load(self.input()[2].open()) # build look-up table for Target IDs targets_by_id = {} target_count = 0 for target in targets: tid = target['id'] targets_by_id[tid] = target target_count += 1 logger.info("Found %i targets..." % target_count) s = pysolr.Solr(self.solr_endpoint, timeout=30) # First, we delete everything (!) s.delete(q="*:*", commit=False) # Update the collections: for col in collections: UpdateCollectionsSolr.add_collection(s, targets_by_id, col, None) # Now commit all changes: s.commit() # Record that we have completed this task successfully: with self.output().open('w') as f: f.write('{}'.format(json.dumps(collections, indent=4)))
class RangeByMinutesBase(RangeBase): """ Produces a contiguous completed range of an recurring tasks separated a specified number of minutes. """ start = luigi.DateMinuteParameter( default=None, description= "beginning date-hour-minute, inclusive. Default: None - work backward forever (requires reverse=True)" ) stop = luigi.DateMinuteParameter( default=None, description= "ending date-hour-minute, exclusive. Default: None - work forward forever" ) minutes_back = luigi.IntParameter( default=60 * 24, # one day description=("extent to which contiguousness is to be assured into " "past, in minutes from current time. Prevents infinite " "loop when start is none. If the dataset has limited " "retention (i.e. old outputs get removed), this should " "be set shorter to that, too, to prevent the oldest " "outputs flapping. Increase freely if you intend to " "process old dates - worker's memory is the limit")) minutes_forward = luigi.IntParameter( default=0, description= "extent to which contiguousness is to be assured into future, " "in minutes from current time. Prevents infinite loop when stop is none" ) minutes_interval = luigi.IntParameter( default=1, description= "separation between events in minutes. It must evenly divide 60") def datetime_to_parameter(self, dt): return dt def parameter_to_datetime(self, p): return p def datetime_to_parameters(self, dt): """ Given a date-time, will produce a dictionary of of-params combined with the ranged task parameter """ return self._task_parameters(dt) def parameters_to_datetime(self, p): """ Given a dictionary of parameters, will extract the ranged task parameter value """ dt = p[self._param_name] return datetime(dt.year, dt.month, dt.day, dt.hour, dt.minute) def moving_start(self, now): return now - timedelta(minutes=self.minutes_back) def moving_stop(self, now): return now + timedelta(minutes=self.minutes_forward) def finite_datetimes(self, finite_start, finite_stop): """ Simply returns the points in time that correspond to a whole number of minutes intervals. """ # Validate that the minutes_interval can divide 60 and it is greater than 0 and lesser than 60 if not (0 < self.minutes_interval < 60): raise ParameterException('minutes-interval must be within 0..60') if (60 / self.minutes_interval) * self.minutes_interval != 60: raise ParameterException( 'minutes-interval does not evenly divide 60') # start of a complete interval, e.g. 20:13 and the interval is 5 -> 20:10 start_minute = int(finite_start.minute / self.minutes_interval) * self.minutes_interval datehour_start = datetime(year=finite_start.year, month=finite_start.month, day=finite_start.day, hour=finite_start.hour, minute=start_minute) datehours = [] for i in itertools.count(): t = datehour_start + timedelta(minutes=i * self.minutes_interval) if t >= finite_stop: return datehours if t >= finite_start: datehours.append(t) def _format_datetime(self, dt): return luigi.DateMinuteParameter().serialize(dt)
class UrlMin15Generator(CmvBaseTask): """Task for url min15 data generation""" start_time = luigi.DateMinuteParameter( default=datetime(year=2016, month=3, day=21, hour=12, minute=15)) wario_target_table_name = luigi.Parameter(default='url_min15', significant=False) appserver_app_name = luigi.Parameter(default='urlmin15gen', significant=False) def get_appserver_job_config(self): """Returns job config""" tmpl_values = { "start_time": CmvLib.date_to_cmvformat(self.start_time), "end_time": CmvLib.date_to_cmvformat(self.start_time + timedelta(minutes=15)), "cache_namespace": self.cassandra_namespace, "cassandra_seeds": self.cassandra_seeds.split(','), "cassandra_keyspace": self.cassandra_keyspace, "hdfs_name_node": self.hdfs_namenode, "hdfs_session_dirs": self.hdfs_session_dirs.split(','), "hdfs_cmv_dir": self.hdfs_cmv_dir } with open(CmvLib.get_template_path( 'resources/url_min15_template.json')) as tmpl_file: cfg = json.load(tmpl_file) CmvLib.replace_config_params(cfg, tmpl_values) return cfg def requires(self): return InputSessionFile(cube_time=self.start_time) def run(self): job_cfg = self.get_appserver_job_config() logging.info('Running url min15 job...') datadog_start_time = time.time() submission_status = CmvLib.submit_config_to_appserver( job_cfg, CmvLib.get_appserver_job_submit_url(self.appserver_host_port, self.appserver_app_name)) job_id = submission_status['payload']['jobId'] time.sleep(5) appserver_resp = CmvLib.poll_appserver_job_status( CmvLib.get_appserver_job_status_url(self.appserver_host_port, self.appserver_app_name, job_id)) elapsed_time = (time.time() - datadog_start_time) / 60 DataDogClient.gauge_this_metric('url_min15_delay', elapsed_time) if appserver_resp['payload']['status'] != 'Finished': logging.error( "AppServer responded with an error. AppServer Response: %s", appserver_resp['payload']['result']) raise Exception('Error in AppServer Response.') else: logging.info("Url min15 job completed successfully.") self.output().touch() def output(self): connect_args = { 'host': self.wario_target_db_host, 'user': self.wario_target_db_user, 'password': self.wario_target_db_password, 'database': self.wario_target_db_name, 'table': self.wario_target_table_name } col_values = {'target_id': self.task_id} return CmvMysqlTarget(connect_args, col_values)
class Min15AndDailyRollupTrigger(CmvBaseTask): start_time = luigi.DateMinuteParameter() end_time = luigi.DateMinuteParameter() min15_target_table_name = None wario_target_table_name = luigi.Parameter(significant=False) connect_args = dict() row_col_dict = dict() def requires(self): CmvLib.validate_min15_time(self.start_time) CmvLib.validate_min15_time(self.end_time) logging.info( "Task: Min15AndDailyRollupTrigger, start_time = %s, end_time = %s", self.start_time, self.end_time) min15_task = CmvMin15Generator(start_time=self.start_time, end_time=self.end_time) self.min15_target_table_name = min15_task.wario_target_table_name return min15_task def task_init(self): self.connect_args['user'] = self.wario_target_db_user self.connect_args['password'] = self.wario_target_db_password self.connect_args['host'] = self.wario_target_db_host self.connect_args['database'] = self.wario_target_db_name self.connect_args['table'] = self.wario_target_table_name self.row_col_dict['target_id'] = self.task_id logging.info('Initializing task params: {cn_args}, {tgt_id}'.format( cn_args=self.connect_args, tgt_id=self.task_id)) def get_ptz_dict_from_db(self): connect_args = dict() connect_args['user'] = self.wario_target_db_user connect_args['password'] = self.wario_target_db_password connect_args['host'] = self.wario_target_db_host connect_args['database'] = self.wario_target_db_name connect_args['table'] = self.min15_target_table_name query_string = 'select COLUMN_GET(ptz_dict, {json_item} as char) from {min15_table} where target_id = %s'.\ format(json_item='\'ptz_items\'', min15_table=self.min15_target_table_name) query_values = [ 'CmvMin15Generator(start_time={s}, end_time={e})'.format( s=self.start_time.strftime('%Y-%m-%dT%H%M'), e=self.end_time.strftime('%Y-%m-%dT%H%M')) ] rows = CmvMysqlTarget(connect_args=connect_args).query( query_string, query_values) return json.loads(str(rows[0][0])) def run(self): ptz_dict_list = self.get_ptz_dict_from_db() upstream_rollup_tasks = [] for ptz_dict in ptz_dict_list: rollup_pcode = ptz_dict['pcode'] rollup_tz = ptz_dict['timezone'] rollup_day = DateTime.utc_to_any_tz(self.start_time, rollup_tz).replace(tzinfo=None) logging.info( 'Preparing DailyRollup with params: day = {day}, timezone = {tz}, pcode = {pcode}' .format(day=rollup_day, tz=rollup_tz, pcode=rollup_pcode)) upstream_rollup_tasks.append( CmvRollupDailyGenerator(day=rollup_day, timezone=rollup_tz, pcode=rollup_pcode)) logging.info('Triggering upstream rollup tasks') yield upstream_rollup_tasks self.output().touch() def output(self): self.task_init() return CmvMysqlTarget(self.connect_args, self.row_col_dict)
class CheckStatus(luigi.Task): """ """ task_namespace = 'monitor' date = luigi.DateMinuteParameter(default=datetime.datetime.today()) def output(self): return luigi.LocalTarget('%s/monitor/checkstatus.%s' % (state().state_folder, self.date.strftime(luigi.DateMinuteParameter.date_format))) def run(self): servers = self.load_as_json(systems().servers) services = self.load_as_json(systems().services) for job in services.get('jobs', []): server = servers[services['jobs'][job]['server']] # app.logger.info(json.dumps(server, indent=4)) services['jobs'][job]['state'] = self.get_h3_status(services['jobs'][job]['name'], server) services['jobs'][job]['url'] = server['url'] for queue in services.get('queues', []): server = servers[services['queues'][queue]['server']] services['queues'][queue]['prefix'] = server['user_prefix'] services['queues'][queue]['state'] = self.get_queue_status(services['queues'][queue]['name'], server) for http in services.get('http', []): services['http'][http]['state'] = self.get_http_status(services['http'][http]['url']) for hdfs in services.get('hdfs', []): services['hdfs'][hdfs]['state'] = self.get_hdfs_status(services['hdfs'][hdfs]) with self.output().open('w') as f: f.write('{}'.format(json.dumps(services, indent=4))) def get_h3_status(self, job, server): # Set up connection to H3: h = hapyx.HapyX(server['url'], username=server['user'], password=server['pass'], timeout=5) state = {} try: logger.info("Getting status for job %s on %s" % (job, server)) info = h.get_job_info(job) state['details'] = info if info.has_key('job'): state['status'] = info['job'].get("crawlControllerState", None) if not state['status']: state['status'] = info['job'].get("statusDescription", None) state['status'] = state['status'].upper() except Exception as e: state['status'] = "DOWN" state['error'] = "Could not reach Heritrix! %s" % e # app.logger.exception(e) # Classify if state['status'] == "DOWN": state['status-class'] = "status-oos" elif state['status'] == "RUNNING": # Replacing RUNNING with docs/second rate rate = state['details']['job']['rateReport']['currentDocsPerSecond'] state['rate'] = "%.1f" % float(rate) if rate < 1.0: state['status-class'] = "status-warning" else: state['status-class'] = "status-good" else: state['status-class'] = "status-warning" return state def get_queue_status(self, queue, server): state = {} try: logger.info("Getting status for queue %s on %s" % (queue, server)) qurl = '%s%s' % (server['prefix'], queue) # app.logger.info("GET: %s" % qurl) r = requests.get(qurl, timeout=5) state['details'] = r.json() state['count'] = "{:0,}".format(state['details']['messages']) if 'error' in state['details']: state['status'] = "ERROR" state['status-class'] = "status-alert" state['error'] = state['details']['reason'] elif state['details']['consumers'] == 0: state['status'] = "BECALMED" state['status-class'] = "status-oos" state['error'] = 'No consumers!' else: state['status'] = state['details']['messages'] state['status-class'] = "status-good" except Exception as e: state['status'] = "DOWN" state['status-class'] = "status-alert" logger.exception(e) return state def get_http_status(self, url): state = {} try: logger.info("Getting status for %s" % (url)) r = requests.get(url, allow_redirects=False, timeout=10) state['status'] = "%s" % r.status_code if r.status_code / 100 == 2 or r.status_code / 100 == 3: state['status'] = "%.3fs" % r.elapsed.total_seconds() state['status-class'] = "status-good" else: state['status-class'] = "status-warning" except: state['status'] = "DOWN" state['status-class'] = "status-alert" return state def get_hdfs_status(self, hdfs): state = {} try: logger.info("Getting status for hdfs %s" % (hdfs)) r = requests.get(hdfs['url'], timeout=5) state['status'] = "%s" % r.status_code if r.status_code / 100 == 2: state['status-class'] = "status-good" tree = etree.fromstring(r.text, etree.HTMLParser()) percent = tree.xpath("//div[@id='dfstable']//tr[5]/td[3]")[0].text percent = percent.replace(" ", "") state['percent'] = percent state['remaining'] = tree.xpath("//div[@id='dfstable']//tr[4]/td[3]")[0].text.replace(" ", "") underr = int(tree.xpath("//div[@id='dfstable']//tr[10]/td[3]")[0].text) if underr != 0: state['status'] = "HDFS has %i under-replicated blocks!" % underr state['status-class'] = "status-warning" else: state['status-class'] = "status-warning" except Exception as e: logger.exception(e) state['status'] = "DOWN" state['status-class'] = "status-alert" return state def load_as_json(self, filename): script_dir = os.path.dirname(__file__) file_path = os.path.join(script_dir, filename) with open(file_path, 'r') as fi: return json.load(fi)
class TrainXGBoost(luigi.Task): """Train a XGBoost model between `start` and `stop` dates to predict bike availability at each station in `city` Attributes ---------- city : luigi.Parameter City of interest, *e.g.* Bordeaux or Lyon start : luigi.DateParameter Training start date stop : luigi.DataParameter Training stop date upper bound (actually the end date is computed with `validation`) validation : luigi.DateMinuteParameter Date that bounds the training set and the validation set during the XGBoost model training frequency : DateOffset, timedelta or str Indicates the prediction frequency """ city = luigi.Parameter() start = luigi.DateParameter(default=yesterday()) stop = luigi.DateParameter(default=date.today()) validation = luigi.DateMinuteParameter(default=dt.now() - timedelta(hours=1)) frequency = luigi.Parameter(default="30T") def outputpath(self): fname = "{}-to-{}-at-{}-freq-{}.model".format( self.start, self.stop, self.validation.isoformat(), self.frequency) return os.path.join(DATADIR, self.city, 'xgboost-model', fname) def output(self): return luigi.LocalTarget(self.outputpath(), format=MixedUnicodeBytes) def run(self): query = ("SELECT DISTINCT id AS station_id, timestamp AS ts, " "available_bikes AS nb_bikes, available_stands AS nb_stands, " "available_bikes::float / (available_bikes::float " "+ available_stands::float) AS probability " "FROM {schema}.{tablename} " "WHERE timestamp >= %(start)s " "AND timestamp < %(stop)s " "AND (available_bikes > 0 OR available_stands > 0) " "AND (status = 'open')" "ORDER BY id, timestamp" ";").format(schema=self.city, tablename='timeseries') eng = db() df = pd.io.sql.read_sql_query(query, eng, params={ "start": self.start, "stop": self.stop }) df.station_id = df.station_id.astype(int) if df.empty: raise Exception( "There is not any data to process in the DataFrame. " + "Please check the dates.") prediction_model = train_prediction_model(df, self.validation, self.frequency) self.output().makedirs() prediction_model.save_model(self.output().path)
def testDateWithMinuteInterval(self): p = luigi.DateMinuteParameter(config_path=dict(section="foo", name="bar"), interval=2) self.assertEqual(datetime.datetime(2001, 2, 3, 4, 30, 0), _value(p))
class Bar(RunOnceTask): time = luigi.DateMinuteParameter()
def test_parse_padding_zero(self): dh = luigi.DateMinuteParameter().parse('2013-01-01T18H07') self.assertEqual(dh, datetime.datetime(2013, 1, 1, 18, 07, 0))
class CmvMin15Generator(CmvBaseTask): start_time = luigi.DateMinuteParameter() end_time = luigi.DateMinuteParameter() wario_target_table_name = luigi.Parameter(default='min15', significant=False) appserver_app_name = luigi.Parameter(default='', significant=False) appserver_app_type = luigi.Parameter(default='', significant=False) hdfs_dir_set = set() provider_list_str = None connect_args = dict() column_formats = dict() pcode_tz_dict = dict() row_col_dict = dict() row_col_dict['target_id'] = None def task_init(self): logging.info('Initializing task params: {cn_args}, {tgt_id}'.format( cn_args=self.connect_args, tgt_id=self.task_id)) self.connect_args['user'] = self.wario_target_db_user self.connect_args['password'] = self.wario_target_db_password self.connect_args['host'] = self.wario_target_db_host self.connect_args['database'] = self.wario_target_db_name self.connect_args['table'] = self.wario_target_table_name self.row_col_dict['target_id'] = self.task_id self.column_formats = {'ptz_dict': "column_create('ptz_items', %s)"} def process_config_tmpl(self, tmpl_file): pcode_tz_list = Helios.get_providers_from_helios() self.pcode_tz_dict = dict(pcode_tz_list) hdfs_dirs = [ hdfs_dir.path.rsplit('/', 1)[0] for hdfs_dir in self.input() ] tmpl_subst_params = { "start_time": CmvLib.date_to_cmvformat(self.start_time), "end_time": CmvLib.date_to_cmvformat(self.end_time), "key_space": self.cassandra_keyspace, "name_space": self.cassandra_namespace, "cassandra_seeds": self.cassandra_seeds.split(','), "pcode_dict": CmvLib.prepare_ptz(pcode_tz_list, hdfs_dirs) } with open(tmpl_file) as json_file: json_data = json.load(json_file) CmvLib.replace_config_params(json_data, tmpl_subst_params) return json_data def requires(self): CmvLib.validate_min15_time(self.start_time) CmvLib.validate_min15_time(self.end_time) cube_timeranges = set() now = self.start_time logging.info("start_time = %s, end_time = %s", self.start_time, self.end_time) while now < self.end_time: cube_timeranges.add(now) now += timedelta(minutes=15) return [ InputSessionFile(cube_time=cube_time) for cube_time in cube_timeranges ] def run(self): config_json = self.process_config_tmpl( CmvLib.get_template_path('resources/cmv_template.json')) with open('new_config.json', 'w') as outfile: json.dump(config_json, outfile, indent=4) datadog_start_time = time.time() appserver_jobsubmit_url = CmvLib.get_appserver_job_submit_url( self.appserver_host_port, self.appserver_app_name, self.appserver_app_type) rslt_json = CmvLib.submit_config_to_appserver(config_json, appserver_jobsubmit_url) job_id = rslt_json['payload']['jobId'] appserver_jobstatus_url = CmvLib.get_appserver_job_status_url( self.appserver_host_port, self.appserver_app_name, job_id) appserver_resp = CmvLib.poll_appserver_job_status( appserver_jobstatus_url) DataDogClient.gauge_this_metric('min15_delay', (time.time() - datadog_start_time)) if appserver_resp['payload']['status'] != 'Finished': logging.error( "AppServer responded with an error. AppServer Response: %s", appserver_resp['payload']['result']) raise Exception('Error in Appserver Response.') else: provider_list_str = appserver_resp['payload']['result']['result'][ 'providers'] if provider_list_str is not None: pcode_list = provider_list_str.replace( 'Set', '')[1:len(provider_list_str) - 4].split(',') ptz_list = [] for pcode in pcode_list: ptz_dict_item = dict() if not pcode or str(pcode).lstrip() == 'unknown': continue ptz_dict_item['pcode'] = str(pcode).lstrip() ptz_dict_item['timezone'] = self.pcode_tz_dict[str(pcode).lstrip()] ptz_list.append(ptz_dict_item) DataDogClient.gauge_this_metric('min15_provider_count', len(ptz_list)) self.row_col_dict['target_id'] = self.task_id self.row_col_dict['ptz_dict'] = json.dumps(ptz_list) self.output().touch() def output(self): self.task_init() return CmvMysqlTarget(self.connect_args, self.row_col_dict, column_formats=self.column_formats)
def test_serialize_padding_zero(self): dh = luigi.DateMinuteParameter().serialize( datetime.datetime(2013, 1, 1, 18, 07, 0)) self.assertEqual(dh, '2013-01-01T18H07')
def test_serialize_padding_zero(self): dm = luigi.DateMinuteParameter().serialize( datetime.datetime(2013, 2, 1, 18, 7, 0)) self.assertEqual(dm, '2013-02-01T1807')
def testDateMinuteDeprecated(self): p = luigi.DateMinuteParameter( config_path=dict(section="foo", name="bar")) self.assertEqual(datetime.datetime(2001, 2, 3, 4, 30, 0), _value(p))
class CheckStatus(luigi.Task): """ """ task_namespace = 'monitor' date = luigi.DateMinuteParameter(default=datetime.datetime.today()) servers = os.path.join(os.path.dirname(__file__), 'servers.json') services = os.path.join(os.path.dirname(__file__), 'services.json') def output(self): return luigi.LocalTarget( '%s/monitor/checkstatus.%s' % (STATE_FOLDER, self.date.strftime(luigi.DateMinuteParameter.date_format))) def run(self): services = self.load_as_json(self.services) services['timestamp'] = datetime.datetime.utcnow().isoformat() pool = Pool(20) # Parallel check for H3 job status: argsv = [] for job in services.get('jobs', []): server = services['servers'][services['jobs'][job]['server']] server_url = server['url'] server_user = server['user'] server_pass = os.environ['HERITRIX_PASSWORD'] # app.logger.info(json.dumps(server, indent=4)) services['jobs'][job]['url'] = server_url argsv.append((services['jobs'][job]['name'], job, server_url, server_user, server_pass)) # Wait for all... results = pool.map(get_h3_status, argsv) for job, state in results: services['jobs'][job]['state'] = state # Parallel check for queue statuses: argsv = [] for queue in services.get('queues', []): server_prefix = services['servers'][services['queues'][queue] ['server']]['prefix'] services['queues'][queue]['prefix'] = server_prefix queue_name = services['queues'][queue]['name'] argsv.append((queue_name, queue, server_prefix)) # Wait for all... results = pool.map(get_queue_status, argsv) for queue, state in results: services['queues'][queue]['state'] = state # Parallel check for HTTP status: argsv = [] for http in services.get('http', []): argsv.append((http, services['http'][http]['url'])) # Wait for all... results = pool.map(get_http_status, argsv) for http, state in results: services['http'][http]['state'] = state argsv = [] for hdfs in services.get('hdfs', []): argsv.append((hdfs, services['hdfs'][hdfs]['url'])) # Wait for all... results = pool.map(get_hdfs_status, argsv) for hdfs, state in results: services['hdfs'][hdfs]['state'] = state # And then write to a file with self.output().open('w') as f: f.write('{}'.format(json.dumps(services, indent=4))) def load_as_json(self, filename): script_dir = os.path.dirname(__file__) file_path = os.path.join(script_dir, filename) with open(file_path, 'r') as fi: return json.load(fi)