Exemple #1
0
 def test_parse_deprecated(self):
     dm = luigi.DateMinuteParameter().parse('2013-02-01T18H42')
     self.assertEqual(dm, datetime.datetime(2013, 2, 1, 18, 42, 0))
Exemple #2
0
 def test_serialize(self):
     dm = luigi.DateMinuteParameter().serialize(
         datetime.datetime(2013, 2, 1, 18, 42, 0))
     self.assertEqual(dm, '2013-02-01T1842')
Exemple #3
0
class DateMinuteTaskOk(luigi.Task):
    minute = luigi.DateMinuteParameter()

    def complete(self):
        # test against 2000.03.01T02H03
        return self.minute in [datetime.datetime(2000, 3, 1, 2, 0), datetime.datetime(2000, 3, 1, 2, 3), datetime.datetime(2000, 3, 1, 2, 4)]
Exemple #4
0
 def test_parse_padding_zero(self):
     dm = luigi.DateMinuteParameter().parse('2013-02-01T1807')
     self.assertEqual(dm, datetime.datetime(2013, 2, 1, 18, 7, 0))
Exemple #5
0
 def test_serialize(self):
     dh = luigi.DateMinuteParameter().serialize(
         datetime.datetime(2013, 1, 1, 18, 42, 0))
     self.assertEqual(dh, '2013-01-01T18H42')
Exemple #6
0
class DateMinuteTask(luigi.Task):
    dh = luigi.DateMinuteParameter()
Exemple #7
0
class AvailabilityToCSV(luigi.Task):
    """Turn real-time bike availability to CSV files
    """
    city = luigi.Parameter()
    timestamp = luigi.DateMinuteParameter(default=dt.now(), interval=5)

    @property
    def path(self):
        return os.path.join(DATADIR, self.city, '{year}', '{month:02d}',
                            '{day:02d}', '{ts}.csv')

    def requires(self):
        return BikeAvailability(self.city, self.timestamp)

    def output(self):
        triple = lambda x: (x.year, x.month, x.day)
        year, month, day = triple(self.timestamp)
        ts = self.timestamp.strftime('%HH%M')  # 16H35
        return luigi.LocalTarget(
            self.path.format(year=year,
                             month=month,
                             day=day,
                             ts=ts,
                             format=UTF8))

    def run(self):
        with self.input().open() as fobj:
            if self.city == 'bordeaux':
                tree = etree.parse(fobj)
                wfs_ns = '{http://www.opengis.net/wfs/2.0}'
                bm_ns = '{http://data.bordeaux-metropole.fr/wfs}'
                elements = (node.find(bm_ns + 'CI_VCUB_P')
                            for node in tree.findall(wfs_ns + 'member'))
                data = []
                for node in elements:
                    data.append(extract_xml_feature(node))
                df = pd.DataFrame([dict(x) for x in data])
                status_key = config[self.city]['feature_status']
                df[status_key] = df[status_key].apply(
                    lambda x: 'open' if x == 'CONNECTEE' else 'closed')
            elif self.city == 'lyon':
                data = json.load(fobj)
                df = pd.DataFrame(data['values'], columns=data['fields'])
                status_key = config[self.city]['feature_status']
                df[status_key] = df[status_key].apply(
                    lambda x: 'open' if x == 'OPEN' else 'closed')
            else:
                raise ValueError(("{} is an unknown city.".format(self.city)))
        df = df[[
            config[self.city]['feature_avl_id'],
            config[self.city]['feature_timestamp'],
            config[self.city]['feature_avl_stands'],
            config[self.city]['feature_avl_bikes'],
            config[self.city]['feature_status']
        ]]
        df.columns = [
            "id", "timestamp", "available_stands", "available_bikes", "status"
        ]
        df = df.sort_values(by="id")
        with self.output().open('w') as fobj:
            df.to_csv(fobj, index=False)
Exemple #8
0
 def test_parse(self):
     dh = luigi.DateMinuteParameter().parse('2013-01-01T18H42')
     self.assertEqual(dh, datetime.datetime(2013, 1, 1, 18, 42, 0))
Exemple #9
0
 def _format_datetime(self, dt):
     return luigi.DateMinuteParameter().serialize(dt)
Exemple #10
0
class UpdateCollectionsSolr(luigi.Task):
    task_namespace = 'discovery'
    date = luigi.DateMinuteParameter(default=datetime.datetime.now())
    solr_endpoint = luigi.Parameter(default='http://localhost:8983/solr/collections')

    def requires(self):
        return [TargetList(self.date), CollectionList(self.date), SubjectList(self.date)]

    def output(self):
        return state_file(self.date,'access-data', 'updated-collections-solr.json')

    @staticmethod
    def add_collection(s, targets_by_id, col, parent_id):
        if col['publish']:
            print("Publishing...", col['name'])

            # add a document to the Solr index
            s.add([
                {
                    "id": col["id"],
                    "type": "collection",
                    "name": col["name"],
                    "description": col["description"],
                    "parentId": parent_id
                }
            ], commit=False)

            # Look up all Targets within this Collection and add them.
            for tid in col.get('target_ids',[]):
                target = targets_by_id.get(tid, None)
                if not target:
                    logger.error("Warning! Could not find target %i" % tid)
                    continue

                # Skip items with no URLs:
                if len(target.get('urls',[])) == 0:
                    continue

                # Determine license status:
                licenses = []
                if target.get('isOA', False):
                    licenses = target.get("license_ids",[])
                    # Use a special value to indicate an inherited license:
                    if len(licenses) == 0:
                        licenses = ['1000']

                # add a document to the Solr index
                s.add([{
                    "id": "cid:%i-tid:%i" % (col['id'], target['id']),
                    "type": "target",
                    "parentId": col['id'],
                    "title": target["title"],
                    "description": target["description"],
                    "url": target["urls"][0],
                    "additionalUrl": target["urls"][1:],
                    "language": target["language"],
                    "startDate": target["crawl_start_date"],
                    "endDate": target["crawl_end_date"],
                    "licenses": licenses
                }], commit=False)

            # Add child collections
            for cc in col["children"]:
                UpdateCollectionsSolr.add_collection(s, targets_by_id, cc, col['id'])
        else:
            print("Skipping...", col['name'])

        return

    def run(self):
        targets = json.load(self.input()[0].open())
        collections = json.load(self.input()[1].open())
        subjects = json.load(self.input()[2].open())

        # build look-up table for Target IDs
        targets_by_id = {}
        target_count = 0
        for target in targets:
            tid = target['id']
            targets_by_id[tid] = target
            target_count += 1
        logger.info("Found %i targets..." % target_count)

        s = pysolr.Solr(self.solr_endpoint, timeout=30)

        # First, we delete everything (!)
        s.delete(q="*:*", commit=False)

        # Update the collections:
        for col in collections:
            UpdateCollectionsSolr.add_collection(s, targets_by_id, col, None)

        # Now commit all changes:
        s.commit()

        # Record that we have completed this task successfully:
        with self.output().open('w') as f:
            f.write('{}'.format(json.dumps(collections, indent=4)))
Exemple #11
0
class RangeByMinutesBase(RangeBase):
    """
    Produces a contiguous completed range of an recurring tasks separated a specified number of minutes.
    """
    start = luigi.DateMinuteParameter(
        default=None,
        description=
        "beginning date-hour-minute, inclusive. Default: None - work backward forever (requires reverse=True)"
    )
    stop = luigi.DateMinuteParameter(
        default=None,
        description=
        "ending date-hour-minute, exclusive. Default: None - work forward forever"
    )
    minutes_back = luigi.IntParameter(
        default=60 * 24,  # one day
        description=("extent to which contiguousness is to be assured into "
                     "past, in minutes from current time. Prevents infinite "
                     "loop when start is none. If the dataset has limited "
                     "retention (i.e. old outputs get removed), this should "
                     "be set shorter to that, too, to prevent the oldest "
                     "outputs flapping. Increase freely if you intend to "
                     "process old dates - worker's memory is the limit"))
    minutes_forward = luigi.IntParameter(
        default=0,
        description=
        "extent to which contiguousness is to be assured into future, "
        "in minutes from current time. Prevents infinite loop when stop is none"
    )

    minutes_interval = luigi.IntParameter(
        default=1,
        description=
        "separation between events in minutes. It must evenly divide 60")

    def datetime_to_parameter(self, dt):
        return dt

    def parameter_to_datetime(self, p):
        return p

    def datetime_to_parameters(self, dt):
        """
        Given a date-time, will produce a dictionary of of-params combined with the ranged task parameter
        """
        return self._task_parameters(dt)

    def parameters_to_datetime(self, p):
        """
        Given a dictionary of parameters, will extract the ranged task parameter value
        """
        dt = p[self._param_name]
        return datetime(dt.year, dt.month, dt.day, dt.hour, dt.minute)

    def moving_start(self, now):
        return now - timedelta(minutes=self.minutes_back)

    def moving_stop(self, now):
        return now + timedelta(minutes=self.minutes_forward)

    def finite_datetimes(self, finite_start, finite_stop):
        """
        Simply returns the points in time that correspond to a whole number of minutes intervals.
        """
        # Validate that the minutes_interval can divide 60 and it is greater than 0 and lesser than 60
        if not (0 < self.minutes_interval < 60):
            raise ParameterException('minutes-interval must be within 0..60')
        if (60 / self.minutes_interval) * self.minutes_interval != 60:
            raise ParameterException(
                'minutes-interval does not evenly divide 60')
        # start of a complete interval, e.g. 20:13 and the interval is 5 -> 20:10
        start_minute = int(finite_start.minute /
                           self.minutes_interval) * self.minutes_interval
        datehour_start = datetime(year=finite_start.year,
                                  month=finite_start.month,
                                  day=finite_start.day,
                                  hour=finite_start.hour,
                                  minute=start_minute)
        datehours = []
        for i in itertools.count():
            t = datehour_start + timedelta(minutes=i * self.minutes_interval)
            if t >= finite_stop:
                return datehours
            if t >= finite_start:
                datehours.append(t)

    def _format_datetime(self, dt):
        return luigi.DateMinuteParameter().serialize(dt)
Exemple #12
0
class UrlMin15Generator(CmvBaseTask):
    """Task for url min15 data generation"""
    start_time = luigi.DateMinuteParameter(
        default=datetime(year=2016, month=3, day=21, hour=12, minute=15))
    wario_target_table_name = luigi.Parameter(default='url_min15',
                                              significant=False)
    appserver_app_name = luigi.Parameter(default='urlmin15gen',
                                         significant=False)

    def get_appserver_job_config(self):
        """Returns job config"""
        tmpl_values = {
            "start_time":
            CmvLib.date_to_cmvformat(self.start_time),
            "end_time":
            CmvLib.date_to_cmvformat(self.start_time + timedelta(minutes=15)),
            "cache_namespace":
            self.cassandra_namespace,
            "cassandra_seeds":
            self.cassandra_seeds.split(','),
            "cassandra_keyspace":
            self.cassandra_keyspace,
            "hdfs_name_node":
            self.hdfs_namenode,
            "hdfs_session_dirs":
            self.hdfs_session_dirs.split(','),
            "hdfs_cmv_dir":
            self.hdfs_cmv_dir
        }
        with open(CmvLib.get_template_path(
                'resources/url_min15_template.json')) as tmpl_file:
            cfg = json.load(tmpl_file)
            CmvLib.replace_config_params(cfg, tmpl_values)
            return cfg

    def requires(self):
        return InputSessionFile(cube_time=self.start_time)

    def run(self):
        job_cfg = self.get_appserver_job_config()
        logging.info('Running url min15 job...')
        datadog_start_time = time.time()
        submission_status = CmvLib.submit_config_to_appserver(
            job_cfg,
            CmvLib.get_appserver_job_submit_url(self.appserver_host_port,
                                                self.appserver_app_name))
        job_id = submission_status['payload']['jobId']
        time.sleep(5)
        appserver_resp = CmvLib.poll_appserver_job_status(
            CmvLib.get_appserver_job_status_url(self.appserver_host_port,
                                                self.appserver_app_name,
                                                job_id))
        elapsed_time = (time.time() - datadog_start_time) / 60
        DataDogClient.gauge_this_metric('url_min15_delay', elapsed_time)
        if appserver_resp['payload']['status'] != 'Finished':
            logging.error(
                "AppServer responded with an error. AppServer Response: %s",
                appserver_resp['payload']['result'])
            raise Exception('Error in AppServer Response.')
        else:
            logging.info("Url min15 job completed successfully.")
        self.output().touch()

    def output(self):
        connect_args = {
            'host': self.wario_target_db_host,
            'user': self.wario_target_db_user,
            'password': self.wario_target_db_password,
            'database': self.wario_target_db_name,
            'table': self.wario_target_table_name
        }
        col_values = {'target_id': self.task_id}
        return CmvMysqlTarget(connect_args, col_values)
class Min15AndDailyRollupTrigger(CmvBaseTask):
    start_time = luigi.DateMinuteParameter()
    end_time = luigi.DateMinuteParameter()
    min15_target_table_name = None
    wario_target_table_name = luigi.Parameter(significant=False)
    connect_args = dict()
    row_col_dict = dict()

    def requires(self):
        CmvLib.validate_min15_time(self.start_time)
        CmvLib.validate_min15_time(self.end_time)
        logging.info(
            "Task: Min15AndDailyRollupTrigger, start_time = %s, end_time = %s",
            self.start_time, self.end_time)
        min15_task = CmvMin15Generator(start_time=self.start_time,
                                       end_time=self.end_time)
        self.min15_target_table_name = min15_task.wario_target_table_name
        return min15_task

    def task_init(self):
        self.connect_args['user'] = self.wario_target_db_user
        self.connect_args['password'] = self.wario_target_db_password
        self.connect_args['host'] = self.wario_target_db_host
        self.connect_args['database'] = self.wario_target_db_name
        self.connect_args['table'] = self.wario_target_table_name
        self.row_col_dict['target_id'] = self.task_id
        logging.info('Initializing task params: {cn_args}, {tgt_id}'.format(
            cn_args=self.connect_args, tgt_id=self.task_id))

    def get_ptz_dict_from_db(self):
        connect_args = dict()
        connect_args['user'] = self.wario_target_db_user
        connect_args['password'] = self.wario_target_db_password
        connect_args['host'] = self.wario_target_db_host
        connect_args['database'] = self.wario_target_db_name
        connect_args['table'] = self.min15_target_table_name

        query_string = 'select COLUMN_GET(ptz_dict, {json_item} as char) from {min15_table} where target_id = %s'.\
            format(json_item='\'ptz_items\'', min15_table=self.min15_target_table_name)

        query_values = [
            'CmvMin15Generator(start_time={s}, end_time={e})'.format(
                s=self.start_time.strftime('%Y-%m-%dT%H%M'),
                e=self.end_time.strftime('%Y-%m-%dT%H%M'))
        ]

        rows = CmvMysqlTarget(connect_args=connect_args).query(
            query_string, query_values)

        return json.loads(str(rows[0][0]))

    def run(self):
        ptz_dict_list = self.get_ptz_dict_from_db()
        upstream_rollup_tasks = []
        for ptz_dict in ptz_dict_list:
            rollup_pcode = ptz_dict['pcode']
            rollup_tz = ptz_dict['timezone']
            rollup_day = DateTime.utc_to_any_tz(self.start_time,
                                                rollup_tz).replace(tzinfo=None)
            logging.info(
                'Preparing DailyRollup with params: day = {day}, timezone = {tz}, pcode = {pcode}'
                .format(day=rollup_day, tz=rollup_tz, pcode=rollup_pcode))
            upstream_rollup_tasks.append(
                CmvRollupDailyGenerator(day=rollup_day,
                                        timezone=rollup_tz,
                                        pcode=rollup_pcode))
        logging.info('Triggering upstream rollup tasks')
        yield upstream_rollup_tasks
        self.output().touch()

    def output(self):
        self.task_init()
        return CmvMysqlTarget(self.connect_args, self.row_col_dict)
Exemple #14
0
class CheckStatus(luigi.Task):
    """
    """
    task_namespace = 'monitor'
    date = luigi.DateMinuteParameter(default=datetime.datetime.today())

    def output(self):
        return luigi.LocalTarget('%s/monitor/checkstatus.%s' % (state().state_folder, self.date.strftime(luigi.DateMinuteParameter.date_format)))

    def run(self):
        servers = self.load_as_json(systems().servers)
        services = self.load_as_json(systems().services)

        for job in services.get('jobs', []):
            server = servers[services['jobs'][job]['server']]
            # app.logger.info(json.dumps(server, indent=4))
            services['jobs'][job]['state'] = self.get_h3_status(services['jobs'][job]['name'], server)
            services['jobs'][job]['url'] = server['url']

        for queue in services.get('queues', []):
            server = servers[services['queues'][queue]['server']]
            services['queues'][queue]['prefix'] = server['user_prefix']
            services['queues'][queue]['state'] = self.get_queue_status(services['queues'][queue]['name'], server)

        for http in services.get('http', []):
            services['http'][http]['state'] = self.get_http_status(services['http'][http]['url'])

        for hdfs in services.get('hdfs', []):
            services['hdfs'][hdfs]['state'] = self.get_hdfs_status(services['hdfs'][hdfs])

        with self.output().open('w') as f:
            f.write('{}'.format(json.dumps(services, indent=4)))

    def get_h3_status(self, job, server):
        # Set up connection to H3:
        h = hapyx.HapyX(server['url'], username=server['user'], password=server['pass'], timeout=5)
        state = {}
        try:
            logger.info("Getting status for job %s on %s" % (job, server))
            info = h.get_job_info(job)
            state['details'] = info
            if info.has_key('job'):
                state['status'] = info['job'].get("crawlControllerState", None)
                if not state['status']:
                    state['status'] = info['job'].get("statusDescription", None)
                state['status'] = state['status'].upper()
        except Exception as e:
            state['status'] = "DOWN"
            state['error'] = "Could not reach Heritrix! %s" % e
            # app.logger.exception(e)
        # Classify
        if state['status'] == "DOWN":
            state['status-class'] = "status-oos"
        elif state['status'] == "RUNNING":
            # Replacing RUNNING with docs/second rate
            rate = state['details']['job']['rateReport']['currentDocsPerSecond']
            state['rate'] = "%.1f" % float(rate)
            if rate < 1.0:
                state['status-class'] = "status-warning"
            else:
                state['status-class'] = "status-good"
        else:
            state['status-class'] = "status-warning"

        return state

    def get_queue_status(self, queue, server):
        state = {}
        try:
            logger.info("Getting status for queue %s on %s" % (queue, server))
            qurl = '%s%s' % (server['prefix'], queue)
            # app.logger.info("GET: %s" % qurl)
            r = requests.get(qurl, timeout=5)
            state['details'] = r.json()
            state['count'] = "{:0,}".format(state['details']['messages'])
            if 'error' in state['details']:
                state['status'] = "ERROR"
                state['status-class'] = "status-alert"
                state['error'] = state['details']['reason']
            elif state['details']['consumers'] == 0:
                state['status'] = "BECALMED"
                state['status-class'] = "status-oos"
                state['error'] = 'No consumers!'
            else:
                state['status'] = state['details']['messages']
                state['status-class'] = "status-good"
        except Exception as e:
            state['status'] = "DOWN"
            state['status-class'] = "status-alert"
            logger.exception(e)

        return state

    def get_http_status(self, url):
        state = {}
        try:
            logger.info("Getting status for %s" % (url))
            r = requests.get(url, allow_redirects=False, timeout=10)
            state['status'] = "%s" % r.status_code
            if r.status_code / 100 == 2 or r.status_code / 100 == 3:
                state['status'] = "%.3fs" % r.elapsed.total_seconds()
                state['status-class'] = "status-good"
            else:
                state['status-class'] = "status-warning"
        except:
            state['status'] = "DOWN"
            state['status-class'] = "status-alert"

        return state

    def get_hdfs_status(self, hdfs):
        state = {}
        try:
            logger.info("Getting status for hdfs %s" % (hdfs))
            r = requests.get(hdfs['url'], timeout=5)
            state['status'] = "%s" % r.status_code
            if r.status_code / 100 == 2:
                state['status-class'] = "status-good"
                tree = etree.fromstring(r.text, etree.HTMLParser())
                percent = tree.xpath("//div[@id='dfstable']//tr[5]/td[3]")[0].text
                percent = percent.replace(" ", "")
                state['percent'] = percent
                state['remaining'] = tree.xpath("//div[@id='dfstable']//tr[4]/td[3]")[0].text.replace(" ", "")
                underr = int(tree.xpath("//div[@id='dfstable']//tr[10]/td[3]")[0].text)
                if underr != 0:
                    state['status'] = "HDFS has %i under-replicated blocks!" % underr
                    state['status-class'] = "status-warning"
            else:
                state['status-class'] = "status-warning"
        except Exception as e:
            logger.exception(e)
            state['status'] = "DOWN"
            state['status-class'] = "status-alert"

        return state

    def load_as_json(self, filename):
        script_dir = os.path.dirname(__file__)
        file_path = os.path.join(script_dir, filename)
        with open(file_path, 'r') as fi:
            return json.load(fi)
Exemple #15
0
class TrainXGBoost(luigi.Task):
    """Train a XGBoost model between `start` and `stop` dates to predict bike
    availability at each station in `city`

    Attributes
    ----------
    city : luigi.Parameter
        City of interest, *e.g.* Bordeaux or Lyon
    start : luigi.DateParameter
        Training start date
    stop : luigi.DataParameter
        Training stop date upper bound (actually the end date is computed with
    `validation`)
    validation : luigi.DateMinuteParameter
        Date that bounds the training set and the validation set during the
    XGBoost model training
    frequency : DateOffset, timedelta or str
        Indicates the prediction frequency
    """
    city = luigi.Parameter()
    start = luigi.DateParameter(default=yesterday())
    stop = luigi.DateParameter(default=date.today())
    validation = luigi.DateMinuteParameter(default=dt.now() -
                                           timedelta(hours=1))
    frequency = luigi.Parameter(default="30T")

    def outputpath(self):
        fname = "{}-to-{}-at-{}-freq-{}.model".format(
            self.start, self.stop, self.validation.isoformat(), self.frequency)
        return os.path.join(DATADIR, self.city, 'xgboost-model', fname)

    def output(self):
        return luigi.LocalTarget(self.outputpath(), format=MixedUnicodeBytes)

    def run(self):
        query = ("SELECT DISTINCT id AS station_id, timestamp AS ts, "
                 "available_bikes AS nb_bikes, available_stands AS nb_stands, "
                 "available_bikes::float / (available_bikes::float "
                 "+ available_stands::float) AS probability "
                 "FROM {schema}.{tablename} "
                 "WHERE timestamp >= %(start)s "
                 "AND timestamp < %(stop)s "
                 "AND (available_bikes > 0 OR available_stands > 0) "
                 "AND (status = 'open')"
                 "ORDER BY id, timestamp"
                 ";").format(schema=self.city, tablename='timeseries')
        eng = db()
        df = pd.io.sql.read_sql_query(query,
                                      eng,
                                      params={
                                          "start": self.start,
                                          "stop": self.stop
                                      })
        df.station_id = df.station_id.astype(int)
        if df.empty:
            raise Exception(
                "There is not any data to process in the DataFrame. " +
                "Please check the dates.")
        prediction_model = train_prediction_model(df, self.validation,
                                                  self.frequency)
        self.output().makedirs()
        prediction_model.save_model(self.output().path)
Exemple #16
0
 def testDateWithMinuteInterval(self):
     p = luigi.DateMinuteParameter(config_path=dict(section="foo",
                                                    name="bar"),
                                   interval=2)
     self.assertEqual(datetime.datetime(2001, 2, 3, 4, 30, 0), _value(p))
 class Bar(RunOnceTask):
     time = luigi.DateMinuteParameter()
Exemple #18
0
 def test_parse_padding_zero(self):
     dh = luigi.DateMinuteParameter().parse('2013-01-01T18H07')
     self.assertEqual(dh, datetime.datetime(2013, 1, 1, 18, 07, 0))
Exemple #19
0
class CmvMin15Generator(CmvBaseTask):
    start_time = luigi.DateMinuteParameter()
    end_time = luigi.DateMinuteParameter()
    wario_target_table_name = luigi.Parameter(default='min15',
                                              significant=False)
    appserver_app_name = luigi.Parameter(default='', significant=False)
    appserver_app_type = luigi.Parameter(default='', significant=False)
    hdfs_dir_set = set()
    provider_list_str = None
    connect_args = dict()
    column_formats = dict()
    pcode_tz_dict = dict()

    row_col_dict = dict()
    row_col_dict['target_id'] = None

    def task_init(self):
        logging.info('Initializing task params: {cn_args}, {tgt_id}'.format(
            cn_args=self.connect_args, tgt_id=self.task_id))
        self.connect_args['user'] = self.wario_target_db_user
        self.connect_args['password'] = self.wario_target_db_password
        self.connect_args['host'] = self.wario_target_db_host
        self.connect_args['database'] = self.wario_target_db_name
        self.connect_args['table'] = self.wario_target_table_name
        self.row_col_dict['target_id'] = self.task_id
        self.column_formats = {'ptz_dict': "column_create('ptz_items', %s)"}

    def process_config_tmpl(self, tmpl_file):
        pcode_tz_list = Helios.get_providers_from_helios()
        self.pcode_tz_dict = dict(pcode_tz_list)
        hdfs_dirs = [
            hdfs_dir.path.rsplit('/', 1)[0] for hdfs_dir in self.input()
        ]

        tmpl_subst_params = {
            "start_time": CmvLib.date_to_cmvformat(self.start_time),
            "end_time": CmvLib.date_to_cmvformat(self.end_time),
            "key_space": self.cassandra_keyspace,
            "name_space": self.cassandra_namespace,
            "cassandra_seeds": self.cassandra_seeds.split(','),
            "pcode_dict": CmvLib.prepare_ptz(pcode_tz_list, hdfs_dirs)
        }

        with open(tmpl_file) as json_file:
            json_data = json.load(json_file)
            CmvLib.replace_config_params(json_data, tmpl_subst_params)
            return json_data

    def requires(self):
        CmvLib.validate_min15_time(self.start_time)
        CmvLib.validate_min15_time(self.end_time)
        cube_timeranges = set()
        now = self.start_time
        logging.info("start_time = %s, end_time = %s", self.start_time,
                     self.end_time)
        while now < self.end_time:
            cube_timeranges.add(now)
            now += timedelta(minutes=15)
        return [
            InputSessionFile(cube_time=cube_time)
            for cube_time in cube_timeranges
        ]

    def run(self):

        config_json = self.process_config_tmpl(
            CmvLib.get_template_path('resources/cmv_template.json'))
        with open('new_config.json', 'w') as outfile:
            json.dump(config_json, outfile, indent=4)
        datadog_start_time = time.time()
        appserver_jobsubmit_url = CmvLib.get_appserver_job_submit_url(
            self.appserver_host_port, self.appserver_app_name,
            self.appserver_app_type)
        rslt_json = CmvLib.submit_config_to_appserver(config_json,
                                                      appserver_jobsubmit_url)

        job_id = rslt_json['payload']['jobId']
        appserver_jobstatus_url = CmvLib.get_appserver_job_status_url(
            self.appserver_host_port, self.appserver_app_name, job_id)
        appserver_resp = CmvLib.poll_appserver_job_status(
            appserver_jobstatus_url)
        DataDogClient.gauge_this_metric('min15_delay',
                                        (time.time() - datadog_start_time))

        if appserver_resp['payload']['status'] != 'Finished':
            logging.error(
                "AppServer responded with an error. AppServer Response: %s",
                appserver_resp['payload']['result'])
            raise Exception('Error in Appserver Response.')
        else:
            provider_list_str = appserver_resp['payload']['result']['result'][
                'providers']
            if provider_list_str is not None:
                pcode_list = provider_list_str.replace(
                    'Set', '')[1:len(provider_list_str) - 4].split(',')

        ptz_list = []
        for pcode in pcode_list:
            ptz_dict_item = dict()
            if not pcode or str(pcode).lstrip() == 'unknown':
                continue
            ptz_dict_item['pcode'] = str(pcode).lstrip()
            ptz_dict_item['timezone'] = self.pcode_tz_dict[str(pcode).lstrip()]
            ptz_list.append(ptz_dict_item)
        DataDogClient.gauge_this_metric('min15_provider_count', len(ptz_list))
        self.row_col_dict['target_id'] = self.task_id
        self.row_col_dict['ptz_dict'] = json.dumps(ptz_list)
        self.output().touch()

    def output(self):
        self.task_init()
        return CmvMysqlTarget(self.connect_args,
                              self.row_col_dict,
                              column_formats=self.column_formats)
Exemple #20
0
 def test_serialize_padding_zero(self):
     dh = luigi.DateMinuteParameter().serialize(
         datetime.datetime(2013, 1, 1, 18, 07, 0))
     self.assertEqual(dh, '2013-01-01T18H07')
Exemple #21
0
 def test_serialize_padding_zero(self):
     dm = luigi.DateMinuteParameter().serialize(
         datetime.datetime(2013, 2, 1, 18, 7, 0))
     self.assertEqual(dm, '2013-02-01T1807')
Exemple #22
0
 def testDateMinuteDeprecated(self):
     p = luigi.DateMinuteParameter(
         config_path=dict(section="foo", name="bar"))
     self.assertEqual(datetime.datetime(2001, 2, 3, 4, 30, 0), _value(p))
Exemple #23
0
class CheckStatus(luigi.Task):
    """
    """
    task_namespace = 'monitor'
    date = luigi.DateMinuteParameter(default=datetime.datetime.today())

    servers = os.path.join(os.path.dirname(__file__), 'servers.json')
    services = os.path.join(os.path.dirname(__file__), 'services.json')

    def output(self):
        return luigi.LocalTarget(
            '%s/monitor/checkstatus.%s' %
            (STATE_FOLDER,
             self.date.strftime(luigi.DateMinuteParameter.date_format)))

    def run(self):
        services = self.load_as_json(self.services)
        services['timestamp'] = datetime.datetime.utcnow().isoformat()

        pool = Pool(20)

        # Parallel check for H3 job status:
        argsv = []
        for job in services.get('jobs', []):
            server = services['servers'][services['jobs'][job]['server']]
            server_url = server['url']
            server_user = server['user']
            server_pass = os.environ['HERITRIX_PASSWORD']
            # app.logger.info(json.dumps(server, indent=4))
            services['jobs'][job]['url'] = server_url
            argsv.append((services['jobs'][job]['name'], job, server_url,
                          server_user, server_pass))
        # Wait for all...
        results = pool.map(get_h3_status, argsv)
        for job, state in results:
            services['jobs'][job]['state'] = state

        # Parallel check for queue statuses:
        argsv = []
        for queue in services.get('queues', []):
            server_prefix = services['servers'][services['queues'][queue]
                                                ['server']]['prefix']
            services['queues'][queue]['prefix'] = server_prefix
            queue_name = services['queues'][queue]['name']
            argsv.append((queue_name, queue, server_prefix))
        # Wait for all...
        results = pool.map(get_queue_status, argsv)
        for queue, state in results:
            services['queues'][queue]['state'] = state

        # Parallel check for HTTP status:
        argsv = []
        for http in services.get('http', []):
            argsv.append((http, services['http'][http]['url']))
        # Wait for all...
        results = pool.map(get_http_status, argsv)
        for http, state in results:
            services['http'][http]['state'] = state

        argsv = []
        for hdfs in services.get('hdfs', []):
            argsv.append((hdfs, services['hdfs'][hdfs]['url']))
        # Wait for all...
        results = pool.map(get_hdfs_status, argsv)
        for hdfs, state in results:
            services['hdfs'][hdfs]['state'] = state

        # And then write to a file
        with self.output().open('w') as f:
            f.write('{}'.format(json.dumps(services, indent=4)))

    def load_as_json(self, filename):
        script_dir = os.path.dirname(__file__)
        file_path = os.path.join(script_dir, filename)
        with open(file_path, 'r') as fi:
            return json.load(fi)