Ejemplo n.º 1
0
 def test_non_zero_total_hits_of_min(self, mock_request):
     r1 = mock.Mock()
     r1.json.return_value = {
         "hits": {
             "total": 1,
             "hits": [{
                 "_source": {
                     "fake-field": "fake-1"
                 }
             }]
         }
     }
     r2 = mock.Mock()
     r2.json.return_value = {
         "hits": {
             "hits": [{
                 "_source": {
                     "fake-field": "fake-2"
                 }
             }]
         }
     }
     mock_request.side_effect = [r1, r2]
     self.assertEqual(["fake-1", "fake-2"],
                      utils.get_min_max_timestamps("fake-se", "fake-field"))
     self.assertEqual(2, mock_request.call_count)
     calls = [mock.call("get", "fake-se/_search?size=1",
                        allow_redirects=True,
                        data='{"sort": {"fake-field": {"order": "asc"}}}',
                        params=None),
              mock.call("get", "fake-se/_search?size=1",
                        allow_redirects=True,
                        data='{"sort": {"fake-field": {"order": "desc"}}}',
                        params=None)]
     mock_request.assert_has_calls(calls)
Ejemplo n.º 2
0
 def test_zero_total_hits_of_min(self, mock_request):
     mock_request.return_value.json.return_value = {"hits": {"total": 0}}
     self.assertEqual([None, None],
                      utils.get_min_max_timestamps("fake-se", "fake-field"))
     mock_request.assert_called_once_with(
         "get", "fake-se/_search?size=1", allow_redirects=True,
         data='{"sort": {"fake-field": {"order": "asc"}}}', params=None)
Ejemplo n.º 3
0
def job():
    started_at = time.time()
    LOG.info("Starting Syncing Job")

    for src in CONF["sources"]:
        backend_url = "%s/ms_health_%s/service" % (CONF["backend"]["elastic"],
                                                   src["region"])
        min_ts, max_ts = utils.get_min_max_timestamps(backend_url, "timestamp")

        driver = _get_driver(src["driver"]["type"])(src["driver"])
        data_generator = driver.fetch(latest_aggregated_ts=max_ts)

        LOG.info("Start syncing %s region", src["region"])

        for i, data_interval in enumerate(data_generator):

            if not data_interval:
                LOG.info("Chunk %s from region %s is already synced.", i,
                         src["region"])
                continue

            req_data = []
            for d in data_interval:
                d["region"] = src["region"]
                # TODO(boris-42): Data is validated only by ES, which is bad
                req_data.append('{"index": {}}')
                req_data.append(json.dumps(d))
            req_data = "\n".join(req_data)
            LOG.info("Sending data from chunk %s to backend", i)

            try:
                r = requests.post("%s/_bulk" % backend_url, data=req_data)
            except requests.exceptions.RequestException:
                LOG.error(
                    "Was unable to store data for %s Stopping current "
                    "job run", data_interval)
                break
            LOG.debug(r.text)

    LOG.info("Syncing job completed in %.3f seconds",
             (time.time() - started_at))
Ejemplo n.º 4
0
    def fetch(self, latest_aggregated_ts=None):
        es = self.config["elastic_src"]
        ts_min, ts_max = utils.get_min_max_timestamps(es, "Timestamp")

        if ts_min is ts_max is None:
            LOG.error(
                "Got no timestamps from source es, will skip fetching "
                "data for %s", es)
            return

        if latest_aggregated_ts:
            intervals = utils.incremental_scan(ts_max, latest_aggregated_ts)
        else:
            intervals = utils.incremental_scan(ts_max, ts_min)

        for interval in intervals:
            body = self.get_request(interval)
            try:
                resp = requests.post("%s/_search" % es, data=json.dumps(body))
            except requests.exceptions.RequestException as e:
                LOG.error("Was unable to make a request for interval %s: %s",
                          interval, e)

            if not resp.ok:
                LOG.error("Got a non-ok response for interval %s: %s",
                          interval, resp.text)
                continue
            resp = resp.json()

            r = []
            for bucket in resp["aggregations"]["per_minute"]["buckets"]:

                ts = bucket["key_as_string"]
                r.append(self.record_from_bucket(bucket, ts, "all"))

                for service in bucket["services"]["buckets"]:
                    r.append(
                        self.record_from_bucket(service, ts, service["key"]))
            yield r
Ejemplo n.º 5
0
def main(es, latest_aggregated_ts=None):
    ts_min, ts_max = utils.get_min_max_timestamps(es, "Timestamp")

    if latest_aggregated_ts:
        intervals = utils.incremental_scan(ts_max, latest_aggregated_ts)
    else:
        intervals = utils.incremental_scan(ts_max, ts_min)

    for interval in intervals:
        body = get_request(interval)
        resp = requests.post("%s/_search?search_type=count" % es,
                             data=json.dumps(body)).json()

        r = []
        for bucket in resp["aggregations"]["per_minute"]["buckets"]:

            ts = bucket["key_as_string"]
            r.append(record_from_bucket(bucket, ts, "all"))

            for service in bucket["services"]["buckets"]:
                r.append(record_from_bucket(service, ts, service["key"]))
        yield r
Ejemplo n.º 6
0
def job():
    started_at = time.time()
    logging.info("Starting Syncing Job")

    backend_url = "%s/ms_health/service" % CONF["backend"]["elastic"]

    min_ts, max_ts = utils.get_min_max_timestamps(backend_url, "timestamp")

    for src in CONF["sources"]:
        # TODO(boris-42): Make this actually pluggable
        data_generator = tcp_driver.main(src["driver"]["elastic_src"],
                                         latest_aggregated_ts=max_ts)

        logging.info("Start syncing %s region" % src["region"])

        for i, data_interval in enumerate(data_generator):
            logging.info("Start syncing %s region" % src["region"])

            if not data_interval:
                logging.info("Fetched data from %s region, chunk %s"
                             % (src["region"], i))
                continue

            req_data = []
            for d in data_interval:
                d["region"] = src["region"]
                # TODO(boris-42): Data is validated only by ES, which is bad
                req_data.append('{"index": {}}')
                req_data.append(json.dumps(d))
            req_data = "\n".join(req_data)
            logging.info("Sending data to elastic %s" % i)
            r = requests.post("%s/_bulk" % backend_url, data=req_data)
            logging.debug(r.json())

        logging.info("Syncing Job: Completed in %.3f seconds"
                     % (time.time() - started_at))