def request(url, tries=3): """Wrapper around :func:`urlopen` to AWIS call. On failure, will attempt another 2 tries for success. **Args:** *url*: the AWIS URL to call *tries*: number of failed tries allowed before flagging this attempt as a failure **Returns:** the HTTP response value """ failed_requests = 0 response_value = None while failed_requests < tries: try: log.debug('Request %d of %d: "%s"', (failed_requests + 1), tries, url) response = urlopen(url) if response.code == 200: response_value = response.read() break except HTTPError as err: log.error('Request failed "%s"', err) failed_requests += 1 if failed_requests >= tries: log.error('All requests failed') return response_value
def flatten_domains(self, max_read_count=None, topic='alexa-results', group_id='default', dry=False): """Takes the Alexa batched domain results and split out into separate, JSON equivalents. Kwargs: *max_read_count*: number of batched domains to read. `None` returned all offsets associated with the *group_id* *group_id*: Kafka managed consumer element that manages the messages read from the topic """ count_q = multiprocessing.Queue() target = self.flatten_worker args = (count_q, max_read_count, topic, group_id) kwargs = {'dry': dry} domain_intel.utils.threader(self.threads, target, *args, **kwargs) total_count = 0 while not count_q.empty(): total_count += count_q.get() log.debug('Flatten workers total records read %d', total_count) return total_count
def persist(self): """Persist flattened (processed) GeoDNS data to ArangoDB. :attr:`max_read_count` can limit the number of records read from *topic*. The default action is to read all available messages. The default consumer :attr:`topics` is `dns-geodns-parsed`. The :attr:`dry` flag will simulate execution. No records will be published. Returns: total count of records written to the DB across all workers """ count_q = multiprocessing.Queue() target = self.persist_worker args = (count_q, self.max_read_count, self.kafka_consumer_topics[0], self.kafka_consumer_group_id) kwargs = {'dry': self.dry} threads = domain_intel.common.CONFIG.get('threads', 1) domain_intel.utils.threader(threads, target, *args, **kwargs) total_count = 0 while not count_q.empty(): total_count += count_q.get() log.debug('Persisted GeoDNS total count %d', total_count) return total_count
def resolve(self, ipv4, time_epoch=None): """resolve an ipv4 address and optional point in time to a geog record (see ipechelon.geog* in aurora. for in depth documentation, look at the compass project. this method does not have an 'unparsed' representation, as we control both ends.""" url = self.url() log.debug( "compass requesting %s", ipv4, ) try: res = self.session.post( url, data=json.dumps({ "ip": ipv4, "time": time_epoch if time_epoch else int(time.time()), })) response = self._parse_results(res.content) res.raise_for_status() return response # bubble this up to caller except CompassServerEmptyResponse as exc: raise exc # wrap generic fatal error except Exception as exc: raise_from( CompassServerError("couldn't call geog compass backend: %s" % (res.content)), exc)
def _get_or_error(self, url, headers): try: log.debug("requesting url %s" % url) res = self.session.get(url, headers=headers) res.raise_for_status() return res except Exception as exc: raise_from(CheckHostNetError("couldn't call check backend"), exc)
def read_worker(self, queue, max_read_count, topic, group_id, slurp, dry=False): """Read all domains from the Kafka partitions. As this is a worker that could be part of a set of executing threads, the number of messages read is pushed onto the :class:`multiprocessing.Queue` *queue*. The parameter list is as per :meth:`read_domains`. Returns: updated :class:`multiprocessing.Queue` *queue* instance with number of records processed """ log.debug('Read worker set to read %s messages', max_read_count or 'all') with self.producer() as producer: with self.consumer(topic, group_id) as consumer: domain_batch = [] messages_read = 0 total_messages_read = 0 for message in consumer: messages_read += 1 total_messages_read += 1 domain_batch.append(message.value.rstrip()) if (len(domain_batch) > 4 or (max_read_count is not None and messages_read >= max_read_count)): if slurp: self.slurp_domains(producer, domain_batch, dry=dry) else: log.info('Domains pending: %s', domain_batch) del domain_batch[:] messages_read = 0 if (max_read_count is not None and (total_messages_read >= max_read_count)): break # ... and check for laggards. if domain_batch: log.info('Processing laggards before close') if slurp: self.slurp_domains(producer, domain_batch, dry=dry) else: log.info('Domains pending: %s', domain_batch) queue.put(total_messages_read)
def get_geodns(self): """Get all the GeoDNS data. """ all_geodns = [] for graph_path in self.paths: edge = graph_path.get('edges', []) vertices = graph_path.get('vertices') for ipv in ['ipv4']: if edge and '{}_resolves'.format(ipv) in edge[0].get('_id'): for vertice in vertices: log.debug('XXX %s', vertices) if vertice and not re.match('{}/'.format(ipv), vertice.get('_id')): continue ip_addr = vertice.get('_key') dns_org = vertice.get('organisation', {}).get('name', '') isp = vertice.get('isp', {}).get('name', '') lat = vertice.get('geospatial', {}).get('latitude', '') lng = vertice.get('geospatial', {}).get('longitude', '') country_code = vertice.get('country', {}).get( 'iso3166_code_2', '') country_name = vertice.get('country', {}).get('name', '') continent_code = vertice.get('continent', {}).get('code', '') continent_name = vertice.get('continent', {}).get('name', '') if dns_org: dns_org = '"{}"'.format(dns_org) if isp: isp = '"{}"'.format(isp) token = ipv.upper() kwargs = { '{}_ADDR'.format(token): ip_addr, '{}_ORG'.format(token): dns_org, '{}_ISP'.format(token): isp, '{}_LATITUDE'.format(token): lat, '{}_LONGITUDE'.format(token): lng, '{}_COUNTRY_CODE'.format(token): country_code, '{}_COUNTRY'.format(token): country_name, '{}_CONTINENT_CODE'.format(token): continent_code, '{}_CONTINENT'.format(token): continent_name, } all_geodns.append(kwargs) return all_geodns
def slurp_sites_linking_in(self, domain, max_slurps=None, as_json=False, dry=False): """Get list of sites linking into *domain*. Alexa places an upper limit of 20 on the number of sites that it will return per request (or a "slurp". Subsequent calls must be made by incrementing the `Start` request parameter to indicate the page to return. Since there is not way to know how many pages need to be slurped, we must test the current result for a list of titles. If no titles are returned or *max_slurps* is breached (which ever comes first) then we exit. Returns: list of titles slurped. If *as_json* is set then the resultant set is returned as a JSON structure """ if max_slurps is None: max_slurps = MAX_SLURPS all_titles = [] for start_index in range(max_slurps): if start_index >= max_slurps: log.debug('SitesLinkingIn domain "%s" threshold breached', domain) break log.debug('SitesLinkingIn domain "%s" slurp iteration %d of %d', domain, start_index + 1, max_slurps) response = None if not dry: response = self.api.sites_linking_in(domain, start_index*SLI_COUNT) parser = domain_intel.awisapi.parser.SitesLinkingIn(response) titles = parser.extract_titles() if titles: all_titles.extend(titles) else: log.info('SitesLinkingIn slurp iteration %d returned ' 'zero titles: exiting', start_index + 1) break unique_titles = SitesLinkingIn.unique_titles(all_titles) if as_json: unique_titles = json.dumps(unique_titles, sort_keys=True, indent=4) return unique_titles
def flatten_worker(self, queue, max_read_count, topic, group_id, dry=False): """Read all Alexa TrafficHistory results from the Kafka *topic*. As this is a worker that could be part of a set of executing threads, the number of messages read is pushed onto the :class:`multiprocessing.Queue` *queue*. The parameter list is as per :meth:`flatten_worker`. Returns: updated :class:`multiprocessing.Queue` *queue* instance with number of records processed """ log.debug('TrafficHistory flatten worker set to read %s messages', max_read_count or 'all') log.debug('TrafficHistory flatten worker timeout set to %d', self.timeout) total_messages_read = 0 total_messages_put = 0 with self.producer() as producer: with self.consumer(topic, group_id=group_id) as consumer: records_read = 0 for message in consumer: total_messages_read += 1 traffic = TrafficHistory.flatten_xml(message.value) if traffic is None: continue if not dry: total_messages_put += 1 producer.send('alexa-traffic-flattened', traffic.encode('utf-8')) if (max_read_count is not None and (records_read >= max_read_count)): break log.info('TrafficHistory flatten worker read|put count %d|%d', total_messages_read, total_messages_put) queue.put(tuple([total_messages_read, total_messages_put]))
def info(**kwargs): """Simple dump to logs/stout of information related to the topics we are currently authorised to access. """ log.info('Attempting get of Kafka topic detail information ...') with safe_consumer(None, **kwargs) as consumer: topics = consumer.topics() topic_count = len(topics) for index, topic in enumerate(topics, 1): log.debug('Authorised topic %d of %d: %s', index, topic_count, topic) partitions = [str(x) for x in consumer.partitions_for_topic(topic)] log.info('- Partitions: %s', ', '.join(partitions)) return topics
def traverse_relationship(self, max_read_count=None, topic='domain-labels', group_id='default', dry=False): """Read domain labels from the Kafka topic *topic* and uses that that the starting vertex to traverse the graph. The hardwired Kafka topic read from is `domain-labels`. If *max_read_count* is `None` then all domains will be returned. The default Kafka *group_id* name used is `default`. However, we can force a re-read of the topic's messages by overriding *group_id* with a unique value. Returns: total count of records read """ log.debug('Traverse worker set to read %s messages', max_read_count or 'all') with self.producer() as producer: with self.consumer(topic, group_id) as consumer: total_messages_read = 0 for message in consumer: label = message.value.decode('utf-8') result = self.store.traverse_graph(label) if result is None: continue total_messages_read += 1 if not dry: producer.send('domain-traversals', result.encode('utf-8')) if (max_read_count is not None and (total_messages_read >= max_read_count)): break log.debug('Domains traverser worker records read %d', total_messages_read) return total_messages_read
def persist_worker(self, queue, max_read_count, topic, group_id, dry=False): """Persist flattened (processed) Alexa domain data to ArangoDB worker. As this is a worker that could be part of a set of executing threads, the number of messages read is pushed onto the :class:`multiprocessing.Queue` *queue*. The parameter list is as per :meth:`persist`. Returns: updated :class:`multiprocessing.Queue` *queue* instance with number of records processed """ log.debug('Data persist worker set to read %s messages', max_read_count or 'all') log.debug('Persist worker timeout set to %d', self.timeout) total_messages_read = 0 put_count = 0 with self.consumer(topic, group_id) as consumer: for message in consumer: total_messages_read += 1 self.write_to_store(message.value, dry) # TODO: quantify successful insert. put_count += 1 if (max_read_count is not None and total_messages_read >= max_read_count): log.info('Maximum read threshold %d breached - exiting', max_read_count) break log.info('UrlInfo persist worker messages read %d', total_messages_read) queue.put((total_messages_read, put_count))
def wide_column_dump(self, max_read_count=None, topic='domain-traversals', group_id='default', dry=False): """Takes Domain Intel graph data and dumps to a wide-column CSV format suitable for ingest into Google BigQuery *max_read_count* can limit the number of records read from *topic*. The default action is to read all available messages. The default consumer *topic* is `domain-traversal`. The *dry* flag will simulate execution. No records will be published. Returns: tuple structure representing counts for the total number of records consumed and the number of domains successfully published to the Kafka topic """ count_q = multiprocessing.Queue() target = self.wide_column_dump_worker args = (count_q, max_read_count, topic, group_id) kwargs = {'dry': dry} domain_intel.utils.threader(self.threads, target, *args, **kwargs) total_read_count = 0 total_put_count = 0 while not count_q.empty(): counter = count_q.get() total_read_count += counter[0] total_put_count += counter[1] log.debug('Wide-column CSV dump read|put count %d|%d', total_read_count, total_put_count) read_put_counts = (total_read_count, total_put_count) return read_put_counts
def persist(self, max_read_count=None, topic='alexa-traffic-flattened', group_id='default', dry=False): """Takes Alexa TrafficHistory records and writes to the persistent store. *max_read_count* can limit the number of records read from *topic*. The default action is to read all available messages. The default consumer *topic* is ``alexa-traffic-flattened``. The *dry* flag will simulate execution. No records will be published. Returns: tuple structure representing counts for the total number of records consumed and the number of domains successfully published to the Kafka topic """ count_q = multiprocessing.Queue() target = self.persist_worker args = (count_q, max_read_count, topic, group_id) kwargs = {'dry': dry} domain_intel.utils.threader(self.threads, target, *args, **kwargs) total_read_count = 0 total_put_count = 0 while not count_q.empty(): counter = count_q.get() total_read_count += counter[0] total_put_count += counter[1] log.debug('TrafficHistory persist worker read|put count %d|%d', total_read_count, total_put_count) read_put_counts = (total_read_count, total_put_count) return read_put_counts
def traverse_graph(self, label, as_json=True): """Traverse the :attr:`graph` starting at vertex denoted by *label*. Returns: the graph structure as a dictionary optionally converted to JSON if *as_json* is set """ log.debug('Traversing label "%s"', label) result = None try: result = self.graph.traverse(label, direction='any', max_depth=1) except arango.exceptions.GraphTraverseError as err: log.error('Label "%s" traverse error: %s', label, err) if result is not None and as_json: result = json.dumps(result) return result
def persist(self, max_read_count=None, topic='alexa-flattened', group_id='default', dry=False): """Persist flattened (processed) Alexa domain data to ArangoDB executor. *max_read_count* can limit the number of records read from *topic*. The default action is to read all available messages. The default consumer *topic* is `alexa-flattened`. The *dry* flag will simulate execution. No records will be published. Returns: total count of records written to the DB across all workers """ count_q = multiprocessing.Queue() target = self.persist_worker args = (count_q, max_read_count, topic, group_id) kwargs = {'dry': dry} domain_intel.utils.threader(self.threads, target, *args, **kwargs) total_read_count = 0 total_put_count = 0 while not count_q.empty(): counter = count_q.get() total_read_count += counter[0] total_put_count += counter[1] log.debug('UrlInfo persist worker read|put count %d|%d', total_read_count, total_put_count) read_put_counts = (total_read_count, total_put_count) return read_put_counts
def topic_dump(self, max_read_count=None, topic='wide-column-csv', group_id='default', dry=False): """Simple dump of messages from *topic*. *max_read_count* can limit the number of records read from *topic*. The default action is to read all available messages. The default Kafka *group_id* name used is `default`. However, we can force a re-read of the topic's messages by overriding *group_id* with a unique value. The *dry* flag will simulate execution. No output CSV will be created. Returns: number of messages read """ log.debug('Topic "%s" dump set to read %s messages', topic, max_read_count or 'all') log.debug('Topic dump timeout set to %d', self.timeout) with self.consumer(topic, group_id) as consumer: messages_read = 0 for message in consumer: messages_read += 1 sys.stdout.buffer.write(message.value) print() if (max_read_count is not None and messages_read >= max_read_count): log.info('Maximum read threshold %d breached - exiting', max_read_count) break return messages_read
def wide_column_dump_worker(self, queue, max_read_count, topic, group_id, dry): """Wide-column CSV dump worker. As this is a worker that could be part of a set of executing threads, the number of messages read is pushed onto the :class:`multiprocessing.Queue` *queue*. The parameter list is as per :meth:`wide_column_dump`. Returns: updated :class:`multiprocessing.Queue` *queue* instance with number of records processed """ log.debug('Wide-column CSV dump worker set to read %s messages', max_read_count or 'all') with self.producer() as producer: with self.consumer(topic, group_id=group_id) as consumer: total_messages_read = 0 total_messages_put = 0 for message in consumer: traversal = json.loads(message.value.decode('utf-8')) reporter = domain_intel.Reporter(data=traversal) total_messages_read += 1 for line in reporter.dump_wide_column_csv(): if not dry: producer.send('wide-column-csv', line.encode('utf-8')) total_messages_put += 1 if (max_read_count is not None and (total_messages_read >= max_read_count)): break queue.put((total_messages_read, total_messages_put))
def persist(self, max_read_count=None, topic='analyst-qas', group_id='default', dry=False): """Takes Analyst QA records and writes to the persistent store. *max_read_count* can limit the number of records read from *topic*. The default action is to read all available messages. The default consumer *topic* is ``analyst-qas``. The *dry* flag will simulate execution. No records will be published. Returns: tuple structure representing counts for the total number of records consumed and the number of domains successfully published to the Kafka topic """ log.debug('Analyst QAs persist worker set to read %s messages', max_read_count or 'all') log.debug('Analyst QAs persist worker timeout set to %d', self.timeout) log.debug('Analyst QAs persist group_id %s', group_id) total_messages_read = 0 edge_count = 0 with self.consumer(topic, group_id=group_id) as consumer: for message in consumer: total_messages_read += 1 data = json.loads(message.value.decode('utf-8')) for domain, value in data.items(): kwargs = {'_key': domain, 'data': value} self.store.collection_insert('analyst-qas', kwargs, dry) edge_kwargs = { '_key': domain, '_from': 'domain/{}'.format(domain), '_to': 'analyst-qas/{}'.format(domain), } if self.store.edge_insert('marked', edge_kwargs, dry): edge_count += 1 if (max_read_count is not None and total_messages_read >= max_read_count): log.info('Max read threshold %d breached: exiting', max_read_count) break log.info('Analyst QAs read|edge put count %d|%d', total_messages_read, edge_count) return (total_messages_read, edge_count)
def publish(self, payloads): """publish arbitrary data into producer. use case would be if this is the first stage in a pipeline and doesnt read from anywhere""" if not self.is_producer: raise GeoDNSError("cannot publish without > 0 topics") self._init_kafka() metrics = self.metrics for i, payload in enumerate(payloads): log.debug("publishing %s", payload) for dest_topic in self.kafka_producer_topics: if not self.dry: self.kafka_producer.send(dest_topic, value=payload) else: log.debug("%s: %s", dest_topic, payload) if self.dump: self._do_dump(payload, str(i), DUMP_PUBLISH) metrics[dest_topic] += 1 self.kafka_producer.flush() return metrics
def persist_worker(self, queue, max_read_count, topic, group_id, dry): """Write out the SitesLinkingIn information to a persistent store. As this is a worker that could be part of a set of executing threads, the number of messages read is pushed onto the :class:`multiprocessing.Queue` *queue*. Returns: updated :class:`multiprocessing.Queue` *queue* instance with number of records processed """ log.debug('SitesLinkingIn persist worker set to read %s messages', max_read_count or 'all') log.debug('Persist worker timeout set to %d', self.timeout) log.debug('Persist group_id %s', group_id) messages_read = edge_count = 0 with self.consumer(topic, group_id=group_id) as consumer: for message in consumer: data = message.value.decode('utf-8') messages_read += 1 edge_count += self.extract_siteslinkingin(data, dry=dry) if (max_read_count is not None and messages_read >= max_read_count): log.info('Max read threshold %d breached - exiting', max_read_count) break log.debug('SitesLinkingIn persist worker messages read %d', messages_read) queue.put((messages_read, edge_count))
def persist_worker(self, queue, max_read_count, topic, group_id, dry): """TrafficHistory persistent store worker. As this is a worker that could be part of a set of executing threads, the number of messages read is pushed onto the :class:`multiprocessing.Queue` *queue*. Returns: updated :class:`multiprocessing.Queue` *queue* instance with number of records processed """ log.debug('TrafficHistory persist worker set to read %s messages', max_read_count or 'all') log.debug('TrafficHistory persist worker timeout set to %d', self.timeout) log.debug('TrafficHistory persist group_id %s', group_id) total_messages_read = 0 edge_count = 0 with self.consumer(topic, group_id=group_id) as consumer: for message in consumer: total_messages_read += 1 data = json.loads(message.value.decode('utf-8')) parser = domain_intel.parser.TrafficHistory(data) self.store.collection_insert('traffic', parser.db_traffichistory_raw(), dry) if self.store.edge_insert('visit', parser.db_visit_edge(), dry): edge_count += 1 if (max_read_count is not None and total_messages_read >= max_read_count): log.info('Max read threshold %d breached - exiting', max_read_count) break log.info('TrafficHistory persist worker messages read %d', total_messages_read) queue.put((total_messages_read, edge_count))
def flatten_worker(self, queue, max_read_count, topic, group_id, dry=False): """Read all Alexa results from the Kafka partitions. As this is a worker that could be part of a set of executing threads, the number of messages read is pushed onto the :class:`multiprocessing.Queue` *queue*. The parameter list is as per :meth:`flatten_domains`. Returns: updated :class:`multiprocessing.Queue` *queue* instance with number of records processed """ log.debug('UrlInfo flatten worker set to read %s messages', max_read_count or 'all') log.debug('UrlInfo flatten worker timeout set to %d', self.timeout) with self.producer() as producer: with self.consumer(topic, group_id=group_id) as consumer: records_read = 0 for message in consumer: records_read += 1 for domain in UrlInfo.flatten_batched_xml(message.value): if not dry: producer.send('alexa-flattened', domain.encode('utf-8')) if (max_read_count is not None and (records_read >= max_read_count)): break log.debug('UrlInfo flatten worker records read %d', records_read) queue.put(records_read)
def alexa_csv_dump(self, max_read_count=None, topic='alexa-flattened', group_id='custom', dry=False): """Simple CSV dump of targetted Alexa data. This method skips the read from the persistent store and simply reads from the flattened Alexa Kafka topic. These messages present as JSON. *max_read_count* can limit the number of records read from *topic*. The default action is to read all available messages. The *dry* flag will simulate execution. No output CSV will be created. Returns: number of messages read """ log.debug('Alexa dump worker set to read %s messages', max_read_count or 'all') log.debug('Alexa dump worker timeout set to %d', self.timeout) # Make sure we read files as unicode for both python 2 and 3. if sys.version_info.major >= 3: rank_csv = tempfile.NamedTemporaryFile(mode='w', delete=dry) country_rank_csv = tempfile.NamedTemporaryFile(mode='w', delete=dry) else: rank_csv = tempfile.NamedTemporaryFile(delete=dry) country_rank_csv = tempfile.NamedTemporaryFile(delete=dry) rank_writer = csv.writer(rank_csv) country_rank_writer = csv.writer(country_rank_csv) with self.consumer(topic, group_id) as consumer: messages_read = 0 for message in consumer: messages_read += 1 flattened_alexa = message.value.decode('utf-8') stats = UrlInfo.alexa_flattened_extract(flattened_alexa) rank_writer.writerow(stats[0]) if stats[1]: country_rank_writer.writerows(stats[1]) if messages_read % 10000 == 0: log.info('Exported %d domains to CSV', messages_read) if (max_read_count is not None and messages_read >= max_read_count): log.info('Maximum read threshold %d breached - exiting', max_read_count) break log.info('Global rank file %s', rank_csv.name) log.info('Country rank file %s', country_rank_csv.name) log.info('Alexa dump worker domains read %d', messages_read) rank_csv.close() country_rank_csv.close() return messages_read
def _do_dump(self, payload, offset, subdir): log.debug("DUMPING TO %s/%s/%s with value: %s", self.dump, subdir, offset, payload) with open("%s/%s/%s" % (self.dump, subdir, offset), "wb") as _fh: _fh.write(payload)
def run(self): self._init_kafka() # preflight checks, since run presumes and input and output side # we must validate that we have what we need. # this is not done in the constructor to support special case stages # i.e. root and final leaf node if self.kafka_consumer_group_id is None: raise GeoDNSError( "will not accept null kafka_consumer_group_id. set one if you are consuming" ) if self.worker is None: raise GeoDNSError("need a worker!") if not self.is_producer and self.is_consumer: raise GeoDNSError( "cannot call run() without input and output topics") self.kafka_consumer.subscribe(self.kafka_consumer_topics) metrics = self.metrics for msg in self.kafka_consumer: metrics["messages_received"] += 1 if self.dump: self._do_dump(msg.value, str(metrics["messages_received"]), DUMP_CONSUME) last_exc = None for retry in range(0, self.retryable_exceptions_count): try: # enforce process level timeout with signals old_alarm_handler = signal.signal( signal.SIGALRM, GeoDNSStage._timeout_handler) signal.alarm(self.worker_timeout_seconds) res = self.worker(msg.value) signal.alarm(0) signal.signal(signal.SIGALRM, old_alarm_handler) last_exc = None break except self.retryable_exceptions + (WorkerTimedOut, ) as exc: log.error("caught retryable exceptions: %s", str(exc)) metrics["retryable_exceptions"] += 1 last_exc = exc time.sleep(retry) if last_exc is not None: log.error("exceeded retryable exception count of %d", self.retryable_exceptions_count) raise last_exc # try marshalling response metrics["messages_processed"] += 1 if hasattr(res, "marshal"): res = res.marshal() metrics["responses_marshalled"] += 1 for dest_topic in self.kafka_producer_topics: metrics["messages_sent"] += 1 if not self.dry: self.kafka_producer.send(dest_topic, value=res) else: log.debug("%s: %s", dest_topic, res) if self.dump: self._do_dump( res, "%d.%d" % (metrics["messages_received"], metrics["messages_sent"]), DUMP_PUBLISH) self.kafka_producer.flush() self.kafka_consumer.commit() log.debug(metrics) if self.max_read_count is not None and metrics[ "messages_received"] >= self.max_read_count: break return metrics
def persist_worker(self, queue, max_read_count, topic, group_id, dry=False): """Persist flattened (processed) GeoDNS domain data to ArangoDB worker. As this is a worker that could be part of a set of executing threads, the number of messages read is pushed onto the :class:`multiprocessing.Queue` *queue*. The parameter list is as per :meth:`persist`. Returns: updated :class:`multiprocessing.Queue` *queue* instance with number of records processed """ log.debug('Data persist worker set to read %s messages', max_read_count or 'all') timeout = domain_intel.common.CONFIG.get('timeout', 10000) log.debug('Persist worker timeout set to %d', timeout) store = domain_intel.Store() kafka_config = domain_intel.common.CONFIG.get('kafka', {}) kwargs = { 'bootstrap_servers': kafka_config.get('bootstrap_servers'), 'group_id': group_id, 'consumer_timeout_ms': timeout, } with domain_intel.utils.safe_consumer(topic, **kwargs) as consumer: messages_read = 0 for message in consumer: messages_read += 1 dns_data = message.value.decode('utf-8') parser = domain_intel.parser.GeoDNS(dns_data) store.collection_insert('geodns', parser.db_geodns_raw(), dry) for ipv4 in parser.db_ipv4_vertex: store.collection_insert('ipv4', ipv4, dry) for ipv6 in parser.db_ipv6_vertex: store.collection_insert('ipv6', ipv6, dry) for ipv4_edge in parser.db_ipv4_edge: store.edge_insert('ipv4_resolves', ipv4_edge, dry) for ipv6_edge in parser.db_ipv6_edge: store.edge_insert('ipv6_resolves', ipv6_edge, dry) if (max_read_count is not None and messages_read >= max_read_count): log.info('Maximum read threshold %d breached - exiting', max_read_count) break log.debug('Data persist worker domains read %d', messages_read) queue.put(messages_read)