Beispiel #1
0
    def version(self):
        """Obtain ArangoDB version.

        Can also be used to verify health of system.

        """
        log.info('ArangoDB version: %s ready', self.client.version())
Beispiel #2
0
    def url_info(self, domains, response_groups=None):
        """Wrapper around the Alexa AWIS UrlInfo action.

        A UrlInfo gets information about pages and sites on the web,
        their traffic, content, and related sites.

        **Args**:
            *domains*: either a string value representing the name of the
            domain to search or a list of domain names.

            *response_groups*: see the `RESPONSE_GROUPS` global for
            the supporting sub-sets of groups that build the `url_info`
            action response.  By default, all groups are included in
            Alexa query

        **Returns:**
            the Alexa API reponse string

        """
        log.info('Domains to search: "%s"', domains)
        if response_groups is None:
            response_groups = self.__response_groups.get('url_info')

        params = {'Action': 'UrlInfo'}

        params.update(UrlInfo.build_domain_query(domains, response_groups))

        return self.request(self.build_url(params))
Beispiel #3
0
 def fin():
     """Tear down.
     """
     cmd.down(options)
     log.info('Deleting dangling docker data containers')
     client = docker.from_env()
     client.volumes.prune()
Beispiel #4
0
    def drop_database(self):
        """Remove the ArangoDB database identified by the
        :attr:`database_names`.

        """
        log.info('Deleting database "%s"', self.database_name)
        self.client.delete_database(self.database_name)
Beispiel #5
0
    def _parse_xml(root):
        """Take the lxml.Element *root* and extract the TrafficHistory
        detail from the source XML.

        Returns:
            flattened JSON variant of the source XML

        """
        xpath = '//a:TrafficHistoryResponse/b:Response/b:TrafficHistoryResult'
        _ns = domain_intel.common.NS
        results = root.xpath(xpath, namespaces=_ns)

        # See if we can find a DataUrl element to display.
        url_xpath = './b:Alexa/b:TrafficHistory/b:Site/text()'
        urls = [x.xpath(url_xpath, namespaces=_ns)[0] for x in results]
        log.info('TrafficHistory flattening domain: %s',
                 ', '.join(['"{}"'.format(x) for x in urls]))

        # Extract the historical data.
        data_xpath = './b:Alexa/b:TrafficHistory'
        traffic = results[0].xpath(data_xpath, namespaces=_ns)

        bf_json = xmljson.BadgerFish(dict_type=collections.OrderedDict)
        ns_replace = r'{{{0}}}'.format(domain_intel.common.NS_20050711)
        xml_to_json = json.dumps(bf_json.data(traffic[0]))

        return xml_to_json.replace(ns_replace, '')
Beispiel #6
0
    def flatten(self,
                max_read_count=None,
                topic='alexa-traffic-results',
                group_id='default',
                dry=False):
        """Takes the Alexa batched domain results and
        split out into separate, JSON equivalents.

        Kwargs:
            *max_read_count*: number of batched domains to read.  `None`
            returned all offsets associated with the *group_id*

            *group_id*: Kafka managed consumer element that manages
            the messages read from the topic

        """
        count_q = multiprocessing.Queue()

        target = self.flatten_worker
        args = (count_q, max_read_count, topic, group_id)
        kwargs = {'dry': dry}
        domain_intel.utils.threader(self.threads, target, *args, **kwargs)

        total_read_count = 0
        total_put_count = 0
        while not count_q.empty():
            counter = count_q.get()
            total_read_count += counter[0]
            total_put_count += counter[1]

        log.info('TrafficHistory flatten read|put count %d|%d',
                 total_read_count, total_put_count)
        read_put_counts = (total_read_count, total_put_count)

        return read_put_counts
Beispiel #7
0
    def flatten_batched_xml(xml):
        """Batched Alexa responses need to be parsed and extracted into
        individual domain components ready for next data flow path.

        Also want to strip off redundant Alexa control XML elements
        that have no value in our problem domain.

        Args:
            *xml*: the source XML to process

        Returns:
            list of domain-based XML

        """
        root = lxml.etree.fromstring(xml)
        xpath = '//a:UrlInfoResponse/b:Response/b:UrlInfoResult'
        _ns = domain_intel.common.NS
        xml_domains = root.xpath(xpath, namespaces=_ns)

        # See if we can find a DataUrl element to display.
        url_xpath = './b:Alexa/b:ContentData/b:DataUrl/text()'
        urls = [x.xpath(url_xpath, namespaces=_ns)[0] for x in xml_domains]
        log.info('Batched URLs sourced: %s',
                 ', '.join(['"{}"'.format(x) for x in urls]))

        bf_json = xmljson.BadgerFish(dict_type=collections.OrderedDict)
        ns_replace = r'{{{0}}}'.format(domain_intel.common.NS_20050711)
        xml_to_json = [json.dumps(bf_json.data(x)) for x in xml_domains]

        return [x.replace(ns_replace, '') for x in xml_to_json]
Beispiel #8
0
def _safe_producer(**kwargs):
    """See :func:`safe_producer`.  This returns a direct producer
    without context manager.

    """
    caller = inspect.stack()[2][3]
    log.info('Starting producer for %s', caller)
    return kafka.producer.KafkaProducer(**kwargs)
Beispiel #9
0
    def alexa_flattened_extract(alexa_json):
        """Rules to extract fields from a given *alexa_json* record.

        Extraction takes out:
        * domain rank
        * domain rank by country

        *alexa_json* should be a flattened Alexa record in JSON format.

        Returns:
            A tuple structure of the form::

                (
                    [
                        'watchseriesonline.io',
                        1490187600.0,
                        588080,
                    ],
                    [
                        [
                            'abc.com',
                            1490187600.0,
                            'VE',
                            29418
                        ],
                        ...
                    ]
                )

        """
        epoch = time.time()
        alexa_data = json.loads(alexa_json)
        base = alexa_data['UrlInfoResult']['Alexa']

        traffic_data = base.get('TrafficData')
        data_url = traffic_data.get('DataUrl').get('$')
        log.info('Exporting domain "%s"', data_url)
        rank = traffic_data.get('Rank').get('$')
        rank_stats = [data_url, epoch, rank]

        country_ranks = []
        rank_by_country = traffic_data.get('RankByCountry').get('Country')

        if rank_by_country is not None:
            if not isinstance(rank_by_country, list):
                rank_by_country = [rank_by_country]

            for country in rank_by_country:
                code = country.get('@Code')
                if code == 'O':
                    continue
                rank = country.get('Rank').get('$')
                country_values = [data_url, epoch, code, rank]

                country_ranks.append(country_values)

        return tuple([rank_stats, country_ranks])
Beispiel #10
0
    def persist(self,
                max_read_count=None,
                topic='analyst-qas',
                group_id='default',
                dry=False):
        """Takes Analyst QA records and writes to the persistent store.

        *max_read_count* can limit the number of records read from *topic*.
        The default action is to read all available messages.

        The default consumer *topic* is ``analyst-qas``.

        The *dry* flag will simulate execution.  No records will be
        published.

        Returns:
            tuple structure representing counts for the total number of
            records consumed and the number of domains successfully
            published to the Kafka topic

        """
        log.debug('Analyst QAs persist worker set to read %s messages',
                  max_read_count or 'all')
        log.debug('Analyst QAs persist worker timeout set to %d', self.timeout)
        log.debug('Analyst QAs persist group_id %s', group_id)

        total_messages_read = 0
        edge_count = 0

        with self.consumer(topic, group_id=group_id) as consumer:
            for message in consumer:
                total_messages_read += 1

                data = json.loads(message.value.decode('utf-8'))
                for domain, value in data.items():
                    kwargs = {'_key': domain, 'data': value}
                    self.store.collection_insert('analyst-qas', kwargs, dry)

                    edge_kwargs = {
                        '_key': domain,
                        '_from': 'domain/{}'.format(domain),
                        '_to': 'analyst-qas/{}'.format(domain),
                    }
                    if self.store.edge_insert('marked', edge_kwargs, dry):
                        edge_count += 1

                if (max_read_count is not None
                        and total_messages_read >= max_read_count):
                    log.info('Max read threshold %d breached: exiting',
                             max_read_count)
                    break

        log.info('Analyst QAs read|edge put count %d|%d', total_messages_read,
                 edge_count)

        return (total_messages_read, edge_count)
Beispiel #11
0
def stabilise_partitions(topics, **kwargs):
    """In addition to topic creation, we need to pause and wait for
    the *topics* partitions to be created.

    """
    log.info('Stabilising Kafka topic partitions ...')
    with safe_producer(**kwargs) as producer:
        for topic in topics:
            log.info('Waiting for topic "%s" partitions ...', topic)
            producer.partitions_for(topic)
Beispiel #12
0
    def slurp_sites(self,
                    max_read_count=None,
                    topic='sli-domains',
                    group_id='default',
                    dry=False):
        """Slurp SitesLinkingIn detail from Alexa based on *domain*
        and then publish the results to *producer*.

        *max_read_count* can limit the number of records read from *topic*.
        The default action is to read all available messages.

        The default consumer *topic* is `sli-domains`.

        The default Kafka *group_id* name used is `default`.  However,
        we can force a re-read of the topic's messages by overriding
        *group_id* with a unique value.

        If the *dry* flag is set then only report, don't run.

        Returns:
            tuple structure representing counts for the total number of
            records consumed and the number of domains successfully
            published to the Kafka topics

        """
        total_messages_read = total_messages_put = 0

        while True:
            domain = self._get_message(topic, group_id)
            if not domain:
                break

            total_messages_read += 1
            results = self.slurp_sites_linking_in(domain=domain, dry=dry)
            if results:
                sites = {
                    'domain': domain,
                    'urls': results,
                }
                message = json.dumps(sites).encode('utf-8')
                if not dry:
                    with self.producer() as producer:
                        producer.send('alexa-sli-results', message)
                    total_messages_put += 1

            if (max_read_count is not None and
                    (total_messages_read >= max_read_count)):
                log.info('Maximum read threshold %d breached - exiting',
                         max_read_count)
                break

        log.info('SitesLinkingIn read|put count %d|%d',
                 total_messages_read, total_messages_put)

        return tuple([total_messages_read, total_messages_put])
Beispiel #13
0
    def slurp_sites_linking_in(self,
                               domain,
                               max_slurps=None,
                               as_json=False,
                               dry=False):
        """Get list of sites linking into *domain*.

        Alexa places an upper limit of 20 on the number of sites that it
        will return per request (or a "slurp".  Subsequent calls must be
        made by incrementing the `Start` request parameter to indicate the
        page to return.  Since there is not way to know how many pages
        need to be slurped, we must test the current result for a list of
        titles.  If no titles are returned or *max_slurps* is breached
        (which ever comes first) then we exit.

        Returns:
            list of titles slurped.  If *as_json* is set then the resultant
            set is returned as a JSON structure

        """
        if max_slurps is None:
            max_slurps = MAX_SLURPS

        all_titles = []
        for start_index in range(max_slurps):
            if start_index >= max_slurps:
                log.debug('SitesLinkingIn domain "%s" threshold breached',
                          domain)
                break
            log.debug('SitesLinkingIn domain "%s" slurp iteration %d of %d',
                      domain, start_index + 1, max_slurps)

            response = None
            if not dry:
                response = self.api.sites_linking_in(domain,
                                                     start_index*SLI_COUNT)
            parser = domain_intel.awisapi.parser.SitesLinkingIn(response)
            titles = parser.extract_titles()

            if titles:
                all_titles.extend(titles)
            else:
                log.info('SitesLinkingIn slurp iteration %d returned '
                         'zero titles: exiting', start_index + 1)
                break

        unique_titles = SitesLinkingIn.unique_titles(all_titles)
        if as_json:
            unique_titles = json.dumps(unique_titles,
                                       sort_keys=True,
                                       indent=4)

        return unique_titles
Beispiel #14
0
def safe_consumer(topic, **kwargs):
    """Obtain a Kafka producer safely.  Waits until all brokers
    and topics are online.

    """
    caller = inspect.stack()[2][3]
    consumer = _safe_consumer(topic, **kwargs)
    yield consumer

    log.info('Closing consumer for %s ...', caller)
    consumer.close(10)
    log.info('Consumer closed for caller %s', caller)
Beispiel #15
0
def safe_producer(**kwargs):
    """Obtain a Kafka producer safely.  Waits until all brokers
    and topics are online.

    """
    caller = inspect.stack()[2][3]
    producer = _safe_producer(**kwargs)
    yield producer

    log.info('Closing producer for %s ...', caller)
    producer.flush()
    producer.close(10)
    log.info('Producer closed for caller %s', caller)
Beispiel #16
0
    def flatten_worker(self,
                       queue,
                       max_read_count,
                       topic,
                       group_id,
                       dry=False):
        """Read all Alexa TrafficHistory results from the Kafka *topic*.

        As this is a worker that could be part of a set of executing
        threads, the number of messages read is pushed onto the
        :class:`multiprocessing.Queue` *queue*.

        The parameter list is as per :meth:`flatten_worker`.

        Returns:
            updated :class:`multiprocessing.Queue` *queue* instance
            with number of records processed

        """
        log.debug('TrafficHistory flatten worker set to read %s messages',
                  max_read_count or 'all')
        log.debug('TrafficHistory flatten worker timeout set to %d',
                  self.timeout)

        total_messages_read = 0
        total_messages_put = 0

        with self.producer() as producer:
            with self.consumer(topic, group_id=group_id) as consumer:
                records_read = 0
                for message in consumer:
                    total_messages_read += 1
                    traffic = TrafficHistory.flatten_xml(message.value)
                    if traffic is None:
                        continue

                    if not dry:
                        total_messages_put += 1
                        producer.send('alexa-traffic-flattened',
                                      traffic.encode('utf-8'))

                    if (max_read_count is not None
                            and (records_read >= max_read_count)):
                        break

        log.info('TrafficHistory flatten worker read|put count %d|%d',
                 total_messages_read, total_messages_put)

        queue.put(tuple([total_messages_read, total_messages_put]))
Beispiel #17
0
def info(**kwargs):
    """Simple dump to logs/stout of information related to the topics
    we are currently authorised to access.

    """
    log.info('Attempting get of Kafka topic detail information ...')
    with safe_consumer(None, **kwargs) as consumer:
        topics = consumer.topics()
        topic_count = len(topics)
        for index, topic in enumerate(topics, 1):
            log.debug('Authorised topic %d of %d: %s',
                      index, topic_count, topic)
            partitions = [str(x) for x in consumer.partitions_for_topic(topic)]
            log.info('- Partitions: %s', ', '.join(partitions))

    return topics
Beispiel #18
0
    def initialise(self):
        """Initialise an ArangoDB database.

        """
        dbs_created = []
        log.info('Attempting to create DB: "%s"', self.database_name)
        try:
            db_obj = self.client.create_database(self.database_name)
            dbs_created.append(db_obj.name)
        except arango.exceptions.DatabaseCreateError as err:
            log.warning('Database "%s" create error: %s', self.database_name,
                        err)

        log.info('Databases created: %s', dbs_created)

        return dbs_created
Beispiel #19
0
    def edge_insert(self, edge_name, kwargs, dry=False):
        """Manage an ArangoDB edge insert.

        """
        persist_status = False

        edge = self.graph.edge_collection(edge_name)
        log.info('Inserting key: "%s" into edge %s', kwargs.get('_key'),
                 edge_name)
        if not dry:
            try:
                edge.insert(kwargs)
                persist_status = True
            except arango.ArangoError as err:
                log.error('%s: %s', err, kwargs)

        return persist_status
Beispiel #20
0
    def export_ids(self, collection_name):
        """Dump all of the `_id` column values from *collection_name*

        Returns:
            generator object that references the label names taken
            from the *collection_name* collection

        """
        log.info('Dumping collection "%s" labels', collection_name)

        database = self.client.db(self.database_name)
        collection = database.collection(collection_name)
        cursor = collection.export(flush=True, filter_fields=['_id'])

        while True:
            value = cursor.next()
            yield value.get('_id')
Beispiel #21
0
    def traffic_history(self, domain):
        """Wrapper around the Alexa AWIS TrafficHistory action.

        The TrafficHistory action returns the daily Alexa Traffic Rank,
        Reach per Million Users and Unique Page Views per Million Users
        for each day going back 4 years.  Sites with a rank in excess of
        1,000,000 are not included.

        Returns:
            the Alexa API reponse string

        """
        params = {'Action': 'TrafficHistory'}
        params.update(TrafficHistory.build_query(domain))
        log.info('Alexa TrafficHistory request for domain: "%s"', domain)

        return self.request(self.build_url(params))
Beispiel #22
0
    def slurp_traffic_worker(self,
                             queue,
                             max_read_count,
                             topic,
                             group_id,
                             dry=False):
        """Slurp TrafficHistory worker.

        As this is a worker that could be part of a set of executing
        threads, the number of messages read is pushed onto the
        :class:`multiprocessing.Queue` *queue*.

        The remaining parameter list is as per :meth:`slurp_traffic`.

        Returns:
            updated :class:`multiprocessing.Queue` *queue* instance
            with number of records processed

        """
        total_messages_read = 0
        total_messages_put = 0

        with self.producer() as producer:
            with self.consumer(topic, group_id=group_id) as consumer:
                for message in consumer:
                    domain = message.value.decode('utf-8')

                    total_messages_read += 1
                    if not dry:
                        result = self.api.traffic_history(domain=domain)
                        if result is not None:
                            producer.send('alexa-traffic-results', result)
                            total_messages_put += 1

                    if (max_read_count is not None
                            and (total_messages_read >= max_read_count)):
                        log.info(
                            'Maximum read threshold %d breached - exiting',
                            max_read_count)
                        break

        log.info('TrafficHistory worker read|put count %d|%d',
                 total_messages_read, total_messages_put)

        queue.put(tuple([total_messages_read, total_messages_put]))
Beispiel #23
0
    def persist_worker(self, queue, max_read_count, topic, group_id, dry):
        """TrafficHistory persistent store worker.

        As this is a worker that could be part of a set of executing
        threads, the number of messages read is pushed onto the
        :class:`multiprocessing.Queue` *queue*.

        Returns:
            updated :class:`multiprocessing.Queue` *queue* instance
            with number of records processed

        """
        log.debug('TrafficHistory persist worker set to read %s messages',
                  max_read_count or 'all')
        log.debug('TrafficHistory persist worker timeout set to %d',
                  self.timeout)
        log.debug('TrafficHistory persist group_id %s', group_id)

        total_messages_read = 0
        edge_count = 0

        with self.consumer(topic, group_id=group_id) as consumer:
            for message in consumer:
                total_messages_read += 1

                data = json.loads(message.value.decode('utf-8'))
                parser = domain_intel.parser.TrafficHistory(data)
                self.store.collection_insert('traffic',
                                             parser.db_traffichistory_raw(),
                                             dry)

                if self.store.edge_insert('visit', parser.db_visit_edge(),
                                          dry):
                    edge_count += 1

                if (max_read_count is not None
                        and total_messages_read >= max_read_count):
                    log.info('Max read threshold %d breached - exiting',
                             max_read_count)
                    break

            log.info('TrafficHistory persist worker messages read %d',
                     total_messages_read)

        queue.put((total_messages_read, edge_count))
Beispiel #24
0
    def persist_worker(self,
                       queue,
                       max_read_count,
                       topic,
                       group_id,
                       dry=False):
        """Persist flattened (processed) Alexa domain data to ArangoDB
        worker.

        As this is a worker that could be part of a set of executing
        threads, the number of messages read is pushed onto the
        :class:`multiprocessing.Queue` *queue*.

        The parameter list is as per :meth:`persist`.

        Returns:
            updated :class:`multiprocessing.Queue` *queue* instance
            with number of records processed

        """
        log.debug('Data persist worker set to read %s messages', max_read_count
                  or 'all')
        log.debug('Persist worker timeout set to %d', self.timeout)

        total_messages_read = 0
        put_count = 0

        with self.consumer(topic, group_id) as consumer:
            for message in consumer:
                total_messages_read += 1

                self.write_to_store(message.value, dry)
                # TODO: quantify successful insert.
                put_count += 1

                if (max_read_count is not None
                        and total_messages_read >= max_read_count):
                    log.info('Maximum read threshold %d breached - exiting',
                             max_read_count)
                    break

        log.info('UrlInfo persist worker messages read %d',
                 total_messages_read)

        queue.put((total_messages_read, put_count))
Beispiel #25
0
def _safe_consumer(topic, **kwargs):
    """See :func:`safe_consumer`.  This returns a direct consumer
    without context manager.

    """
    caller = inspect.stack()[2][3]
    log.info('Starting consumer for %s', caller)

    default_kwargs = {
        'auto_offset_reset': 'earliest',
        'enable_auto_commit': True,
        'consumer_timeout_ms': 10000,
    }
    default_kwargs.update(dict(kwargs))
    consumer = kafka.consumer.KafkaConsumer(**default_kwargs)
    if topic is not None:
        consumer.subscribe(topic)
    return consumer
Beispiel #26
0
def load_domains_from_file(filename, **kwargs):
    """Load GTR domains into 'dns-domains' Kafka topic"""

    stage = GeoDNSStage(kafka_producer_topics=["dns-domains"], **kwargs)

    log.info("loading domains to %s from file %s", stage.kafka_producer_topics,
             filename)

    def _chomped_lines(_filename):
        with open(_filename, "rb") as _fh:
            for line in _fh:
                yield line.rstrip()

    metrics = stage.publish(_chomped_lines(filename))
    log.info("finished loading domains from %s to %s with %s",
             stage.kafka_producer_topics, filename, metrics)

    return metrics
Beispiel #27
0
    def slurp_traffic(self,
                      max_read_count=None,
                      topic='traffic-domains',
                      group_id='default',
                      dry=False):
        """Slurp TrafficHistory detail from Alexa based on *domain*
        and then publish the results to *producer*.

        *max_read_count* can limit the number of records read from *topic*.
        The default action is to read all available messages.

        The default consumer *topic* is `traffic-domains`.

        The default Kafka *group_id* name used is `default`.  However,
        we can force a re-read of the topic's messages by overriding
        *group_id* with a unique value.

        If the *dry* flag is set then only report, don't run.

        Returns:
            tuple structure representing counts for the total number of
            records consumed and the number of domains successfully
            published to the Kafka topics

        """
        count_q = multiprocessing.Queue()

        target = self.slurp_traffic_worker
        args = (count_q, max_read_count, topic, group_id)
        kwargs = {'dry': dry}
        domain_intel.utils.threader(self.threads, target, *args, **kwargs)

        total_read_count = 0
        total_put_count = 0
        while not count_q.empty():
            counter = count_q.get()
            total_read_count += counter[0]
            total_put_count += counter[1]

        log.info('TrafficHistory read|put count %d|%d', total_read_count,
                 total_put_count)
        read_put_counts = (total_read_count, total_put_count)

        return read_put_counts
Beispiel #28
0
def analyst_xls_to_json(xls_file, dry=False):
    """Convert *xls_file* into a JSON file.
    Conversion will attempt to create the JSON file variant in the same
    directory as *xls_file*.

    If *dry* is ``True``, will attempt to write out the converted JSON to
    a filename based on *xls_file* with the ``xls`` extension replaced
    by ``json``.

    Returns:
        converted XLS content as JSON

    """
    log.info('Attempting to convert xls file: "%s" to JSON', xls_file)

    filename = os.path.splitext(xls_file)[0]

    workbook = xlrd.open_workbook(xls_file)
    sheet = workbook.sheet_by_index(1)

    data = collections.OrderedDict()
    for rownum in range(1, sheet.nrows):
        row_values = sheet.row_values(rownum)
        if row_values:
            domain = row_values[0]
            data.setdefault(domain, {})
            data[domain]['p2p_magnet_links'] = row_values[1]
            data[domain]['links_to_torrents'] = row_values[2]
            data[domain]['links_to_osp'] = row_values[3]
            data[domain]['search_feature'] = row_values[4]
            data[domain]['domain_down_or_parked'] = row_values[5]
            data[domain]['has_rss_feed'] = row_values[6]
            data[domain]['requires_login'] = row_values[7]
            data[domain]['has_forum_or_comments'] = row_values[8]

    if not dry:
        target_file = '{}.json'.format(filename)
        file_h = io.open(target_file, 'w', encoding='utf-8')
        file_h.write(json.dumps(data, indent=2))
        file_h.close()

    for key, value in data.items():
        yield json.dumps({key: value})
Beispiel #29
0
    def collection_insert(self, collection_name, kwargs, dry=False):
        """Insert *kwargs* into *collection_name*.

        Returns:
            Boolean try on success.  False otherwise

        """
        persist_status = False
        collection = self.graph.vertex_collection(collection_name)
        log.info('Inserting key: "%s" into collection %s', kwargs.get('_key'),
                 collection_name)
        if not dry:
            try:
                collection.insert(kwargs)
                persist_status = True
            except arango.exceptions.DocumentInsertError as err:
                log.error('%s: %s', err, kwargs)

        return persist_status
Beispiel #30
0
    def parse_raw_siteslinkingin(self,
                                 file_h,
                                 max_read_count=None,
                                 topic='alexa-sli-results',
                                 dry=False):
        """Re-load raw Alexa SitesLinkingIn action (as JSON) back
        into the Kafka *topic*.

        Alexa SitesLinkingIn action (as JSON) structure is as per::

            {
                "domain": "allmp3s.xyz",
                "urls": [
                    {"title": ...}
                ]
            }

        Returns:
            tuple structure representing counts for the total number of
            records consumed and the number of records successfully
            published to the Kafka topic

        """
        file_h = domain_intel.utils.standardise_file_handle(file_h)

        records_read = records_put = 0

        with self.producer() as producer:
            for raw_line in file_h:
                records_read += 1

                if not dry:
                    producer.send(topic,
                                  raw_line.rstrip().encode('utf-8'))
                    records_put += 1

                if (max_read_count is not None and
                        records_read >= max_read_count):
                    log.info('Maximum read threshold %d breached: exiting',
                             max_read_count)
                    break

        return tuple([records_read, records_put])