class CrawlScheduler(object):
    def __init__(self):
        if False:
            self.kafka = KafkaClient(*KAFKA_SERVER)
            self.consumer = SimpleConsumer(self.kafka, "crawl", "wiki-links",
                                           driver_type=KAFKA_THREAD_DRIVER,
                                           auto_commit=False)
        else:
            self.kafka = None
            self.consumer = ZSimpleConsumer(ZKHOSTS, "crawl", "wiki-links",
                                            driver_type=KAFKA_THREAD_DRIVER,
                                            manage_offsets=True,
                                            auto_commit=False)

        self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)
    def shutdown(self):
        if self.kafka:
            self.kafka.close()

    def submit(self, curls):
        logging.info('submitting %d curls to HQ', len(curls))
        for n in itertools.count():
            try:
                self.submitter.put(curls)
                if n > 0:
                    logging.info('submission retry succeeded')
                break
            except Exception, ex:
                logging.warn('submission failed (%s), retrying after 30s',
                             ex)
                time.sleep(30.0)
        self.consumer.commit()
        self.stats['scheduled'] += len(curls)
class CrawlScheduler(object):
    def __init__(self):
        self.kafka = KafkaClient(hosts=KAFKA_SERVER)
        self.consumer = SimpleConsumer(
            self.kafka, KAFKA_CONSUMER_GROUP, KAFKA_TOPIC,
            auto_commit=True,
            max_buffer_size=1024*1024)

        self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)
    def shutdown(self):
        if self.kafka:
            self.kafka.close()

    def submit(self, curls):
        logging.info('submitting %d curls to HQ', len(curls))
        for n in itertools.count():
            try:
                self.submitter.put(curls)
                if n > 0:
                    logging.info('submission retry succeeded')
                break
            except Exception, ex:
                logging.warn('submission failed (%s), retrying after 30s',
                             ex)
                time.sleep(30.0)
        self.consumer.commit()
        self.stats['scheduled'] += len(curls)
class CrawlScheduler(object):
    def __init__(self):
        self.kafka = KafkaClient(hosts=KAFKA_SERVER)
        self.consumer = SimpleConsumer(self.kafka,
                                       KAFKA_CONSUMER_GROUP,
                                       KAFKA_TOPIC,
                                       auto_commit=True,
                                       max_buffer_size=1024 * 1024)

        self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)

    def shutdown(self):
        if self.kafka:
            self.kafka.close()

    def submit(self, curls):
        logging.info('submitting %d curls to HQ', len(curls))
        for n in itertools.count():
            try:
                self.submitter.put(curls)
                if n > 0:
                    logging.info('submission retry succeeded')
                break
            except Exception, ex:
                logging.warn('submission failed (%s), retrying after 30s', ex)
                time.sleep(30.0)
        self.consumer.commit()
        self.stats['scheduled'] += len(curls)
class CrawlScheduler(object):
    def __init__(self, config):
        self._log = logging.getLogger('{0.__name__}'.format(CrawlScheduler))

        self.submitter = HeadquarterSubmitter(
            config['output']['hq']['base_url'],
            config['output']['hq']['job']
        )
        self.eventrouter = EventRouter(config['input'], self)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)

        self.curls = []

    def shutdown(self):
        pass

    def submit(self, curls):
        self._log.info('submitting %d curls to HQ', len(curls))
        for n in itertools.count():
            try:
                self.submitter.put(curls)
                if n > 0:
                    self._log.info('submission retry succeeded')
                break
            except Exception, ex:
                self._log.warn('submission failed (%s), retrying after 30s',
                             ex)
                time.sleep(30.0)
        #self.consumer.commit()
        self.stats['scheduled'] += len(curls)
Example #5
0
class CrawlScheduler(object):
    def __init__(self, config):
        self._log = logging.getLogger('{0.__name__}'.format(CrawlScheduler))

        self.submitter = HeadquarterSubmitter(
            config['output']['hq']['base_url'], config['output']['hq']['job'])
        self.eventrouter = EventRouter(config['input'], self)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)

        self.curls = []

    def shutdown(self):
        pass

    def submit(self, curls):
        self._log.info('submitting %d curls to HQ', len(curls))
        for n in itertools.count():
            try:
                self.submitter.put(curls)
                if n > 0:
                    self._log.info('submission retry succeeded')
                break
            except Exception, ex:
                self._log.warn('submission failed (%s), retrying after 30s',
                               ex)
                time.sleep(30.0)
        #self.consumer.commit()
        self.stats['scheduled'] += len(curls)
    def __init__(self,
                 feed_url,
                 hqbase,
                 hqjob,
                 datadir='data',
                 timeout=20,
                 check_interval=-1):
        self.log = logging.getLogger(
            'gdelt.{0.__name__}'.format(FeedScheduler))
        self.feed_url = feed_url
        self.hqbase = hqbase
        self.hqjob = hqjob
        self.datadir = datadir
        self.timeout = int(timeout)
        self.check_interval = int(check_interval)

        assert os.path.isdir(self.datadir)

        self.deduper = Deduper(self.datadir)
        self.hqclient = HeadquarterSubmitter(self.hqbase, self.hqjob)

        rfiles = [
            fn for fn in os.listdir(self.datadir)
            if re.match(r'feed-\d{14}$', fn)
        ]
        if rfiles:
            self.log.debug('last=%s', max(rfiles))
            # time.strptime() returns time tuple without timezone. make it
            # UTC with timegm() and gmtime()
            self.last_time = time.gmtime(
                timegm(time.strptime(max(rfiles)[-14:], '%Y%m%d%H%M%S')))
        else:
            self.last_time = None
Example #7
0
    def __init__(self, config):
        self._log = logging.getLogger('{0.__name__}'.format(CrawlScheduler))

        self.submitter = HeadquarterSubmitter(
            config['output']['hq']['base_url'], config['output']['hq']['job'])
        self.eventrouter = EventRouter(config['input'], self)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)

        self.curls = []
    def __init__(self):
        self.kafka = KafkaClient(hosts=KAFKA_SERVER)
        self.consumer = SimpleConsumer(self.kafka,
                                       KAFKA_CONSUMER_GROUP,
                                       KAFKA_TOPIC,
                                       auto_commit=True,
                                       max_buffer_size=1024 * 1024)

        self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)
    def __init__(self, feed_url, hqbase, hqjob,
                 datadir='data', timeout=20,
                 check_interval=-1):
        self.log = logging.getLogger(
            'gdelt.{0.__name__}'.format(FeedScheduler))
        self.feed_url = feed_url
        self.hqbase = hqbase
        self.hqjob = hqjob
        self.datadir = datadir
        self.timeout = int(timeout)
        self.check_interval = int(check_interval)

        assert os.path.isdir(self.datadir)

        self.deduper = Deduper(self.datadir)
        self.hqclient = HeadquarterSubmitter(self.hqbase, self.hqjob)

        rfiles = [fn for fn in os.listdir(self.datadir)
                  if re.match(r'feed-\d{14}$', fn)]
        if rfiles:
            self.log.debug('last=%s', max(rfiles))
            # time.strptime() returns time tuple without timezone. make it
            # UTC with timegm() and gmtime()
            self.last_time = time.gmtime(timegm(
                    time.strptime(max(rfiles)[-14:], '%Y%m%d%H%M%S')))
        else:
            self.last_time = None
    def __init__(self):
        self.kafka = KafkaClient(hosts=KAFKA_SERVER)
        self.consumer = SimpleConsumer(
            self.kafka, KAFKA_CONSUMER_GROUP, KAFKA_TOPIC,
            auto_commit=True,
            max_buffer_size=1024*1024)

        self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)
    def __init__(self, config):
        self._log = logging.getLogger('{0.__name__}'.format(CrawlScheduler))

        self.submitter = HeadquarterSubmitter(
            config['output']['hq']['base_url'],
            config['output']['hq']['job']
        )
        self.eventrouter = EventRouter(config['input'], self)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)

        self.curls = []
    def __init__(self):
        if False:
            self.kafka = KafkaClient(*KAFKA_SERVER)
            self.consumer = SimpleConsumer(self.kafka, "crawl", "wiki-links",
                                           driver_type=KAFKA_THREAD_DRIVER,
                                           auto_commit=False)
        else:
            self.kafka = None
            self.consumer = ZSimpleConsumer(ZKHOSTS, "crawl", "wiki-links",
                                            driver_type=KAFKA_THREAD_DRIVER,
                                            manage_offsets=True,
                                            auto_commit=False)

        self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)
class FeedScheduler(object):
    def __init__(self, feed_url, hqbase, hqjob,
                 datadir='data', timeout=20,
                 check_interval=-1):
        self.log = logging.getLogger(
            'gdelt.{0.__name__}'.format(FeedScheduler))
        self.feed_url = feed_url
        self.hqbase = hqbase
        self.hqjob = hqjob
        self.datadir = datadir
        self.timeout = int(timeout)
        self.check_interval = int(check_interval)

        assert os.path.isdir(self.datadir)

        self.deduper = Deduper(self.datadir)
        self.hqclient = HeadquarterSubmitter(self.hqbase, self.hqjob)

        rfiles = [fn for fn in os.listdir(self.datadir)
                  if re.match(r'feed-\d{14}$', fn)]
        if rfiles:
            self.log.debug('last=%s', max(rfiles))
            # time.strptime() returns time tuple without timezone. make it
            # UTC with timegm() and gmtime()
            self.last_time = time.gmtime(timegm(
                    time.strptime(max(rfiles)[-14:], '%Y%m%d%H%M%S')))
        else:
            self.last_time = None

    def process(self):
        while True:
            t = time.time()
            try:
                self.process1()
            except KeyboardInterrupt as ex:
                raise
            except Exception as ex:
                self.log.error('process1 failed', exc_info=1)
            if self.check_interval < 0:
                self.log.debug('exiting because check_interval < 0')
                break
            if test_mode:
                self.log.debug('exiting because test_mode=True')
                break
            dt = t + self.check_interval - time.time()
            if dt >= 1.0:
                self.log.debug('sleeping %ds until next cycle', int(dt))
                time.sleep(dt)

    def process1(self):
        # file name is in UTC. 
        rid = time.strftime('%Y%m%d%H%M%S', time.gmtime())
        rfile = os.path.join(self.datadir, 'feed-{}'.format(rid))
        try:
            req = urllib2.Request(self.feed_url)
            if self.last_time:
                self.log.debug('last_time=%s', httpdate(self.last_time))
                req.add_header('If-Modified-Since', httpdate(self.last_time))
            f = urllib2.urlopen(req, timeout=self.timeout)
            try:
                with open(rfile, 'wb') as w:
                    while True:
                        d = f.read(16*1024)
                        if not d: break
                        w.write(d)
                    self.log.info('downloaded %d bytes in %s', w.tell(),
                                  rfile)
            except KeyboardInterrupt as ex:
                if os.path.exists(rfile):
                    os.remove(rfile)
                raise
        except urllib2.HTTPError as ex:
            if ex.code == 304:
                # Not Modified
                self.log.debug('feed %s not modified since %s', self.feed_url,
                              httpdate(self.last_time))
                return
            self.log.warn('%s %s %s', self.feed_url, ex.code, ex.reason)
            return
        except (urllib2.URLError, socket.error) as ex:
            self.log.warn('%s %s', self.feed_url, ex)
            return

        self.last_time = time.gmtime()
        
        urlcount = 0
        slfile = os.path.join(self.datadir, 'sche-{}'.format(rid))
        with open(slfile, 'wb') as sl:
            with open(rfile, 'rb') as f:
                reader = FeedReader(f)
                for urls in batchup(crawluri(self.deduper.dedup(reader)), 500):
                    self.log.debug('submitting %s URLs...', len(urls))
                    if not test_mode:
                        self.hqclient.put(urls)
                    for curl in urls:
                        sl.write(curl['u'])
                        sl.write('\n')
                    urlcount += len(urls)
        self.log.info('submitted total %s URLs (see %s)',
                      urlcount, os.path.basename(slfile))

        self.deduper.step()
class FeedScheduler(object):
    def __init__(self,
                 feed_url,
                 hqbase,
                 hqjob,
                 datadir='data',
                 timeout=20,
                 check_interval=-1):
        self.log = logging.getLogger(
            'gdelt.{0.__name__}'.format(FeedScheduler))
        self.feed_url = feed_url
        self.hqbase = hqbase
        self.hqjob = hqjob
        self.datadir = datadir
        self.timeout = int(timeout)
        self.check_interval = int(check_interval)

        assert os.path.isdir(self.datadir)

        self.deduper = Deduper(self.datadir)
        self.hqclient = HeadquarterSubmitter(self.hqbase, self.hqjob)

        rfiles = [
            fn for fn in os.listdir(self.datadir)
            if re.match(r'feed-\d{14}$', fn)
        ]
        if rfiles:
            self.log.debug('last=%s', max(rfiles))
            # time.strptime() returns time tuple without timezone. make it
            # UTC with timegm() and gmtime()
            self.last_time = time.gmtime(
                timegm(time.strptime(max(rfiles)[-14:], '%Y%m%d%H%M%S')))
        else:
            self.last_time = None

    def process(self):
        while True:
            t = time.time()
            try:
                self.process1()
            except KeyboardInterrupt as ex:
                raise
            except Exception as ex:
                self.log.error('process1 failed', exc_info=1)
            if self.check_interval < 0:
                self.log.debug('exiting because check_interval < 0')
                break
            if test_mode:
                self.log.debug('exiting because test_mode=True')
                break
            dt = t + self.check_interval - time.time()
            if dt >= 1.0:
                self.log.debug('sleeping %ds until next cycle', int(dt))
                time.sleep(dt)

    def process1(self):
        # file name is in UTC.
        rid = time.strftime('%Y%m%d%H%M%S', time.gmtime())
        rfile = os.path.join(self.datadir, 'feed-{}'.format(rid))
        try:
            req = urllib2.Request(self.feed_url)
            if self.last_time:
                self.log.debug('last_time=%s', httpdate(self.last_time))
                req.add_header('If-Modified-Since', httpdate(self.last_time))
            f = urllib2.urlopen(req, timeout=self.timeout)
            try:
                with open(rfile, 'wb') as w:
                    while True:
                        d = f.read(16 * 1024)
                        if not d: break
                        w.write(d)
                    self.log.info('downloaded %d bytes in %s', w.tell(), rfile)
            except KeyboardInterrupt as ex:
                if os.path.exists(rfile):
                    os.remove(rfile)
                raise
        except urllib2.HTTPError as ex:
            if ex.code == 304:
                # Not Modified
                self.log.debug('feed %s not modified since %s', self.feed_url,
                               httpdate(self.last_time))
                return
            self.log.warn('%s %s %s', self.feed_url, ex.code, ex.reason)
            return
        except (urllib2.URLError, socket.error) as ex:
            self.log.warn('%s %s', self.feed_url, ex)
            return

        self.last_time = time.gmtime()

        urlcount = 0
        slfile = os.path.join(self.datadir, 'sche-{}'.format(rid))
        with open(slfile, 'wb') as sl:
            with open(rfile, 'rb') as f:
                reader = FeedReader(f)
                for urls in batchup(crawluri(self.deduper.dedup(reader)), 500):
                    self.log.debug('submitting %s URLs...', len(urls))
                    if not test_mode:
                        self.hqclient.put(urls)
                    for curl in urls:
                        sl.write(curl['u'])
                        sl.write('\n')
                    urlcount += len(urls)
        self.log.info('submitted total %s URLs (see %s)', urlcount,
                      os.path.basename(slfile))

        self.deduper.step()