class CrawlScheduler(object): def __init__(self): if False: self.kafka = KafkaClient(*KAFKA_SERVER) self.consumer = SimpleConsumer(self.kafka, "crawl", "wiki-links", driver_type=KAFKA_THREAD_DRIVER, auto_commit=False) else: self.kafka = None self.consumer = ZSimpleConsumer(ZKHOSTS, "crawl", "wiki-links", driver_type=KAFKA_THREAD_DRIVER, manage_offsets=True, auto_commit=False) self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB) self.stats = dict(fetched=0, scheduled=0, discarded=0) def shutdown(self): if self.kafka: self.kafka.close() def submit(self, curls): logging.info('submitting %d curls to HQ', len(curls)) for n in itertools.count(): try: self.submitter.put(curls) if n > 0: logging.info('submission retry succeeded') break except Exception, ex: logging.warn('submission failed (%s), retrying after 30s', ex) time.sleep(30.0) self.consumer.commit() self.stats['scheduled'] += len(curls)
class CrawlScheduler(object): def __init__(self): self.kafka = KafkaClient(hosts=KAFKA_SERVER) self.consumer = SimpleConsumer( self.kafka, KAFKA_CONSUMER_GROUP, KAFKA_TOPIC, auto_commit=True, max_buffer_size=1024*1024) self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB) self.stats = dict(fetched=0, scheduled=0, discarded=0) def shutdown(self): if self.kafka: self.kafka.close() def submit(self, curls): logging.info('submitting %d curls to HQ', len(curls)) for n in itertools.count(): try: self.submitter.put(curls) if n > 0: logging.info('submission retry succeeded') break except Exception, ex: logging.warn('submission failed (%s), retrying after 30s', ex) time.sleep(30.0) self.consumer.commit() self.stats['scheduled'] += len(curls)
class CrawlScheduler(object): def __init__(self): self.kafka = KafkaClient(hosts=KAFKA_SERVER) self.consumer = SimpleConsumer(self.kafka, KAFKA_CONSUMER_GROUP, KAFKA_TOPIC, auto_commit=True, max_buffer_size=1024 * 1024) self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB) self.stats = dict(fetched=0, scheduled=0, discarded=0) def shutdown(self): if self.kafka: self.kafka.close() def submit(self, curls): logging.info('submitting %d curls to HQ', len(curls)) for n in itertools.count(): try: self.submitter.put(curls) if n > 0: logging.info('submission retry succeeded') break except Exception, ex: logging.warn('submission failed (%s), retrying after 30s', ex) time.sleep(30.0) self.consumer.commit() self.stats['scheduled'] += len(curls)
class CrawlScheduler(object): def __init__(self, config): self._log = logging.getLogger('{0.__name__}'.format(CrawlScheduler)) self.submitter = HeadquarterSubmitter( config['output']['hq']['base_url'], config['output']['hq']['job'] ) self.eventrouter = EventRouter(config['input'], self) self.stats = dict(fetched=0, scheduled=0, discarded=0) self.curls = [] def shutdown(self): pass def submit(self, curls): self._log.info('submitting %d curls to HQ', len(curls)) for n in itertools.count(): try: self.submitter.put(curls) if n > 0: self._log.info('submission retry succeeded') break except Exception, ex: self._log.warn('submission failed (%s), retrying after 30s', ex) time.sleep(30.0) #self.consumer.commit() self.stats['scheduled'] += len(curls)
class CrawlScheduler(object): def __init__(self, config): self._log = logging.getLogger('{0.__name__}'.format(CrawlScheduler)) self.submitter = HeadquarterSubmitter( config['output']['hq']['base_url'], config['output']['hq']['job']) self.eventrouter = EventRouter(config['input'], self) self.stats = dict(fetched=0, scheduled=0, discarded=0) self.curls = [] def shutdown(self): pass def submit(self, curls): self._log.info('submitting %d curls to HQ', len(curls)) for n in itertools.count(): try: self.submitter.put(curls) if n > 0: self._log.info('submission retry succeeded') break except Exception, ex: self._log.warn('submission failed (%s), retrying after 30s', ex) time.sleep(30.0) #self.consumer.commit() self.stats['scheduled'] += len(curls)
def __init__(self, feed_url, hqbase, hqjob, datadir='data', timeout=20, check_interval=-1): self.log = logging.getLogger( 'gdelt.{0.__name__}'.format(FeedScheduler)) self.feed_url = feed_url self.hqbase = hqbase self.hqjob = hqjob self.datadir = datadir self.timeout = int(timeout) self.check_interval = int(check_interval) assert os.path.isdir(self.datadir) self.deduper = Deduper(self.datadir) self.hqclient = HeadquarterSubmitter(self.hqbase, self.hqjob) rfiles = [ fn for fn in os.listdir(self.datadir) if re.match(r'feed-\d{14}$', fn) ] if rfiles: self.log.debug('last=%s', max(rfiles)) # time.strptime() returns time tuple without timezone. make it # UTC with timegm() and gmtime() self.last_time = time.gmtime( timegm(time.strptime(max(rfiles)[-14:], '%Y%m%d%H%M%S'))) else: self.last_time = None
def __init__(self, config): self._log = logging.getLogger('{0.__name__}'.format(CrawlScheduler)) self.submitter = HeadquarterSubmitter( config['output']['hq']['base_url'], config['output']['hq']['job']) self.eventrouter = EventRouter(config['input'], self) self.stats = dict(fetched=0, scheduled=0, discarded=0) self.curls = []
def __init__(self): self.kafka = KafkaClient(hosts=KAFKA_SERVER) self.consumer = SimpleConsumer(self.kafka, KAFKA_CONSUMER_GROUP, KAFKA_TOPIC, auto_commit=True, max_buffer_size=1024 * 1024) self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB) self.stats = dict(fetched=0, scheduled=0, discarded=0)
def __init__(self, feed_url, hqbase, hqjob, datadir='data', timeout=20, check_interval=-1): self.log = logging.getLogger( 'gdelt.{0.__name__}'.format(FeedScheduler)) self.feed_url = feed_url self.hqbase = hqbase self.hqjob = hqjob self.datadir = datadir self.timeout = int(timeout) self.check_interval = int(check_interval) assert os.path.isdir(self.datadir) self.deduper = Deduper(self.datadir) self.hqclient = HeadquarterSubmitter(self.hqbase, self.hqjob) rfiles = [fn for fn in os.listdir(self.datadir) if re.match(r'feed-\d{14}$', fn)] if rfiles: self.log.debug('last=%s', max(rfiles)) # time.strptime() returns time tuple without timezone. make it # UTC with timegm() and gmtime() self.last_time = time.gmtime(timegm( time.strptime(max(rfiles)[-14:], '%Y%m%d%H%M%S'))) else: self.last_time = None
def __init__(self): self.kafka = KafkaClient(hosts=KAFKA_SERVER) self.consumer = SimpleConsumer( self.kafka, KAFKA_CONSUMER_GROUP, KAFKA_TOPIC, auto_commit=True, max_buffer_size=1024*1024) self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB) self.stats = dict(fetched=0, scheduled=0, discarded=0)
def __init__(self, config): self._log = logging.getLogger('{0.__name__}'.format(CrawlScheduler)) self.submitter = HeadquarterSubmitter( config['output']['hq']['base_url'], config['output']['hq']['job'] ) self.eventrouter = EventRouter(config['input'], self) self.stats = dict(fetched=0, scheduled=0, discarded=0) self.curls = []
def __init__(self): if False: self.kafka = KafkaClient(*KAFKA_SERVER) self.consumer = SimpleConsumer(self.kafka, "crawl", "wiki-links", driver_type=KAFKA_THREAD_DRIVER, auto_commit=False) else: self.kafka = None self.consumer = ZSimpleConsumer(ZKHOSTS, "crawl", "wiki-links", driver_type=KAFKA_THREAD_DRIVER, manage_offsets=True, auto_commit=False) self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB) self.stats = dict(fetched=0, scheduled=0, discarded=0)
class FeedScheduler(object): def __init__(self, feed_url, hqbase, hqjob, datadir='data', timeout=20, check_interval=-1): self.log = logging.getLogger( 'gdelt.{0.__name__}'.format(FeedScheduler)) self.feed_url = feed_url self.hqbase = hqbase self.hqjob = hqjob self.datadir = datadir self.timeout = int(timeout) self.check_interval = int(check_interval) assert os.path.isdir(self.datadir) self.deduper = Deduper(self.datadir) self.hqclient = HeadquarterSubmitter(self.hqbase, self.hqjob) rfiles = [fn for fn in os.listdir(self.datadir) if re.match(r'feed-\d{14}$', fn)] if rfiles: self.log.debug('last=%s', max(rfiles)) # time.strptime() returns time tuple without timezone. make it # UTC with timegm() and gmtime() self.last_time = time.gmtime(timegm( time.strptime(max(rfiles)[-14:], '%Y%m%d%H%M%S'))) else: self.last_time = None def process(self): while True: t = time.time() try: self.process1() except KeyboardInterrupt as ex: raise except Exception as ex: self.log.error('process1 failed', exc_info=1) if self.check_interval < 0: self.log.debug('exiting because check_interval < 0') break if test_mode: self.log.debug('exiting because test_mode=True') break dt = t + self.check_interval - time.time() if dt >= 1.0: self.log.debug('sleeping %ds until next cycle', int(dt)) time.sleep(dt) def process1(self): # file name is in UTC. rid = time.strftime('%Y%m%d%H%M%S', time.gmtime()) rfile = os.path.join(self.datadir, 'feed-{}'.format(rid)) try: req = urllib2.Request(self.feed_url) if self.last_time: self.log.debug('last_time=%s', httpdate(self.last_time)) req.add_header('If-Modified-Since', httpdate(self.last_time)) f = urllib2.urlopen(req, timeout=self.timeout) try: with open(rfile, 'wb') as w: while True: d = f.read(16*1024) if not d: break w.write(d) self.log.info('downloaded %d bytes in %s', w.tell(), rfile) except KeyboardInterrupt as ex: if os.path.exists(rfile): os.remove(rfile) raise except urllib2.HTTPError as ex: if ex.code == 304: # Not Modified self.log.debug('feed %s not modified since %s', self.feed_url, httpdate(self.last_time)) return self.log.warn('%s %s %s', self.feed_url, ex.code, ex.reason) return except (urllib2.URLError, socket.error) as ex: self.log.warn('%s %s', self.feed_url, ex) return self.last_time = time.gmtime() urlcount = 0 slfile = os.path.join(self.datadir, 'sche-{}'.format(rid)) with open(slfile, 'wb') as sl: with open(rfile, 'rb') as f: reader = FeedReader(f) for urls in batchup(crawluri(self.deduper.dedup(reader)), 500): self.log.debug('submitting %s URLs...', len(urls)) if not test_mode: self.hqclient.put(urls) for curl in urls: sl.write(curl['u']) sl.write('\n') urlcount += len(urls) self.log.info('submitted total %s URLs (see %s)', urlcount, os.path.basename(slfile)) self.deduper.step()
class FeedScheduler(object): def __init__(self, feed_url, hqbase, hqjob, datadir='data', timeout=20, check_interval=-1): self.log = logging.getLogger( 'gdelt.{0.__name__}'.format(FeedScheduler)) self.feed_url = feed_url self.hqbase = hqbase self.hqjob = hqjob self.datadir = datadir self.timeout = int(timeout) self.check_interval = int(check_interval) assert os.path.isdir(self.datadir) self.deduper = Deduper(self.datadir) self.hqclient = HeadquarterSubmitter(self.hqbase, self.hqjob) rfiles = [ fn for fn in os.listdir(self.datadir) if re.match(r'feed-\d{14}$', fn) ] if rfiles: self.log.debug('last=%s', max(rfiles)) # time.strptime() returns time tuple without timezone. make it # UTC with timegm() and gmtime() self.last_time = time.gmtime( timegm(time.strptime(max(rfiles)[-14:], '%Y%m%d%H%M%S'))) else: self.last_time = None def process(self): while True: t = time.time() try: self.process1() except KeyboardInterrupt as ex: raise except Exception as ex: self.log.error('process1 failed', exc_info=1) if self.check_interval < 0: self.log.debug('exiting because check_interval < 0') break if test_mode: self.log.debug('exiting because test_mode=True') break dt = t + self.check_interval - time.time() if dt >= 1.0: self.log.debug('sleeping %ds until next cycle', int(dt)) time.sleep(dt) def process1(self): # file name is in UTC. rid = time.strftime('%Y%m%d%H%M%S', time.gmtime()) rfile = os.path.join(self.datadir, 'feed-{}'.format(rid)) try: req = urllib2.Request(self.feed_url) if self.last_time: self.log.debug('last_time=%s', httpdate(self.last_time)) req.add_header('If-Modified-Since', httpdate(self.last_time)) f = urllib2.urlopen(req, timeout=self.timeout) try: with open(rfile, 'wb') as w: while True: d = f.read(16 * 1024) if not d: break w.write(d) self.log.info('downloaded %d bytes in %s', w.tell(), rfile) except KeyboardInterrupt as ex: if os.path.exists(rfile): os.remove(rfile) raise except urllib2.HTTPError as ex: if ex.code == 304: # Not Modified self.log.debug('feed %s not modified since %s', self.feed_url, httpdate(self.last_time)) return self.log.warn('%s %s %s', self.feed_url, ex.code, ex.reason) return except (urllib2.URLError, socket.error) as ex: self.log.warn('%s %s', self.feed_url, ex) return self.last_time = time.gmtime() urlcount = 0 slfile = os.path.join(self.datadir, 'sche-{}'.format(rid)) with open(slfile, 'wb') as sl: with open(rfile, 'rb') as f: reader = FeedReader(f) for urls in batchup(crawluri(self.deduper.dedup(reader)), 500): self.log.debug('submitting %s URLs...', len(urls)) if not test_mode: self.hqclient.put(urls) for curl in urls: sl.write(curl['u']) sl.write('\n') urlcount += len(urls) self.log.info('submitted total %s URLs (see %s)', urlcount, os.path.basename(slfile)) self.deduper.step()