def __init__(self, feed_url, hqbase, hqjob, datadir='data', timeout=20, check_interval=-1): self.log = logging.getLogger( 'gdelt.{0.__name__}'.format(FeedScheduler)) self.feed_url = feed_url self.hqbase = hqbase self.hqjob = hqjob self.datadir = datadir self.timeout = int(timeout) self.check_interval = int(check_interval) assert os.path.isdir(self.datadir) self.deduper = Deduper(self.datadir) self.hqclient = HeadquarterSubmitter(self.hqbase, self.hqjob) rfiles = [ fn for fn in os.listdir(self.datadir) if re.match(r'feed-\d{14}$', fn) ] if rfiles: self.log.debug('last=%s', max(rfiles)) # time.strptime() returns time tuple without timezone. make it # UTC with timegm() and gmtime() self.last_time = time.gmtime( timegm(time.strptime(max(rfiles)[-14:], '%Y%m%d%H%M%S'))) else: self.last_time = None
def __init__(self, config): self._log = logging.getLogger('{0.__name__}'.format(CrawlScheduler)) self.submitter = HeadquarterSubmitter( config['output']['hq']['base_url'], config['output']['hq']['job']) self.eventrouter = EventRouter(config['input'], self) self.stats = dict(fetched=0, scheduled=0, discarded=0) self.curls = []
def __init__(self): self.kafka = KafkaClient(hosts=KAFKA_SERVER) self.consumer = SimpleConsumer(self.kafka, KAFKA_CONSUMER_GROUP, KAFKA_TOPIC, auto_commit=True, max_buffer_size=1024 * 1024) self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB) self.stats = dict(fetched=0, scheduled=0, discarded=0)