def __init__(self,
                 feed_url,
                 hqbase,
                 hqjob,
                 datadir='data',
                 timeout=20,
                 check_interval=-1):
        self.log = logging.getLogger(
            'gdelt.{0.__name__}'.format(FeedScheduler))
        self.feed_url = feed_url
        self.hqbase = hqbase
        self.hqjob = hqjob
        self.datadir = datadir
        self.timeout = int(timeout)
        self.check_interval = int(check_interval)

        assert os.path.isdir(self.datadir)

        self.deduper = Deduper(self.datadir)
        self.hqclient = HeadquarterSubmitter(self.hqbase, self.hqjob)

        rfiles = [
            fn for fn in os.listdir(self.datadir)
            if re.match(r'feed-\d{14}$', fn)
        ]
        if rfiles:
            self.log.debug('last=%s', max(rfiles))
            # time.strptime() returns time tuple without timezone. make it
            # UTC with timegm() and gmtime()
            self.last_time = time.gmtime(
                timegm(time.strptime(max(rfiles)[-14:], '%Y%m%d%H%M%S')))
        else:
            self.last_time = None
Exemple #2
0
    def __init__(self, config):
        self._log = logging.getLogger('{0.__name__}'.format(CrawlScheduler))

        self.submitter = HeadquarterSubmitter(
            config['output']['hq']['base_url'], config['output']['hq']['job'])
        self.eventrouter = EventRouter(config['input'], self)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)

        self.curls = []
    def __init__(self):
        self.kafka = KafkaClient(hosts=KAFKA_SERVER)
        self.consumer = SimpleConsumer(self.kafka,
                                       KAFKA_CONSUMER_GROUP,
                                       KAFKA_TOPIC,
                                       auto_commit=True,
                                       max_buffer_size=1024 * 1024)

        self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)