def main():
    """ Main function. Nothing to see here. Move along.
    """
    parser = optparse.OptionParser(usage='%prog [options]',
                                   version=USER_AGENT)
    parser.add_option('--settings',
      help='Python path to settings module. If this isn\'t provided, ' \
           'the DJANGO_SETTINGS_MODULE enviroment variable will be used.')
    parser.add_option('-f', '--feed', action='append', type='int',
      help='A feed id to be updated. This option can be given multiple ' \
           'times to update several feeds at the same time ' \
           '(-f 1 -f 4 -f 7).')
    parser.add_option('-s', '--site', type='int',
      help='A site id to update.')
    parser.add_option('-v', '--verbose', action='store_true',
      dest='verbose', default=False, help='Verbose output.')
    parser.add_option('-t', '--timeout', type='int', default=10,
      help='Wait timeout in seconds when connecting to feeds.')
    parser.add_option('-w', '--workerthreads', type='int', default=10,
      help='Worker threads that will fetch feeds in parallel.')
    options = parser.parse_args()[0]
    if options.settings:
        os.environ["DJANGO_SETTINGS_MODULE"] = options.settings


    from feedjack import models, fjcache

    # settting socket timeout (default= 10 seconds)
    socket.setdefaulttimeout(options.timeout)

    # our job dispatcher
    disp = Dispatcher(options, options.workerthreads)
    
    prints('* BEGIN: %s' % (unicode(datetime.datetime.now()),))

    if options.feed:
        feeds = models.Feed.objects.filter(id__in=options.feed)
        known_ids = []
        for feed in feeds:
            known_ids.append(feed.id)
            disp.add_job(feed)
        for feed in options.feed:
            if feed not in known_ids:
                prints('! Unknown feed id: %d' % (feed,))
    elif options.site:
        try:
            site = models.Site.objects.get(pk=int(options.site))
        except models.Site.DoesNotExist:
            site = None
            prints('! Unknown site id: %d' % (options.site,))
        if site:
            feeds = [sub.feed for sub in site.subscriber_set.all()]
            for feed in feeds:
                disp.add_job(feed)
    else:
        for feed in models.Feed.objects.filter(is_active=True):
            disp.add_job(feed)

    disp.poll()

    # removing the cached data in all sites, this will only work with the
    # memcached, db and file backends
    [fjcache.cache_delsite(site.id) for site in models.Site.objects.all()]

    if threadpool:
        tcom = u'%d threads' % (options.workerthreads,)
    else:
        tcom = u'no threadpool module available, no parallel fetching'

    prints('* END: %s (%s)' % (unicode(datetime.datetime.now()), tcom))
Ejemplo n.º 2
0
def main():
    """ Main function. Nothing to see here. Move along.
    """
    parser = optparse.OptionParser(usage='%prog [options]', version=USER_AGENT)
    parser.add_option('--settings',
      help='Python path to settings module. If this isn\'t provided, ' \
           'the DJANGO_SETTINGS_MODULE enviroment variable will be used.')
    parser.add_option('-f', '--feed', action='append', type='int',
      help='A feed id to be updated. This option can be given multiple ' \
           'times to update several feeds at the same time ' \
           '(-f 1 -f 4 -f 7).')
    parser.add_option('-s', '--site', type='int', help='A site id to update.')
    parser.add_option('-v',
                      '--verbose',
                      action='store_true',
                      dest='verbose',
                      default=False,
                      help='Verbose output.')
    parser.add_option('-t',
                      '--timeout',
                      type='int',
                      default=10,
                      help='Wait timeout in seconds when connecting to feeds.')
    parser.add_option('-w',
                      '--workerthreads',
                      type='int',
                      default=0,
                      help='Worker threads that will fetch feeds in parallel.')
    options = parser.parse_args()[0]
    if options.settings:
        os.environ["DJANGO_SETTINGS_MODULE"] = options.settings

    from feedjack import models, fjcache

    # settting socket timeout (default= 10 seconds)
    socket.setdefaulttimeout(options.timeout)

    # our job dispatcher
    if options.workerthreads:
        disp = ThreadPoolDispatcher(options, options.workerthreads)
    else:
        disp = BaseDispatcher(options)

    prints('* BEGIN: %s' % (unicode(datetime.datetime.now()), ))

    if options.feed:
        feeds = models.Feed.objects.filter(id__in=options.feed)
        known_ids = []
        for feed in feeds:
            known_ids.append(feed.id)
            disp.add_job(feed)
        for feed in options.feed:
            if feed not in known_ids:
                prints('! Unknown feed id: %d' % (feed, ))
    elif options.site:
        try:
            site = models.Site.objects.get(pk=int(options.site))
        except models.Site.DoesNotExist:
            site = None
            prints('! Unknown site id: %d' % (options.site, ))
        if site:
            feeds = [sub.feed for sub in site.subscriber_set.all()]
            for feed in feeds:
                disp.add_job(feed)
    else:
        for feed in models.Feed.objects.filter(is_active=True):
            disp.add_job(feed)

    disp.poll()

    # removing the cached data in all sites, this will only work with the
    # memcached, db and file backends
    [fjcache.cache_delsite(site.id) for site in models.Site.objects.all()]

    if threadpool:
        tcom = u'%d threads' % (options.workerthreads, )
    else:
        tcom = u'no threadpool module available, no parallel fetching'

    prints('* END: %s (%s)' % (unicode(datetime.datetime.now()), tcom))
Ejemplo n.º 3
0
def bulk_update(opts):
	global _exc_feed_id # updated to be available on uncaught errors

	from feedjack.models import Feed, Site
	from feedjack import fjcache

	import socket
	socket.setdefaulttimeout(opts.timeout)


	affected_feeds = set() # for post-transaction signals
	Site.signal_updated.connect(
		lambda sender, instance, **kwz: fjcache.cache_delsite(instance.id) )

	def transaction_commit():
		log.debug('Comitting db transaction')
		transaction_signaled_commit()
		for feed in affected_feeds: feed.signal_updated_dispatch(sender=FeedProcessor)
		for site in Site.objects.filter(subscriber__feed__in=affected_feeds):
			site.signal_updated_dispatch(sender=FeedProcessor)
		transaction_signaled_commit() # in case of any immediate changes from signals


	if not opts.feed and not opts.site: # fetches even unbound feeds
		feeds = Feed.objects.filter(is_active=True)
	else:
		feeds = set()
		if opts.feed: # no is_active check if specified explicitly
			feeds.update(Feed.objects.get_by_string(spec) for spec in opts.feed)
		if opts.site:
			sites = list(Site.objects.get_by_string(unicode(spec)) for spec in opts.site)
			for site in sites: feeds.update(site.active_feeds)

	feeds = list(feeds)
	time_delta_global = time_delta_commit = timezone.now()
	log.info( '* BEGIN: {0}, feeds to process: {1}'\
		.format(time_delta_global, len(feeds)) )

	feed_stats, entry_stats = defaultdict(int), defaultdict(int)
	for feed in feeds:
		_exc_feed_id = feed.id
		log.info('[{0}] Processing feed: {1}'.format(feed.id, feed.feed_url))

		# Check if feed has to be fetched
		if opts.adaptive_interval:
			check_opts = opts.interval_parameters.copy()
			check_clc = check_opts.pop('consider_last_check') or False
			if feed.last_checked:
				check_interval, check_interval_ts =\
					fjcache.feed_interval_get(feed.id, check_opts)
				if check_interval is None: # calculate and cache it
					check_interval = feed.calculate_check_interval(**check_opts)
					fjcache.feed_interval_set( feed.id,
						check_opts, check_interval, check_interval_ts )
				# With "consider_last_check", interval to feed.last_checked is added to average
				time_delta = timedelta( 0,
					feed.calculate_check_interval(
						ewma=check_interval, ewma_ts=check_interval_ts,
						add_partial=feed.last_checked, **check_opts )\
					if check_clc else check_interval )
				if not check_interval_ts:
					# Cache miss, legacy case or first post on the feed
					# Normally, it should be set after any feed update
					check_interval_ts = feed.last_checked
				time_delta_chk = (timezone.now() - time_delta) - check_interval_ts
				if time_delta_chk < timedelta(0):
					log.info(
						( '[{0}] Skipping check for feed (url: {1}) due to adaptive interval setting.'
							' Minimal time until next check {2} (calculated min interval: {3}).' )\
						.format(feed.id, feed.feed_url, abs(time_delta_chk), abs(time_delta)) )
					continue
			else: check_interval, check_interval_ts = 0, None

		# Fetch new/updated stuff from the feed to db
		time_delta = timezone.now()
		if not opts.dry_run:
			ret_feed, ret_entries = FeedProcessor(feed, opts).process()
		else:
			log.debug('[{0}] Not fetching feed, because dry-run flag is set'.format(feed.id))
			ret_feed, ret_entries = FEED_SAME, dict()
		time_delta = timezone.now() - time_delta
		# FEED_SAME or errors don't invalidate cache or generate "updated" signals
		if ret_feed == FEED_OK: affected_feeds.add(feed)

		# Update check_interval ewma if feed had updates
		if opts.adaptive_interval and any(it.imap(
				ret_entries.get, [ENTRY_NEW, ENTRY_UPDATED, ENTRY_ERR] )):
			if not check_interval_ts:
				assert feed.last_checked
				check_interval_ts = feed.last_checked
			check_interval = feed.calculate_check_interval(
				ewma=check_interval, ewma_ts=check_interval_ts, **check_opts )
			fjcache.feed_interval_set(feed.id, check_opts, check_interval, check_interval_ts)

		# Feedback, stats, commit, delay
		log.info('[{0}] Processed {1} in {2}s [{3}] [{4}]{5}'.format(
			feed.id, feed.feed_url, time_delta, feed_keys_dict[ret_feed],
			' '.join('{0}={1}'.format( label,
				ret_entries.get(key, 0) ) for key,label in entry_keys),
			' (SLOW FEED!)' if time_delta.seconds > SLOWFEED_WARNING else '' ))

		feed_stats[ret_feed] += 1
		for k,v in ret_entries.iteritems(): entry_stats[k] += v

		if opts.commit_interval:
			if isinstance(opts.commit_interval, timedelta):
				ts = timezone.now()
				if ts - time_delta_commit > opts.commit_interval:
					transaction_commit()
					time_delta_commit = ts
			elif sum(feed_stats.itervalues()) % opts.commit_interval == 0: transaction_commit()

		if opts.delay:
			log.debug('Waiting for {0}s (delay option)'.format(opts.delay))
			sleep(opts.delay)

	_exc_feed_id = None

	time_delta_global = timezone.now() - time_delta_global
	log.info('* END: {0} (delta: {1}s), entries: {2}, feeds: {3}'.format(
		timezone.now(), time_delta_global,
		' '.join('{0}={1}'.format(label, entry_stats[key]) for key,label in entry_keys),
		' '.join('{0}={1}'.format(label, feed_stats[key]) for key,label in feed_keys) ))

	transaction_commit()
Ejemplo n.º 4
0
def bulk_update(optz):
    import socket
    socket.setdefaulttimeout(optz.timeout)


    from feedjack.models import Feed, Site
    affected_sites = set() # to drop cache

    if optz.feed:
        feeds = list(Feed.objects.filter(pk__in=optz.feed)) # no is_active check
        for feed_id in set(optz.feed).difference(it.imap(op.attrgetter('id'), feeds)):
            log.warn('Unknown feed id: {0}'.format(feed_id))
        affected_sites.update(Site.objects.filter(
            subscriber__feed__in=feeds ).values_list('id', flat=True))

    if optz.site:
        feeds = Feed.objects.filter( is_active=True,
            subscriber__site__pk__in=optz.site )
        sites = Site.objects.filter(pk__in=optz.site).values_list('id', flat=True)
        for site_id in set(optz.site).difference(sites):
            log.warn('Unknown site id: {0}'.format(site_id))
        affected_sites.update(sites)

    if not optz.feed and not optz.site: # fetches even unbound feeds
        feeds = Feed.objects.filter(is_active=True)
        affected_sites = Site.objects.all().values_list('id', flat=True)


    feeds, time_delta_global = list(feeds), datetime.now()
    log.info( '* BEGIN: {0}, feeds to process: {1}'\
        .format(time_delta_global, len(feeds)) )

    feed_stats, entry_stats = defaultdict(int), defaultdict(int)
    for feed in feeds:
        time_delta = datetime.now()
        ret_feed, ret_entries = FeedProcessor(feed, optz).process()
        time_delta = datetime.now() - time_delta

        log.info('[{0}] Processed {1} in {2}s [{3}] [{4}]{5}'.format(
            feed.id, feed.feed_url, time_delta, feed_keys_dict[ret_feed],
            ' '.join('{0}={1}'.format( label,
                ret_entries.get(key, 0) ) for key,label in entry_keys),
            ' (SLOW FEED!)' if time_delta.seconds > SLOWFEED_WARNING else '' ))

        feed_stats[ret_feed] += 1
        for k,v in ret_entries.iteritems(): entry_stats[k] += v

        if optz.delay: sleep(optz.delay)
    
    time_delta_global = datetime.now() - time_delta_global
    log.info('* END: {0} (delta: {1}s), entries: {2}, feeds: {3}'.format(
        datetime.now(), time_delta_global,
        ' '.join('{0}={1}'.format(label, entry_stats[key]) for key,label in entry_keys),
        ' '.join('{0}={1}'.format(label, feed_stats[key]) for key,label in feed_keys) ))

    # Removing the cached data in all sites,
    #  this will only work with the memcached, db and file backends
    # TODO: make it work by "magic" through model signals
    from feedjack import fjcache
    for site_id in affected_sites: fjcache.cache_delsite(site_id)

    transaction.commit()