def handle(self, *args, **options): # couchdbkit doesn't preserve microseconds start_time = datetime.utcnow().replace(microsecond=0) excluded_tags = settings.DIRECTORY_EXCLUDED_TAGS tags = args or Tag.all() for n, tag in enumerate(tags): if not isinstance(tag, basestring): tag = str(tag) label = utils.remove_control_chars(tag.strip()) if not label: continue tag_obj = Tag(tag) podcast_ids, weights = utils.unzip(list(tag_obj.get_podcasts())) podcast_objs = Podcast.get_multi(podcast_ids) podcasts = [] for podcast, weight in zip(podcast_objs, weights): e = CategoryEntry() e.podcast = podcast.get_id() e.weight = float(weight * podcast.subscriber_count()) podcasts.append(e) category = Category.for_tag(label) if not category: if not label or label in excluded_tags: continue category = Category() category.label = label category.spellings = [] # delete if it has been excluded after it has been created if label in excluded_tags: category.delete() continue # we overwrite previous data if category.updated != start_time: category.podcasts = [] category.merge_podcasts(podcasts) category.updated = start_time if 'weight' in category: del category['weight'] category.save() try: utils.progress(n % 1000, 1000, category.label.encode('utf-8')) except: pass
def add_user_recursive(self, user, docs): """ adds a user and all the podcast and episodes it references """ # User docs.add(user._id) # Suggestions suggestions = suggestions_for_user(user) docs.add(suggestions._id) progress(0, len(docs), '', stream=sys.stderr) # Podcast States for p_state in podcast_states_for_user(user): self.add_podcast_state(p_state, docs) progress(0, len(docs), p_state, stream=sys.stderr) # Podcast podcast = podcast_by_id(p_state.podcast) self.add_podcast(podcast, docs) progress(0, len(docs), podcast, stream=sys.stderr) # Episodes for episode in episodes_for_podcast(podcast): self.add_episode(episode, docs) progress(0, len(docs), episode, stream=sys.stderr) e_state = episode_state_for_user_episode(user, episode) self.add_episode_state(e_state, docs) progress(0, len(docs), e_state, stream=sys.stderr)
def handle(self, *args, **options): docs = set() progress(0, len(docs), '', stream=sys.stderr) for username in options.get('users', []): user = User.get_user(username) self.add_user_recursive(user, docs) if options.get('toplist', False): toplist = PodcastToplist() for n, podcast in toplist[:25]: self.add_podcast_recursive(podcast, docs) for podcast_url in options.get('podcasts'): podcast = podcast_for_url(podcast_url, docs) if not podcast: logger.warn('podcast not found for URL "%s"', podcast_url) else: self.add_podcast_recursive(podcast, docs) db = get_main_database() docs = sorted(docs) self.dump(docs, db)
def print_status(self, seq, actions): counter = getattr(self, 'counter', 0) if counter % 1000 == 0: self.total = self.db.info()['update_seq'] self.counter = counter + 1 status_str = ', '.join('%s: %d' % x for x in self.actions.items()) progress(seq, self.total, status_str)
def print_status(self, seq, actions): counter = getattr(self, "counter", 0) if counter % 1000 == 0: self.total = self.db.info()["update_seq"] self.counter = counter + 1 status_str = ", ".join("%s: %d" % x for x in self.actions.items()) progress(seq, self.total, status_str)
def handle(self, *args, **options): db = get_main_database() status = self.get_cmd_status() since = self.get_since(status, options) objects = self.get_objects(db, since) actions = Counter() # create unfinished command run status run_status = CommandRunStatus() run_status.timestamp_started = datetime.utcnow() run_status.start_seq = since # add it to existing one (if any) status.runs.append(run_status) status.save() total = db.info()['update_seq'] has_slug = lambda x: bool(x.slug) for seq, obj in objects: total = db.info()['update_seq'] if isinstance(obj, PodcastGroup): podcasts = filter(has_slug, obj.podcasts) if isinstance(obj, Podcast): podcasts = filter(has_slug, [obj]) elif isinstance(obj, Episode): if has_slug(obj): continue podcast = podcast_by_id(obj.podcast) if not podcast: continue podcasts = filter(has_slug, [podcast]) updated = self.handle_podcasts(podcasts) actions['updated'] += updated if not options['silent']: status_str = ', '.join('%s: %d' % x for x in actions.items()) progress(seq, total, status_str) # finish command run status run_status.timestamp_finished = datetime.utcnow() run_status.end_seq = total run_status.status_counter = dict(actions) # and overwrite existing one (we could keep a longer log here) status.runs = [run_status] status.save()
def handle(self, *args, **options): silent = options.get("silent") podcasts = Podcast.objects.all() total = podcasts.count() for n, podcast in enumerate(podcasts): update_podcast_subscribers.delay(podcast.get_id()) if not silent: progress(n, total)
def handle(self, *args, **options): silent = options.get('silent') podcasts = Podcast.objects.all() total = podcasts.count_fast() for n, podcast in enumerate(podcasts): update_podcast_subscribers.delay(podcast.get_id()) if not silent: progress(n, total)
def handle(self, *args, **options): skip = options.get('skip') total = EpisodeUserState.view('episode_states/by_user_episode', limit=0, ).total_rows db = get_main_database() actions = Counter() actions['merged'] = 0 for n in count(skip): first = EpisodeUserState.view('episode_states/by_user_episode', skip = n, include_docs = True, limit = 1, ) first = list(first) if not first: break first = first[0] states = EpisodeUserState.view('episode_states/by_user_episode', key = [first.user, first.episode], include_docs = True, ) states = list(states) l1 = len(states) # we don't want to delete this one states.remove(first) assert len(states) == l1-1 if states: updater = get_updater(states) obj_funs = [(first, updater)] + [(state, do_delete) for state in states] bulk_save_retry(db, obj_funs) merged = len(states)-1 actions['merged'] += merged total -= merged status_str = ', '.join('%s: %d' % x for x in actions.items()) progress(n+1, total, status_str)
def handle(self, *args, **options): if not args: print >> sys.stderr, "Usage: ./manage.py delete-sanitizing-rules <slug> [<slug2> ...]" return for n, slug in enumerate(args): rule = SanitizingRule.for_slug(slug) if rule: self.delete_rule(rule=rule) progress(n+1, len(args))
def handle(self, *args, **options): users = deleted_users() total = deleted_user_count() for n, user in enumerate(users): if user.is_active or not user.deleted: print 'skipping', user.username print 'deleting', user.username, user.delete() progress(n+1, total)
def handle(self, *args, **options): silent = options.get('silent') # couchdbkit doesn't preserve microseconds started = datetime.utcnow().replace(microsecond=0) podcasts = Podcast.all_podcasts() total = Podcast.view('podcasts/by_oldid', limit=0).total_rows for n, podcast in enumerate(podcasts): subscriber_count = self.get_subscriber_count(podcast) self.update(podcast=podcast, started=started, subscriber_count=subscriber_count) if not silent: progress(n, total)
def handle(self, *args, **options): users = User.view("users/deleted", include_docs=True, reduce=False) total = User.view("users/deleted", reduce=True) total = list(total)[0]["value"] if total else 0 for n, user in enumerate(users): if user.is_active or not user.deleted: print "skipping", user.username print "deleting", user.username, user.delete() progress(n + 1, total)
def handle(self, *args, **options): if not args: print >> sys.stderr, "Usage: ./manage.py sync-sanitizing-rules <filename> [<filename2> ...]" return for filename in args: config = ConfigParser.ConfigParser() config.read(filename) sections = config.sections() for n, slug in enumerate(sections): rule = SanitizingRule.for_slug(slug) or SanitizingRule() self.update_rule(rule=rule, config=config, slug=slug) progress(n+1, len(sections), filename)
def handle(self, *args, **options): total = Podcast.count() podcasts = Podcast.all_podcasts() actions = Counter() for n, podcast in enumerate(podcasts): psubscriber = PodcastSubscriberData.for_podcast(podcast.get_id()) res = self.update_subscriber_data(podcast, data=psubscriber) self.update_podcast(podcast=podcast) action = 'updated' if res else 'skipped' actions[action] += 1 status_str = ', '.join('%s: %d' % x for x in actions.items()) progress(n+1, total, status_str)
def handle(self, *args, **options): get_podcast = itemgetter(0) max_related = options.get('max') podcasts = all_podcasts() total = podcast_count() for (n, podcast) in enumerate(podcasts): l = calc_similar_podcasts(podcast)[:max_related] related = map(get_podcast, l) update_related_podcasts(podcast, related) progress(n+1, total)
def handle(self, *args, **options): groups = PodcastGroupsMissingSlugs() total = len(groups) for n, group in enumerate(groups): assign_slug(group, PodcastGroupSlug) progress(n+1, total) # only consider podcasts that have enough subscribers min_subscribers = settings.PODCAST_SLUG_SUBSCRIBER_LIMIT enough_subscribers = lambda p: p.subscriber_count() >= min_subscribers podcasts = PodcastsMissingSlugs() total = len(podcasts) for n, podcast in enumerate(takewhile(enough_subscribers, podcasts)): assign_slug(podcast, PodcastSlug) progress(n+1, total)
def dump(self, docs, db): output = sys.stdout boundary = None envelope = write_multipart(output, boundary=boundary) total = len(docs) for n, docid in enumerate(docs): if not docid: continue doc = db.get(docid, attachments=True) attachments = doc.pop('_attachments', {}) jsondoc = json.encode(doc) if attachments: parts = envelope.open({ 'Content-ID': doc['_id'], 'ETag': '"%s"' % doc['_rev'] }) parts.add('application/json', jsondoc) for name, info in attachments.items(): content_type = info.get('content_type') if content_type is None: # CouchDB < 0.8 content_type = info.get('content-type') parts.add(content_type, b64decode(info['data']), { 'Content-ID': name }) parts.close() else: envelope.add('application/json', jsondoc, { 'Content-ID': doc['_id'], 'ETag': '"%s"' % doc['_rev'] }) progress(n+1, total, docid, stream=sys.stderr) envelope.close()
def handle(self, *args, **options): max_suggestions = options.get('max') if options.get('username'): users = [User.get_user(options.get('username'))] else: users = User.all_users() users = filter(lambda u: u.is_active, users) if options.get('outdated'): users = filter(lambda u: not u.suggestions_up_to_date, users) if options.get('max_users'): users = users[:int(options.get('max_users'))] total = len(users) for n, user in enumerate(users): suggestion = Suggestions.for_user(user) subscribed_podcasts = list(set(user.get_subscribed_podcasts())) subscribed_podcasts = filter(None, subscribed_podcasts) subscribed_podcasts = filter(None, subscribed_podcasts) related = chain.from_iterable([p.related_podcasts for p in subscribed_podcasts]) related = filter(lambda pid: not pid in suggestion.blacklist, related) counter = Counter(related) get_podcast_id = itemgetter(0) suggested = map(get_podcast_id, counter.most_common(max_suggestions)) suggestion.podcasts = suggested suggestion.save() _update_user(user=user) progress(n+1, total)
def handle(self, *args, **options): get_podcast = itemgetter(0) max_related = options.get('max') podcasts = Podcast.all_podcasts() total = Podcast.view('podcasts/by_id', limit=0).total_rows for (n, podcast) in enumerate(podcasts): l = calc_similar_podcasts(podcast)[:max_related] related = map(get_podcast, l) @repeat_on_conflict(['podcast']) def _update(podcast, related): podcast.related_podcasts = related podcast.save() _update(podcast=podcast, related=related) progress(n+1, total)
def handle(self, *args, **options): skip = options.get('skip') total = episode_states_count() actions = Counter() actions['merged'] = 0 for n in count(skip): first = get_nth_episode_state(n) if first is None: break states = get_duplicate_episode_states(first.user, first.episode) l1 = len(states) # we don't want to delete this one states.remove(first) assert len(states) == l1-1 if states: updater = get_updater(states) obj_funs = [(first, updater)] + [(state, do_delete) for state in states] udb = get_userdata_database() bulk_save_retry(obj_funs, udb) merged = len(states)-1 actions['merged'] += merged total -= merged status_str = ', '.join('%s: %d' % x for x in actions.items()) progress(n+1, total, status_str)
def add_podcast_recursive(self, podcast, docs): self.add_podcast(podcast, docs) progress(0, len(docs), podcast, stream=sys.stderr) states = all_podcast_states(podcast) for state in states: self.add_podcast_state(state, docs) progress(0, len(docs), state, stream=sys.stderr) # Episodes for episode in episodes_for_podcast(podcast.get_podcast()): self.add_episode(episode, docs) progress(0, len(docs), episode, stream=sys.stderr) states = all_episode_states(episode) for state in states: self.add_episode_state(state, docs) progress(0, len(docs), state, stream=sys.stderr)
def import_file(self, user_id, filename): progress(0, 100, filename) with open(filename, 'r') as f: actions = json.load(f) progress(0, len(actions), filename) user = User.get(user_id) now = datetime.now() batch_size = 100 count = len(actions) / batch_size for low in range(0, len(actions), batch_size): high = low+batch_size batch = actions[low:high] update_episodes(user, batch, now, None) progress(high, len(actions), filename)
# last option - merge podcasts try: if not dry_run: rewrite_podcasts(p, su_podcast) p.delete() p_stats["merged"] += 1 except Exception, e: log("error rewriting podcast %s: %s" % (p.id, e)) print "error rewriting podcast %s: %s" % (p.id, e) p_stats["error"] += 1 continue progress(n + 1, num_podcasts, str(p.id)) print "finished %s podcasts" % (n + 1) print "%(unchanged)d unchanged, %(merged)d merged, %(updated)d updated, %(deleted)d deleted, %(error)d error" % p_stats print "Hits" for _, r in podcast_rules: print "% 30s: %d" % (r.slug, getattr(r, "hits", 0) if hasattr(r, "hits") else 0) def rewrite_podcasts(p_old, p_new): log('merging podcast %s "%s" to correct podcast %s "%s"' % (p_old.id, p_old.url, p_new.id, p_new.url)) rewrite_newpodcast(p_old, p_new)