def extract_topics_from_url(db): articles = db.articles.find() topic_re = re.compile('\/\d+\/(?P<topic>[a-z]+)\/(?P<subtopic>[a-z]+)?\/') topics = TopicsSet() for article in articles: match = topic_re.search(article['url']) if match: t = Topic(match.group('topic')) if t not in topics: topics.add(t) else: t = topics.getelement(t) subtopic = match.group('subtopic') if subtopic: t.add_topic(subtopic) return topics