def categories_in_query(title, content): num_categories = 0 for link_article, link_text in extract_links(content): link_article = make_wiki_title(link_article) if link_article in category_set: num_categories += 1 return num_categories
def subcategories_of_query(title, content): num_subcategories = 0 for link_article, link_text in extract_links(content): link_article = make_wiki_title(link_article) if link_article in subcategories: num_subcategories += 1 return num_subcategories
def category_occurences_query(title, content): links = extract_links(content) occurences = 0 for article, link_text in links: if article.startswith(category_prefix): category = article[len(category_prefix):] occurences += category.lower().count(phrase) return occurences
def subcategory_depth_of_query(title, content): min_depth = float('inf') for link_article, link_text in extract_links(content): link_article = make_wiki_title(link_article) try: depth = subcategories[link_article] if depth < min_depth: min_depth = depth except KeyError: pass return min_depth
def _article_links(lang, article_title): return { title for title, _ in extract_links(_article_text(lang, article_title)) }
print('category: [subcategories] pairs and writes it to a pickle file.') else: _, dump_fname, categories_fname = sys.argv logging.info('extracting categories from %s', dump_fname) # filter_namespaces = ('14',) extracts only category pages category_pages = ((title, content, pageid) for title, content, pageid in extract_pages(bz2.open(dump_fname, 'r'), filter_namespaces = ('14',)) if content != '') categories = defaultdict(list) for title, content, pageid in category_pages: title = make_wiki_title(title) category_prefix = title[:title.index(':') + 1] # Make entry for this category with no subcategories if it # doesn't exist if title not in categories: categories[title] = [] # Look for supercategories of this category for link_article, link_text in extract_links(content): link_article = make_wiki_title(link_article) if link_article.startswith(category_prefix): categories[link_article].append(title) with open(categories_fname, 'wb') as categories_file: logging.info('saving categories to %s', categories_fname) pickle.dump(categories, categories_file)
len(subcategories), category) root_name_occurences = defaultdict(int) num_articles = defaultdict(int) has_main_article = defaultdict(bool) # Iterate through corpus to determine number of times root category # name appears in subcategories, number of articles in each # subcategory, and whether each subcategory has a main article. logging.info('analyzing subcategories using articles from %s', dump_fname) with bz2.open(dump_fname, 'r') as dump_file: for title, content, pageid in \ wikicorpus.extract_pages(dump_file, filter_namespaces=('0',)): for article_title, link_text in extract_links(content): if article_title in subcategories: subcategory = article_title root_name_occurences[subcategory] += \ content.lower().count(category_name) num_articles[subcategory] += 1 subcategory_name = subcategory[subcategory.index(':') + 1:] if subcategory_name == make_wiki_title(title): has_main_article[subcategory] = True # Write results with open(results_fname, 'w') as results_file: results = csv.writer(results_file) results.writerow(['Subcategory', 'Depth below {}'.format(category),