def categories_in_query(title, content):
     num_categories = 0
     for link_article, link_text in extract_links(content):
         link_article = make_wiki_title(link_article)
         if link_article in category_set:
             num_categories += 1
     return num_categories
 def subcategories_of_query(title, content):
     num_subcategories = 0
     for link_article, link_text in extract_links(content):
         link_article = make_wiki_title(link_article)
         if link_article in subcategories:
             num_subcategories += 1
     return num_subcategories
 def category_occurences_query(title, content):
     links = extract_links(content)
     occurences = 0
     for article, link_text in links:
         if article.startswith(category_prefix):
             category = article[len(category_prefix):]
             occurences += category.lower().count(phrase)
     return occurences
 def subcategory_depth_of_query(title, content):
     min_depth = float('inf')
     for link_article, link_text in extract_links(content):
         link_article = make_wiki_title(link_article)
         try:
             depth = subcategories[link_article]
             if depth < min_depth:
                 min_depth = depth
         except KeyError:
             pass
     return min_depth
Exemple #5
0
def _article_links(lang, article_title):
    return {
        title
        for title, _ in extract_links(_article_text(lang, article_title))
    }
Exemple #6
0
        print('category: [subcategories] pairs and writes it to a pickle file.')
    else:
        _, dump_fname, categories_fname = sys.argv
        
        logging.info('extracting categories from %s', dump_fname)
        # filter_namespaces = ('14',) extracts only category pages
        category_pages = ((title, content, pageid) for title, content, pageid
                          in extract_pages(bz2.open(dump_fname, 'r'),
                                           filter_namespaces = ('14',))
                          if content != '')
        
        categories = defaultdict(list)
        
        for title, content, pageid in category_pages:
            title = make_wiki_title(title)
            category_prefix = title[:title.index(':') + 1]
            # Make entry for this category with no subcategories if it
            # doesn't exist
            if title not in categories:
                categories[title] = []
            
            # Look for supercategories of this category
            for link_article, link_text in extract_links(content):
                link_article = make_wiki_title(link_article)
                if link_article.startswith(category_prefix):
                    categories[link_article].append(title)
                    
        with open(categories_fname, 'wb') as categories_file:
            logging.info('saving categories to %s', categories_fname)
            pickle.dump(categories, categories_file)
              len(subcategories), category)
 
 root_name_occurences = defaultdict(int)
 num_articles = defaultdict(int)
 has_main_article = defaultdict(bool)
 
 # Iterate through corpus to determine number of times root category
 # name appears in subcategories, number of articles in each
 # subcategory, and whether each subcategory has a main article.
 logging.info('analyzing subcategories using articles from %s',
              dump_fname)
 with bz2.open(dump_fname, 'r') as dump_file:
     for title, content, pageid in \
             wikicorpus.extract_pages(dump_file,
                                      filter_namespaces=('0',)):
         for article_title, link_text in extract_links(content):
             if article_title in subcategories:
                 subcategory = article_title
                 root_name_occurences[subcategory] += \
                         content.lower().count(category_name)
                 num_articles[subcategory] += 1
                 
                 subcategory_name = subcategory[subcategory.index(':')
                                                + 1:]
                 if subcategory_name == make_wiki_title(title):
                     has_main_article[subcategory] = True
                         
 # Write results
 with open(results_fname, 'w') as results_file:
     results = csv.writer(results_file)
     results.writerow(['Subcategory', 'Depth below {}'.format(category),