def categories_in_query(title, content):
     num_categories = 0
     for link_article, link_text in extract_links(content):
         link_article = make_wiki_title(link_article)
         if link_article in category_set:
             num_categories += 1
     return num_categories
 def subcategories_of_query(title, content):
     num_subcategories = 0
     for link_article, link_text in extract_links(content):
         link_article = make_wiki_title(link_article)
         if link_article in subcategories:
             num_subcategories += 1
     return num_subcategories
 def subcategory_depth_of_query(title, content):
     min_depth = float('inf')
     for link_article, link_text in extract_links(content):
         link_article = make_wiki_title(link_article)
         try:
             depth = subcategories[link_article]
             if depth < min_depth:
                 min_depth = depth
         except KeyError:
             pass
     return min_depth
def categories_in(corpus, dict, categories, category_list):
    category_set = {make_wiki_title(category) for category in category_list}
    
    def categories_in_query(title, content):
        num_categories = 0
        for link_article, link_text in extract_links(content):
            link_article = make_wiki_title(link_article)
            if link_article in category_set:
                num_categories += 1
        return num_categories
    
    return SearchQuery(categories_in_query)
Exemple #5
0
def extract_links(text):
    """Given a selection of MediaWiki text, returns a generator over the links
    in the text as tuples (article, link text)."""

    for match in MEDIAWIKI_LINK_RE.finditer(text):
        link = match.groups()
        # Some links only contain a page title
        if len(link) == 1:
            title, = link
            link_text, = link
        else:
            title, link_text = link

        yield (make_wiki_title(title), link_text)
Exemple #6
0
def dbscan_single(lang, seed_titles, min_pts, eps, dictionary):
    """
    Finds a single DBSCAN cluster on Wikipedia with a given set of seed
    articles and parameters min_pts and epsilon. For DBSCAN, "neighbors" are
    considered to be articles that are linked to and that have tfidf cosine
    similarity greater than eps. dictionary should be a gensim dictionary
    that can be used for computing tfidf values. Returns a set of article
    titles.
    """

    global _dict, _tfidf
    _dict = dictionary
    _tfidf = TfidfModel(dictionary=dictionary)

    frontier = {make_wiki_title(title) for title in seed_titles}
    cluster = set()
    visited = set()

    while len(frontier) > 0:
        article_title = next(iter(frontier))
        frontier.remove(article_title)
        visited.add(article_title)

        # Get neighbors
        linked_titles = _article_links(lang, article_title)
        sys.stdout.flush()
        article_tfidf = _article_tfidf(lang, article_title)
        neighbors = []
        for linked_title in linked_titles:
            linked_tfidf = _article_tfidf(lang, linked_title)
            if linked_tfidf is not None:
                _article_links(lang, linked_title)
                cosine_sim = cosine_similarity(article_tfidf, linked_tfidf)
                sys.stdout.flush()
                if cosine_sim >= eps:
                    neighbors.append(linked_title)

        sys.stdout.flush()
        if len(neighbors) >= min_pts:
            # Add all neighbors to frontier
            for neighbor_title in neighbors:
                if neighbor_title not in visited:
                    frontier.add(neighbor_title)

        cluster.add(article_title)

    return cluster
 def is_category_main_article_query(title, content):
     title = make_wiki_title(title)
     return 1 if (category_prefix + title) in categories else 0
        # cooresponding versions in the other languages
        with open(langlinks_fname, 'r') as langlinks_file:
            langlinks_csv = csv.reader(langlinks_file)
            langs = next(langlinks_csv)
            langlinks_articles = {row[0] for row in langlinks_csv}
        # If titles are likely in the same language then use them all,
        # otherwise filter by number with langlinks
        if len(corpus & langlinks_articles) > 0.5 * len(corpus):
            corpus = corpus & langlinks_articles

        logging.info('loaded corpus with %d articles (%d with langlinks)',
                     corpus_size, len(corpus))

        chunked_articles = {}
        total_chunks = 0
        with bz2.open(dump_fname, 'rt') as dump_file:
            for title, content, pageid in \
                    wikicorpus.extract_pages(dump_file):
                title = make_wiki_title(title)
                if title in corpus:
                    chunks = chunk_article(title, content)
                    chunked_articles[title] = chunks
                    total_chunks += len(chunks)

        logging.info('split %d articles into %d chunks', len(chunked_articles),
                     total_chunks)

        logging.info('writing chunks to %s', chunks_fname)
        with open(chunks_fname, 'wb') as chunks_file:
            pickle.dump(chunked_articles, chunks_file)
                        metavar='search-query',
                        help='search query (described more below)')

    args = parser.parse_args()

    # Load MM corpus and dictionary
    corpus = load_mm_corpus(args.mm_fname)
    dict = gensim.corpora.Dictionary.load(args.dict_fname)
    with open(args.categories_fname, 'rb') as categories_file:
        categories = pickle.load(categories_file)

    prepared_query_funcs = {}
    for name, search_query in ALL_SEARCH_QUERIES.items():
        prepared_query_funcs[name] = search_query(corpus, dict, categories)
    search_query_func = eval(args.search_query, prepared_query_funcs)

    num_hits = 0

    logging.info('Searching %s with query %s', args.wiki_dump_fname,
                 args.search_query)
    with bz2.open(args.wiki_dump_fname, 'rt') as wiki_dump_file:
        with open(args.titles_out_fname, 'w') as titles_out_file:
            for title, content, pageid in \
                    wikicorpus.extract_pages(wiki_dump_file,
                                             filter_namespaces=('0',)):
                if search_query_func(title, content):
                    titles_out_file.write(make_wiki_title(title) + '\n')
                    num_hits += 1

    logging.info('Found %d matches', num_hits)
Exemple #10
0
    else:
        dump_fname = sys.argv[1]
        mm_fname = sys.argv[2]
        csv_fname = sys.argv[3]
        langs = sys.argv[4:]

        from_lang = langs[0]
        to_langs = langs[1:]

        logging.info('Reading langlink data from %s', dump_fname)
        langlinks = read_langlinks_from_dump(dump_fname, to_langs)

        metadata_fname = mm_fname[:-4] + '.metadata.cpickle'
        logging.info('Reading article titles from %s', metadata_fname)
        with open(metadata_fname, 'rb') as metadata_file:
            metadata = pickle.load(metadata_file)

        logging.info('Saving langlinks to %s', csv_fname)
        with open(csv_fname, 'w') as csv_file:
            out = csv.writer(csv_file)
            out.writerow(langs)
            for article_id, article_title in metadata.values():
                article_id = int(article_id)
                if article_id in langlinks:
                    row = [make_wiki_title(article_title)]
                    row += [
                        make_wiki_title(title) if title else ''
                        for title in langlinks[article_id]
                    ]
                    out.writerow(row)
Exemple #11
0
     print('Extract all categories from a Wikipedia database dump as a dictionary of')
     print('category: [subcategories] pairs and writes it to a pickle file.')
 else:
     _, dump_fname, categories_fname = sys.argv
     
     logging.info('extracting categories from %s', dump_fname)
     # filter_namespaces = ('14',) extracts only category pages
     category_pages = ((title, content, pageid) for title, content, pageid
                       in extract_pages(bz2.open(dump_fname, 'r'),
                                        filter_namespaces = ('14',))
                       if content != '')
     
     categories = defaultdict(list)
     
     for title, content, pageid in category_pages:
         title = make_wiki_title(title)
         category_prefix = title[:title.index(':') + 1]
         # Make entry for this category with no subcategories if it
         # doesn't exist
         if title not in categories:
             categories[title] = []
         
         # Look for supercategories of this category
         for link_article, link_text in extract_links(content):
             link_article = make_wiki_title(link_article)
             if link_article.startswith(category_prefix):
                 categories[link_article].append(title)
                 
     with open(categories_fname, 'wb') as categories_file:
         logging.info('saving categories to %s', categories_fname)
         pickle.dump(categories, categories_file)
from gensim.corpora import wikicorpus

from biases.wiki.titles import make_wiki_title
from biases.wiki.categories import get_subcategories
from biases.wiki.text import extract_links

if __name__ == '__main__':
    if len(sys.argv) != 5:
        print('Usage: python3 analyze_subcategories.py wiki-pages-articles.xml.bz2 categories.pickle results.csv "Category:name"')
        print('Analyze the subcategories of the given category for properties such as the depth')
        print('below the root and the number of times the root category name is mentioned.')
    else:
        _, dump_fname, categories_fname, results_fname, category = sys.argv
        
        # If category is 'Category:Cold_War', category_name is 'cold war'
        category = make_wiki_title(category)
        category_name = category[category.index(':') + 1:].replace('_', ' ') \
                                                          .lower()
        
        logging.info('loading categories from %s', categories_fname)
        with open(categories_fname, 'rb') as categories_file:
            categories = pickle.load(categories_file)
            
        logging.info('determining subcategories of "%s"', category)
        subcategories = get_subcategories(categories, category)
        logging.info('found %d subcategories of "%s"',
                     len(subcategories), category)
        
        root_name_occurences = defaultdict(int)
        num_articles = defaultdict(int)
        has_main_article = defaultdict(bool)