def categories_in_query(title, content): num_categories = 0 for link_article, link_text in extract_links(content): link_article = make_wiki_title(link_article) if link_article in category_set: num_categories += 1 return num_categories
def subcategories_of_query(title, content): num_subcategories = 0 for link_article, link_text in extract_links(content): link_article = make_wiki_title(link_article) if link_article in subcategories: num_subcategories += 1 return num_subcategories
def subcategory_depth_of_query(title, content): min_depth = float('inf') for link_article, link_text in extract_links(content): link_article = make_wiki_title(link_article) try: depth = subcategories[link_article] if depth < min_depth: min_depth = depth except KeyError: pass return min_depth
def categories_in(corpus, dict, categories, category_list): category_set = {make_wiki_title(category) for category in category_list} def categories_in_query(title, content): num_categories = 0 for link_article, link_text in extract_links(content): link_article = make_wiki_title(link_article) if link_article in category_set: num_categories += 1 return num_categories return SearchQuery(categories_in_query)
def extract_links(text): """Given a selection of MediaWiki text, returns a generator over the links in the text as tuples (article, link text).""" for match in MEDIAWIKI_LINK_RE.finditer(text): link = match.groups() # Some links only contain a page title if len(link) == 1: title, = link link_text, = link else: title, link_text = link yield (make_wiki_title(title), link_text)
def dbscan_single(lang, seed_titles, min_pts, eps, dictionary): """ Finds a single DBSCAN cluster on Wikipedia with a given set of seed articles and parameters min_pts and epsilon. For DBSCAN, "neighbors" are considered to be articles that are linked to and that have tfidf cosine similarity greater than eps. dictionary should be a gensim dictionary that can be used for computing tfidf values. Returns a set of article titles. """ global _dict, _tfidf _dict = dictionary _tfidf = TfidfModel(dictionary=dictionary) frontier = {make_wiki_title(title) for title in seed_titles} cluster = set() visited = set() while len(frontier) > 0: article_title = next(iter(frontier)) frontier.remove(article_title) visited.add(article_title) # Get neighbors linked_titles = _article_links(lang, article_title) sys.stdout.flush() article_tfidf = _article_tfidf(lang, article_title) neighbors = [] for linked_title in linked_titles: linked_tfidf = _article_tfidf(lang, linked_title) if linked_tfidf is not None: _article_links(lang, linked_title) cosine_sim = cosine_similarity(article_tfidf, linked_tfidf) sys.stdout.flush() if cosine_sim >= eps: neighbors.append(linked_title) sys.stdout.flush() if len(neighbors) >= min_pts: # Add all neighbors to frontier for neighbor_title in neighbors: if neighbor_title not in visited: frontier.add(neighbor_title) cluster.add(article_title) return cluster
def is_category_main_article_query(title, content): title = make_wiki_title(title) return 1 if (category_prefix + title) in categories else 0
# cooresponding versions in the other languages with open(langlinks_fname, 'r') as langlinks_file: langlinks_csv = csv.reader(langlinks_file) langs = next(langlinks_csv) langlinks_articles = {row[0] for row in langlinks_csv} # If titles are likely in the same language then use them all, # otherwise filter by number with langlinks if len(corpus & langlinks_articles) > 0.5 * len(corpus): corpus = corpus & langlinks_articles logging.info('loaded corpus with %d articles (%d with langlinks)', corpus_size, len(corpus)) chunked_articles = {} total_chunks = 0 with bz2.open(dump_fname, 'rt') as dump_file: for title, content, pageid in \ wikicorpus.extract_pages(dump_file): title = make_wiki_title(title) if title in corpus: chunks = chunk_article(title, content) chunked_articles[title] = chunks total_chunks += len(chunks) logging.info('split %d articles into %d chunks', len(chunked_articles), total_chunks) logging.info('writing chunks to %s', chunks_fname) with open(chunks_fname, 'wb') as chunks_file: pickle.dump(chunked_articles, chunks_file)
metavar='search-query', help='search query (described more below)') args = parser.parse_args() # Load MM corpus and dictionary corpus = load_mm_corpus(args.mm_fname) dict = gensim.corpora.Dictionary.load(args.dict_fname) with open(args.categories_fname, 'rb') as categories_file: categories = pickle.load(categories_file) prepared_query_funcs = {} for name, search_query in ALL_SEARCH_QUERIES.items(): prepared_query_funcs[name] = search_query(corpus, dict, categories) search_query_func = eval(args.search_query, prepared_query_funcs) num_hits = 0 logging.info('Searching %s with query %s', args.wiki_dump_fname, args.search_query) with bz2.open(args.wiki_dump_fname, 'rt') as wiki_dump_file: with open(args.titles_out_fname, 'w') as titles_out_file: for title, content, pageid in \ wikicorpus.extract_pages(wiki_dump_file, filter_namespaces=('0',)): if search_query_func(title, content): titles_out_file.write(make_wiki_title(title) + '\n') num_hits += 1 logging.info('Found %d matches', num_hits)
else: dump_fname = sys.argv[1] mm_fname = sys.argv[2] csv_fname = sys.argv[3] langs = sys.argv[4:] from_lang = langs[0] to_langs = langs[1:] logging.info('Reading langlink data from %s', dump_fname) langlinks = read_langlinks_from_dump(dump_fname, to_langs) metadata_fname = mm_fname[:-4] + '.metadata.cpickle' logging.info('Reading article titles from %s', metadata_fname) with open(metadata_fname, 'rb') as metadata_file: metadata = pickle.load(metadata_file) logging.info('Saving langlinks to %s', csv_fname) with open(csv_fname, 'w') as csv_file: out = csv.writer(csv_file) out.writerow(langs) for article_id, article_title in metadata.values(): article_id = int(article_id) if article_id in langlinks: row = [make_wiki_title(article_title)] row += [ make_wiki_title(title) if title else '' for title in langlinks[article_id] ] out.writerow(row)
print('Extract all categories from a Wikipedia database dump as a dictionary of') print('category: [subcategories] pairs and writes it to a pickle file.') else: _, dump_fname, categories_fname = sys.argv logging.info('extracting categories from %s', dump_fname) # filter_namespaces = ('14',) extracts only category pages category_pages = ((title, content, pageid) for title, content, pageid in extract_pages(bz2.open(dump_fname, 'r'), filter_namespaces = ('14',)) if content != '') categories = defaultdict(list) for title, content, pageid in category_pages: title = make_wiki_title(title) category_prefix = title[:title.index(':') + 1] # Make entry for this category with no subcategories if it # doesn't exist if title not in categories: categories[title] = [] # Look for supercategories of this category for link_article, link_text in extract_links(content): link_article = make_wiki_title(link_article) if link_article.startswith(category_prefix): categories[link_article].append(title) with open(categories_fname, 'wb') as categories_file: logging.info('saving categories to %s', categories_fname) pickle.dump(categories, categories_file)
from gensim.corpora import wikicorpus from biases.wiki.titles import make_wiki_title from biases.wiki.categories import get_subcategories from biases.wiki.text import extract_links if __name__ == '__main__': if len(sys.argv) != 5: print('Usage: python3 analyze_subcategories.py wiki-pages-articles.xml.bz2 categories.pickle results.csv "Category:name"') print('Analyze the subcategories of the given category for properties such as the depth') print('below the root and the number of times the root category name is mentioned.') else: _, dump_fname, categories_fname, results_fname, category = sys.argv # If category is 'Category:Cold_War', category_name is 'cold war' category = make_wiki_title(category) category_name = category[category.index(':') + 1:].replace('_', ' ') \ .lower() logging.info('loading categories from %s', categories_fname) with open(categories_fname, 'rb') as categories_file: categories = pickle.load(categories_file) logging.info('determining subcategories of "%s"', category) subcategories = get_subcategories(categories, category) logging.info('found %d subcategories of "%s"', len(subcategories), category) root_name_occurences = defaultdict(int) num_articles = defaultdict(int) has_main_article = defaultdict(bool)