def parse_category_page(page, output, depth, visited): print 'looking at page:', page.title for member in page.categorymembers: print member, depth if member in visited: print 'visited' continue visited.add(member) if is_category_title(member): # subcategory page if depth < MAX_CATEGORY_DEPTH: page = wiki.categorypage(member) if page is None: print '-> subcategory not found' continue print '-> subcategory' parse_category_page(page, output, depth + 1, visited) else: # member page page = get_wiki_page(member) if page is None: print '-> member page not found' continue if is_algorithm_page(page): print '-> algorithm page' write_output_from_page(output, page) else: print '-> member page of other stuff'
def index_wiki_page(title, depth, visited): print 'looking at page %s, at depth %d:' % (title, depth) algo_id = -1 cate_id = -1 if title in visited and not UPDATING_WIKI: # don't need to revisit any page if we are not updating for new algos print 'visited' return get_ids_of_visited_wiki_page(title) if pw.is_category_title(title): # is category page if depth < pw.MAX_CATEGORY_DEPTH: page = wiki.categorypage(title) if page is None: print '-> category not found' mark_visited(title, visited) return (algo_id, cate_id) print '-> category' child_algo_ids = list() child_cate_ids = list() for member in page.categorymembers: (child_algo_id, child_cate_id) = index_wiki_page(member, depth + 1, visited) if child_algo_id != -1: child_algo_ids.append(child_algo_id) if child_cate_id != -1: child_cate_ids.append(child_cate_id) if len(child_algo_ids) == 0 and len(child_cate_ids) == 0: # if not algorithm category, igore mark_visited(title, visited) return (-1, -1) # add self to category table, and update cate_id cate_id = index_wiki_category_entry(page, child_algo_ids, child_cate_ids) else: # is member page page = pw.get_wiki_page(title) if page is None: print '-> member page not found' mark_visited(title, visited) return (algo_id, cate_id) if pw.is_algorithm_page(page): print '-> algorithm page' # add this algorithm to algorithm table algo_id = index_wiki_algorithm_entry(page, title, visited) else: print '-> member page of other stuff' mark_visited(title, visited) return (algo_id, cate_id)
def parse_category(category): visited = set() try: input = open('wiki_algo_category.json') for line in input: visited.add(json.loads(line)['title']) input.close() except IOError: pass output = open('wiki_algo_category.json', 'a') # csv_writer = csv.writer(output) print 'start parsing...' parse_category_page(wiki.categorypage(category), output, 0, visited) output.close()