Example #1
0
def parse_category_page(page, output, depth, visited):
    print 'looking at page:', page.title
    for member in page.categorymembers:
        print member, depth
        if member in visited:
            print 'visited'
            continue
        visited.add(member)
        if is_category_title(member):        # subcategory page
            if depth < MAX_CATEGORY_DEPTH:
                page = wiki.categorypage(member)
                if page is None:
                    print '-> subcategory not found'
                    continue
                print '-> subcategory'
                parse_category_page(page, output, depth + 1, visited)
        else:                                # member page
            page = get_wiki_page(member)
            if page is None:
                print '-> member page not found'
                continue
            if is_algorithm_page(page):
                print '-> algorithm page'
                write_output_from_page(output, page)
            else:
                print '-> member page of other stuff'
Example #2
0
def parse_category_page(page, output, depth, visited):
    print 'looking at page:', page.title
    for member in page.categorymembers:
        print member, depth
        if member in visited:
            print 'visited'
            continue
        visited.add(member)
        if is_category_title(member):  # subcategory page
            if depth < MAX_CATEGORY_DEPTH:
                page = wiki.categorypage(member)
                if page is None:
                    print '-> subcategory not found'
                    continue
                print '-> subcategory'
                parse_category_page(page, output, depth + 1, visited)
        else:  # member page
            page = get_wiki_page(member)
            if page is None:
                print '-> member page not found'
                continue
            if is_algorithm_page(page):
                print '-> algorithm page'
                write_output_from_page(output, page)
            else:
                print '-> member page of other stuff'
Example #3
0
def index_wiki_page(title, depth, visited):
    print 'looking at page %s, at depth %d:' % (title, depth)

    algo_id = -1
    cate_id = -1

    if title in visited and not UPDATING_WIKI:
        # don't need to revisit any page if we are not updating for new algos
        print 'visited'
        return get_ids_of_visited_wiki_page(title)

    if pw.is_category_title(title):  # is category page
        if depth < pw.MAX_CATEGORY_DEPTH:
            page = wiki.categorypage(title)
            if page is None:
                print '-> category not found'
                mark_visited(title, visited)
                return (algo_id, cate_id)
            print '-> category'
            child_algo_ids = list()
            child_cate_ids = list()
            for member in page.categorymembers:
                (child_algo_id,
                 child_cate_id) = index_wiki_page(member, depth + 1, visited)
                if child_algo_id != -1:
                    child_algo_ids.append(child_algo_id)
                if child_cate_id != -1:
                    child_cate_ids.append(child_cate_id)

            if len(child_algo_ids) == 0 and len(child_cate_ids) == 0:
                # if not algorithm category, igore
                mark_visited(title, visited)
                return (-1, -1)
            # add self to category table, and update cate_id
            cate_id = index_wiki_category_entry(page, child_algo_ids,
                                                child_cate_ids)
    else:  # is member page
        page = pw.get_wiki_page(title)
        if page is None:
            print '-> member page not found'
            mark_visited(title, visited)
            return (algo_id, cate_id)
        if pw.is_algorithm_page(page):
            print '-> algorithm page'
            # add this algorithm to algorithm table
            algo_id = index_wiki_algorithm_entry(page, title, visited)
        else:
            print '-> member page of other stuff'

    mark_visited(title, visited)
    return (algo_id, cate_id)
def index_wiki_page(title, depth, visited):
    print 'looking at page %s, at depth %d:' % (title, depth)

    algo_id = -1
    cate_id = -1

    if title in visited and not UPDATING_WIKI:
        # don't need to revisit any page if we are not updating for new algos
        print 'visited'
        return get_ids_of_visited_wiki_page(title)

    if pw.is_category_title(title):                    # is category page
        if depth < pw.MAX_CATEGORY_DEPTH:
            page = wiki.categorypage(title)
            if page is None:
                print '-> category not found'
                mark_visited(title, visited)
                return (algo_id, cate_id)
            print '-> category'
            child_algo_ids = list()
            child_cate_ids = list()
            for member in page.categorymembers:
                (child_algo_id, child_cate_id) = index_wiki_page(member,
                    depth + 1, visited)
                if child_algo_id != -1:
                    child_algo_ids.append(child_algo_id)
                if child_cate_id != -1:
                    child_cate_ids.append(child_cate_id)

            if len(child_algo_ids) == 0 and len(child_cate_ids) == 0:
                # if not algorithm category, igore
                mark_visited(title, visited)
                return (-1, -1)
            # add self to category table, and update cate_id
            cate_id = index_wiki_category_entry(page,
                child_algo_ids, child_cate_ids)
    else:                                               # is member page
        page = pw.get_wiki_page(title)
        if page is None:
            print '-> member page not found'
            mark_visited(title, visited)
            return (algo_id, cate_id)
        if pw.is_algorithm_page(page):
            print '-> algorithm page'
            # add this algorithm to algorithm table
            algo_id = index_wiki_algorithm_entry(page, title, visited)
        else:
            print '-> member page of other stuff'

    mark_visited(title, visited)
    return (algo_id, cate_id)
Example #5
0
def parse_category(category):
    visited = set()
    try:
        input = open('wiki_algo_category.json')
        for line in input:
            visited.add(json.loads(line)['title'])
        input.close()
    except IOError:
        pass

    output = open('wiki_algo_category.json', 'a')
    # csv_writer = csv.writer(output)

    print 'start parsing...'
    parse_category_page(wiki.categorypage(category), output, 0, visited)
    output.close()
Example #6
0
def parse_category(category):
    visited = set()
    try:
        input = open('wiki_algo_category.json')
        for line in input:
            visited.add(json.loads(line)['title'])
        input.close()
    except IOError:
        pass

    output = open('wiki_algo_category.json', 'a')
    # csv_writer = csv.writer(output)

    print 'start parsing...'
    parse_category_page(wiki.categorypage(category), output, 0, visited)
    output.close()