Python get_wiki_pageの例、parseWikipedia.get_wiki_page Pythonの例

コード例 #1

0

ファイルを表示

ファイル: index_elasticsearch_rosetta.py プロジェクト: xkxx/algodb

def get_corres_wikipedia_algo_id(page):
    wikilinks = [linktitle
        for (linksite, linktitle) in list(page.iwlinks())
        if linksite == 'wp']

    if len(wikilinks) == 0:
        # no any wiki links
        rd.sadd('rosetta-mapping-error-no-wiki-links', page.page_title)
        return None

    # first, try wikilinks that has titles similar to the task name,
    # these links are sorted by confidence of fuzzy matching
    for link in get_sorted_similar_links(page.page_title, wikilinks):
        # check if indexed
        id = get_id_of_corresponding_algorithm(link, page.page_title)
        if id is None:
            # try to index this algorithm
            wikipage = get_wiki_page(link)
            id = index_corresponding_algorithm(wikipage, link, page.page_title)
            if id is None:
                continue

        rd.hset('rosetta-mapping-success', page.page_title,
            json.dumps([id]))
        rd.sadd('rosetta-mapping-similars-success', page.page_title)
        return [id]

    # then, if none of the links is similar to the task name,
    # 1, store the task description
    # 2, relate the implementation with ALL wiki algorithms pages
    #    mentioned in description
    ids = list()
    for link in wikilinks:
        wikipage = get_wiki_page(link)
        if wikipage is not None and is_algorithm_page(wikipage):
            # check if indexed
            id = get_id_of_corresponding_algorithm(link, page.page_title)
            if id is None:
                # try to index this algorithm
                wikipage = get_wiki_page(link)
                id = index_corresponding_algorithm(wikipage, link,
                    page.page_title)
                if id is None:
                    continue
            ids.append(id)
    if len(ids) > 0:
        rd.hset('rosetta-mapping-success', page.page_title,
            json.dumps(ids))
        return ids

    rd.sadd('rosetta-mapping-error-undefinable-wikilinks', page.page_title)
    return None

コード例 #2

0

ファイルを表示

def index_wiki_page(title, depth, visited):
    print 'looking at page %s, at depth %d:' % (title, depth)

    algo_id = -1
    cate_id = -1

    if title in visited and not UPDATING_WIKI:
        # don't need to revisit any page if we are not updating for new algos
        print 'visited'
        return get_ids_of_visited_wiki_page(title)

    if pw.is_category_title(title):  # is category page
        if depth < pw.MAX_CATEGORY_DEPTH:
            page = wiki.categorypage(title)
            if page is None:
                print '-> category not found'
                mark_visited(title, visited)
                return (algo_id, cate_id)
            print '-> category'
            child_algo_ids = list()
            child_cate_ids = list()
            for member in page.categorymembers:
                (child_algo_id,
                 child_cate_id) = index_wiki_page(member, depth + 1, visited)
                if child_algo_id != -1:
                    child_algo_ids.append(child_algo_id)
                if child_cate_id != -1:
                    child_cate_ids.append(child_cate_id)

            if len(child_algo_ids) == 0 and len(child_cate_ids) == 0:
                # if not algorithm category, igore
                mark_visited(title, visited)
                return (-1, -1)
            # add self to category table, and update cate_id
            cate_id = index_wiki_category_entry(page, child_algo_ids,
                                                child_cate_ids)
    else:  # is member page
        page = pw.get_wiki_page(title)
        if page is None:
            print '-> member page not found'
            mark_visited(title, visited)
            return (algo_id, cate_id)
        if pw.is_algorithm_page(page):
            print '-> algorithm page'
            # add this algorithm to algorithm table
            algo_id = index_wiki_algorithm_entry(page, title, visited)
        else:
            print '-> member page of other stuff'

    mark_visited(title, visited)
    return (algo_id, cate_id)

コード例 #3

0

ファイルを表示

ファイル: index_elasticsearch_wikipedia.py プロジェクト: geitje01/algodb

def index_wiki_page(title, depth, visited):
    print 'looking at page %s, at depth %d:' % (title, depth)

    algo_id = -1
    cate_id = -1

    if title in visited and not UPDATING_WIKI:
        # don't need to revisit any page if we are not updating for new algos
        print 'visited'
        return get_ids_of_visited_wiki_page(title)

    if pw.is_category_title(title):                    # is category page
        if depth < pw.MAX_CATEGORY_DEPTH:
            page = wiki.categorypage(title)
            if page is None:
                print '-> category not found'
                mark_visited(title, visited)
                return (algo_id, cate_id)
            print '-> category'
            child_algo_ids = list()
            child_cate_ids = list()
            for member in page.categorymembers:
                (child_algo_id, child_cate_id) = index_wiki_page(member,
                    depth + 1, visited)
                if child_algo_id != -1:
                    child_algo_ids.append(child_algo_id)
                if child_cate_id != -1:
                    child_cate_ids.append(child_cate_id)

            if len(child_algo_ids) == 0 and len(child_cate_ids) == 0:
                # if not algorithm category, igore
                mark_visited(title, visited)
                return (-1, -1)
            # add self to category table, and update cate_id
            cate_id = index_wiki_category_entry(page,
                child_algo_ids, child_cate_ids)
    else:                                               # is member page
        page = pw.get_wiki_page(title)
        if page is None:
            print '-> member page not found'
            mark_visited(title, visited)
            return (algo_id, cate_id)
        if pw.is_algorithm_page(page):
            print '-> algorithm page'
            # add this algorithm to algorithm table
            algo_id = index_wiki_algorithm_entry(page, title, visited)
        else:
            print '-> member page of other stuff'

    mark_visited(title, visited)
    return (algo_id, cate_id)

コード例 #4

0

ファイルを表示

def index_wiki_category_entry(page, algo_ids, subcate_ids):
    # page title -> category title
    # e.g. 'Category:abcd' -> 'abcd'
    title = page.title[9:]

    result = es.get(index=INDEX_NAME,
                    doc_type='category',
                    id=normalize(title),
                    ignore=404)

    if not result['found']:
        body = {'name': title, 'algorithms': algo_ids, 'children': subcate_ids}

        # try to get category description from corresponding page
        corres_page = pw.get_wiki_page(title)
        if corres_page is None:
            # if there's no corresponding page,
            # get category description from category page itself
            corres_page = pw.get_wiki_page(page.title)
            if corres_page.summary == '':
                try:
                    corres_page = pw.get_wiki_page(corres_page.links[0])
                except KeyError:
                    # if there is no links, KeyError will be raised
                    # set corres_page to None,
                    # because empty summary will cause error
                    corres_page = None

        if corres_page is not None:
            body['tag_line'] = get_tag_line(corres_page.summary)
            body['description'] = corres_page.summary
        retval = es.index(index=INDEX_NAME,
                          doc_type='category',
                          id=normalize(title),
                          body=body)
        return retval['_id']
    else:
        print page.title, 'updated with new algos/cates!'

コード例 #5

0

ファイルを表示

ファイル: index_elasticsearch_wikipedia.py プロジェクト: geitje01/algodb

def index_wiki_category_entry(page, algo_ids, subcate_ids):
    # page title -> category title
    # e.g. 'Category:abcd' -> 'abcd'
    title = page.title[9:]

    result = es.get(index=INDEX_NAME, doc_type='category',
        id=normalize(title), ignore=404)

    if not result['found']:
        body = {
            'name': title,
            'algorithms': algo_ids,
            'children': subcate_ids
        }

        # try to get category description from corresponding page
        corres_page = pw.get_wiki_page(title)
        if corres_page is None:
            # if there's no corresponding page,
            # get category description from category page itself
            corres_page = pw.get_wiki_page(page.title)
            if corres_page.summary == '':
                try:
                    corres_page = pw.get_wiki_page(corres_page.links[0])
                except KeyError:
                    # if there is no links, KeyError will be raised
                    # set corres_page to None,
                    # because empty summary will cause error
                    corres_page = None

        if corres_page is not None:
            body['tag_line'] = get_tag_line(corres_page.summary)
            body['description'] = corres_page.summary
        retval = es.index(index=INDEX_NAME, doc_type='category',
            id=normalize(title), body=body)
        return retval['_id']
    else:
        print page.title, 'updated with new algos/cates!'

コード例 #6

0

ファイルを表示

def nel_title_suggest(page_title, auto_suggest=True):
    wikipage = get_wiki_page(page_title, auto_suggest)
    if wikipage is not None:
        # check if indexed
        id = get_id_of_corresponding_algorithm(page_title, page_title)
        if id is None:
            # try to index this algorithm
            id = index_corresponding_algorithm(wikipage, page_title,
                                               page_title)
        if id is not None:
            rd.hset('rosetta-mapping-success', page_title, json.dumps([id]))
            rd.sadd('rosetta-mapping-success-wikipedia-autosuggest',
                    page_title)
            safe_print(id)
            print '--second'
            return [id]

コード例 #7

0

ファイルを表示

ファイル: index_elasticsearch_rosetta_using_crosswikis.py プロジェクト: geitje01/algodb

def nel_title_suggest(page_title, auto_suggest=True):
    wikipage = get_wiki_page(page_title, auto_suggest)
    if wikipage is not None:
        # check if indexed
        id = get_id_of_corresponding_algorithm(page_title, page_title)
        if id is None:
            # try to index this algorithm
            id = index_corresponding_algorithm(wikipage, page_title,
                page_title)
        if id is not None:
            rd.hset('rosetta-mapping-success', page_title,
                json.dumps([id]))
            rd.sadd('rosetta-mapping-success-wikipedia-autosuggest',
                page_title)
            safe_print(id)
            print '--second'
            return [id]

コード例 #8

0

ファイルを表示

def nel_wikilinks_match_all(wikilinks, page_title):
    ids = list()
    for link in wikilinks:
        wikipage = get_wiki_page(link)
        if wikipage is not None and is_algorithm_page(wikipage):
            # check if indexed
            id = get_id_of_corresponding_algorithm(link, page_title)
            if id is None:
                # try to index this algorithm
                id = index_corresponding_algorithm(wikipage, link, page_title)
                if id is None:
                    continue
            ids.append(id)
    if len(ids) > 0:
        rd.hset('rosetta-mapping-success', page_title, json.dumps(ids))
        rd.sadd('rosetta-mapping-success-all-algo-links', page_title)
        safe_print(ids)
        print '--all-link'

    return ids

コード例 #9

0

ファイルを表示

ファイル: index_elasticsearch_rosetta_using_crosswikis.py プロジェクト: geitje01/algodb

def nel_wikilinks_match_all(wikilinks, page_title):
    ids = list()
    for link in wikilinks:
        wikipage = get_wiki_page(link)
        if wikipage is not None and is_algorithm_page(wikipage):
            # check if indexed
            id = get_id_of_corresponding_algorithm(link, page_title)
            if id is None:
                # try to index this algorithm
                id = index_corresponding_algorithm(wikipage, link,
                    page_title)
                if id is None:
                    continue
            ids.append(id)
    if len(ids) > 0:
        rd.hset('rosetta-mapping-success', page_title,
            json.dumps(ids))
        rd.sadd('rosetta-mapping-success-all-algo-links', page_title)
        safe_print(ids)
        print '--all-link'

    return ids

コード例 #10

0

ファイルを表示

def nel_title_crosswikis(page_title):
    query = "SELECT cprob, entity FROM queries WHERE anchor = %s"
    suggested_wikilinks = list(session.execute(query, [page_title]))
    suggested_wikilinks = sorted(suggested_wikilinks, key=lambda tup: tup[0])
    if len(suggested_wikilinks) > 0:
        # get the most confident link
        toplink = suggested_wikilinks[0][1]
        wikipage = get_wiki_page(toplink.replace('_', ' '))
        if wikipage is not None:
            # check if indexed
            id = get_id_of_corresponding_algorithm(toplink, page_title)
            if id is None:
                # try to index this algorithm
                id = index_corresponding_algorithm(wikipage, toplink,
                                                   page_title)
            if id is not None:
                rd.hset('rosetta-mapping-success', page_title,
                        json.dumps([id]))
                rd.sadd('rosetta-mapping-success-crosswikis', page_title)
                safe_print(id)
                print '--third'
                return [id]

コード例 #11

0

ファイルを表示

ファイル: index_elasticsearch_rosetta_using_crosswikis.py プロジェクト: geitje01/algodb

def nel_title_crosswikis(page_title):
    query = "SELECT cprob, entity FROM queries WHERE anchor = %s"
    suggested_wikilinks = list(session.execute(query, [page_title]))
    suggested_wikilinks = sorted(suggested_wikilinks,
        key=lambda tup: tup[0])
    if len(suggested_wikilinks) > 0:
        # get the most confident link
        toplink = suggested_wikilinks[0][1]
        wikipage = get_wiki_page(toplink.replace('_', ' '))
        if wikipage is not None:
            # check if indexed
            id = get_id_of_corresponding_algorithm(toplink, page_title)
            if id is None:
                # try to index this algorithm
                id = index_corresponding_algorithm(wikipage, toplink,
                    page_title)
            if id is not None:
                rd.hset('rosetta-mapping-success', page_title,
                    json.dumps([id]))
                rd.sadd('rosetta-mapping-success-crosswikis',
                    page_title)
                safe_print(id)
                print '--third'
                return [id]

コード例 #12

0

ファイルを表示

ファイル: index_elasticsearch_rosetta_using_crosswikis.py プロジェクト: geitje01/algodb

def nel_wikilinks_fuzzy(wikilinks, page_title):
    if len(wikilinks) == 0:
        # no any wiki links
        rd.sadd('rosetta-mapping-error-no-wiki-links', page_title)
    else:
        # first, try wikilinks that has titles similar to the task name,
        # these links are sorted by confidence of fuzzy matching
        for link in get_sorted_similar_links(page_title, wikilinks):
            # check if indexed
            id = get_id_of_corresponding_algorithm(link, page_title)
            if id is None:
                # try to index this algorithm
                wikipage = get_wiki_page(link)
                if wikipage is not None:
                    id = index_corresponding_algorithm(wikipage, link,
                        page_title)
                safe_print(id)
            if id is not None:
                rd.hset('rosetta-mapping-success', page_title,
                    json.dumps([id]))
                rd.sadd('rosetta-mapping-similars-success', page_title)
                safe_print(id)
                print '--first'
                return [id]

コード例 #13

0

ファイルを表示

def nel_wikilinks_fuzzy(wikilinks, page_title):
    if len(wikilinks) == 0:
        # no any wiki links
        rd.sadd('rosetta-mapping-error-no-wiki-links', page_title)
    else:
        # first, try wikilinks that has titles similar to the task name,
        # these links are sorted by confidence of fuzzy matching
        for link in get_sorted_similar_links(page_title, wikilinks):
            # check if indexed
            id = get_id_of_corresponding_algorithm(link, page_title)
            if id is None:
                # try to index this algorithm
                wikipage = get_wiki_page(link)
                if wikipage is not None:
                    id = index_corresponding_algorithm(wikipage, link,
                                                       page_title)
                safe_print(id)
            if id is not None:
                rd.hset('rosetta-mapping-success', page_title,
                        json.dumps([id]))
                rd.sadd('rosetta-mapping-similars-success', page_title)
                safe_print(id)
                print '--first'
                return [id]