def get_corres_wikipedia_algo_id(page): wikilinks = [linktitle for (linksite, linktitle) in list(page.iwlinks()) if linksite == 'wp'] if len(wikilinks) == 0: # no any wiki links rd.sadd('rosetta-mapping-error-no-wiki-links', page.page_title) return None # first, try wikilinks that has titles similar to the task name, # these links are sorted by confidence of fuzzy matching for link in get_sorted_similar_links(page.page_title, wikilinks): # check if indexed id = get_id_of_corresponding_algorithm(link, page.page_title) if id is None: # try to index this algorithm wikipage = get_wiki_page(link) id = index_corresponding_algorithm(wikipage, link, page.page_title) if id is None: continue rd.hset('rosetta-mapping-success', page.page_title, json.dumps([id])) rd.sadd('rosetta-mapping-similars-success', page.page_title) return [id] # then, if none of the links is similar to the task name, # 1, store the task description # 2, relate the implementation with ALL wiki algorithms pages # mentioned in description ids = list() for link in wikilinks: wikipage = get_wiki_page(link) if wikipage is not None and is_algorithm_page(wikipage): # check if indexed id = get_id_of_corresponding_algorithm(link, page.page_title) if id is None: # try to index this algorithm wikipage = get_wiki_page(link) id = index_corresponding_algorithm(wikipage, link, page.page_title) if id is None: continue ids.append(id) if len(ids) > 0: rd.hset('rosetta-mapping-success', page.page_title, json.dumps(ids)) return ids rd.sadd('rosetta-mapping-error-undefinable-wikilinks', page.page_title) return None
def index_wiki_page(title, depth, visited): print 'looking at page %s, at depth %d:' % (title, depth) algo_id = -1 cate_id = -1 if title in visited and not UPDATING_WIKI: # don't need to revisit any page if we are not updating for new algos print 'visited' return get_ids_of_visited_wiki_page(title) if pw.is_category_title(title): # is category page if depth < pw.MAX_CATEGORY_DEPTH: page = wiki.categorypage(title) if page is None: print '-> category not found' mark_visited(title, visited) return (algo_id, cate_id) print '-> category' child_algo_ids = list() child_cate_ids = list() for member in page.categorymembers: (child_algo_id, child_cate_id) = index_wiki_page(member, depth + 1, visited) if child_algo_id != -1: child_algo_ids.append(child_algo_id) if child_cate_id != -1: child_cate_ids.append(child_cate_id) if len(child_algo_ids) == 0 and len(child_cate_ids) == 0: # if not algorithm category, igore mark_visited(title, visited) return (-1, -1) # add self to category table, and update cate_id cate_id = index_wiki_category_entry(page, child_algo_ids, child_cate_ids) else: # is member page page = pw.get_wiki_page(title) if page is None: print '-> member page not found' mark_visited(title, visited) return (algo_id, cate_id) if pw.is_algorithm_page(page): print '-> algorithm page' # add this algorithm to algorithm table algo_id = index_wiki_algorithm_entry(page, title, visited) else: print '-> member page of other stuff' mark_visited(title, visited) return (algo_id, cate_id)
def index_wiki_category_entry(page, algo_ids, subcate_ids): # page title -> category title # e.g. 'Category:abcd' -> 'abcd' title = page.title[9:] result = es.get(index=INDEX_NAME, doc_type='category', id=normalize(title), ignore=404) if not result['found']: body = {'name': title, 'algorithms': algo_ids, 'children': subcate_ids} # try to get category description from corresponding page corres_page = pw.get_wiki_page(title) if corres_page is None: # if there's no corresponding page, # get category description from category page itself corres_page = pw.get_wiki_page(page.title) if corres_page.summary == '': try: corres_page = pw.get_wiki_page(corres_page.links[0]) except KeyError: # if there is no links, KeyError will be raised # set corres_page to None, # because empty summary will cause error corres_page = None if corres_page is not None: body['tag_line'] = get_tag_line(corres_page.summary) body['description'] = corres_page.summary retval = es.index(index=INDEX_NAME, doc_type='category', id=normalize(title), body=body) return retval['_id'] else: print page.title, 'updated with new algos/cates!'
def index_wiki_category_entry(page, algo_ids, subcate_ids): # page title -> category title # e.g. 'Category:abcd' -> 'abcd' title = page.title[9:] result = es.get(index=INDEX_NAME, doc_type='category', id=normalize(title), ignore=404) if not result['found']: body = { 'name': title, 'algorithms': algo_ids, 'children': subcate_ids } # try to get category description from corresponding page corres_page = pw.get_wiki_page(title) if corres_page is None: # if there's no corresponding page, # get category description from category page itself corres_page = pw.get_wiki_page(page.title) if corres_page.summary == '': try: corres_page = pw.get_wiki_page(corres_page.links[0]) except KeyError: # if there is no links, KeyError will be raised # set corres_page to None, # because empty summary will cause error corres_page = None if corres_page is not None: body['tag_line'] = get_tag_line(corres_page.summary) body['description'] = corres_page.summary retval = es.index(index=INDEX_NAME, doc_type='category', id=normalize(title), body=body) return retval['_id'] else: print page.title, 'updated with new algos/cates!'
def nel_title_suggest(page_title, auto_suggest=True): wikipage = get_wiki_page(page_title, auto_suggest) if wikipage is not None: # check if indexed id = get_id_of_corresponding_algorithm(page_title, page_title) if id is None: # try to index this algorithm id = index_corresponding_algorithm(wikipage, page_title, page_title) if id is not None: rd.hset('rosetta-mapping-success', page_title, json.dumps([id])) rd.sadd('rosetta-mapping-success-wikipedia-autosuggest', page_title) safe_print(id) print '--second' return [id]
def nel_wikilinks_match_all(wikilinks, page_title): ids = list() for link in wikilinks: wikipage = get_wiki_page(link) if wikipage is not None and is_algorithm_page(wikipage): # check if indexed id = get_id_of_corresponding_algorithm(link, page_title) if id is None: # try to index this algorithm id = index_corresponding_algorithm(wikipage, link, page_title) if id is None: continue ids.append(id) if len(ids) > 0: rd.hset('rosetta-mapping-success', page_title, json.dumps(ids)) rd.sadd('rosetta-mapping-success-all-algo-links', page_title) safe_print(ids) print '--all-link' return ids
def nel_title_crosswikis(page_title): query = "SELECT cprob, entity FROM queries WHERE anchor = %s" suggested_wikilinks = list(session.execute(query, [page_title])) suggested_wikilinks = sorted(suggested_wikilinks, key=lambda tup: tup[0]) if len(suggested_wikilinks) > 0: # get the most confident link toplink = suggested_wikilinks[0][1] wikipage = get_wiki_page(toplink.replace('_', ' ')) if wikipage is not None: # check if indexed id = get_id_of_corresponding_algorithm(toplink, page_title) if id is None: # try to index this algorithm id = index_corresponding_algorithm(wikipage, toplink, page_title) if id is not None: rd.hset('rosetta-mapping-success', page_title, json.dumps([id])) rd.sadd('rosetta-mapping-success-crosswikis', page_title) safe_print(id) print '--third' return [id]
def nel_wikilinks_fuzzy(wikilinks, page_title): if len(wikilinks) == 0: # no any wiki links rd.sadd('rosetta-mapping-error-no-wiki-links', page_title) else: # first, try wikilinks that has titles similar to the task name, # these links are sorted by confidence of fuzzy matching for link in get_sorted_similar_links(page_title, wikilinks): # check if indexed id = get_id_of_corresponding_algorithm(link, page_title) if id is None: # try to index this algorithm wikipage = get_wiki_page(link) if wikipage is not None: id = index_corresponding_algorithm(wikipage, link, page_title) safe_print(id) if id is not None: rd.hset('rosetta-mapping-success', page_title, json.dumps([id])) rd.sadd('rosetta-mapping-similars-success', page_title) safe_print(id) print '--first' return [id]