def get_id_of_corresponding_algorithm(linktitle, page_title, fuzzy=False): id = convert_to_id(linktitle) result = es.get(index=INDEX_NAME, doc_type='algorithm', id=id, ignore=404) if result['found']: return id if fuzzy: body = { "query": { "match": { "name": { "query": "merge sort", "fuzziness": "auto" } } } } r = es.search(index=INDEX_NAME, doc_type='algorithm', body=body, size=1) if r['hits']['total'] > 0: return r['hits'][0]['_id'] rd.sadd('rosetta-mapping-error-correspage-notfound', page_title) return None
def nel_title_elasticsearch(page_title): # TODO search on name and alt_name if id is not None: rd.hset('rosetta-mapping-success', page_title, json.dumps([id])) rd.sadd('rosetta-mapping-success-wikipedia-autosuggest', page_title) safe_print(id) print '--second' return [id]
def index_corresponding_algorithm(wikipage, linktitle, page_title): # try to index this algorithm if wikipage is not None: id = index_wiki_algorithm_entry(wikipage, linktitle, visitedwiki) return id rd.sadd('rosetta-mapping-error-indexing-error', page_title + ' -> ' + linktitle) return None
def get_id_of_corresponding_algorithm(linktitle, page_title): id = convert_to_id(linktitle) # print '--looking for id:', id result = es.get(index=INDEX_NAME, doc_type='algorithm', id=id, ignore=404) if result['found']: return id rd.sadd('rosetta-mapping-error-correspage-notfound', page_title) return None
def get_sorted_similar_links(taskname, links): taskname = taskname.encode('utf8') choices = [link.encode('utf8') for link in links] try: res = process.extract(taskname, choices) except Exception as e: rd.sadd('rosetta-mapping-taskname-coding-error', str(e) + taskname) return [] if res is not None: res = [link for (link, confidence) in res if confidence > FUZZY_THRESHOLD] return sorted(res, key=lambda x: x[1], reverse = True) else: return []
def get_sorted_similar_links(taskname, links): taskname = taskname.encode('utf8') choices = [link.encode('utf8') for link in links] try: res = process.extract(taskname, choices) except Exception as e: rd.sadd('rosetta-mapping-taskname-coding-error', str(e) + taskname) return [] if res is not None: # print 'confidence: ', res res = [link for (link, confidence) in res if confidence > FUZZY_THRESHOLD] return sorted(res, key=lambda x: x[1], reverse = True) else: return []
def nel_title_suggest(page_title, auto_suggest=True): wikipage = get_wiki_page(page_title, auto_suggest) if wikipage is not None: # check if indexed id = get_id_of_corresponding_algorithm(page_title, page_title) if id is None: # try to index this algorithm id = index_corresponding_algorithm(wikipage, page_title, page_title) if id is not None: rd.hset('rosetta-mapping-success', page_title, json.dumps([id])) rd.sadd('rosetta-mapping-success-wikipedia-autosuggest', page_title) safe_print(id) print '--second' return [id]
def nel_wikilinks_match_all(wikilinks, page_title): ids = list() for link in wikilinks: wikipage = get_wiki_page(link) if wikipage is not None and is_algorithm_page(wikipage): # check if indexed id = get_id_of_corresponding_algorithm(link, page_title) if id is None: # try to index this algorithm id = index_corresponding_algorithm(wikipage, link, page_title) if id is None: continue ids.append(id) if len(ids) > 0: rd.hset('rosetta-mapping-success', page_title, json.dumps(ids)) rd.sadd('rosetta-mapping-success-all-algo-links', page_title) safe_print(ids) print '--all-link' return ids
def get_corres_wikipedia_algo_id(page): wikilinks = [ linktitle for (linksite, linktitle) in list(page.iwlinks()) if linksite == 'wp' ] # first, try wikilinks that has titles similar to the task name, # these links are sorted by confidence of fuzzy matching id = nel_wikilinks_fuzzy(wikilinks, page.page_title) if id is not None: return [id] # then use wikipedia api's auto-suggest to find corresponding # wikipedia page id = nel_title_suggest(page.page_title, False) if id is not None: return [id] # # then use elasticsearch fuzzy match task to indexed algorithm # # check if indexed # id = nel_title_elasticsearch(page.page_title) # if id is not None: # return id # then, use crosswikis dictionary to get the most possible wiki link id = nel_title_crosswikis(page.page_title) if id is not None: return [id] # # finally, if none of the links is similar to the task name, # # 1, store the task description # # 2, relate the implementation with ALL wiki algorithms pages # # mentioned in description # ids = nel_wikilinks_match_all(wikilinks, page.page_title) # if len(ids) > 0: # return ids rd.sadd('rosetta-mapping-error-undefinable-wikilinks', page.page_title) print '' return None
def get_corres_wikipedia_algo_id(page): wikilinks = [linktitle for (linksite, linktitle) in list(page.iwlinks()) if linksite == 'wp'] # first, try wikilinks that has titles similar to the task name, # these links are sorted by confidence of fuzzy matching id = nel_wikilinks_fuzzy(wikilinks, page.page_title) if id is not None: return [id] # then use wikipedia api's auto-suggest to find corresponding # wikipedia page id = nel_title_suggest(page.page_title, False) if id is not None: return [id] # # then use elasticsearch fuzzy match task to indexed algorithm # # check if indexed # id = nel_title_elasticsearch(page.page_title) # if id is not None: # return id # then, use crosswikis dictionary to get the most possible wiki link id = nel_title_crosswikis(page.page_title) if id is not None: return [id] # # finally, if none of the links is similar to the task name, # # 1, store the task description # # 2, relate the implementation with ALL wiki algorithms pages # # mentioned in description # ids = nel_wikilinks_match_all(wikilinks, page.page_title) # if len(ids) > 0: # return ids rd.sadd('rosetta-mapping-error-undefinable-wikilinks', page.page_title) print '' return None
def nel_title_crosswikis(page_title): query = "SELECT cprob, entity FROM queries WHERE anchor = %s" suggested_wikilinks = list(session.execute(query, [page_title])) suggested_wikilinks = sorted(suggested_wikilinks, key=lambda tup: tup[0]) if len(suggested_wikilinks) > 0: # get the most confident link toplink = suggested_wikilinks[0][1] wikipage = get_wiki_page(toplink.replace('_', ' ')) if wikipage is not None: # check if indexed id = get_id_of_corresponding_algorithm(toplink, page_title) if id is None: # try to index this algorithm id = index_corresponding_algorithm(wikipage, toplink, page_title) if id is not None: rd.hset('rosetta-mapping-success', page_title, json.dumps([id])) rd.sadd('rosetta-mapping-success-crosswikis', page_title) safe_print(id) print '--third' return [id]
def nel_wikilinks_fuzzy(wikilinks, page_title): if len(wikilinks) == 0: # no any wiki links rd.sadd('rosetta-mapping-error-no-wiki-links', page_title) else: # first, try wikilinks that has titles similar to the task name, # these links are sorted by confidence of fuzzy matching for link in get_sorted_similar_links(page_title, wikilinks): # check if indexed id = get_id_of_corresponding_algorithm(link, page_title) if id is None: # try to index this algorithm wikipage = get_wiki_page(link) if wikipage is not None: id = index_corresponding_algorithm(wikipage, link, page_title) safe_print(id) if id is not None: rd.hset('rosetta-mapping-success', page_title, json.dumps([id])) rd.sadd('rosetta-mapping-similars-success', page_title) safe_print(id) print '--first' return [id]
def get_corres_wikipedia_algo_id(page): wikilinks = [linktitle for (linksite, linktitle) in list(page.iwlinks()) if linksite == 'wp'] if len(wikilinks) == 0: # no any wiki links rd.sadd('rosetta-mapping-error-no-wiki-links', page.page_title) return None # first, try wikilinks that has titles similar to the task name, # these links are sorted by confidence of fuzzy matching for link in get_sorted_similar_links(page.page_title, wikilinks): # check if indexed id = get_id_of_corresponding_algorithm(link, page.page_title) if id is None: # try to index this algorithm wikipage = get_wiki_page(link) id = index_corresponding_algorithm(wikipage, link, page.page_title) if id is None: continue rd.hset('rosetta-mapping-success', page.page_title, json.dumps([id])) rd.sadd('rosetta-mapping-similars-success', page.page_title) return [id] # then, if none of the links is similar to the task name, # 1, store the task description # 2, relate the implementation with ALL wiki algorithms pages # mentioned in description ids = list() for link in wikilinks: wikipage = get_wiki_page(link) if wikipage is not None and is_algorithm_page(wikipage): # check if indexed id = get_id_of_corresponding_algorithm(link, page.page_title) if id is None: # try to index this algorithm wikipage = get_wiki_page(link) id = index_corresponding_algorithm(wikipage, link, page.page_title) if id is None: continue ids.append(id) if len(ids) > 0: rd.hset('rosetta-mapping-success', page.page_title, json.dumps(ids)) return ids rd.sadd('rosetta-mapping-error-undefinable-wikilinks', page.page_title) return None