def get_most_popular_articles(source, campaign=''): days = configuration.get_config_int('popular_pageviews', 'days') date_format = configuration.get_config_value('popular_pageviews', 'date_format') query = configuration.get_config_value('popular_pageviews', 'query') date = (datetime.datetime.utcnow() - datetime.timedelta(days=days)).strftime(date_format) query = query.format(source=source, date=date) try: data = get(query) except ValueError: log.info('pageview query failed') return [] if 'items' not in data or len( data['items']) < 1 or 'articles' not in data['items'][0]: log.info('pageview data is not in a known format') return [] articles = [] for article in data['items'][0]['articles']: articles.append({ 'title': article['article'], 'pageviews': article['views'] }) return articles
def home(): s = request.args.get('s', '') t = request.args.get('t', '') seed = request.args.get('seed', '') search = request.args.get('search', '') rank_method = request.args.get('rank_method', '') campaign = request.args.get('campaign', '') campaign_info_file = '' pairs = language_pairs.get_language_pairs() # WikiGapFinder specific settings. TODO: these should be in a config file. if campaign == 'WikiGapFinder': s = s or 'en' t = t or 'sv' campaign_info_file = 'gf-wikigapfinder-campaign-info.tag' return render_template( 'index.html', language_pairs=json.dumps(pairs), language_to_domain_mapping=json.dumps( language_pairs.get_language_to_domain_mapping()), s=urllib.parse.quote(s), t=urllib.parse.quote(t), seed=urllib.parse.quote(seed), search=urllib.parse.quote(search), rank_method=urllib.parse.quote(rank_method), campaign=urllib.parse.quote(campaign), campaign_info_file=campaign_info_file, event_logger_url=configuration.get_config_value( 'endpoints', 'event_logger'), default_search=configuration.get_config_value('gapfinder', 'default_search'))
def initialize_logging(): logging.basicConfig(format=configuration.get_config_value( 'logging', 'format'), level=logging.WARNING) log = logging.getLogger(recommendation.__name__) log.setLevel( logging.getLevelName(configuration.get_config_value( 'logging', 'level')))
def initialize_embedding(optimize=True): global _embedding embedding_path = configuration.get_config_value('related_articles', 'embedding_path', fallback='') embedding_package = configuration.get_config_value('related_articles', 'embedding_package', fallback='') embedding_name = configuration.get_config_value('related_articles', 'embedding_name', fallback='') optimized_embedding_path = configuration.get_config_value('related_articles', 'optimized_embedding_path') minimum_similarity = configuration.get_config_float('related_articles', 'minimum_similarity') _embedding = WikiEmbedding(minimum_similarity) _embedding.initialize(embedding_path, embedding_package, embedding_name, optimize, optimized_embedding_path)
def test_correct_endpoints_are_used(client, query_url): client.get(query_url) called_urls = [r.request.url for r in responses.calls] expected_urls = [ configuration.get_config_value('endpoints', 'language_pairs'), configuration.get_config_value('endpoints', 'event_logger'), configuration.get_config_value('endpoints', 'related_articles') ] for expected_url in expected_urls: assert any(expected_url in url for url in called_urls) assert 3 == len(responses.calls)
def get_sections_in_article(title): endpoint = configuration.get_config_value('endpoints', 'restbase') path = configuration.get_config_value('sections_query', 'path') endpoint = endpoint.format(source='en') path = path.format(title=title) url = endpoint + path try: result = fetcher.get(url) except ValueError: return {title: []} sections = {title: [item['line'].upper() for item in result.get('sections', []) if 'line' in item]} return sections
def log_api_request(source, target, seed=None, search=None, user_agent=None, **kwargs): event = dict(timestamp=int(time.time()), sourceLanguage=source, targetLanguage=target) if seed: event['seed'] = seed if search: event['searchAlgorithm'] = search payload = dict(schema='TranslationRecommendationAPIRequests', revision=16261139, wiki='metawiki', event=event) url = configuration.get_config_value('endpoints', 'event_logger') url += '?' + urllib.parse.quote_plus(json.dumps(payload)) log.info('Logging event: %s', json.dumps(payload)) headers = {} if user_agent is not None: headers['User-Agent'] = user_agent try: requests.get(url, headers=headers) except requests.exceptions.RequestException: pass
def get_wikidata_sitelinks(source, target, titles): """ Returns a dictionary mapping from titles to wikidata ids for the articles in source missing in target """ endpoint = configuration.get_config_value('endpoints', 'wikidata') params = configuration.get_config_dict('wikidata_params') params['sites'] = params['sites'].format(source=source) params['titles'] = '|'.join(titles) title_id_dict = {} try: data = post(endpoint, data=params) except ValueError: log.info('Bad Wikidata API response') return title_id_dict source_wiki = '{}wiki'.format(source) target_wiki = '{}wiki'.format(target) if 'entities' not in data: log.info('None of the titles have a Wikidata Item') return title_id_dict for wikidata_id, v in data['entities'].items(): sitelinks = v.get('sitelinks', None) if sitelinks: if source_wiki in sitelinks and target_wiki not in sitelinks: title = sitelinks[source_wiki]['title'].replace(' ', '_') title_id_dict[title] = wikidata_id if len(title_id_dict) == 0: log.info('None of the source articles missing in the target') return title_id_dict
def setup_function(function): language_pairs._language_pairs = None responses.add(responses.GET, configuration.get_config_value('endpoints', 'language_pairs'), json=LANGUAGE_PAIRS, status=200)
def get_related_articles(source, seed): endpoint = configuration.get_config_value('endpoints', 'related_articles') try: response = get(endpoint, dict(source=source, seed=seed, count=500)) except ValueError: return [] return response
def get_pageview_query_url(source, title): start_days = configuration.get_config_int('single_article_pageviews', 'start_days') end_days = configuration.get_config_int('single_article_pageviews', 'end_days') query = configuration.get_config_value('single_article_pageviews', 'query') start = get_relative_timestamp(start_days) end = get_relative_timestamp(end_days) query = query.format(source=source, title=title, start=start, end=end) return query
def test_getter_queries_correct_url(): add_response() run_getter() assert 1 == len(responses.calls) assert configuration.get_config_value( 'endpoints', 'pageviews') in responses.calls[0].request.url assert data_fetcher.get_pageview_query_url( SOURCE, TITLE) == responses.calls[0].request.url
def build_wiki_search(source, seed, count, morelike): endpoint = configuration.get_config_value('endpoints', 'wikipedia').format(source=source) params = configuration.get_config_dict('wiki_search_params') params['srlimit'] = count if morelike: seed = 'morelike:' + seed params['srsearch'] = seed return endpoint, params
def set_related_articles_response(): related_articles_endpoint = configuration.get_config_value( 'endpoints', 'related_articles') responses.add(responses.GET, re.compile(r'{}.'.format(related_articles_endpoint)), body=json.dumps(RELATED_ARTICLE_RESPONSE), status=200, content_type='application/json')
def get_entities(params): endpoint = configuration.get_config_value('endpoints', 'wikidata') try: data = fetcher.post(endpoint, data=params) if 'warnings' in data: raise ValueError() except ValueError: log.info('Bad Wikidata API response') return {} return data.get('entities', {})
def initialize_language_pairs(): global _language_pairs if _language_pairs is None: language_pairs_endpoint = configuration.get_config_value('endpoints', 'language_pairs') try: result = requests.get(language_pairs_endpoint) result.raise_for_status() pairs = result.json() except requests.exceptions.RequestException as e: raise ConnectionError('Unable to load data from {}. {}'.format(language_pairs_endpoint, e)) _language_pairs = pairs
def initialize_language_pairs(): global _language_pairs if _language_pairs is None: language_pairs_endpoint = configuration.get_config_value( 'endpoints', 'language_pairs') try: result = requests.get(language_pairs_endpoint) result.raise_for_status() pairs = result.json() except requests.exceptions.RequestException as e: raise ConnectionError('Unable to load data from {}. {}'.format( language_pairs_endpoint, e)) _language_pairs = pairs
def query_pageviews(self, s): """ Query pageview API and parse results """ days = configuration.get_config_int('popular_pageviews', 'days') date_format = configuration.get_config_value('popular_pageviews', 'date_format') query = configuration.get_config_value('popular_pageviews', 'query') date = (datetime.datetime.utcnow() - datetime.timedelta(days=days)).strftime(date_format) query = query.format(source=s, date=date) try: data = data_fetcher.get(query) except ValueError: return [] article_pv_tuples = [] try: for d in data['items'][0]['articles']: article_pv_tuples.append((d['article'], d['views'])) except: log.info('Could not get most popular articles for %s from pageview API. Try using a seed article.', s) return article_pv_tuples
def home(): s = request.args.get('s') t = request.args.get('t') seed = request.args.get('seed') pairs = language_pairs.get_language_pairs() return render_template( 'index.html', language_pairs=json.dumps(pairs), language_to_domain_mapping=json.dumps(language_pairs.get_language_to_domain_mapping()), s=s, t=t, seed=seed, event_logger_url=configuration.get_config_value('endpoints', 'event_logger') )
def home(): s = request.args.get('s') t = request.args.get('t') seed = request.args.get('seed') pairs = language_pairs.get_language_pairs() return render_template( 'index.html', language_pairs=json.dumps(pairs), language_to_domain_mapping=json.dumps( language_pairs.get_language_to_domain_mapping()), s=s, t=t, seed=seed, event_logger_url=configuration.get_config_value( 'endpoints', 'event_logger'))
def get_disambiguation_pages(source, titles): """ Returns the subset of titles that are disambiguation pages """ endpoint = configuration.get_config_value('endpoints', 'wikipedia').format(source=source) params = configuration.get_config_dict('disambiguation_params') params['titles'] = '|'.join(titles) try: data = post(endpoint, data=params) except ValueError: log.info('Bad Disambiguation API response') return [] pages = data.get('query', {}).get('pages', {}).values() return list(set(page['title'].replace(' ', '_') for page in pages if 'disambiguation' in page.get('pageprops', {})))
def get_categories_for_article(title): endpoint = configuration.get_config_value('endpoints', 'wikipedia') params = { 'action': 'query', 'prop': 'categories', 'format': 'json', 'titles': title } endpoint = endpoint.format(source='en') try: result = fetcher.get(endpoint, params=params) except ValueError: return {title: []} items = list(result.get('query', {}).get('pages', {}).values()) if len(items) != 1: return {title: []} categories = {title: [item['title'].replace(' ', '_') for item in items[0].get('categories', []) if 'title' in item]} return categories
def get_disambiguation_pages(source, titles): """ Returns the subset of titles that are disambiguation pages """ endpoint = configuration.get_config_value( 'endpoints', 'wikipedia').format(source=source) params = configuration.get_config_dict('disambiguation_params') params['titles'] = '|'.join(titles) try: data = post(endpoint, data=params) except ValueError: log.info('Bad Disambiguation API response') return [] pages = data.get('query', {}).get('pages', {}).values() return list( set(page['title'].replace(' ', '_') for page in pages if 'disambiguation' in page.get('pageprops', {})))
def initialize_language_pairs(): global _language_pairs if _language_pairs is None: language_pairs_endpoint = configuration.get_config_value( 'endpoints', 'language_pairs') try: result = requests.get(language_pairs_endpoint) result.raise_for_status() pairs = result.json() if {'source', 'target'} ^ set(pairs.keys()): raise ValueError() if not all(isinstance(v, list) for v in pairs.values()): raise ValueError() _language_pairs = pairs except requests.exceptions.RequestException as e: log.warning('Unable to load data from {}. {}'.format( language_pairs_endpoint, e)) except (AttributeError, ValueError): log.warning('language pairs were invalid')
def get_category_members(source, category): log.debug(category) endpoint = configuration.get_config_value( 'endpoints', 'wikipedia').format(source=source) params = configuration.get_config_dict('category_search_params') params['cmtitle'] = category members = dict(pages=set(), subcats=set()) try: response = get(endpoint, params=params) except ValueError: return [] results = response.get('query', {}).get('categorymembers', []) for member in results: if member.get('type', None) == 'page': members['pages'].add(member.get('title')) if member.get('type', None) == 'subcat': members['subcats'].add(member.get('title')) return members
def log_api_request(source, target, seed=None, search=None, **kwargs): event = dict(timestamp=int(time.time()), sourceLanguage=source, targetLanguage=target) if seed: event['seed'] = seed if search: event['searchAlgorithm'] = search payload = dict(schema='TranslationRecommendationAPIRequests', revision=15405506, wiki='metawiki', event=event) url = configuration.get_config_value('endpoints', 'event_logger') url += '?' + urllib.parse.quote_plus(json.dumps(payload)) log.info('Logging event: %s', json.dumps(payload)) try: requests.get(url) except requests.exceptions.RequestException: pass
def test_configuration(): assert recommendation.__name__ == configuration.get_config_value('related_articles', 'embedding_package')
def get_expected_endpoint(the_filter): if the_filter is filters.filter_by_missing: return configuration.get_config_value('endpoints', 'wikidata') if the_filter is filters.filter_by_disambiguation: return configuration.get_config_value('endpoints', 'wikipedia').format(source=SOURCE)
def test_getter_queries_correct_url(): add_response() run_getter() assert 1 == len(responses.calls) assert configuration.get_config_value('endpoints', 'pageviews') in responses.calls[0].request.url assert data_fetcher.get_pageview_query_url(SOURCE, TITLE) == responses.calls[0].request.url
def get_relative_timestamp(relative_days): date_format = configuration.get_config_value('single_article_pageviews', 'date_format') return (datetime.datetime.utcnow() + datetime.timedelta(days=relative_days)).strftime(date_format)
def resource(filename): return send_from_directory(configuration.get_config_value( 'gapfinder', 'resource_path'), filename=filename)
def test_correct_endpoint_is_requested(): responses.add(responses.GET, re.compile("."), body="", status=200) event_logger.log_api_request("a", "b") assert 1 == len(responses.calls) assert configuration.get_config_value("endpoints", "event_logger") in responses.calls[0].request.url
def get_expected_endpoint(the_filter): if the_filter is filters.filter_by_missing: return configuration.get_config_value('endpoints', 'wikidata') if the_filter is filters.filter_by_disambiguation: return configuration.get_config_value( 'endpoints', 'wikipedia').format(source=SOURCE)
def test_language_pairs_when_fetch_is_invalid(json_value): responses.reset() responses.add(responses.GET, configuration.get_config_value('endpoints', 'language_pairs'), json=json_value, status=200) assert None is language_pairs.get_language_pairs() assert True is language_pairs.is_valid_language_pair('any', 'combination')
def test_correct_endpoint_is_requested(): responses.add(responses.GET, re.compile('.'), body='', status=200) event_logger.log_api_request('a', 'b') assert 1 == len(responses.calls) assert configuration.get_config_value('endpoints', 'event_logger') in responses.calls[0].request.url
def initialize_logging(): logging.basicConfig(format=configuration.get_config_value('logging', 'format'), level=logging.WARNING) log = logging.getLogger(recommendation.__name__) log.setLevel(logging.getLevelName(configuration.get_config_value('logging', 'level')))