def article_downloads(table_id, from_date, to_date, cached=False, only_cached=False): "returns article download data either from the cache or from talking to google" if not valid_downloads_dt_pair((from_date, to_date)): LOG.warning( "given date range %r for downloads is older than known inception %r, skipping", (ymd(from_date), ymd(to_date)), DOWNLOADS_INCEPTION, ) return {} path = output_path("downloads", from_date, to_date) module = module_picker(from_date, to_date) if cached and os.path.exists(path): raw_data = json.load(open(path, "r")) elif only_cached: # no cache exists and we've been told to only use cache. # no results found. raw_data = {} else: # talk to google query_map = module.event_counts_query(table_id, from_date, to_date) raw_data, actual_path = query_ga_write_results(query_map) assert path == actual_path, "the expected output path (%s) doesn't match the path actually written to (%s)" % ( path, actual_path, ) return module.event_counts(raw_data.get("rows", []))
def output_path(results_type, from_date, to_date): "generates a path for results of the given type" assert results_type in ["views", "downloads"], "results type must be either 'views' or 'downloads'" if isinstance(from_date, str): # given strings from_date_dt = datetime.strptime(from_date, "%Y-%m-%d") to_date_dt = datetime.strptime(to_date, "%Y-%m-%d") else: # given dt objects from_date_dt, to_date_dt = from_date, to_date from_date, to_date = ymd(from_date), ymd(to_date) now, now_dt = ymd(datetime.now()), datetime.now() # different formatting if two different dates are provided if from_date == to_date: dt_str = to_date else: dt_str = "%s_%s" % (from_date, to_date) partial = "" if to_date == now or to_date_dt >= now_dt: # anything gathered today or for the future (month ranges) # will only ever be partial. when run again on a future day # there will be cache miss and the full results downloaded partial = ".partial" # ll: output/downloads/2014-04-01.json # ll: output/views/2014-01-01_2014-01-31.json.partial return join(output_dir(), results_type, dt_str + ".json" + partial)
def path_counts_query(table_id, from_date, to_date): """returns a GA query object that, when executed, returns raw results for article page views between the two given dates""" assert isinstance(from_date, datetime), "'from' date must be a datetime object. received %r" % from_date assert isinstance(to_date, datetime), "'to' date must be a datetime object. received %r" % to_date # regular expression suffixes (escape special chars) suffix_list = [ '\.full', '\.abstract', '\.short', '/abstract-1', '/abstract-2', ] # wrap each suffix in a zero-or-one group. ll: ['(\.full)?', '(\.abstract)?', ...] suffix_list = ['(%s)?' % suffix for suffix in suffix_list] # pipe-delimit the suffix list. ll: '(\.full)?|(\.abstract)?|...)' suffix_str = '|'.join(suffix_list) return { 'ids': table_id, 'max_results': 10000, # 10,000 is the max GA will ever return 'start_date': ymd(from_date), 'end_date': ymd(to_date), 'metrics': 'ga:pageviews', 'dimensions': 'ga:pagePath', 'sort': 'ga:pagePath', 'filters': ','.join([ # these filters are OR'ed r'ga:pagePath=~^/content/.*/e[0-9]{5}(%s)$' % suffix_str, r'ga:pagePath=~^/content/.*/elife\.[0-9]{5}$', ]) }
def total_traffic_monthly_query(table_id, from_date=None, to_date=None): "returns " from_date = from_date or core.VIEWS_INCEPTION to_date = to_date or datetime.now() month_range = utils.dt_month_range(from_date, to_date) from_date, to_date = month_range[0][0], month_range[-1][1] return { 'ids': table_id, 'max_results': 10000, # 10,000 is the max GA will ever return 'start_date': ymd(from_date), 'end_date': ymd(to_date), 'metrics': 'ga:pageviews', 'dimensions': 'ga:year,ga:month' }
def event_counts_query(table_id, from_date, to_date): "returns the raw GA results for PDF downloads between the two given dates" assert isinstance(from_date, datetime), "'from' date must be a datetime object. received %r" % from_date assert isinstance(to_date, datetime), "'to' date must be a datetime object. received %r" % to_date return { 'ids': table_id, 'max_results': 10000, # 10,000 is the max GA will ever return 'start_date': ymd(from_date), 'end_date': ymd(to_date), 'metrics': 'ga:totalEvents', 'dimensions': 'ga:eventLabel', 'sort': 'ga:eventLabel', # ';' separates 'AND' expressions, ',' separates 'OR' expressions 'filters': r'ga:eventAction==Download;ga:eventCategory==Article;ga:eventLabel=~pdf-article', }
def path_counts_query(table_id, from_date, to_date): """returns a GA query object that, when executed, returns raw results for article page views between the two given dates""" assert isinstance( from_date, datetime ), "'from' date must be a datetime object. received %r" % from_date assert isinstance( to_date, datetime), "'to' date must be a datetime object. received %r" % to_date # regular expression suffixes (escape special chars) suffix_list = [ '\.full', '\.abstract', '\.short', '/abstract-1', '/abstract-2', ] # wrap each suffix in a zero-or-one group. ll: ['(\.full)?', '(\.abstract)?', ...] suffix_list = ['(%s)?' % suffix for suffix in suffix_list] # pipe-delimit the suffix list. ll: '(\.full)?|(\.abstract)?|...)' suffix_str = '|'.join(suffix_list) return { 'ids': table_id, 'max_results': 10000, # 10,000 is the max GA will ever return 'start_date': ymd(from_date), 'end_date': ymd(to_date), 'metrics': 'ga:pageviews', 'dimensions': 'ga:pagePath', 'sort': 'ga:pagePath', 'filters': ','.join([ # these filters are OR'ed r'ga:pagePath=~^/content/.*/e[0-9]{5}(%s)$' % suffix_str, r'ga:pagePath=~^/content/.*/elife\.[0-9]{5}$', ]) }