Example #1
0
def article_downloads(table_id, from_date, to_date, cached=False, only_cached=False):
    "returns article download data either from the cache or from talking to google"
    if not valid_downloads_dt_pair((from_date, to_date)):
        LOG.warning(
            "given date range %r for downloads is older than known inception %r, skipping",
            (ymd(from_date), ymd(to_date)),
            DOWNLOADS_INCEPTION,
        )
        return {}
    path = output_path("downloads", from_date, to_date)
    module = module_picker(from_date, to_date)
    if cached and os.path.exists(path):
        raw_data = json.load(open(path, "r"))
    elif only_cached:
        # no cache exists and we've been told to only use cache.
        # no results found.
        raw_data = {}
    else:
        # talk to google
        query_map = module.event_counts_query(table_id, from_date, to_date)
        raw_data, actual_path = query_ga_write_results(query_map)
        assert path == actual_path, "the expected output path (%s) doesn't match the path actually written to (%s)" % (
            path,
            actual_path,
        )
    return module.event_counts(raw_data.get("rows", []))
Example #2
0
def output_path(results_type, from_date, to_date):
    "generates a path for results of the given type"
    assert results_type in ["views", "downloads"], "results type must be either 'views' or 'downloads'"
    if isinstance(from_date, str):  # given strings
        from_date_dt = datetime.strptime(from_date, "%Y-%m-%d")
        to_date_dt = datetime.strptime(to_date, "%Y-%m-%d")
    else:  # given dt objects
        from_date_dt, to_date_dt = from_date, to_date
        from_date, to_date = ymd(from_date), ymd(to_date)

    now, now_dt = ymd(datetime.now()), datetime.now()

    # different formatting if two different dates are provided
    if from_date == to_date:
        dt_str = to_date
    else:
        dt_str = "%s_%s" % (from_date, to_date)

    partial = ""
    if to_date == now or to_date_dt >= now_dt:
        # anything gathered today or for the future (month ranges)
        # will only ever be partial. when run again on a future day
        # there will be cache miss and the full results downloaded
        partial = ".partial"

    # ll: output/downloads/2014-04-01.json
    # ll: output/views/2014-01-01_2014-01-31.json.partial
    return join(output_dir(), results_type, dt_str + ".json" + partial)
def path_counts_query(table_id, from_date, to_date):
    """returns a GA query object that, when executed, returns raw
    results for article page views between the two given dates"""
    assert isinstance(from_date, datetime), "'from' date must be a datetime object. received %r" % from_date
    assert isinstance(to_date, datetime), "'to' date must be a datetime object. received %r" % to_date

    # regular expression suffixes (escape special chars)
    suffix_list = [
        '\.full',
        '\.abstract',
        '\.short',
        '/abstract-1',
        '/abstract-2',
    ]
    # wrap each suffix in a zero-or-one group. ll: ['(\.full)?', '(\.abstract)?', ...]
    suffix_list = ['(%s)?' % suffix for suffix in suffix_list]

    # pipe-delimit the suffix list. ll: '(\.full)?|(\.abstract)?|...)'
    suffix_str = '|'.join(suffix_list)
    
    return {
        'ids': table_id,
        'max_results': 10000, # 10,000 is the max GA will ever return
        'start_date': ymd(from_date),
        'end_date': ymd(to_date),
        'metrics': 'ga:pageviews',
        'dimensions': 'ga:pagePath',
        'sort': 'ga:pagePath',
        'filters': ','.join([
            # these filters are OR'ed
            r'ga:pagePath=~^/content/.*/e[0-9]{5}(%s)$' % suffix_str,
            r'ga:pagePath=~^/content/.*/elife\.[0-9]{5}$',
        ])
    }
def total_traffic_monthly_query(table_id, from_date=None, to_date=None):
    "returns "
    from_date = from_date or core.VIEWS_INCEPTION
    to_date = to_date or datetime.now()    
    month_range = utils.dt_month_range(from_date, to_date)
    from_date, to_date = month_range[0][0], month_range[-1][1]    
    return {
        'ids': table_id,
        'max_results': 10000, # 10,000 is the max GA will ever return
        'start_date': ymd(from_date),
        'end_date': ymd(to_date),
        'metrics': 'ga:pageviews',
        'dimensions': 'ga:year,ga:month'
    }
def event_counts_query(table_id, from_date, to_date):
    "returns the raw GA results for PDF downloads between the two given dates"
    assert isinstance(from_date, datetime), "'from' date must be a datetime object. received %r" % from_date
    assert isinstance(to_date, datetime), "'to' date must be a datetime object. received %r" % to_date
    return {
        'ids': table_id,
        'max_results': 10000, # 10,000 is the max GA will ever return
        'start_date': ymd(from_date),
        'end_date': ymd(to_date),
        'metrics': 'ga:totalEvents',
        'dimensions': 'ga:eventLabel',
        'sort': 'ga:eventLabel',
        # ';' separates 'AND' expressions, ',' separates 'OR' expressions
        'filters': r'ga:eventAction==Download;ga:eventCategory==Article;ga:eventLabel=~pdf-article',
    }
def path_counts_query(table_id, from_date, to_date):
    """returns a GA query object that, when executed, returns raw
    results for article page views between the two given dates"""
    assert isinstance(
        from_date, datetime
    ), "'from' date must be a datetime object. received %r" % from_date
    assert isinstance(
        to_date,
        datetime), "'to' date must be a datetime object. received %r" % to_date

    # regular expression suffixes (escape special chars)
    suffix_list = [
        '\.full',
        '\.abstract',
        '\.short',
        '/abstract-1',
        '/abstract-2',
    ]
    # wrap each suffix in a zero-or-one group. ll: ['(\.full)?', '(\.abstract)?', ...]
    suffix_list = ['(%s)?' % suffix for suffix in suffix_list]

    # pipe-delimit the suffix list. ll: '(\.full)?|(\.abstract)?|...)'
    suffix_str = '|'.join(suffix_list)

    return {
        'ids':
        table_id,
        'max_results':
        10000,  # 10,000 is the max GA will ever return
        'start_date':
        ymd(from_date),
        'end_date':
        ymd(to_date),
        'metrics':
        'ga:pageviews',
        'dimensions':
        'ga:pagePath',
        'sort':
        'ga:pagePath',
        'filters':
        ','.join([
            # these filters are OR'ed
            r'ga:pagePath=~^/content/.*/e[0-9]{5}(%s)$' % suffix_str,
            r'ga:pagePath=~^/content/.*/elife\.[0-9]{5}$',
        ])
    }