class DownloadAnalytics(object): '''Downloads and stores analytics info''' def __init__(self, service=None, token=None, profile_id=None, delete_first=False, stat=None, print_progress=False): self.period = config['ga-report.period'] self.service = service self.profile_id = profile_id self.delete_first = delete_first self.stat = stat self.token = token self.print_progress = print_progress def specific_month(self, date): import calendar first_of_this_month = datetime.datetime(date.year, date.month, 1) _, last_day_of_month = calendar.monthrange(int(date.year), int(date.month)) last_of_this_month = datetime.datetime(date.year, date.month, last_day_of_month) # if this is the latest month, note that it is only up until today now = datetime.datetime.now() if now.year == date.year and now.month == date.month: last_day_of_month = now.day last_of_this_month = now periods = ((date.strftime(FORMAT_MONTH), last_day_of_month, first_of_this_month, last_of_this_month), ) self.download_and_store(periods) def latest(self): if self.period == 'monthly': # from first of this month to today now = datetime.datetime.now() first_of_this_month = datetime.datetime(now.year, now.month, 1) periods = ((now.strftime(FORMAT_MONTH), now.day, first_of_this_month, now), ) else: raise NotImplementedError self.download_and_store(periods) def for_date(self, for_date): assert isinstance(for_date, datetime.datetime) periods = [ ] # (period_name, period_complete_day, start_date, end_date) if self.period == 'monthly': year = for_date.year month = for_date.month now = datetime.datetime.now() first_of_this_month = datetime.datetime(now.year, now.month, 1) while True: first_of_the_month = datetime.datetime(year, month, 1) if first_of_the_month == first_of_this_month: periods.append((now.strftime(FORMAT_MONTH), now.day, first_of_this_month, now)) break elif first_of_the_month < first_of_this_month: in_the_next_month = first_of_the_month + datetime.timedelta( 40) last_of_the_month = datetime.datetime(in_the_next_month.year, in_the_next_month.month, 1)\ - datetime.timedelta(1) periods.append((now.strftime(FORMAT_MONTH), 0, first_of_the_month, last_of_the_month)) else: # first_of_the_month has got to the future somehow break month += 1 if month > 12: year += 1 month = 1 else: raise NotImplementedError self.download_and_store(periods) @staticmethod def get_full_period_name(period_name, period_complete_day): if period_complete_day: return period_name + ' (up to %ith)' % period_complete_day else: return period_name def download_and_store(self, periods): for period_name, period_complete_day, start_date, end_date in periods: log.info( 'Period "%s" (%s - %s)', self.get_full_period_name(period_name, period_complete_day), start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')) if self.delete_first: log.info('Deleting existing Analytics for this period "%s"', period_name) ga_model.delete(period_name) if self.stat in (None, 'url'): # Clean out old url data before storing the new ga_model.pre_update_url_stats(period_name) accountName = config.get('googleanalytics.account') path_prefix = '~' # i.e. it is a regex # Possibly there is a domain in the path. # I'm not sure why, but on the data.gov.uk property we see # the domain gets added to the GA path. e.g. # '/data.gov.uk/data/search' # '/co-prod2.dh.bytemark.co.uk/apps/test-app' # but on other properties we don't. e.g. # '/data/search' path_prefix += '(/%s)?' % accountName log.info('Downloading analytics for dataset views') data = self.download(start_date, end_date, path_prefix + '/dataset/[a-z0-9-_]+') log.info('Storing dataset views (%i rows)', len(data.get('url'))) self.store( period_name, period_complete_day, data, ) log.info('Downloading analytics for publisher views') data = self.download(start_date, end_date, path_prefix + '/publisher/[a-z0-9-_]+') log.info('Storing publisher views (%i rows)', len(data.get('url'))) self.store( period_name, period_complete_day, data, ) # Create the All records ga_model.post_update_url_stats() log.info('Associating datasets with their publisher') ga_model.update_publisher_stats( period_name) # about 30 seconds. if self.stat == 'url-all': # This stat is split off just for test purposes ga_model.post_update_url_stats() if self.stat in (None, 'sitewide'): # Clean out old ga_stats data before storing the new ga_model.pre_update_sitewide_stats(period_name) log.info( 'Downloading and storing analytics for site-wide stats') self.sitewide_stats(period_name, period_complete_day) if self.stat in (None, 'social'): # Clean out old ga_stats data before storing the new ga_model.pre_update_social_stats(period_name) log.info( 'Downloading and storing analytics for social networks') self.update_social_info(period_name, start_date, end_date) def update_social_info(self, period_name, start_date, end_date): start_date = start_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d') query = 'ga:hasSocialSourceReferral=~Yes$' metrics = 'ga:entrances' sort = '-ga:entrances' try: args = dict(ids='ga:' + self.profile_id, filters=query, metrics=metrics, sort=sort, dimensions="ga:landingPagePath,ga:socialNetwork", max_results=10000) args['start-date'] = start_date args['end-date'] = end_date results = self._get_ga_data(args) except Exception, e: log.exception(e) results = dict(url=[]) data = collections.defaultdict(list) rows = results.get('rows') for row in rows: url = strip_off_host_prefix(row[0]) data[url].append(( row[1], int(row[2]), )) ga_model.update_social(period_name, data)
class DownloadAnalytics(object): '''Downloads and stores analytics info''' def __init__(self, service=None, token=None, profile_id=None, delete_first=False, skip_url_stats=False): self.period = config['ga-report.period'] self.service = service self.profile_id = profile_id self.delete_first = delete_first self.skip_url_stats = skip_url_stats self.token = token def specific_month(self, date): import calendar first_of_this_month = datetime.datetime(date.year, date.month, 1) _, last_day_of_month = calendar.monthrange(int(date.year), int(date.month)) last_of_this_month = datetime.datetime(date.year, date.month, last_day_of_month) # if this is the latest month, note that it is only up until today now = datetime.datetime.now() if now.year == date.year and now.month == date.month: last_day_of_month = now.day last_of_this_month = now periods = ((date.strftime(FORMAT_MONTH), last_day_of_month, first_of_this_month, last_of_this_month),) self.download_and_store(periods) def latest(self): if self.period == 'monthly': # from first of this month to today now = datetime.datetime.now() first_of_this_month = datetime.datetime(now.year, now.month, 1) periods = ((now.strftime(FORMAT_MONTH), now.day, first_of_this_month, now),) else: raise NotImplementedError self.download_and_store(periods) def for_date(self, for_date): assert isinstance(since_date, datetime.datetime) periods = [] # (period_name, period_complete_day, start_date, end_date) if self.period == 'monthly': first_of_the_months_until_now = [] year = for_date.year month = for_date.month now = datetime.datetime.now() first_of_this_month = datetime.datetime(now.year, now.month, 1) while True: first_of_the_month = datetime.datetime(year, month, 1) if first_of_the_month == first_of_this_month: periods.append((now.strftime(FORMAT_MONTH), now.day, first_of_this_month, now)) break elif first_of_the_month < first_of_this_month: in_the_next_month = first_of_the_month + datetime.timedelta(40) last_of_the_month = datetime.datetime(in_the_next_month.year, in_the_next_month.month, 1)\ - datetime.timedelta(1) periods.append((now.strftime(FORMAT_MONTH), 0, first_of_the_month, last_of_the_month)) else: # first_of_the_month has got to the future somehow break month += 1 if month > 12: year += 1 month = 1 else: raise NotImplementedError self.download_and_store(periods) @staticmethod def get_full_period_name(period_name, period_complete_day): if period_complete_day: return period_name + ' (up to %ith)' % period_complete_day else: return period_name def download_and_store(self, periods): for period_name, period_complete_day, start_date, end_date in periods: log.info('Period "%s" (%s - %s)', self.get_full_period_name(period_name, period_complete_day), start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')) if self.delete_first: log.info('Deleting existing Analytics for this period "%s"', period_name) ga_model.delete(period_name) if not self.skip_url_stats: # Clean out old url data before storing the new ga_model.pre_update_url_stats(period_name) accountName = config.get('googleanalytics.account') log.info('Downloading analytics for dataset views') data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName) log.info('Storing dataset views (%i rows)', len(data.get('url'))) self.store(period_name, period_complete_day, data, ) log.info('Downloading analytics for publisher views') data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName) log.info('Storing publisher views (%i rows)', len(data.get('url'))) self.store(period_name, period_complete_day, data,) # Make sure the All records are correct. ga_model.post_update_url_stats() log.info('Associating datasets with their publisher') ga_model.update_publisher_stats(period_name) # about 30 seconds. log.info('Downloading and storing analytics for site-wide stats') self.sitewide_stats( period_name, period_complete_day ) log.info('Downloading and storing analytics for social networks') self.update_social_info(period_name, start_date, end_date) def update_social_info(self, period_name, start_date, end_date): start_date = start_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d') query = 'ga:hasSocialSourceReferral=~Yes$' metrics = 'ga:entrances' sort = '-ga:entrances' try: # Because of issues of invalid responses, we are going to make these requests # ourselves. headers = {'authorization': 'Bearer ' + self.token} args = dict(ids='ga:' + self.profile_id, filters=query, metrics=metrics, sort=sort, dimensions="ga:landingPagePath,ga:socialNetwork", max_results=10000) args['start-date'] = start_date args['end-date'] = end_date results = self._get_json(args) except Exception, e: log.exception(e) results = dict(url=[]) data = collections.defaultdict(list) rows = results.get('rows',[]) for row in rows: url = _normalize_url('http:/' + row[0]) data[url].append( (row[1], int(row[2]),) ) ga_model.update_social(period_name, data)