Esempio n. 1
0
class DownloadAnalytics(object):
    '''Downloads and stores analytics info'''
    def __init__(self,
                 service=None,
                 token=None,
                 profile_id=None,
                 delete_first=False,
                 stat=None,
                 print_progress=False):
        self.period = config['ga-report.period']
        self.service = service
        self.profile_id = profile_id
        self.delete_first = delete_first
        self.stat = stat
        self.token = token
        self.print_progress = print_progress

    def specific_month(self, date):
        import calendar

        first_of_this_month = datetime.datetime(date.year, date.month, 1)
        _, last_day_of_month = calendar.monthrange(int(date.year),
                                                   int(date.month))
        last_of_this_month = datetime.datetime(date.year, date.month,
                                               last_day_of_month)
        # if this is the latest month, note that it is only up until today
        now = datetime.datetime.now()
        if now.year == date.year and now.month == date.month:
            last_day_of_month = now.day
            last_of_this_month = now
        periods = ((date.strftime(FORMAT_MONTH), last_day_of_month,
                    first_of_this_month, last_of_this_month), )
        self.download_and_store(periods)

    def latest(self):
        if self.period == 'monthly':
            # from first of this month to today
            now = datetime.datetime.now()
            first_of_this_month = datetime.datetime(now.year, now.month, 1)
            periods = ((now.strftime(FORMAT_MONTH), now.day,
                        first_of_this_month, now), )
        else:
            raise NotImplementedError
        self.download_and_store(periods)

    def for_date(self, for_date):
        assert isinstance(for_date, datetime.datetime)
        periods = [
        ]  # (period_name, period_complete_day, start_date, end_date)
        if self.period == 'monthly':
            year = for_date.year
            month = for_date.month
            now = datetime.datetime.now()
            first_of_this_month = datetime.datetime(now.year, now.month, 1)
            while True:
                first_of_the_month = datetime.datetime(year, month, 1)
                if first_of_the_month == first_of_this_month:
                    periods.append((now.strftime(FORMAT_MONTH), now.day,
                                    first_of_this_month, now))
                    break
                elif first_of_the_month < first_of_this_month:
                    in_the_next_month = first_of_the_month + datetime.timedelta(
                        40)
                    last_of_the_month = datetime.datetime(in_the_next_month.year,
                                                           in_the_next_month.month, 1)\
                                                           - datetime.timedelta(1)
                    periods.append((now.strftime(FORMAT_MONTH), 0,
                                    first_of_the_month, last_of_the_month))
                else:
                    # first_of_the_month has got to the future somehow
                    break
                month += 1
                if month > 12:
                    year += 1
                    month = 1
        else:
            raise NotImplementedError
        self.download_and_store(periods)

    @staticmethod
    def get_full_period_name(period_name, period_complete_day):
        if period_complete_day:
            return period_name + ' (up to %ith)' % period_complete_day
        else:
            return period_name

    def download_and_store(self, periods):
        for period_name, period_complete_day, start_date, end_date in periods:
            log.info(
                'Period "%s" (%s - %s)',
                self.get_full_period_name(period_name, period_complete_day),
                start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'))

            if self.delete_first:
                log.info('Deleting existing Analytics for this period "%s"',
                         period_name)
                ga_model.delete(period_name)

            if self.stat in (None, 'url'):
                # Clean out old url data before storing the new
                ga_model.pre_update_url_stats(period_name)

                accountName = config.get('googleanalytics.account')

                path_prefix = '~'  # i.e. it is a regex
                # Possibly there is a domain in the path.
                # I'm not sure why, but on the data.gov.uk property we see
                # the domain gets added to the GA path. e.g.
                #   '/data.gov.uk/data/search'
                #   '/co-prod2.dh.bytemark.co.uk/apps/test-app'
                # but on other properties we don't. e.g.
                #   '/data/search'
                path_prefix += '(/%s)?' % accountName

                log.info('Downloading analytics for dataset views')
                data = self.download(start_date, end_date,
                                     path_prefix + '/dataset/[a-z0-9-_]+')

                log.info('Storing dataset views (%i rows)',
                         len(data.get('url')))
                self.store(
                    period_name,
                    period_complete_day,
                    data,
                )

                log.info('Downloading analytics for publisher views')
                data = self.download(start_date, end_date,
                                     path_prefix + '/publisher/[a-z0-9-_]+')

                log.info('Storing publisher views (%i rows)',
                         len(data.get('url')))
                self.store(
                    period_name,
                    period_complete_day,
                    data,
                )

                # Create the All records
                ga_model.post_update_url_stats()

                log.info('Associating datasets with their publisher')
                ga_model.update_publisher_stats(
                    period_name)  # about 30 seconds.

            if self.stat == 'url-all':
                # This stat is split off just for test purposes
                ga_model.post_update_url_stats()

            if self.stat in (None, 'sitewide'):
                # Clean out old ga_stats data before storing the new
                ga_model.pre_update_sitewide_stats(period_name)

                log.info(
                    'Downloading and storing analytics for site-wide stats')
                self.sitewide_stats(period_name, period_complete_day)

            if self.stat in (None, 'social'):
                # Clean out old ga_stats data before storing the new
                ga_model.pre_update_social_stats(period_name)

                log.info(
                    'Downloading and storing analytics for social networks')
                self.update_social_info(period_name, start_date, end_date)

    def update_social_info(self, period_name, start_date, end_date):
        start_date = start_date.strftime('%Y-%m-%d')
        end_date = end_date.strftime('%Y-%m-%d')
        query = 'ga:hasSocialSourceReferral=~Yes$'
        metrics = 'ga:entrances'
        sort = '-ga:entrances'

        try:
            args = dict(ids='ga:' + self.profile_id,
                        filters=query,
                        metrics=metrics,
                        sort=sort,
                        dimensions="ga:landingPagePath,ga:socialNetwork",
                        max_results=10000)

            args['start-date'] = start_date
            args['end-date'] = end_date

            results = self._get_ga_data(args)
        except Exception, e:
            log.exception(e)
            results = dict(url=[])

        data = collections.defaultdict(list)
        rows = results.get('rows')
        for row in rows:
            url = strip_off_host_prefix(row[0])
            data[url].append((
                row[1],
                int(row[2]),
            ))
        ga_model.update_social(period_name, data)
Esempio n. 2
0
class DownloadAnalytics(object):
    '''Downloads and stores analytics info'''

    def __init__(self, service=None, token=None, profile_id=None, delete_first=False,
                 skip_url_stats=False):
        self.period = config['ga-report.period']
        self.service = service
        self.profile_id = profile_id
        self.delete_first = delete_first
        self.skip_url_stats = skip_url_stats
        self.token = token

    def specific_month(self, date):
        import calendar

        first_of_this_month = datetime.datetime(date.year, date.month, 1)
        _, last_day_of_month = calendar.monthrange(int(date.year), int(date.month))
        last_of_this_month =  datetime.datetime(date.year, date.month, last_day_of_month)
        # if this is the latest month, note that it is only up until today
        now = datetime.datetime.now()
        if now.year == date.year and now.month == date.month:
            last_day_of_month = now.day
            last_of_this_month = now
        periods = ((date.strftime(FORMAT_MONTH),
                    last_day_of_month,
                    first_of_this_month, last_of_this_month),)
        self.download_and_store(periods)


    def latest(self):
        if self.period == 'monthly':
            # from first of this month to today
            now = datetime.datetime.now()
            first_of_this_month = datetime.datetime(now.year, now.month, 1)
            periods = ((now.strftime(FORMAT_MONTH),
                        now.day,
                        first_of_this_month, now),)
        else:
            raise NotImplementedError
        self.download_and_store(periods)


    def for_date(self, for_date):
        assert isinstance(since_date, datetime.datetime)
        periods = [] # (period_name, period_complete_day, start_date, end_date)
        if self.period == 'monthly':
            first_of_the_months_until_now = []
            year = for_date.year
            month = for_date.month
            now = datetime.datetime.now()
            first_of_this_month = datetime.datetime(now.year, now.month, 1)
            while True:
                first_of_the_month = datetime.datetime(year, month, 1)
                if first_of_the_month == first_of_this_month:
                    periods.append((now.strftime(FORMAT_MONTH),
                                    now.day,
                                    first_of_this_month, now))
                    break
                elif first_of_the_month < first_of_this_month:
                    in_the_next_month = first_of_the_month + datetime.timedelta(40)
                    last_of_the_month = datetime.datetime(in_the_next_month.year,
                                                           in_the_next_month.month, 1)\
                                                           - datetime.timedelta(1)
                    periods.append((now.strftime(FORMAT_MONTH), 0,
                                    first_of_the_month, last_of_the_month))
                else:
                    # first_of_the_month has got to the future somehow
                    break
                month += 1
                if month > 12:
                    year += 1
                    month = 1
        else:
            raise NotImplementedError
        self.download_and_store(periods)

    @staticmethod
    def get_full_period_name(period_name, period_complete_day):
        if period_complete_day:
            return period_name + ' (up to %ith)' % period_complete_day
        else:
            return period_name


    def download_and_store(self, periods):
        for period_name, period_complete_day, start_date, end_date in periods:
            log.info('Period "%s" (%s - %s)',
                     self.get_full_period_name(period_name, period_complete_day),
                     start_date.strftime('%Y-%m-%d'),
                     end_date.strftime('%Y-%m-%d'))

            if self.delete_first:
                log.info('Deleting existing Analytics for this period "%s"',
                         period_name)
                ga_model.delete(period_name)

            if not self.skip_url_stats:
                # Clean out old url data before storing the new
                ga_model.pre_update_url_stats(period_name)

                accountName = config.get('googleanalytics.account')

                log.info('Downloading analytics for dataset views')
                data = self.download(start_date, end_date, '~/%s/dataset/[a-z0-9-_]+' % accountName)

                log.info('Storing dataset views (%i rows)', len(data.get('url')))
                self.store(period_name, period_complete_day, data, )

                log.info('Downloading analytics for publisher views')
                data = self.download(start_date, end_date, '~/%s/publisher/[a-z0-9-_]+' % accountName)

                log.info('Storing publisher views (%i rows)', len(data.get('url')))
                self.store(period_name, period_complete_day, data,)

                # Make sure the All records are correct.
                ga_model.post_update_url_stats()

                log.info('Associating datasets with their publisher')
                ga_model.update_publisher_stats(period_name) # about 30 seconds.


            log.info('Downloading and storing analytics for site-wide stats')
            self.sitewide_stats( period_name, period_complete_day )

            log.info('Downloading and storing analytics for social networks')
            self.update_social_info(period_name, start_date, end_date)


    def update_social_info(self, period_name, start_date, end_date):
        start_date = start_date.strftime('%Y-%m-%d')
        end_date = end_date.strftime('%Y-%m-%d')
        query = 'ga:hasSocialSourceReferral=~Yes$'
        metrics = 'ga:entrances'
        sort = '-ga:entrances'

        try:
            # Because of issues of invalid responses, we are going to make these requests
            # ourselves.
            headers = {'authorization': 'Bearer ' + self.token}

            args = dict(ids='ga:' + self.profile_id,
                       filters=query,
                       metrics=metrics,
                       sort=sort,
                       dimensions="ga:landingPagePath,ga:socialNetwork",
                       max_results=10000)

            args['start-date'] = start_date
            args['end-date'] = end_date

            results = self._get_json(args)
        except Exception, e:
            log.exception(e)
            results = dict(url=[])


        data = collections.defaultdict(list)
        rows = results.get('rows',[])
        for row in rows:
            url = _normalize_url('http:/' + row[0])
            data[url].append( (row[1], int(row[2]),) )
        ga_model.update_social(period_name, data)