Beispiel #1
0
 def save_ga_data(self, packages_data):
     """
     Save tuples of packages_data to the database
     """
     for identifier, visits_collection in packages_data.items():
         visits = visits_collection.get('visits', {})
         matches = RESOURCE_URL_REGEX.match(identifier)
         if matches:
             resource_url = identifier[len(self.resource_url_tag):]
             resource = model.Session.query(model.Resource).autoflush(True)\
                        .filter_by(id=matches.group(1)).first()
             if not resource:
                 self.log.warning("Couldn't find resource %s" %
                                  resource_url)
                 continue
             for visit_date, count in visits.iteritems():
                 ResourceStats.update_visits(resource.id, visit_date, count)
                 self.log.info("Updated %s with %s visits" %
                               (resource.id, count))
         else:
             package_name = identifier[len(PACKAGE_URL):]
             if "/" in package_name:
                 self.log.warning("%s not a valid package name" %
                                  package_name)
                 continue
             item = model.Package.by_name(package_name)
             if not item:
                 self.log.warning("Couldn't find package %s" % package_name)
                 continue
             for visit_date, count in visits.iteritems():
                 PackageStats.update_visits(item.id, visit_date, count)
                 self.log.info("Updated %s with %s visits" %
                               (item.id, count))
     model.Session.commit()
    def save_type_package_downloads(self, data):
        for package_id_or_name, date_collection in data.items():
            package = model.Package.get(package_id_or_name)

            if not package:
                self.log.warning("Couldn't find package %s" % package_id_or_name)
                continue

            for date, value in date_collection.iteritems():
                PackageStats.update_downloads(package_id=package.id, visit_date=date, downloads=value["downloads"])
    def save_type_package(self, data):
        for package_id_or_name, date_collection in data.items():
            # this is a lot slower than by_name()
            item = model.Package.get(package_id_or_name)
            if not item:
                self.log.warning("Couldn't find package %s" % package_id_or_name)
                continue

            for date, value in date_collection.iteritems():
                PackageStats.update_visits(item.id, date, value["visits"], value["entrances"])
Beispiel #4
0
def get_visits_count_for_dataset_during_last_year(id):

    from ckanext.googleanalytics.model import PackageStats

    return len(
        PackageStats.get_visits_during_year(id,
                                            datetime.datetime.now().year - 1))
Beispiel #5
0
    def handle_package(pkg):
        resolved_dict = {
            'title': pkg["title"],
            'id': pkg["id"],
            'maintainer_name': pkg["maintainer"],
            "maintainer_email": pkg["maintainer_email"],
            'metadata_created': pkg["metadata_created"],
            'valid_till': pkg["valid_till"],
        }

        if pkg.get('organization'):
            resolved_dict['organization_title'] = pkg["organization"].get("title")
            resolved_dict['organization_homepage'] = pkg["organization"].get("homepage", None)
            resolved_dict['organization_id'] = pkg["organization"].get("id")

        # FIXME: Against CKAN best practices to access model directly, should be done through actions
        # https://docs.ckan.org/en/ckan-2.7.0/contributing/architecture.html#always-go-through-the-action-functions
        package_stats = PackageStats.get_total_visits(limit=1, package_id=pkg['id'])
        if package_stats:
            resolved_dict['visits'] = package_stats[0].get("visits", 0)
            resolved_dict['downloads'] = package_stats[0].get("downloads", 0)
        else:
            resolved_dict['visits'] = 0
            resolved_dict['downloads'] = 0

        return resolved_dict
Beispiel #6
0
def google_analytics_dataset_report(last):
    '''
    Generates report based on google analytics data. number of views per package
    '''
    # get package objects corresponding to popular GA content
    top_packages = PackageStats.get_top(limit=last)

    return {'table': top_packages.get("packages")}
def googleanalytics_dataset_visits(context=None, data_dict=None):
    """
    Fetch the amount of times a dataset hs been visited

    :param id: Dataset id
    :type id: string

    :returns: The number of times the dataset has been viewed
    :rtype: integer
    """
    package = Package.get(data_dict['id'])
    return PackageStats.get_all_visits(package.id)
def google_analytics_dataset_least_popular_report(time):
    '''
    Generates report based on google analytics data. number of views per package
    '''

    start_date, end_date = last_calendar_period(time)

    # get package objects corresponding to popular GA content
    top_packages = PackageStats.get_total_visits(start_date=start_date, end_date=end_date, limit=None, descending=False)
    top_20 = top_packages[:20]

    return {
        'table': top_packages,
        'top': top_20
    }
Beispiel #9
0
def google_analytics_dataset_report(last):
    '''
    Generates report based on google analytics data. number of views per package
    '''
    # get package objects corresponding to popular GA content
    result = PackageStats.get_top(limit=last)
    packages = []

    for package in result['packages']:
        package_with_extras = toolkit.get_action('package_show')({}, {'id': package['package_id']})
        package_with_extras['visits'] = package['visits']
        package_with_extras['visit_date'] = package['visit_date']
        packages.append(package_with_extras)

    from operator import itemgetter
    result['packages'] = sorted(packages, key=itemgetter('visits'), reverse=True)

    return {
        'table': result.get("packages")
    }
Beispiel #10
0
def get_visits_for_dataset(id):

    from ckanext.googleanalytics.model import PackageStats

    return PackageStats.get_all_visits(id)
def google_analytics_organizations_with_most_popular_datasets(time):
    start_date, end_date = last_calendar_period(time)
    most_popular_organizations = PackageStats.get_organizations_with_most_popular_datasets(start_date, end_date)
    return {
        'table': most_popular_organizations
    }
 def test_queries(self):
     last_month_end = datetime.datetime.today().replace(day=1) - datetime.timedelta(days=1)
     last_month_start = last_month_end.replace(day=1)
     stats = PackageStats.get_total_visits(last_month_start, last_month_end, limit=20)
     for stat in stats:
         print(stat['entrances'], stat['package_name'], stat['visits'])
    def parse_and_save(self, args):
        """Grab raw data from Google Analytics and save to the database"""
        from ga_auth import get_profile_id

        self.init_service(args)

        self.profile_id = get_profile_id(self.service)
        if len(args) > 3:
            raise Exception('Too many arguments')

        given_start_date = None
        if len(args) == 3:
            given_start_date = datetime.datetime.strptime(args[2], '%Y-%m-%d').date()

        botFilters = [
            'ga:browser!@StatusCake',
            'ga:browser!@Python',
            'ga:sessionDurationBucket!=0',
            'ga:sessionDurationBucket!=1',
            'ga:sessionDurationBucket!=2',
            'ga:sessionDurationBucket!=3',
            'ga:networkDomain!=ua.es',
            'ga:networkDomain!=amazonaws.com',
            'ga:networkDomain!=kcura.com',
            'ga:networkDomain!=relativity.com',
        ]
        # list of queries to send to analytics
        queries = [{
            'type': 'package',
            'dates': self.get_dates_between_update(given_start_date, PackageStats.get_latest_update_date()),
            'filters': 'ga:pagePath=~%s,ga:pagePath=~%s' % (PACKAGE_URL, self.resource_url_tag),
            'metrics': 'ga:uniquePageviews, ga:entrances',
            'sort': 'ga:date',
            'dimensions': 'ga:pagePath, ga:date',
            'resolver': self.resolver_type_package,
            'save': self.save_type_package,
        }, {
            'type': 'resource',
            'dates': self.get_dates_between_update(given_start_date, ResourceStats.get_latest_update_date()),
            'filters': 'ga:pagePath=~%s' % self.resource_url_tag,
            'metrics': 'ga:uniquePageviews',
            'sort': 'ga:date',
            'dimensions': 'ga:pagePath, ga:date',
            'resolver': self.resolver_type_resource,
            'save': self.save_type_resource,
        }, {
            'type': 'visitorlocation',
            'dates': self.get_dates_between_update(given_start_date, AudienceLocationDate.get_latest_update_date()),
            'filters': ";".join(botFilters),
            'metrics': 'ga:sessions',
            'sort': 'ga:date',
            'dimensions': 'ga:country, ga:date',
            'resolver': self.resolver_type_visitorlocation,
            'save': self.save_type_visitorlocation,
        }, {
            'type': 'package_downloads',
            'dates': self.get_dates_between_update(given_start_date, PackageStats.get_latest_update_date()),
            'filters': "ga:eventCategory==Resource;ga:eventAction==Download",
            'metrics': "ga:uniqueEvents",
            'sort': "ga:date",
            'dimensions': "ga:pagePath, ga:date, ga:eventCategory",
            'resolver': self.resolver_type_package_downloads,
            'save': self.save_type_package_downloads,
        }, {
            'type': 'search_terms',
            'dates': self.get_dates_between_update(given_start_date, SearchStats.get_latest_update_date()),
            'filters': ";".join(botFilters),
            'metrics': "ga:searchUniques",
            'sort': "ga:date",
            'dimensions': "ga:searchKeyword, ga:date",
            'resolver': self.resolver_type_search_terms,
            'save': self.save_type_search_terms,
        }]

        # loop through queries, parse and save them to db
        for query in queries:
            data = {}
            current = datetime.datetime.now()
            self.log.info('performing analytics query of type: %s' % query['type'])
            print 'Querying type: %s' % query['type']
            for date in query['dates']:
                # run query with current query values
                results = self.ga_query(start_date=date,
                                        end_date=current,
                                        filters=query['filters'],
                                        metrics=query['metrics'],
                                        sort=query['sort'],
                                        dimensions=query['dimensions'])
                # parse query
                resolver = query['resolver']
                data = resolver(results, data)
                current = date

            save_function = query['save']
            print 'Saving type: %s' % query['type']
            save_function(data)
            model.Session.commit()
            print 'Saving done'
            self.log.info("Successfully saved analytics query of type: %s" % query['type'])
Beispiel #14
0
    def get_ga_data(self, start_date=None):
        """
        Get raw data from Google Analytics for packages and
        resources for the start date given as parameter or last time since database was updated and 2 days more

        Returns a dictionary like::

           {'identifier': {'visits':3, 'visit_date':<time>}}
        """
        now = datetime.datetime.now()

        # If there is no last valid value found from database then we make sure to grab all values from start. i.e. 2014
        # We want to take minimum 2 days worth logs even latest_date is today
        floor_date = datetime.date(2014, 1, 1)
        latest_date = None

        if start_date is not None:
            floor_date = start_date

        latest_date = PackageStats.get_latest_update_date()

        if latest_date is not None:
            floor_date = latest_date - datetime.timedelta(days=2)

        packages = {}
        queries = ['ga:pagePath=~%s' % PACKAGE_URL]

        current_month = datetime.date(now.year, now.month, 1)
        dates = []

        #If floor date and current month belong to the same month no need to add backward months
        if current_month != datetime.date(floor_date.year, floor_date.month,
                                          1):
            while current_month > datetime.date(
                    floor_date.year, floor_date.month, floor_date.day):
                dates.append(current_month)
                current_month = current_month - datetime.timedelta(days=30)
        dates.append(floor_date)

        current = now
        for date in dates:

            for query in queries:
                results = self.ga_query(start_date=date, end_date=current)
                if 'rows' in results:
                    for result in results.get('rows'):

                        package = result[0]
                        if not package.startswith(PACKAGE_URL):
                            package = '/' + '/'.join(package.split('/')[2:])
                        if package.startswith('/fi/') or package.startswith(
                                '/sv/') or package.startswith('/en/'):
                            package = '/' + '/'.join(package.split('/')[2:])

                        visit_date = datetime.datetime.strptime(
                            result[1], "%Y%m%d").date()
                        count = result[2]
                        # Make sure we add the different representations of the same
                        # dataset /mysite.com & /www.mysite.com ...

                        val = 0
                        if package in packages and "visits" in packages[
                                package]:
                            if visit_date in packages[package]['visits']:
                                val += packages[package]["visits"][visit_date]
                        else:
                            packages.setdefault(package, {})["visits"] = {}
                        packages[package]['visits'][visit_date] = int(
                            count) + val
            current = date
        return packages