import time import requests from datapackage_pipelines.wrapper import ingest, spew from datapackage_pipelines_measure.config import settings import logging log = logging.getLogger(__name__) parameters, datapackage, res_iter = ingest() # 30 authenticated requests per minute, so wait 3 secs (or use # GITHUB_REQUEST_WAIT_INTERVAL env var) before each request # (https://developer.github.com/v3/search/#rate-limit) REQUEST_WAIT_INTERVAL = int(settings.get('GITHUB_REQUEST_WAIT_INTERVAL', 3)) def _make_github_request(url): try: headers = { 'Authorization': 'token {}'.format(settings['GITHUB_API_TOKEN']) } response = requests.get(url, headers=headers) json_response = response.json() except simplejson.scanner.JSONDecodeError: log.error('Expected JSON in response from: {}'.format(url)) raise if response.status_code != 200: log.error('Response from Github not successful')
def add_steps(steps: list, pipeline_id: str, project_id: str, config: dict) -> list: steps.append(('measure.datastore_get_latest', { 'resource-name': 'latest-project-entries', 'table': 'forum_categories', 'engine': settings.get('DB_ENGINE'), 'distinct_on': ['project_id', 'domain', 'source', 'category'] })) for domain_categories in config['discourse-categories']: for category in domain_categories['categories']: steps.append(('measure.add_discourse_category_resource', { 'category': category, 'domain': domain_categories['domain'] })) steps.append(('measure.remove_resource', { 'name': 'latest-project-entries' })) steps.append(('concatenate', { 'target': { 'name': 'forum-categories', 'path': 'data/forum-categories.json'}, 'fields': { 'domain': [], 'category': [], 'new_topics': [], 'new_posts': [], 'source': [], 'date': []} })) steps.append(('set_types', { 'types': { 'domain': { 'type': 'string', }, 'category': { 'type': 'string', }, 'source': { 'type': 'string', }, 'new_topics': { 'type': 'integer' }, 'new_posts': { 'type': 'integer' }, 'date': { 'type': 'date', }, } })) steps.append(('measure.add_project_name', {'name': project_id})) steps.append(('measure.add_timestamp')) steps.append(('measure.add_uuid')) # Dump to path if in development mode if settings.get('DEVELOPMENT', False): steps.append(('dump.to_path', { 'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id) })) steps.append(('dump.to_sql', { 'engine': settings['DB_ENGINE'], 'tables': { 'forum_categories': { 'resource-name': 'forum-categories', 'mode': 'update', 'update_keys': ['domain', 'category', 'source', 'project_id', 'date'] } } })) return steps
def add_steps(steps: list, pipeline_id: str, project_id: str, config: dict) -> list: steps.append(('measure.datastore_get_latest', { 'resource-name': 'latest-project-entries', 'table': 'websiteanalytics', 'engine': settings.get('DB_ENGINE'), 'distinct_on': ['project_id', 'domain', 'source'] })) if 'ga' in config: for domain in config['ga']['domains']: steps.append(('measure.add_ga_resource', {'domain': domain})) steps.append(('measure.remove_resource', { 'name': 'latest-project-entries' })) steps.append(('concatenate', { 'target': { 'name': 'website-analytics', 'path': 'data/website-analytics.json' }, 'fields': { 'domain': [], 'page_path': [], 'visitors': [], 'unique_visitors': [], 'avg_time_spent': [], 'source': [], 'date': [] } })) steps.append(('set_types', { 'types': { 'domain': { 'type': 'string', }, 'page_path': { 'type': 'string', }, 'visitors': { 'type': 'integer' }, 'unique_visitors': { 'type': 'integer' }, 'avg_time_spent': { 'type': 'number' }, 'date': { 'type': 'date', }, } })) steps.append(('measure.add_project_name', {'name': project_id})) steps.append(('measure.add_timestamp')) steps.append(('measure.add_uuid')) # Dump to path if in development mode if settings.get('DEVELOPMENT', False): steps.append(('dump.to_path', { 'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id) })) steps.append(('dump.to_sql', { 'engine': settings['DB_ENGINE'], 'tables': { 'websiteanalytics': { 'resource-name': 'website-analytics', 'mode': 'update', 'update_keys': [ 'domain', 'page_path', 'source', 'project_id', 'date', ] } } })) return steps
def add_steps(steps: list, pipeline_id: str, project_id: str, config: dict) -> list: steps.append(('measure.datastore_get_latest', { 'resource-name': 'latest-project-entries', 'table': 'email', 'engine': settings.get('DB_ENGINE'), 'distinct_on': ['project_id', 'source', 'list_id'] })) if 'mailchimp' in config: for list_id in config['mailchimp']['lists']: steps.append(('measure.add_mailchimp_resource', { 'list_id': list_id })) steps.append(('measure.remove_resource', { 'name': 'latest-project-entries' })) steps.append(('concatenate', { 'target': { 'name': 'email', 'path': 'data/email.csv' }, 'fields': { 'source': [], 'list_id': [], 'date': [], 'subscribers': [], 'subs': [], 'unsubs': [], 'campaigns_sent': [] } })) steps.append(('set_types', { 'types': { 'source': { 'type': 'string' }, 'list_id': { 'type': 'string' }, 'date': { 'type': 'date' }, 'subscribers': { 'type': 'integer' }, 'subs': { 'type': 'integer' }, 'unsubs': { 'type': 'integer' }, 'campaigns_sent': { 'type': 'integer' } } })) steps.append(('measure.add_project_name', {'name': project_id})) steps.append(('measure.add_timestamp')) steps.append(('measure.add_uuid')) # Dump to path if in development mode if settings.get('DEVELOPMENT', False): steps.append(('dump.to_path', { 'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id) })) steps.append(('dump.to_sql', { 'engine': settings.get('DB_ENGINE'), 'tables': { 'email': { 'resource-name': 'email', 'mode': 'update', 'update_keys': ['date', 'source', 'list_id', 'project_id'] } } })) return steps
def add_steps(steps: list, pipeline_id: str, project_id: str, config: dict) -> list: steps.append(('measure.datastore_get_latest', { 'resource-name': 'latest-project-entries', 'table': 'codepackaging', 'engine': settings.get('DB_ENGINE'), 'distinct_on': ['project_id', 'package', 'source'] })) if 'npm' in config: for package in config['npm']['packages']: steps.append(('measure.add_npm_resource', { 'package': slugify(package) })) if 'pypi' in config: for package in config['pypi']['packages']: steps.append(('measure.add_pypi_resource', { 'package': slugify(package) })) if 'rubygems' in config: for gem in config['rubygems']['gems']: steps.append(('measure.add_rubygems_resource', {'gem_id': gem})) if 'packagist' in config: for package in config['packagist']['packages']: steps.append(('measure.add_packagist_resource', { 'package': package })) steps.append(('measure.remove_resource', { 'name': 'latest-project-entries' })) steps.append(('concatenate', { 'target': { 'name': 'code-packaging', 'path': 'data/code-packaging.csv' }, 'fields': { 'date': [], 'downloads': [], 'total_downloads': [], 'source': [], 'package': [] } })) steps.append(('set_types', { 'types': { 'downloads': { 'type': 'integer' }, 'total_downloads': { 'type': 'integer' }, 'source': { 'type': 'string' }, 'date': { 'type': 'date' }, 'package': { 'type': 'string' } } })) steps.append(('measure.add_project_name', {'name': project_id})) steps.append(('measure.add_timestamp')) steps.append(('measure.add_uuid')) # Dump to path if in development mode if settings.get('DEVELOPMENT', False): steps.append(('dump.to_path', { 'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id) })) steps.append(('dump.to_sql', { 'engine': settings.get('DB_ENGINE'), 'tables': { 'codepackaging': { 'resource-name': 'code-packaging', 'mode': 'update', 'update_keys': ['project_id', 'date', 'package', 'source'] } } })) return steps
def add_steps(steps: list, pipeline_id: str, project_id: str, config: dict) -> list: for repo in config['github']['repositories']: steps.append(('measure.add_github_resource', { 'name': slugify(repo), 'repo': repo, 'map_fields': { 'repository': 'name', 'watchers': 'subscribers_count', 'stars': 'stargazers_count' } })) steps.append(('concatenate', { 'sources': [slugify(repo) for repo in config['github']['repositories']], 'target': { 'name': 'code-hosting', 'path': 'data/code-hosting.json'}, 'fields': { 'repository': [], 'watchers': [], 'stars': [], 'source': [], 'date': []} })) steps.append(('set_types', { 'types': { 'repository': { 'type': 'string', }, 'watchers': { 'type': 'integer' }, 'stars': { 'type': 'integer' }, 'date': { 'type': 'date', }, } })) steps.append(('measure.add_project_name', {'name': project_id})) steps.append(('measure.add_timestamp')) steps.append(('measure.add_uuid')) # Dump to path if in development mode if settings.get('DEVELOPMENT', False): steps.append(('dump.to_path', { 'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id) })) steps.append(('dump.to_sql', { 'engine': settings['DB_ENGINE'], 'tables': { 'codehosting': { 'resource-name': 'code-hosting', 'mode': 'update', 'update_keys': ['repository', 'source', 'project_id', 'date'] } } })) return steps
def add_steps(steps: list, pipeline_id: str, project_id: str, config: dict) -> list: if 'twitter' in config: for entity in config['twitter']['entities']: steps.append(('measure.add_twitter_resource', { 'entity': entity, 'project_id': project_id })) if 'facebook' in config: for page in config['facebook']['pages']: steps.append(('measure.add_facebook_resource', { 'entity': page, 'project_id': project_id })) steps.append(('concatenate', { 'target': { 'name': 'social-media', 'path': 'data/social-media.csv' }, 'fields': { 'entity': [], 'entity_type': [], 'source': [], 'date': [], 'followers': [], 'mentions': [], 'interactions': [], 'impressions': [] } })) steps.append(('set_types', { 'types': { 'entity': { 'type': 'string', }, 'entity_type': { 'type': 'string' }, 'source': { 'type': 'string' }, 'date': { 'type': 'date', }, 'followers': { 'type': 'integer' }, 'mentions': { 'type': 'integer' }, 'interactions': { 'type': 'integer' }, 'impressions': { 'type': 'integer' } } })) steps.append(('measure.add_project_name', {'name': project_id})) steps.append(('measure.add_timestamp')) steps.append(('measure.add_uuid')) # Dump to path if in development if settings.get('DEVELOPMENT', False): steps.append(('dump.to_path', { 'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id) })) steps.append(('dump.to_sql', { 'engine': settings.get('DB_ENGINE'), 'tables': { 'socialmedia': { 'resource-name': 'social-media', 'mode': 'update', 'update_keys': ['entity', 'entity_type', 'source', 'project_id', 'date'] } } })) return steps
def add_steps(steps: list, pipeline_id: str, project_id: str, config: dict) -> list: steps.append(('measure.datastore_get_latest', { 'resource-name': 'latest-project-entries', 'table': 'outputs', 'engine': settings.get('DB_ENGINE'), 'distinct_on': ['project_id', 'source', 'source_id'], 'sort_date_key': 'source_timestamp' })) for source in config: steps.append(('measure.add_outputs_resource', { 'sheet_id': source.get('sheetid'), 'gid': source.get('gid'), 'source_type': source.get('type') })) steps.append(('measure.remove_resource', { 'name': 'latest-project-entries' })) steps.append(('concatenate', { 'target': { 'name': 'outputs', 'path': 'data/outputs.csv'}, 'fields': { 'source_id': [], 'source_type': [], 'source': [], 'source_timestamp': [], 'source_email': [], 'output_title': [], 'output_type': [], 'output_organization': [], 'output_person': [], 'output_link': [], 'output_additional_information': [], 'output_date': []} })) steps.append(('set_types', { 'types': { 'source_id': { 'type': 'string' }, 'source_type': { 'type': 'string' }, 'source': { 'type': 'string' }, 'source_timestamp': { 'type': 'datetime' }, 'source_email': { 'type': 'string' }, 'output_title': { 'type': 'string' }, 'output_organization': { 'type': 'string' }, 'output_person': { 'type': 'string' }, 'output_link': { 'type': 'string' }, 'output_additional_information': { 'type': 'string' }, 'output_date': { 'type': 'date' }} })) steps.append(('measure.add_project_name', {'name': project_id})) steps.append(('measure.add_timestamp')) steps.append(('measure.add_uuid')) # Dump to path if in development mode if settings.get('DEVELOPMENT', False): steps.append(('dump.to_path', { 'out-path': '{}/{}'.format(DOWNLOADS_PATH, pipeline_id) })) steps.append(('dump.to_sql', { 'engine': settings.get('DB_ENGINE'), 'tables': { 'outputs': { 'resource-name': 'outputs', 'mode': 'update', 'update_keys': ['project_id', 'source', 'source_timestamp', 'source_id'] } } })) return steps