def update_project_info(project): ''' Update info from Github, if it's missing. Modify the project in-place and return nothing. Complete repository project details go into extras, for example project details from Github can be found under "github_details". Github_details is specifically expected to be used on this page: http://opengovhacknight.org/projects.html ''' if 'code_url' not in project or not project['code_url']: project = non_github_project_update_time(project) return project _, host, path, _, _, _ = urlparse(project['code_url']) if host != 'github.com': project = non_github_project_update_time(project) return project # Get the Github attributes if host == 'github.com': repo_url = GITHUB_REPOS_API_URL.format(repo_path=path) # If we've hit the GitHub rate limit, skip updating projects. global github_throttling if github_throttling: return project # find an existing project, filtering on code_url, organization_name, and project name (if we know it) existing_filter = [ Project.code_url == project['code_url'], Project.organization_name == project['organization_name'] ] if 'name' in project and project['name']: existing_filter.append(Project.name == project['name']) spreadsheet_is_updated = False existing_project = db.session.query(Project).filter( *existing_filter).first() if existing_project: # copy 'last_updated' values from the existing project to the project dict project['last_updated'] = existing_project.last_updated project[ 'last_updated_issues'] = existing_project.last_updated_issues project[ 'last_updated_civic_json'] = existing_project.last_updated_civic_json project[ 'last_updated_root_files'] = existing_project.last_updated_root_files # check whether any of the org spreadsheet values for the project have changed for project_key in project: check_value = project[project_key] existing_value = existing_project.__dict__[project_key] if check_value and check_value != existing_value: spreadsheet_is_updated = True elif not check_value and existing_value: project[project_key] = existing_value # request project info from GitHub with the If-Modified-Since header if existing_project.last_updated: last_updated = datetime.strftime(existing_project.last_updated, "%a, %d %b %Y %H:%M:%S GMT") got = get_github_api( repo_url, headers={"If-Modified-Since": last_updated}) # In rare cases, a project can be saved without a last_updated. else: got = get_github_api(repo_url) else: got = get_github_api(repo_url) if got.status_code in range(400, 499): if got.status_code == 404: logging.error(repo_url + ' doesn\'t exist.') # If its a bad GitHub link, don't return it at all. return None elif got.status_code == 403: logging.error("GitHub Rate Limit Remaining: " + str(got.headers["x-ratelimit-remaining"])) error_dict = { "error": u'IOError: We done got throttled by GitHub', "time": datetime.now() } new_error = Error(**error_dict) db.session.add(new_error) # commit the error db.session.commit() github_throttling = True return project else: raise IOError # If the project has not been modified... elif got.status_code == 304: logging.info( 'Project {} has not been modified since last update'.format( repo_url)) # Populate values from the civic.json if it exists/is updated project, civic_json_is_updated = update_project_from_civic_json( project_dict=project, force=spreadsheet_is_updated) # if values have changed, copy untouched values from the existing project object and return it if spreadsheet_is_updated or civic_json_is_updated: logging.info( 'Project %s has been modified via spreadsheet or civic.json.', repo_url) project['last_updated'] = datetime.now().strftime( "%a, %d %b %Y %H:%M:%S %Z") project['github_details'] = existing_project.github_details return project # nothing was updated, but make sure we keep the project # :::here (project/true) existing_project.keep = True db.session.add(existing_project) # commit the project db.session.commit() return None # Save last_updated time header for future requests project['last_updated'] = got.headers['Last-Modified'] all_github_attributes = got.json() github_details = {} for field in ('contributors_url', 'created_at', 'forks_count', 'homepage', 'html_url', 'id', 'language', 'open_issues', 'pushed_at', 'updated_at', 'watchers_count', 'name', 'description', 'stargazers_count'): github_details[field] = all_github_attributes[field] github_details['owner'] = dict() for field in ('avatar_url', 'html_url', 'login', 'type'): github_details['owner'][field] = all_github_attributes['owner'][ field] project['github_details'] = github_details if 'name' not in project or not project['name']: project['name'] = all_github_attributes['name'] if 'description' not in project or not project['description']: project['description'] = all_github_attributes['description'] if 'link_url' not in project or not project['link_url']: project['link_url'] = all_github_attributes['homepage'] # # Populate project contributors from github_details[contributors_url] # project['github_details']['contributors'] = [] got = get_github_api(all_github_attributes['contributors_url']) # Check if there are contributors try: for contributor in got.json(): # we don't want people without email addresses? if contributor['login'] == 'invalid-email-address': break project['github_details']['contributors'].append(dict()) for field in ('login', 'url', 'avatar_url', 'html_url', 'contributions'): project['github_details']['contributors'][-1][ field] = contributor[field] # flag the owner with a boolean value project['github_details']['contributors'][-1]['owner'] \ = bool(contributor['login'] == project['github_details']['owner']['login']) except: pass # # Populate project participation from github_details[url] + "/stats/participation" # Sometimes GitHub returns a blank dict instead of no participation. # got = get_github_api(all_github_attributes['url'] + '/stats/participation') try: project['github_details']['participation'] = got.json()['all'] except: project['github_details']['participation'] = [0] * 50 # # Populate values from the civic.json if it exists/is updated # project, civic_json_is_updated = update_project_from_civic_json( project_dict=project, force=spreadsheet_is_updated) return project
def update_project_info(project): ''' Update info from Github, if it's missing. Modify the project in-place and return nothing. Complete repository project details go into extras, for example project details from Github can be found under "github_details". Github_details is specifically expected to be used on this page: http://opengovhacknight.org/projects.html ''' def non_github_project_update_time(project): ''' If its a non-github project, we should check if any of the fields have been updated, such as the description. Set the last_updated timestamp. ''' existing_project = db.session.query(Project).filter( Project.name == project['name']).first() if existing_project: # project gets existing last_updated project['last_updated'] = existing_project.last_updated # unless one of the fields has been updated if 'description' in project: if project['description'] != existing_project.description: project['last_updated'] = datetime.now().strftime( "%a, %d %b %Y %H:%M:%S %Z") if 'categories' in project: if project['categories'] != existing_project.categories: project['last_updated'] = datetime.now().strftime( "%a, %d %b %Y %H:%M:%S %Z") if 'type' in project: if project['type'] != existing_project.type: project['last_updated'] = datetime.now().strftime( "%a, %d %b %Y %H:%M:%S %Z") if 'link_url' in project: if project['link_url'] != existing_project.link_url: project['last_updated'] = datetime.now().strftime( "%a, %d %b %Y %H:%M:%S %Z") else: # Set a date when we first see a non-github project project['last_updated'] = datetime.now().strftime( "%a, %d %b %Y %H:%M:%S %Z") return project if 'code_url' not in project: project = non_github_project_update_time(project) return project _, host, path, _, _, _ = urlparse(project['code_url']) if host != 'github.com': project = non_github_project_update_time(project) return project # Get the Github attributes if host == 'github.com': repo_url = 'https://api.github.com/repos' + path # If we've hit the GitHub rate limit, skip updating projects. global github_throttling if github_throttling: return project # find an existing project, filtering on code_url, organization_name, and project name (if we know it) existing_filter = [ Project.code_url == project['code_url'], Project.organization_name == project['organization_name'] ] if 'name' in project.keys() and project['name'] not in [u'', None]: existing_filter.append(Project.name == project['name']) existing_project = db.session.query(Project).filter( *existing_filter).first() if existing_project: if existing_project.last_updated: last_updated = datetime.strftime(existing_project.last_updated, "%a, %d %b %Y %H:%M:%S GMT") got = get_github_api( repo_url, headers={"If-Modified-Since": last_updated}) else: # In rare cases, a project can be saved without a last_updated. got = get_github_api(repo_url) else: got = get_github_api(repo_url) if got.status_code in range(400, 499): if got.status_code == 404: logging.error(repo_url + ' doesn\'t exist.') # If its a bad GitHub link, don't return it at all. return None elif got.status_code == 403: logging.error("GitHub Rate Limit Remaining: " + str(got.headers["x-ratelimit-remaining"])) error_dict = { "error": u'IOError: We done got throttled by GitHub', "time": datetime.now() } new_error = Error(**error_dict) db.session.add(new_error) # commit the error db.session.commit() github_throttling = True return project else: raise IOError # If project has not been modified, return elif got.status_code == 304: logging.info('Project %s has not been modified since last update', repo_url) if existing_project: # make sure we keep the project # :::here (project/true) existing_project.keep = True db.session.add(existing_project) # commit the project db.session.commit() return None # Save last_updated time header for future requests project['last_updated'] = got.headers['Last-Modified'] all_github_attributes = got.json() github_details = {} for field in ('contributors_url', 'created_at', 'forks_count', 'homepage', 'html_url', 'id', 'language', 'open_issues', 'pushed_at', 'updated_at', 'watchers_count', 'name', 'description', 'stargazers_count'): github_details[field] = all_github_attributes[field] github_details['owner'] = dict() for field in ('avatar_url', 'html_url', 'login', 'type'): github_details['owner'][field] = all_github_attributes['owner'][ field] project['github_details'] = github_details if 'name' not in project or not project['name']: project['name'] = all_github_attributes['name'] if 'description' not in project or not project['description']: project['description'] = all_github_attributes['description'] if 'link_url' not in project or not project['link_url']: project['link_url'] = all_github_attributes['homepage'] # # Populate project contributors from github_details[contributors_url] # project['github_details']['contributors'] = [] got = get_github_api(all_github_attributes['contributors_url']) # Check if there are contributors try: for contributor in got.json(): # we don't want people without email addresses? if contributor['login'] == 'invalid-email-address': break project['github_details']['contributors'].append(dict()) for field in ('login', 'url', 'avatar_url', 'html_url', 'contributions'): project['github_details']['contributors'][-1][ field] = contributor[field] # flag the owner with a boolean value project['github_details']['contributors'][-1]['owner'] \ = bool(contributor['login'] == project['github_details']['owner']['login']) except: pass # # Populate project participation from github_details[url] + "/stats/participation" # Sometimes GitHub returns a blank dict instead of no participation. # got = get_github_api(all_github_attributes['url'] + '/stats/participation') try: project['github_details']['participation'] = got.json()['all'] except: project['github_details']['participation'] = [0] * 50 return project
def main(org_name=None, org_sources=None): ''' Run update over all organizations. Optionally, update just one. ''' # set org_sources org_sources = org_sources or ORG_SOURCES_FILENAME # Collect a set of fresh organization names. organization_names = set() # Retrieve all organizations and shuffle the list in place. orgs_info = get_organizations(org_sources) shuffle(orgs_info) if org_name: orgs_info = [org for org in orgs_info if org['name'] == org_name] # Iterate over organizations and projects, saving them to db.session. for org_info in orgs_info: if not is_safe_name(org_info['name']): error_dict = { "error": unicode('ValueError: Bad organization name: "%s"' % org_info['name']), "time": datetime.now() } new_error = Error(**error_dict) db.session.add(new_error) # commit the error db.session.commit() continue try: filter = Organization.name == org_info['name'] existing_org = db.session.query(Organization).filter( filter).first() organization_names.add(org_info['name']) # Mark everything associated with this organization for deletion at first. # :::here (event/false, story/false, project/false, organization/false) db.session.execute( db.update(Event, values={ 'keep': False }).where(Event.organization_name == org_info['name'])) db.session.execute( db.update(Story, values={ 'keep': False }).where(Story.organization_name == org_info['name'])) db.session.execute( db.update(Project, values={ 'keep': False }).where(Project.organization_name == org_info['name'])) db.session.execute( db.update(Organization, values={ 'keep': False }).where(Organization.name == org_info['name'])) # commit the false keeps db.session.commit() # Empty lat longs are okay. if 'latitude' in org_info: if not org_info['latitude']: org_info['latitude'] = None if 'longitude' in org_info: if not org_info['longitude']: org_info['longitude'] = None organization = save_organization_info(db.session, org_info) organization_names.add(organization.name) # flush the organization db.session.flush() if organization.rss or organization.website: logging.info("Gathering all of %s's stories." % organization.name) stories = get_stories(organization) if stories: for story_info in stories: save_story_info(db.session, story_info) # flush the stories db.session.flush() if organization.projects_list_url: logging.info("Gathering all of %s's projects." % organization.name) projects = get_projects(organization) for proj_dict in projects: save_project_info(db.session, proj_dict) # flush the projects db.session.flush() if organization.events_url: if not meetup_key: logging.error("No Meetup.com key set.") if 'meetup.com' not in organization.events_url: logging.error("Only Meetup.com events work right now.") else: logging.info("Gathering all of %s's events." % organization.name) identifier = get_event_group_identifier( organization.events_url) if identifier: for event in get_meetup_events(organization, identifier): save_event_info(db.session, event) # flush the events db.session.flush() else: logging.error("%s does not have a valid events url" % organization.name) # Get issues for all of the projects logging.info("Gathering all of %s's open GitHub issues." % organization.name) issues = get_issues(organization.name) for issue in issues: save_issue(db.session, issue) # flush the issues db.session.flush() for issue in issues: save_labels(db.session, issue) # commit everything db.session.commit() # Remove everything marked for deletion. # :::here (event/delete, story/delete, project/delete, issue/delete, organization/delete) db.session.query(Event).filter(Event.keep == False).delete() db.session.query(Story).filter(Story.keep == False).delete() db.session.query(Issue).filter(Issue.keep == False).delete() db.session.query(Project).filter(Project.keep == False).delete() db.session.query(Organization).filter( Organization.keep == False).delete() # commit objects deleted for keep=False db.session.commit() except: # Raise the error, get out of main(), and don't commit the transaction. raise else: # Commit and move on to the next organization. # final commit before moving on to the next organization db.session.commit() # prune orphaned organizations if no organization name was passed if not org_name: for bad_org in db.session.query(Organization): if bad_org.name in organization_names: continue # delete orphaned organizations, all other deletions will cascade db.session.execute( db.delete(Organization).where( Organization.name == bad_org.name)) # commit for deleting orphaned organizations db.session.commit()
def main(org_name=None, org_sources=None): ''' Run update over all organizations. Optionally, update just one. ''' # Keep a set of fresh organization names. organization_names = set() # Retrieve all organizations and shuffle the list in place. orgs_info = get_organizations(org_sources) shuffle(orgs_info) if org_name: orgs_info = [org for org in orgs_info if org['name'] == org_name] # Iterate over organizations and projects, saving them to db.session. for org_info in orgs_info: if not is_safe_name(org_info['name']): error_dict = { "error": 'ValueError: Bad organization name: "%s"' % org_info['name'], "time": datetime.now() } new_error = Error(**error_dict) db.session.add(new_error) db.session.commit() continue try: filter = Organization.name == org_info['name'] existing_org = db.session.query(Organization).filter( filter).first() organization_names.add(org_info['name']) # Mark everything in this organization for deletion at first. db.session.execute( db.update(Event, values={ 'keep': False }).where(Event.organization_name == org_info['name'])) db.session.execute( db.update(Story, values={ 'keep': False }).where(Story.organization_name == org_info['name'])) db.session.execute( db.update(Project, values={ 'keep': False }).where(Project.organization_name == org_info['name'])) db.session.execute( db.update(Organization, values={ 'keep': False }).where(Organization.name == org_info['name'])) # Empty lat longs are okay. if 'latitude' in org_info: if not org_info['latitude']: org_info['latitude'] = None if 'longitude' in org_info: if not org_info['longitude']: org_info['longitude'] = None organization = save_organization_info(db.session, org_info) organization_names.add(organization.name) if organization.rss or organization.website: logging.info("Gathering all of %s's stories." % organization.name) stories = get_stories(organization) if stories: for story_info in stories: save_story_info(db.session, story_info) if organization.projects_list_url: logging.info("Gathering all of %s's projects." % organization.name) projects = get_projects(organization) for proj_info in projects: save_project_info(db.session, proj_info) if organization.events_url: if not meetup_key: logging.error("No Meetup.com key set.") if 'meetup.com' not in organization.events_url: logging.error("Only Meetup.com events work right now.") else: logging.info("Gathering all of %s's events." % organization.name) identifier = get_event_group_identifier( organization.events_url) if identifier: for event in get_meetup_events(organization, identifier): save_event_info(db.session, event) else: logging.error("%s does not have a valid events url" % organization.name) # Get issues for all of the projects logging.info("Gathering all of %s's open GitHub issues." % organization.name) issues, labels = get_issues(organization.name) for i in range(0, len(issues)): save_issue_info(db.session, issues[i], labels[i]) # Remove everything marked for deletion. db.session.query(Event).filter(not Event.keep).delete() db.session.query(Story).filter(not Story.keep).delete() db.session.query(Project).filter(not Project.keep).delete() db.session.query(Issue).filter(Issue.keep == False).delete() db.session.query(Organization).filter( not Organization.keep).delete() except: # Raise the error, get out of main(), and don't commit the transaction. raise else: # Commit and move on to the next organization. db.session.commit() # Stop right here if an org name was specified. if org_name: return # Delete any organization not found on this round. for bad_org in db.session.query(Organization): if bad_org.name in organization_names: continue db.session.execute( db.delete(Event).where(Event.organization_name == bad_org.name)) db.session.execute( db.delete(Story).where(Story.organization_name == bad_org.name)) db.session.execute( db.delete(Project).where( Project.organization_name == bad_org.name)) db.session.execute( db.delete(Organization).where(Organization.name == bad_org.name)) db.session.commit()