Exemple #1
0
def update_project_info(project):
    ''' Update info from Github, if it's missing.

        Modify the project in-place and return nothing.

        Complete repository project details go into extras, for example
        project details from Github can be found under "github_details".

        Github_details is specifically expected to be used on this page:
        http://opengovhacknight.org/projects.html
    '''
    if 'code_url' not in project or not project['code_url']:
        project = non_github_project_update_time(project)
        return project

    _, host, path, _, _, _ = urlparse(project['code_url'])

    if host != 'github.com':
        project = non_github_project_update_time(project)
        return project

    # Get the Github attributes
    if host == 'github.com':
        repo_url = GITHUB_REPOS_API_URL.format(repo_path=path)

        # If we've hit the GitHub rate limit, skip updating projects.
        global github_throttling
        if github_throttling:
            return project

        # find an existing project, filtering on code_url, organization_name, and project name (if we know it)
        existing_filter = [
            Project.code_url == project['code_url'],
            Project.organization_name == project['organization_name']
        ]
        if 'name' in project and project['name']:
            existing_filter.append(Project.name == project['name'])

        spreadsheet_is_updated = False

        existing_project = db.session.query(Project).filter(
            *existing_filter).first()
        if existing_project:
            # copy 'last_updated' values from the existing project to the project dict
            project['last_updated'] = existing_project.last_updated
            project[
                'last_updated_issues'] = existing_project.last_updated_issues
            project[
                'last_updated_civic_json'] = existing_project.last_updated_civic_json
            project[
                'last_updated_root_files'] = existing_project.last_updated_root_files

            # check whether any of the org spreadsheet values for the project have changed
            for project_key in project:
                check_value = project[project_key]
                existing_value = existing_project.__dict__[project_key]
                if check_value and check_value != existing_value:
                    spreadsheet_is_updated = True
                elif not check_value and existing_value:
                    project[project_key] = existing_value

            # request project info from GitHub with the If-Modified-Since header
            if existing_project.last_updated:
                last_updated = datetime.strftime(existing_project.last_updated,
                                                 "%a, %d %b %Y %H:%M:%S GMT")
                got = get_github_api(
                    repo_url, headers={"If-Modified-Since": last_updated})

            # In rare cases, a project can be saved without a last_updated.
            else:
                got = get_github_api(repo_url)

        else:
            got = get_github_api(repo_url)

        if got.status_code in range(400, 499):
            if got.status_code == 404:
                logging.error(repo_url + ' doesn\'t exist.')
                # If its a bad GitHub link, don't return it at all.
                return None
            elif got.status_code == 403:
                logging.error("GitHub Rate Limit Remaining: " +
                              str(got.headers["x-ratelimit-remaining"]))
                error_dict = {
                    "error": u'IOError: We done got throttled by GitHub',
                    "time": datetime.now()
                }
                new_error = Error(**error_dict)
                db.session.add(new_error)
                # commit the error
                db.session.commit()
                github_throttling = True
                return project

            else:
                raise IOError

        # If the project has not been modified...
        elif got.status_code == 304:
            logging.info(
                'Project {} has not been modified since last update'.format(
                    repo_url))

            # Populate values from the civic.json if it exists/is updated
            project, civic_json_is_updated = update_project_from_civic_json(
                project_dict=project, force=spreadsheet_is_updated)

            # if values have changed, copy untouched values from the existing project object and return it
            if spreadsheet_is_updated or civic_json_is_updated:
                logging.info(
                    'Project %s has been modified via spreadsheet or civic.json.',
                    repo_url)
                project['last_updated'] = datetime.now().strftime(
                    "%a, %d %b %Y %H:%M:%S %Z")
                project['github_details'] = existing_project.github_details
                return project

            # nothing was updated, but make sure we keep the project
            # :::here (project/true)
            existing_project.keep = True
            db.session.add(existing_project)
            # commit the project
            db.session.commit()
            return None

        # Save last_updated time header for future requests
        project['last_updated'] = got.headers['Last-Modified']

        all_github_attributes = got.json()
        github_details = {}
        for field in ('contributors_url', 'created_at', 'forks_count',
                      'homepage', 'html_url', 'id', 'language', 'open_issues',
                      'pushed_at', 'updated_at', 'watchers_count', 'name',
                      'description', 'stargazers_count'):
            github_details[field] = all_github_attributes[field]

        github_details['owner'] = dict()

        for field in ('avatar_url', 'html_url', 'login', 'type'):
            github_details['owner'][field] = all_github_attributes['owner'][
                field]

        project['github_details'] = github_details

        if 'name' not in project or not project['name']:
            project['name'] = all_github_attributes['name']

        if 'description' not in project or not project['description']:
            project['description'] = all_github_attributes['description']

        if 'link_url' not in project or not project['link_url']:
            project['link_url'] = all_github_attributes['homepage']

        #
        # Populate project contributors from github_details[contributors_url]
        #
        project['github_details']['contributors'] = []
        got = get_github_api(all_github_attributes['contributors_url'])

        # Check if there are contributors
        try:
            for contributor in got.json():
                # we don't want people without email addresses?
                if contributor['login'] == 'invalid-email-address':
                    break

                project['github_details']['contributors'].append(dict())

                for field in ('login', 'url', 'avatar_url', 'html_url',
                              'contributions'):
                    project['github_details']['contributors'][-1][
                        field] = contributor[field]

                # flag the owner with a boolean value
                project['github_details']['contributors'][-1]['owner'] \
                    = bool(contributor['login'] == project['github_details']['owner']['login'])
        except:
            pass

        #
        # Populate project participation from github_details[url] + "/stats/participation"
        # Sometimes GitHub returns a blank dict instead of no participation.
        #
        got = get_github_api(all_github_attributes['url'] +
                             '/stats/participation')
        try:
            project['github_details']['participation'] = got.json()['all']
        except:
            project['github_details']['participation'] = [0] * 50

        #
        # Populate values from the civic.json if it exists/is updated
        #
        project, civic_json_is_updated = update_project_from_civic_json(
            project_dict=project, force=spreadsheet_is_updated)

    return project
Exemple #2
0
def update_project_info(project):
    ''' Update info from Github, if it's missing.

        Modify the project in-place and return nothing.

        Complete repository project details go into extras, for example
        project details from Github can be found under "github_details".

        Github_details is specifically expected to be used on this page:
        http://opengovhacknight.org/projects.html
    '''
    def non_github_project_update_time(project):
        ''' If its a non-github project, we should check if any of the fields
            have been updated, such as the description.

            Set the last_updated timestamp.
        '''
        existing_project = db.session.query(Project).filter(
            Project.name == project['name']).first()

        if existing_project:
            # project gets existing last_updated
            project['last_updated'] = existing_project.last_updated

            # unless one of the fields has been updated
            if 'description' in project:
                if project['description'] != existing_project.description:
                    project['last_updated'] = datetime.now().strftime(
                        "%a, %d %b %Y %H:%M:%S %Z")
            if 'categories' in project:
                if project['categories'] != existing_project.categories:
                    project['last_updated'] = datetime.now().strftime(
                        "%a, %d %b %Y %H:%M:%S %Z")
            if 'type' in project:
                if project['type'] != existing_project.type:
                    project['last_updated'] = datetime.now().strftime(
                        "%a, %d %b %Y %H:%M:%S %Z")
            if 'link_url' in project:
                if project['link_url'] != existing_project.link_url:
                    project['last_updated'] = datetime.now().strftime(
                        "%a, %d %b %Y %H:%M:%S %Z")

        else:
            # Set a date when we first see a non-github project
            project['last_updated'] = datetime.now().strftime(
                "%a, %d %b %Y %H:%M:%S %Z")

        return project

    if 'code_url' not in project:
        project = non_github_project_update_time(project)
        return project

    _, host, path, _, _, _ = urlparse(project['code_url'])

    if host != 'github.com':
        project = non_github_project_update_time(project)
        return project

    # Get the Github attributes
    if host == 'github.com':
        repo_url = 'https://api.github.com/repos' + path

        # If we've hit the GitHub rate limit, skip updating projects.
        global github_throttling
        if github_throttling:
            return project

        # find an existing project, filtering on code_url, organization_name, and project name (if we know it)
        existing_filter = [
            Project.code_url == project['code_url'],
            Project.organization_name == project['organization_name']
        ]
        if 'name' in project.keys() and project['name'] not in [u'', None]:
            existing_filter.append(Project.name == project['name'])

        existing_project = db.session.query(Project).filter(
            *existing_filter).first()
        if existing_project:
            if existing_project.last_updated:
                last_updated = datetime.strftime(existing_project.last_updated,
                                                 "%a, %d %b %Y %H:%M:%S GMT")
                got = get_github_api(
                    repo_url, headers={"If-Modified-Since": last_updated})
            else:
                # In rare cases, a project can be saved without a last_updated.
                got = get_github_api(repo_url)

        else:
            got = get_github_api(repo_url)

        if got.status_code in range(400, 499):
            if got.status_code == 404:
                logging.error(repo_url + ' doesn\'t exist.')
                # If its a bad GitHub link, don't return it at all.
                return None
            elif got.status_code == 403:
                logging.error("GitHub Rate Limit Remaining: " +
                              str(got.headers["x-ratelimit-remaining"]))
                error_dict = {
                    "error": u'IOError: We done got throttled by GitHub',
                    "time": datetime.now()
                }
                new_error = Error(**error_dict)
                db.session.add(new_error)
                # commit the error
                db.session.commit()
                github_throttling = True
                return project

            else:
                raise IOError

        # If project has not been modified, return
        elif got.status_code == 304:
            logging.info('Project %s has not been modified since last update',
                         repo_url)
            if existing_project:
                # make sure we keep the project
                # :::here (project/true)
                existing_project.keep = True
                db.session.add(existing_project)
                # commit the project
                db.session.commit()
                return None

        # Save last_updated time header for future requests
        project['last_updated'] = got.headers['Last-Modified']

        all_github_attributes = got.json()
        github_details = {}
        for field in ('contributors_url', 'created_at', 'forks_count',
                      'homepage', 'html_url', 'id', 'language', 'open_issues',
                      'pushed_at', 'updated_at', 'watchers_count', 'name',
                      'description', 'stargazers_count'):
            github_details[field] = all_github_attributes[field]

        github_details['owner'] = dict()

        for field in ('avatar_url', 'html_url', 'login', 'type'):
            github_details['owner'][field] = all_github_attributes['owner'][
                field]

        project['github_details'] = github_details

        if 'name' not in project or not project['name']:
            project['name'] = all_github_attributes['name']

        if 'description' not in project or not project['description']:
            project['description'] = all_github_attributes['description']

        if 'link_url' not in project or not project['link_url']:
            project['link_url'] = all_github_attributes['homepage']

        #
        # Populate project contributors from github_details[contributors_url]
        #
        project['github_details']['contributors'] = []
        got = get_github_api(all_github_attributes['contributors_url'])

        # Check if there are contributors
        try:
            for contributor in got.json():
                # we don't want people without email addresses?
                if contributor['login'] == 'invalid-email-address':
                    break

                project['github_details']['contributors'].append(dict())

                for field in ('login', 'url', 'avatar_url', 'html_url',
                              'contributions'):
                    project['github_details']['contributors'][-1][
                        field] = contributor[field]

                # flag the owner with a boolean value
                project['github_details']['contributors'][-1]['owner'] \
                    = bool(contributor['login'] == project['github_details']['owner']['login'])
        except:
            pass

        #
        # Populate project participation from github_details[url] + "/stats/participation"
        # Sometimes GitHub returns a blank dict instead of no participation.
        #
        got = get_github_api(all_github_attributes['url'] +
                             '/stats/participation')
        try:
            project['github_details']['participation'] = got.json()['all']
        except:
            project['github_details']['participation'] = [0] * 50
    return project
Exemple #3
0
def main(org_name=None, org_sources=None):
    ''' Run update over all organizations. Optionally, update just one.
    '''
    # set org_sources
    org_sources = org_sources or ORG_SOURCES_FILENAME

    # Collect a set of fresh organization names.
    organization_names = set()

    # Retrieve all organizations and shuffle the list in place.
    orgs_info = get_organizations(org_sources)
    shuffle(orgs_info)

    if org_name:
        orgs_info = [org for org in orgs_info if org['name'] == org_name]

    # Iterate over organizations and projects, saving them to db.session.
    for org_info in orgs_info:

        if not is_safe_name(org_info['name']):
            error_dict = {
                "error":
                unicode('ValueError: Bad organization name: "%s"' %
                        org_info['name']),
                "time":
                datetime.now()
            }
            new_error = Error(**error_dict)
            db.session.add(new_error)
            # commit the error
            db.session.commit()
            continue

        try:
            filter = Organization.name == org_info['name']
            existing_org = db.session.query(Organization).filter(
                filter).first()
            organization_names.add(org_info['name'])

            # Mark everything associated with this organization for deletion at first.
            # :::here (event/false, story/false, project/false, organization/false)
            db.session.execute(
                db.update(Event, values={
                    'keep': False
                }).where(Event.organization_name == org_info['name']))
            db.session.execute(
                db.update(Story, values={
                    'keep': False
                }).where(Story.organization_name == org_info['name']))
            db.session.execute(
                db.update(Project, values={
                    'keep': False
                }).where(Project.organization_name == org_info['name']))
            db.session.execute(
                db.update(Organization, values={
                    'keep': False
                }).where(Organization.name == org_info['name']))
            # commit the false keeps
            db.session.commit()

            # Empty lat longs are okay.
            if 'latitude' in org_info:
                if not org_info['latitude']:
                    org_info['latitude'] = None
            if 'longitude' in org_info:
                if not org_info['longitude']:
                    org_info['longitude'] = None

            organization = save_organization_info(db.session, org_info)
            organization_names.add(organization.name)
            # flush the organization
            db.session.flush()

            if organization.rss or organization.website:
                logging.info("Gathering all of %s's stories." %
                             organization.name)
                stories = get_stories(organization)
                if stories:
                    for story_info in stories:
                        save_story_info(db.session, story_info)
                    # flush the stories
                    db.session.flush()

            if organization.projects_list_url:
                logging.info("Gathering all of %s's projects." %
                             organization.name)
                projects = get_projects(organization)
                for proj_dict in projects:
                    save_project_info(db.session, proj_dict)
                # flush the projects
                db.session.flush()

            if organization.events_url:
                if not meetup_key:
                    logging.error("No Meetup.com key set.")
                if 'meetup.com' not in organization.events_url:
                    logging.error("Only Meetup.com events work right now.")
                else:
                    logging.info("Gathering all of %s's events." %
                                 organization.name)
                    identifier = get_event_group_identifier(
                        organization.events_url)
                    if identifier:
                        for event in get_meetup_events(organization,
                                                       identifier):
                            save_event_info(db.session, event)
                        # flush the events
                        db.session.flush()
                    else:
                        logging.error("%s does not have a valid events url" %
                                      organization.name)

            # Get issues for all of the projects
            logging.info("Gathering all of %s's open GitHub issues." %
                         organization.name)
            issues = get_issues(organization.name)
            for issue in issues:
                save_issue(db.session, issue)

            # flush the issues
            db.session.flush()
            for issue in issues:
                save_labels(db.session, issue)

            # commit everything
            db.session.commit()

            # Remove everything marked for deletion.
            # :::here (event/delete, story/delete, project/delete, issue/delete, organization/delete)
            db.session.query(Event).filter(Event.keep == False).delete()
            db.session.query(Story).filter(Story.keep == False).delete()
            db.session.query(Issue).filter(Issue.keep == False).delete()
            db.session.query(Project).filter(Project.keep == False).delete()
            db.session.query(Organization).filter(
                Organization.keep == False).delete()
            # commit objects deleted for keep=False
            db.session.commit()

        except:
            # Raise the error, get out of main(), and don't commit the transaction.
            raise

        else:
            # Commit and move on to the next organization.
            # final commit before moving on to the next organization
            db.session.commit()

    # prune orphaned organizations if no organization name was passed
    if not org_name:
        for bad_org in db.session.query(Organization):
            if bad_org.name in organization_names:
                continue

            # delete orphaned organizations, all other deletions will cascade
            db.session.execute(
                db.delete(Organization).where(
                    Organization.name == bad_org.name))
            # commit for deleting orphaned organizations
            db.session.commit()
Exemple #4
0
def main(org_name=None, org_sources=None):
    ''' Run update over all organizations. Optionally, update just one.
    '''
    # Keep a set of fresh organization names.
    organization_names = set()

    # Retrieve all organizations and shuffle the list in place.
    orgs_info = get_organizations(org_sources)
    shuffle(orgs_info)

    if org_name:
        orgs_info = [org for org in orgs_info if org['name'] == org_name]

    # Iterate over organizations and projects, saving them to db.session.
    for org_info in orgs_info:

        if not is_safe_name(org_info['name']):
            error_dict = {
                "error":
                'ValueError: Bad organization name: "%s"' % org_info['name'],
                "time":
                datetime.now()
            }
            new_error = Error(**error_dict)
            db.session.add(new_error)
            db.session.commit()
            continue

        try:
            filter = Organization.name == org_info['name']
            existing_org = db.session.query(Organization).filter(
                filter).first()
            organization_names.add(org_info['name'])

            # Mark everything in this organization for deletion at first.
            db.session.execute(
                db.update(Event, values={
                    'keep': False
                }).where(Event.organization_name == org_info['name']))
            db.session.execute(
                db.update(Story, values={
                    'keep': False
                }).where(Story.organization_name == org_info['name']))
            db.session.execute(
                db.update(Project, values={
                    'keep': False
                }).where(Project.organization_name == org_info['name']))
            db.session.execute(
                db.update(Organization, values={
                    'keep': False
                }).where(Organization.name == org_info['name']))

            # Empty lat longs are okay.
            if 'latitude' in org_info:
                if not org_info['latitude']:
                    org_info['latitude'] = None
            if 'longitude' in org_info:
                if not org_info['longitude']:
                    org_info['longitude'] = None

            organization = save_organization_info(db.session, org_info)
            organization_names.add(organization.name)

            if organization.rss or organization.website:
                logging.info("Gathering all of %s's stories." %
                             organization.name)
                stories = get_stories(organization)
                if stories:
                    for story_info in stories:
                        save_story_info(db.session, story_info)

            if organization.projects_list_url:
                logging.info("Gathering all of %s's projects." %
                             organization.name)
                projects = get_projects(organization)
                for proj_info in projects:
                    save_project_info(db.session, proj_info)

            if organization.events_url:
                if not meetup_key:
                    logging.error("No Meetup.com key set.")
                if 'meetup.com' not in organization.events_url:
                    logging.error("Only Meetup.com events work right now.")
                else:
                    logging.info("Gathering all of %s's events." %
                                 organization.name)
                    identifier = get_event_group_identifier(
                        organization.events_url)
                    if identifier:
                        for event in get_meetup_events(organization,
                                                       identifier):
                            save_event_info(db.session, event)
                    else:
                        logging.error("%s does not have a valid events url" %
                                      organization.name)

            # Get issues for all of the projects
            logging.info("Gathering all of %s's open GitHub issues." %
                         organization.name)
            issues, labels = get_issues(organization.name)
            for i in range(0, len(issues)):
                save_issue_info(db.session, issues[i], labels[i])

            # Remove everything marked for deletion.
            db.session.query(Event).filter(not Event.keep).delete()
            db.session.query(Story).filter(not Story.keep).delete()
            db.session.query(Project).filter(not Project.keep).delete()
            db.session.query(Issue).filter(Issue.keep == False).delete()
            db.session.query(Organization).filter(
                not Organization.keep).delete()

        except:
            # Raise the error, get out of main(), and don't commit the transaction.
            raise

        else:
            # Commit and move on to the next organization.
            db.session.commit()

    # Stop right here if an org name was specified.
    if org_name:
        return

    # Delete any organization not found on this round.
    for bad_org in db.session.query(Organization):
        if bad_org.name in organization_names:
            continue

        db.session.execute(
            db.delete(Event).where(Event.organization_name == bad_org.name))
        db.session.execute(
            db.delete(Story).where(Story.organization_name == bad_org.name))
        db.session.execute(
            db.delete(Project).where(
                Project.organization_name == bad_org.name))
        db.session.execute(
            db.delete(Organization).where(Organization.name == bad_org.name))
        db.session.commit()