Example #1
0
def main():
    parser = argparse.ArgumentParser(
        description='Set a value for CORTX Community.',
        add_help=False,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    required = parser.add_argument_group('required arguments')
    optional = parser.add_argument_group('optional_requirements')
    optional.add_argument('-h',
                          '--help',
                          action='help',
                          help='show this help message and exit')
    required.add_argument('--key',
                          '-k',
                          type=str,
                          help="Which key to set / query",
                          required=True)
    optional.add_argument('--value',
                          '-v',
                          type=str,
                          help="Which value to set",
                          required=False)
    optional.add_argument('--date',
                          '-d',
                          type=str,
                          help='Which date to set',
                          required=False)
    optional.add_argument('--org', '-o', help='Which org', default='Seagate')
    optional.add_argument('--repo', '-r', help='Which repo', default='GLOBAL')
    optional.add_argument('--verbose',
                          '-V',
                          help='Do not compress array values into a number',
                          action='store_true',
                          default=False,
                          required=False)
    args = parser.parse_args()

    repo = args.repo
    org = args.org
    key = args.key
    val = args.value
    date = args.date

    ps = cortx_community.PersistentStats(org_name=args.org)
    dates = ps.get_dates(args.repo)

    if date is None:
        date = dates[-1]
        print("Defaulting to use last valid date %s" % date)

    if val is not None:
        ps.add_stat(date=date, repo=repo, stat=key, value=int(val))
        print("Changing %s on %s to be %s" % (repo, date, val))

    for d in dates:
        if args.verbose:
            print(d, args.key, ps.get_values(args.repo, args.key, [d]))
        else:
            print(d, args.key,
                  ps.get_values_as_numbers(args.repo, args.key, [d]))
Example #2
0
def merge_stats(slack_stats):
  ps=cc.PersistentStats()
  repos=ps.get_repos()
  for channel in slack_stats.keys():
    try:
      repo = channel_repo_map[channel] 
    except KeyError:
      repo = channel
    assert repo in repos, "Do not have a repo into which to merge %s" % channel
    print("Can %s merge channel %s into corresponding repo" % ( '' if repo in repos else 'not', channel))
    merge_stat(slack_stats[channel],ps,repo)
  ps.persist()  # probably unnecessary since ps.add_stat does an internal persist
  return ps 
Example #3
0
def collect_stats(update):
  gh = Github(os.environ.get('GH_OATH'))
  avoid_rate_limiting(gh)
  stx = gh.get_organization('Seagate')
  today = datetime.today().strftime('%Y-%m-%d')

  # averages are weird so handle them differently
  ave_age_str='_ave_age_in_s'

  # the shared structure that we use for collecting stats
  global_stats = { 'branches'                      : 0, 
                   'clones_count_14_days'          : 0,
                   'clones_unique_14_days'         : 0,
                   'comments'                      : 0,
                   'commits'                       : 0, 
                   'companies_contributing'        : set(),
                   'companies'                     : set(), 
                   'contributors'                  : set(), 
                   'domains'                       : set(), 
                   'downloads_releases'            : 0,
                   'downloads_vms'                 : 0,
                   'email_addresses'               : set(), 
                   'external_comments'             : 0,
                   'external_email_addresses'      : set(),
                   'forks_external'                : set(),
                   'forks'                         : set(),
                   'logins'                        : set(), 
                   'new_external_activities'       : set(),
                   'new_logins'                    : set(),
                   'pull_requests_external_merged' : 0,
                   'pull_requests_internal_merged' : 0,
                   'pull_requests_merged'          : 0,
                   'seagate_blog_referrer_count'   : 0,
                   'seagate_blog_referrer_uniques' : 0,
                   'seagate_referrer_count'        : 0,
                   'seagate_referrer_uniques'      : 0,
                   'stars_external'                : set(),
                   'stars'                         : set(),
                   'top_paths'                     : [], 
                   'top_referrers'                 : [],
                   'views_count_14_days'           : 0,
                   'views_unique_14_days'          : 0,
                   'watchers_external'             : set(),
                   'watchers'                      : set(),
                    }
  load_actors(global_stats,('mannequin','innersource','external','hackathon','bot','cortx_team','unknown'))
  load_items(global_stats,('issues','pull_requests'),('_external','_internal',''),('','_open','_closed','_open_ave_age_in_s','_closed_ave_age_in_s'))
  global_stats['pull_requests_external_merged'] = 0
  local_stats_template = copy.deepcopy(global_stats)    # save an empty copy of the stats struct to copy for each repo
  author_activity = cortx_community.CortxActivity()     # load up the author activity pickle 
  people = cortx_community.CortxCommunity()             # load up the people pickle
  persistent_stats = cortx_community.PersistentStats()  # load up all the stats

  for repo in stx.get_repos():
    rname = repo.name # put this in a variable just in case it is a github API to fetch this
    if 'cortx' not in rname or rname.endswith('.old') or rname.endswith('-old') or repo.private:
      continue

    local_stats = copy.deepcopy(local_stats_template) # get an empty copy of the stats structure

    # Use this update if you just want to add some new data and don't want to wait for the very slow time
    # to scrape all activity.  Once you have finished the update, migrate the code out of the update block.
    # Typically we don't use update; only during development 
    # Note that update doesn't work for values that are incremented . . . 
    if update:
      (cached_local_stats,timestamp) = persistent_stats.get_latest(rname)  # load the cached version
      print("Fetched %s data for %s" % (timestamp, repo))
      for k,v in cached_local_stats.items():
        local_stats[k] = v
    else:
      get_top_level_repo_info(local_stats,repo,people=people,author_activity=author_activity,gh=gh,)
      get_issues_and_prs(rname,repo,local_stats,people=people,author_activity=author_activity,gh=gh)
      get_commits(rname,repo,local_stats,people=people,author_activity=author_activity,gh=gh)
      get_contributors(rname,repo,local_stats,people=people,gh=gh)

    # what we need to do is query when the last time this ran and then pass 'since' to get_commits

    # summarize info for this repo and persist the data structures
    summarize_consolidate(local_stats,global_stats,people=people,author_activity=author_activity,ave_age_str=ave_age_str)
    persist_author_activity(author_activity)
    persistent_stats.add_stats(date=today,repo=rname,stats=local_stats)
    persistent_stats.print_repo(rname,local_stats,date=today,verbose=False,csv=False)

  # do a bit of cleaning on global stats
  # print and persist the global consolidated stats

  # treat the 'ave_age_in_s' fields differently 
  # all those fields have consistent names: 'x_ave_age_in_s'
  # also, there will always be a corresponding field x which is the count
  for ave_age in [key for key in global_stats.keys() if ave_age_str in key]:
    item  = ave_age[0:len(ave_age)-len(ave_age_str)]
    try:
      global_stats[ave_age] /= global_stats[item]
    except ZeroDivisionError:
      global_stats[ave_age] = 0

  global_stats['top_referrers'] = consolidate_referrers(global_stats['top_referrers'])

  persistent_stats.print_repo('GLOBAL',global_stats,date=today,verbose=False,csv=False)
  persistent_stats.add_stats(date=today,repo='GLOBAL',stats=global_stats)
Example #4
0
def main():
    parser = argparse.ArgumentParser(
        description='Print the latest statistics for CORTX Community.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help="Show all data")
    parser.add_argument('-a',
                        '--all',
                        action='store_true',
                        help="Show all repos (i.e. not just summary)")
    parser.add_argument('-s',
                        '--suppress',
                        action='store_true',
                        help="Don't show interesting fields")
    parser.add_argument('-i',
                        '--individual',
                        type=str,
                        help="Only show data for a single repo")
    parser.add_argument('-c',
                        '--csv',
                        action='store_true',
                        help="Output in csv")
    args = parser.parse_args()

    ps = cortx_community.PersistentStats()
    repos = sorted([repo for repo in ps.get_repos() if repo != 'GLOBAL'])

    # averages are weird so handle them differently
    ave_age_str = '_ave_age_in_s'

    # first build the global stats
    gstats = {}
    gstats['repo_count'] = len(repos)
    timestamp = None
    for repo in repos:
        (rstats, timestamp) = ps.get_latest(repo)
        for k, v in rstats.items():
            if isinstance(v, int) or isinstance(v, float):
                if k not in gstats:
                    gstats[k] = 0
                if ave_age_str in k:
                    # treat averages differently, put the total value in here, down below adust by count to get actual average
                    item = k[0:len(k) - len(ave_age_str)]
                    count = rstats[item]
                    gstats[k] += (v * count)
                else:
                    gstats[k] += v
            elif isinstance(v, set):
                if k not in gstats:
                    gstats[k] = set()
                gstats[k] |= v
            elif isinstance(v, list):
                if k not in gstats:
                    gstats[k] = []
                gstats[k] += v
            else:
                raise TypeError("%s has unknown type %s" % (k, type(v)))

    # top referrers is a bit weird so clean that one up specifically here
    gstats['top_referrers'] = consolidate_referrers(gstats['top_referrers'])

    # ugh, there is a problem with the average age fields
    # all those fields have consistent names: 'x_ave_age_in_s'
    # also, there will always be a corresponding field x which is the count
    for ave_age in [key for key in gstats.keys() if ave_age_str in key]:
        item = ave_age[0:len(ave_age) - len(ave_age_str)]
        gstats[ave_age] /= gstats[item]

    # remove some bullshit companies
    for bs_company in ('Seagate', 'Codacy', 'Dependabot'):
        for k in ['companies', 'companies_contributing']:
            try:
                gstats[k].remove(bs_company)
            except KeyError:
                pass

    if args.individual:
        (repo, timestamp) = ps.get_latest(args.individual)
        ps.print_repo(args.individual,
                      repo,
                      timestamp,
                      verbose=args.verbose,
                      csv=args.csv)
        if not args.suppress:
            print_interesting_arrays(args.individual, repo)
    else:
        ps.print_repo('GLOBAL',
                      gstats,
                      timestamp,
                      verbose=args.verbose,
                      csv=args.csv)
        if not args.suppress:
            print_interesting_arrays('GLOBAL', gstats)

    if args.all:
        for repo in repos:
            (rstats, timestamp) = ps.get_latest(repo)
            ps.print_repo(repo,
                          rstats,
                          timestamp,
                          verbose=args.verbose,
                          csv=args.csv)
Example #5
0
def collect_stats(gh,org_name,update,prefix,top_only):
  avoid_rate_limiting(gh)
  today = datetime.today().strftime('%Y-%m-%d')

  # populate our persistent data structures from the pickles
  people = cortx_community.CortxCommunity(org_name)             
  author_activity = cortx_community.CortxActivity(org_name)     
  persistent_stats = cortx_community.PersistentStats(org_name)  

  # averages are weird so handle them differently
  ave_age_str='_ave_age_in_s'

  # the shared structure that we use for collecting stats
  global_stats = { 'branches'                      : 0, 
                   'clones_count_14_days'          : 0,
                   'clones_unique_14_days'         : 0,
                   'comments'                      : 0,
                   'commits'                       : 0, 
                   'companies_contributing'        : set(),
                   'companies'                     : set(), 
                   'contributors'                  : set(), 
                   'domains'                       : set(), 
                   'downloads_releases'            : 0,
                   'downloads_vms'                 : 0,
                   'email_addresses'               : set(), 
                   'external_comments'             : 0,
                   'external_email_addresses'      : set(),
                   'forks_external'                : set(),
                   'forks'                         : set(),
                   'logins'                        : set(), 
                   'new_external_activities'       : set(),
                   'new_logins'                    : set(),
                   'pull_requests_external_merged' : 0,
                   'pull_requests_internal_merged' : 0,
                   'pull_requests_merged'          : 0,
                   'seagate_blog_referrer_count'   : 0,
                   'seagate_blog_referrer_uniques' : 0,
                   'seagate_referrer_count'        : 0,
                   'seagate_referrer_uniques'      : 0,
                   'stars_external'                : set(),
                   'stars'                         : set(),
                   'top_paths'                     : [], 
                   'top_referrers'                 : [],
                   'views_count_14_days'           : 0,
                   'views_unique_14_days'          : 0,
                   'watchers_external'             : set(),
                   'watchers'                      : set(),
                    }
  load_actors(global_stats,people)
  load_items(global_stats,('issues','pull_requests'),('_external','_internal',''),('','_open','_closed','_open_ave_age_in_s','_closed_ave_age_in_s'))
  local_stats_template = copy.deepcopy(global_stats)    # save an empty copy of the stats struct to copy for each repo

  for repo in cortx_community.get_repos(org_name=org_name,prefix=prefix): 
    while True: # add a while loop since we are always failing and it would be good to run successfully more often
      try:
        local_stats = copy.deepcopy(local_stats_template) # get an empty copy of the stats structure
        rname=repo.name # just in case this requires a github API call, fetch it once and reuse it

        # Use this update if you just want to add some new data and don't want to wait for the very slow time
        # to scrape all activity.  Once you have finished the update, migrate the code out of the update block.
        # Typically we don't use update; only during development 
        # Note that update doesn't work for values that are incremented . . . 
        if update:
          (cached_local_stats,timestamp) = persistent_stats.get_latest(rname)  # load the cached version
          print("Fetched %s data for %s" % (timestamp, repo))
          for k,v in cached_local_stats.items():
            local_stats[k] = v
        else:
          get_top_level_repo_info(local_stats,repo,people=people,author_activity=author_activity,gh=gh,org_name=org_name)
          get_contributors(rname,repo,local_stats,people=people,gh=gh,org_name=org_name)
          if not top_only:
            get_issues_and_prs(rname,repo,local_stats,people=people,author_activity=author_activity,gh=gh,org_name=org_name)
            get_commits(rname,repo,local_stats,people=people,author_activity=author_activity,gh=gh,org_name=org_name)

        # what we need to do is query when the last time this ran and then pass 'since' to get_commits

        # summarize info for this repo and persist the data structures
        summarize_consolidate(local_stats,global_stats,people=people,author_activity=author_activity,ave_age_str=ave_age_str)
        persist_author_activity(author_activity)
        persistent_stats.add_stats(date=today,repo=rname,stats=local_stats)
        persistent_stats.print_repo(rname,local_stats,date=today,verbose=False,csv=False)
        break
      except Exception as e:
        print("WTF: Failed while getting stats for repo %s" % repo.name, e)
        avoid_rate_limiting(gh,Verbose=True)

  # do a bit of cleaning on global stats
  # print and persist the global consolidated stats

  # treat the 'ave_age_in_s' fields differently 
  # all those fields have consistent names: 'x_ave_age_in_s'
  # also, there will always be a corresponding field x which is the count
  for ave_age in [key for key in global_stats.keys() if ave_age_str in key]:
    item  = ave_age[0:len(ave_age)-len(ave_age_str)]
    try:
      global_stats[ave_age] /= global_stats[item]
    except ZeroDivisionError:
      global_stats[ave_age] = 0

  global_stats['top_referrers'] = consolidate_referrers(global_stats['top_referrers'])

  persistent_stats.print_repo('GLOBAL',global_stats,date=today,verbose=False,csv=False)
  persistent_stats.add_stats(date=today,repo='GLOBAL',stats=global_stats)
Example #6
0
def main():
    parser = argparse.ArgumentParser(
        description='Print the latest statistics for CORTX Community.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help="Show all data")
    parser.add_argument('-a',
                        '--all',
                        action='store_true',
                        help="Show all repos (i.e. not just summary)")
    parser.add_argument('-s',
                        '--suppress',
                        action='store_true',
                        help="Don't show interesting fields")
    parser.add_argument('-i',
                        '--individual',
                        type=str,
                        help="Only show data for a single repo")
    parser.add_argument('-c',
                        '--csv',
                        action='store_true',
                        help="Output in csv")
    parser.add_argument('-o',
                        '--org',
                        action='store',
                        help='Print the latest statistics for a different org',
                        default='Seagate')
    #parser.add_argument('-k', '--key', type=str, help="Show all values for all dates for a single key") # TODO: would be nice to add this one
    args = parser.parse_args()

    ps = cortx_community.PersistentStats(org_name=args.org)
    repos = sorted([repo for repo in ps.get_repos() if repo != 'GLOBAL'])

    # just use global from the scrape
    (gstats, timestamp) = ps.get_latest('GLOBAL')

    if args.individual:
        (repo, timestamp) = ps.get_latest(args.individual)
        ps.print_repo(args.individual,
                      repo,
                      timestamp,
                      verbose=args.verbose,
                      csv=args.csv)
        if not args.suppress:
            print_interesting_arrays(args.individual, repo)
    else:
        ps.print_repo('GLOBAL',
                      gstats,
                      timestamp,
                      verbose=args.verbose,
                      csv=args.csv)
        if not args.suppress:
            print_interesting_arrays('GLOBAL', gstats)

    if args.all:
        for repo in repos:
            (rstats, timestamp) = ps.get_latest(repo)
            ps.print_repo(repo,
                          rstats,
                          timestamp,
                          verbose=args.verbose,
                          csv=args.csv)