def main(): # Begin by parsing options and ensuring existence of input and output paths. quickSetup() parser = get_parser() options = parser.parse_args() output_path = options.output_path[0] pathlog = log.fields(output_path=output_path) if os.path.isdir(output_path): pathlog.info("directory OK") elif os.path.exists(output_path): pathlog.error("is NOT a directory") sys.exit(1) else: pathlog.error("does NOT exist") sys.exit(1) all_repos_valid = True for repo_path in options.repos: repolog = log.fields(repo_path=repo_path) try: repo = git.Repo(repo_path) except git.InvalidGitRepositoryError: repolog.error("is NOT a repository") all_repos_valid = False except git.NoSuchPathError: repolog.error("does NOT exist") all_repos_valid = False else: repolog.info("repository OK") repos[repo_path] = repo if not all_repos_valid: sys.exit(1) process_all_repos(repos)
def write_page(path, filename, **kwargs): if 'index' not in kwargs: kwargs['index_link'] = '<p><a href="index.html">Back to stats home page</a></p>' else: kwargs['index_link'] = '' if 'head_title' not in kwargs: kwargs['head_title'] = kwargs['title'] kwargs['timestamp'] = unicode(datetime.now()) filename = os.path.join(path, filename) log.fields(filename=filename).info('writing') with open(filename, 'wb') as f: f.write(PAGE_TEMPLATE.format(**kwargs))
def pickle_all(filename, log=log): log.fields(filename=filename).info('pickling') structure = dict( authors_map=authors_map, review_locks=review_locks, snippet_locks=snippet_locks, latest_commits=latest_commits, repo_infos_by_path=repo_infos_by_path, repo_infos_by_name=repo_infos_by_name, ) with open(filename, 'wb') as f: pickle.dump(structure, f, -1)
def store_grid_entry(session, grid_spec): """ Add a grid spec to the database and return the grid's unique ID. Parameters ---------- session : sqlalchemy.orm.session.Session grid_spec : dict Returns ------- hash_id : str """ llog = log.fields(secret=grid_spec['secret']) llog.debug('storing grid') table = models.SecretGrid if grid_spec['secret'] else models.PublicGrid new_grid = table(**grid_spec) session.add(new_grid) session.flush() hash_id = encode_grid_id(new_grid.id, grid_spec['secret']) llog.fields(grid_id=new_grid.id, hash_id=hash_id).debug('grid stored') return hash_id
def get_grid_entry(session, hash_id, secret=False): """ Get a specific grid entry. Parameters ---------- session : sqlalchemy.orm.session.Session hash_id : str secret : bool, optional Whether this is a secret grid. Returns ------- grid_spec : dict Will be None if no matching grid was found. """ grid_id = decode_hash_id(hash_id, secret) llog = log.fields(grid_id=grid_id, hash_id=hash_id, secret=secret) if not grid_id: # couldn't do the conversion from hash to database ID llog.debug('cannot decrypt hash') return llog.debug('pulling grid from database') table = models.SecretGrid if secret else models.PublicGrid grid_spec = session.query(table).filter(table.id == grid_id).one_or_none() return grid_spec
def process_all_repos(): """Loop through events in all repositories.""" for repo_path, repo_info in repo_infos_by_path.iteritems(): repo = repo_info.repo repo_name = repo_info.name repolog = log.fields(repo_name=repo_name) repolog.info('processing') last_locks = {} prev_latest_commit = latest_commits.get(repo_name, None) if prev_latest_commit is not None: repolog.fields(prev_latest_commit=prev_latest_commit).info() else: repolog.info('new repo') latest_commit = repo.commit('master').hexsha commits = [] # Find applicable commits. for commit in repo.iter_commits(latest_commit): if commit.hexsha == prev_latest_commit: # Reached commit we stopped at last time. break else: commits.append(commit) # Process them starting with eldest first. for commit in reversed(commits): email = normalize_email(commit.author.email) update_authors_map(email, commit) last_locks = update_locks(email, repo_name, commit, last_locks) update_snippets(email, repo_info, commit) # Store latest commit for next time. latest_commits[repo_name] = latest_commit repolog.fields(latest_commit=latest_commit).info()
def unpickle_all(filename, log=log): log = log.fields(filename=filename) if os.path.isfile(filename): log.info('unpickling') with open(filename, 'rb') as f: globals().update(pickle.load(f)) else: log.info('notfound')
def process_all_repos(repos): """Loop through events in all repositories.""" for repo_path, repo in repos.iteritems(): repo_name = os.path.split(repo_path)[-1] repolog = log.fields(repo_name=repo_name) repolog.info("processing") last_locks = {} for commit in reversed(list(repo.iter_commits("master"))): update_authors_map(commit) last_locks = update_locks(repo_name, commit, last_locks)
def get_grid_entry(hash_id, secret=False): """ Get a specific grid entry. Parameters ---------- hash_id : str secret : bool, optional Whether this is a secret grid. Returns ------- grid_spec : dict Will be None if no matching grid was found. """ grid_id = decode_hash_id(hash_id, secret) llog = log.fields(grid_id=grid_id, hash_id=hash_id, secret=secret) if not grid_id: # couldn't do the conversion from hash to database ID llog.debug('cannot decrypt hash') return llog.debug('looking for grid') mc = get_memcached() mc_key = str((grid_id, secret)) if mc_key in mc: llog.debug('pulling grid from memcached') return mc[mc_key] llog.debug('pulling grid from database') table = get_table(secret) grid_spec = table.find_one(id=grid_id) if grid_spec: llog.debug('grid found') grid_spec = desqlize_grid_entry(grid_spec) mc[mc_key] = grid_spec else: llog.debug('grid not found') return return grid_spec
def store_grid_entry(grid_spec): """ Add a grid spec to the database and return the grid's unique ID. Parameters ---------- grid_spec : dict Returns ------- hash_id : str """ grid_entry = sqlize_grid_spec(grid_spec) llog = log.fields(secret=grid_entry['secret']) llog.debug('storing grid') table = get_table(grid_entry['secret']) grid_id = table.insert(grid_entry) llog.fields(grid_id=grid_id).debug('grid stored') return encode_grid_id(grid_id, grid_entry['secret'])
def process_all_authors(): """Loop through authors to calculate author-specific stats.""" for author_info in authors_map.itervalues(): author_log = log.fields(author_name=sorted(author_info.names)[0]) author_log.info('processing') # Reset accumulators. author_info.total_actions = 0 author_info.total_transcriptions = 0 author_info.time_spent = 0 author_info.time_spent_transcribing = 0 author_info.total_bytes_transcribed = 0 # Process snippets. for repo_name, snippet_map in author_info.snippets.iteritems(): for starting_point, snippet_actions in snippet_map.iteritems(): author_info.total_actions += len(snippet_actions) # Determine if the snippet action was a transcription. repo_info = repo_infos_by_name[repo_name] first_repo_snippet = repo_info.snippets.get(starting_point) if first_repo_snippet: first_repo_snippet_action = first_repo_snippet[0] if first_repo_snippet_action in snippet_actions: author_info.total_transcriptions += 1 # Process locks. for repo_name, locks_map in author_info.locks_created.iteritems(): for lock in locks_map.itervalues(): if lock.created_by == lock.destroyed_by: duration = lock.destroyed_at - lock.created_at author_info.time_spent += duration # Determine if the lock was associated with a transcription, vs. an edit. repo_info = repo_infos_by_name[repo_name] if lock.starting_point in repo_info.snippets: first_repo_snippet_action = repo_info.snippets[lock.starting_point][0] if first_repo_snippet_action.saved == lock.destroyed_at: author_info.time_spent_transcribing += duration author_info.total_bytes_transcribed += first_repo_snippet_action.bytes if author_info.total_transcriptions and author_info.time_spent_transcribing: author_info.average_time_per_transcription = author_info.time_spent_transcribing / author_info.total_transcriptions author_info.average_wpm = (author_info.total_bytes_transcribed / 5.0) / (author_info.time_spent_transcribing / 60.0)
def get_memcached(): host = os.environ.get('MC_PORT', '127.0.0.1').replace('tcp://', '') log.fields(mc_host=host).debug('connecting to memcached') return pylibmc.Client([host], binary=True)
def main(): # Parse options and ensuring existence of input and output paths. quickSetup() parser = get_parser() options = parser.parse_args() if options.pickle is not None: options.pickle = options.pickle[0] # list -> string unpickle_all(options.pickle) else: # Do nothing; keep initial state. pass if options.email_map: for email_mapping in options.email_map: email_from, email_to = email_mapping.split(':') email_maps[email_from] = email_to log.fields(email_from=email_from, email_to=email_to).info('email mapping') if options.email_ignore: email_ignores.update(options.email_ignore) output_path = options.output_path[0] pathlog = log.fields(output_path=output_path) if os.path.isdir(output_path): pathlog.info('directory OK') elif os.path.exists(output_path): pathlog.error('is NOT a directory') sys.exit(1) else: pathlog.error('does NOT exist') sys.exit(1) all_repos_valid = True for repo_path in options.repos: repolog = log.fields(repo_path=repo_path) try: repo = git.Repo(repo_path) except git.InvalidGitRepositoryError: repolog.error('is NOT a repository') all_repos_valid = False except git.NoSuchPathError: repolog.error('does NOT exist') all_repos_valid = False else: repolog.info('repository OK') tree = repo.tree('master') transcription_json = load(tree['transcription.json'].data_stream) repo_name = os.path.split(repo_path)[-1] git_repos_by_name[repo_name] = repo if repo_path not in repo_infos_by_path: # Create new RepoInfo structure. repo_infos_by_path[repo_path] = repo_infos_by_name[repo_name] = RepoInfo( name=repo_name, transcription=transcription_json, authors=set(), snippets={}, ) else: # Keep existing RepoInfo structure. pass if not all_repos_valid: sys.exit(1) process_all_repos() process_all_authors() create_all_output(output_path) if options.pickle is not None: pickle_all(options.pickle)