def summarize(lines, departed_devs): """ Aggregate the FileData in lines, considering all devs in departed_devs to be hit by a bus. """ aggs = {} # aggregate by valtype and our top-level objects, used by the # index page. create_agg(aggs, (a_valtype, a_dev)) create_agg(aggs, (a_valtype, a_project)) create_agg(aggs, (a_valtype, a_fname)) # aggregates by project for the projects pages create_agg(aggs, (a_project, a_valtype, a_fname)) create_agg(aggs, (a_project, a_valtype, a_dev)) # aggregates by dev group of 1 or more for the devs pages. create_agg(aggs, (a_dev, a_valtype, a_fname)) create_agg(aggs, (a_dev, a_valtype, a_project)) # fname aggregate for the files pages create_agg(aggs, (a_fname, a_valtype, a_dev)) for line in lines: fd = FileData(line) # we don't do anything with the risk represented by departed # devs...the risk has already turned out to be real and the # knowledge is gone. dev_risk, _ignored = split_out_dev_vals(fd.dev_risk, departed_devs) for devs, risk in dev_risk: agg_all(aggs, Dat('risk', fd, devs, risk)) dev_uniq, dev_orphaned = split_out_dev_vals(fd.dev_uniq, departed_devs) for devs, uniq in dev_uniq: agg_all(aggs, Dat('unique knowledge', fd, devs, uniq)) # hack: to get the devs with most shared knowledge to show # up on the devs pages, explode the devs and aggregate # them pairwise here under a different valtype that only # the devs pages will use for dev1 in devs: for dev2 in devs: # don't double count the similarity if dev1 < dev2: agg_all( aggs, Dat('shared knowledge (devs still present)', fd, [dev1, dev2], uniq)) # if there is knowledge unique to groups of 1 or more devs who # are all departed, this knowledge is orphaned. for devs, orphaned in dev_orphaned: agg_all(aggs, Dat('orphaned knowledge', fd, devs, orphaned)) return aggs
def sequential(lines, model_args): """ Entry point for the sequential algorithm. See the description in the file docs. Yields FileData objects as tsv lines, with dev_uniq and tot_knowledge fields filled in. """ knowledge_churn_constant = float(model_args[0]) for line in lines: fd = FileData(line) dev_uniq, tot_knowledge = sequential_estimate_uniq( fd, knowledge_churn_constant) fd.dev_uniq = dev_uniq fd.tot_knowledge = tot_knowledge yield fd.as_line()
def gen_stats(root, project, interesting, not_interesting, options): """ root: the root svn url of the project we are generating stats for (does not need to be the root of the svn repo). Must be a url, not a checkout path. project: the project identifier. interesting: regular expressions that indicate an interesting path if they match not_interesting: regular expressions that trump interesting and indicate a path is not interesting. options: currently unused, options from gen_file_stats.py's main. Yields FileData objects encoded as tsv lines. Only the fname, dev_experience and cnt_lines fields are filled in. """ client = pysvn.Client() # we need the repo root because the paths returned by svn ls are relative to the repo root, # not our project root repo_root = client.root_url_from_path(root) interesting_fs = [ f[0].repos_path for f in client.list(root, recurse=True) if is_interesting(f[0].repos_path, interesting, not_interesting) and f[0].kind == pysvn.node_kind.file ] for f in interesting_fs: dev_experience = parse_dev_experience(f, client, repo_root) if dev_experience: fd = FileData(':'.join([project, f])) # don't take revisions that are 0 lines added and 0 removed, like properties fd.dev_experience = [(dev, added, removed) for dev, added, removed in dev_experience if added or removed] fd.cnt_lines = count_lines(f, client, repo_root) fd_line = fd.as_line() if fd_line.strip(): yield fd_line
def estimate_file_risks(lines, bus_risks, def_bus_risk): """ Estimate the risk in the file as: sum(knowledge unique to a group of 1 or more devs * the probability that all devs in the group will be hit by a bus) We use a simple joint probability and assume that all bus killings are independently likely. """ for line in lines: fd = FileData(line) dev_risk = [] for devs, shared in fd.dev_uniq: risk = shared for dev in devs: risk = float(risk) * get_bus_risk(dev, bus_risks, def_bus_risk) dev_risk.append((devs, risk)) fd.dev_risk = dev_risk yield fd.as_line()
def gen_stats(root, project, interesting, not_interesting, options): """ root: the path a local, git controlled-directory that is the root of this project project: the name of the project interesting: regular expressions that indicate an interesting path if they match not_interesting: regular expressions that trump interesting and indicate a path is not interesting. options: from gen_file_stats.py's main, currently only uses git_exe. Yields FileData objects encoded as tsv lines. Only the fname, dev_experience and cnt_lines fields are filled in. """ git_exe = options.git_exe # since git only works once you're in a git controlled path, we # need to get into one of those... prepare(root, git_exe) files = git_ls(root, git_exe) for f in files: if is_interesting(f, interesting, not_interesting): dev_experience = parse_dev_experience(f, git_exe) if dev_experience: fd = FileData(':'.join([project, f])) fd.dev_experience = dev_experience fd.cnt_lines = count_lines(f) fd_line = fd.as_line() if fd_line.strip(): yield fd_line