def ls(project_root): """ Return a list of real, absolute paths to all the git-controlled files under the project root. """ git_root = git.find_git_root(project_root) # --full-tree = allow absolute path for final argument (pathname) # # --name-only = don't show the git id for the object, just the # file name # # -r = recurse into subdirs # # -z = null byte separate listings git_cmd_s = 'ls-tree --full-tree --name-only -r -z HEAD' # don't add the project root until after the split, in case it # contains spaces. git_cmd = git_cmd_s.split() git_cmd.append(project_root) with git.git_cmd(cmd=git_cmd, cwd=git_root) as out_f: fnames_z = out_f.read() return [ util.real_abs_path(fname=fname, parent=git_root) for fname in fnames_z.split('\0') # don't show '', which is just the root of the repo. if fname ]
def ls(project_root): """ Return a list of real, absolute paths to all the git-controlled files under the project root. """ git_root = git.find_git_root(project_root) # --full-tree = allow absolute path for final argument (pathname) # # --name-only = don't show the git id for the object, just the # file name # # -r = recurse into subdirs # # -z = null byte separate listings git_cmd_s = 'ls-tree --full-tree --name-only -r -z HEAD' # don't add the project root until after the split, in case it # contains spaces. git_cmd = git_cmd_s.split() git_cmd.append(project_root) with git.git_cmd(cmd=git_cmd, cwd=git_root) as out_f: fnames_z = out_f.read() return [util.real_abs_path(fname=fname, parent=git_root) for fname in fnames_z.split('\0') # don't show '', which is just the root of the repo. if fname]
def find_git_root(git_repo_or_subdir): """ Returns a real, absolute path to the git root, assuming that `git_repo_or_subdir` is a real, absolute path to either a git repo or subdir under it. """ cmd = 'rev-parse --show-toplevel'.split() with git_cmd(cmd, cwd=git_repo_or_subdir) as out_f: git_root = out_f.read().strip() return util.real_abs_path(git_root)
def excavate(project_dir, log_cache_dir, interesting_fnames_res, boring_fnames_res, fact_finders, summarizer, num_procs=1, use_cached_logs=True): """ Extract the git logs for all the interesting files in `project_dir`, running each file's logs through all the supplied `fact_finders` and passing the generated facts into the supplied `summarizer`. If `summarizer` is None, the facts will be printed to standard out. - `project_dir`: the root directory of the project to excavate - `log_cache_dir`: the directory where the null terminated git logs will be written for later fact finding - `interesting_fnames_res`: the regular expressions that will be passed to is_interesting_fname to determine whether a given file in the project is interesting - `boring_fnames_res`: the regular expressions that will be passed to is_interesting_fname to determine whether a given file in the project is interesting - `fact_finders`: a list whose elements are either strs or lists of strs, each str representing a fully qualified function named (e.g. 'one.two.func'), which must be importable (i.e. somewhere in the python path), and each list of strs representing an external exe to invoke. For each log entry of each interesting file in the project dir, a function will be passed (fname, log_entry), where fname is the name of the file relative to project dir, and log_entry is a git_log.LogEntry named tuple. An external exe will receive the fname and fields of the log_entry on stdin, separated by null bytes. The fields will appear in the same order they are declared in git_log.LogEntry. It is guaranteed that for a given fname, each log entry will be passed to the fact finders in chronological order, in the same process. The fact finders can return anything that can be serialized across python processes, but note that if you provide a `summarizer`, the summarizer must handle whatever a fact finder might return, and if you do not provide a summarizer, whatever the fact finders return must be sensibly printable to stdout. - `summarizer`: a callable that will be called repeatedly, once for each generated fact - `num_procs`: how many parallel processes to use when generating logs and facts. Note that the logs are generated with calls to 'git', and are relatively CPU and disk intensive. Generally you can up this number until you're maxing out your disk, past which you won't see performance improvements. Returns `summarizer`. """ pool = multiprocessing.Pool(num_procs) project_dir = util.real_abs_path(project_dir) fnames_to_excavate = _interesting_fnames_in_proj(project_dir, interesting_fnames_res, boring_fnames_res) log.info("Found %d interesting fnames", len(fnames_to_excavate)) log.debug("Interesting fnames: %s", fnames_to_excavate) rel_and_log_z_fnames = _extract_logs(pool, fnames_to_excavate, project_dir, log_cache_dir, use_cached_logs) facts_async_results = [] for (rel_name, log_z_fname) in rel_and_log_z_fnames: facts_async_results.append( pool.apply_async(_find_facts, (rel_name, log_z_fname, fact_finders))) for res in facts_async_results: facts = res.get(REALLY_LONG_TIME) for fact in facts: summarizer(fact) pool.close() pool.join() return summarizer
def excavate(project_dir, log_cache_dir, interesting_fnames_res, boring_fnames_res, fact_finders, summarizer, num_procs=1, use_cached_logs=True): """ Extract the git logs for all the interesting files in `project_dir`, running each file's logs through all the supplied `fact_finders` and passing the generated facts into the supplied `summarizer`. If `summarizer` is None, the facts will be printed to standard out. - `project_dir`: the root directory of the project to excavate - `log_cache_dir`: the directory where the null terminated git logs will be written for later fact finding - `interesting_fnames_res`: the regular expressions that will be passed to is_interesting_fname to determine whether a given file in the project is interesting - `boring_fnames_res`: the regular expressions that will be passed to is_interesting_fname to determine whether a given file in the project is interesting - `fact_finders`: a list whose elements are either strs or lists of strs, each str representing a fully qualified function named (e.g. 'one.two.func'), which must be importable (i.e. somewhere in the python path), and each list of strs representing an external exe to invoke. For each log entry of each interesting file in the project dir, a function will be passed (fname, log_entry), where fname is the name of the file relative to project dir, and log_entry is a git_log.LogEntry named tuple. An external exe will receive the fname and fields of the log_entry on stdin, separated by null bytes. The fields will appear in the same order they are declared in git_log.LogEntry. It is guaranteed that for a given fname, each log entry will be passed to the fact finders in chronological order, in the same process. The fact finders can return anything that can be serialized across python processes, but note that if you provide a `summarizer`, the summarizer must handle whatever a fact finder might return, and if you do not provide a summarizer, whatever the fact finders return must be sensibly printable to stdout. - `summarizer`: a callable that will be called repeatedly, once for each generated fact - `num_procs`: how many parallel processes to use when generating logs and facts. Note that the logs are generated with calls to 'git', and are relatively CPU and disk intensive. Generally you can up this number until you're maxing out your disk, past which you won't see performance improvements. Returns `summarizer`. """ pool = multiprocessing.Pool(num_procs) project_dir = util.real_abs_path(project_dir) fnames_to_excavate = _interesting_fnames_in_proj(project_dir, interesting_fnames_res, boring_fnames_res) log.info("Found %d interesting fnames", len(fnames_to_excavate)) log.debug("Interesting fnames: %s", fnames_to_excavate) rel_and_log_z_fnames = _extract_logs(pool, fnames_to_excavate, project_dir, log_cache_dir, use_cached_logs) facts_async_results = [] for (rel_name, log_z_fname) in rel_and_log_z_fnames: facts_async_results.append(pool.apply_async(_find_facts, (rel_name, log_z_fname, fact_finders))) for res in facts_async_results: facts = res.get(REALLY_LONG_TIME) for fact in facts: summarizer(fact) pool.close() pool.join() return summarizer