def _parse_log_entry(raw_log_entry): """ Parse a single git log entry into a LogEntry, or return None if it can't be parsed. """ # A note on the encodings. Git doesn't give us a way to get at # the encodings of the files / diffs (short of .gitattributes, # which has to be set by the original producers of the repo). We # assume that the encoding is UTF-8, and just replace everything # else with that lovely question mark thing, to do the parsing / # manipulation, and then convert that utf-8 to unicode for the # returned LogEntry. utf8_log_entry = util.utf8(raw_log_entry) # attempt to split the header from the diff. split_log_entry = _split_entry_header(utf8_log_entry) if split_log_entry is None: return None header_lines, diff_lines = split_log_entry diff = '\n'.join(diff_lines) if not diff.strip(): log.debug("Diff appeared to be empty.") return None author = _parse_header('Author: ', header_lines) if not author: log.debug("Could not parse author.") return None parsed_author = parse_name_and_email(author) if not parsed_author: log.debug("Could not parse author name / email.") return None author_name, author_email = parsed_author commit = _parse_header('commit ', header_lines) if not commit: log.debug("Could not parse commit.") return None log_msg = '\n'.join(_parse_log_msg(header_lines)) return LogEntry(author_name=util.uc(author_name), author_email=util.uc(author_email), commit=util.uc(commit), log_msg=util.uc(log_msg), diff=util.uc(diff), raw_log=raw_log_entry)
def _split_entry_header(entry): """ Parse `entry`, which is a single git log entry as unicode, diff and all, into a (header_lines, diff_lines) tuple, where header_lines is a list of unicodes including everything before the line beginning with 'diff', and diff_lines is a list of unicodes including everything from the diff line on. So header typically includes the commit hash, the date, the author, and the log msg. If there's no diff, no commit hash, or no author, this logs the fact and returns None (these can happen in a number of normal circumstances, including e.g. binary files, which we don't want anyway). """ lines = entry.split('\n') # just in case of \r\n fun lines = [line.rstrip('\r') for line in lines] if not lines or len(lines) < 2: log.debug("Empty entry.") return None if not lines[0].startswith("commit"): log.debug("No commit line.") return None if not lines[1].startswith("Author"): log.debug("No author line.") return None # Start after the author line and look for the diff line. This # should account for features like git notes. ind = 2 lines_len = len(lines) while ind < lines_len and not lines[ind].startswith('diff'): ind += 1 # call everything before the diff line the header, the rest # the diff. return lines[:ind], lines[ind:]
def excavate(project_dir, log_cache_dir, interesting_fnames_res, boring_fnames_res, fact_finders, summarizer, num_procs=1, use_cached_logs=True): """ Extract the git logs for all the interesting files in `project_dir`, running each file's logs through all the supplied `fact_finders` and passing the generated facts into the supplied `summarizer`. If `summarizer` is None, the facts will be printed to standard out. - `project_dir`: the root directory of the project to excavate - `log_cache_dir`: the directory where the null terminated git logs will be written for later fact finding - `interesting_fnames_res`: the regular expressions that will be passed to is_interesting_fname to determine whether a given file in the project is interesting - `boring_fnames_res`: the regular expressions that will be passed to is_interesting_fname to determine whether a given file in the project is interesting - `fact_finders`: a list whose elements are either strs or lists of strs, each str representing a fully qualified function named (e.g. 'one.two.func'), which must be importable (i.e. somewhere in the python path), and each list of strs representing an external exe to invoke. For each log entry of each interesting file in the project dir, a function will be passed (fname, log_entry), where fname is the name of the file relative to project dir, and log_entry is a git_log.LogEntry named tuple. An external exe will receive the fname and fields of the log_entry on stdin, separated by null bytes. The fields will appear in the same order they are declared in git_log.LogEntry. It is guaranteed that for a given fname, each log entry will be passed to the fact finders in chronological order, in the same process. The fact finders can return anything that can be serialized across python processes, but note that if you provide a `summarizer`, the summarizer must handle whatever a fact finder might return, and if you do not provide a summarizer, whatever the fact finders return must be sensibly printable to stdout. - `summarizer`: a callable that will be called repeatedly, once for each generated fact - `num_procs`: how many parallel processes to use when generating logs and facts. Note that the logs are generated with calls to 'git', and are relatively CPU and disk intensive. Generally you can up this number until you're maxing out your disk, past which you won't see performance improvements. Returns `summarizer`. """ pool = multiprocessing.Pool(num_procs) project_dir = util.real_abs_path(project_dir) fnames_to_excavate = _interesting_fnames_in_proj(project_dir, interesting_fnames_res, boring_fnames_res) log.info("Found %d interesting fnames", len(fnames_to_excavate)) log.debug("Interesting fnames: %s", fnames_to_excavate) rel_and_log_z_fnames = _extract_logs(pool, fnames_to_excavate, project_dir, log_cache_dir, use_cached_logs) facts_async_results = [] for (rel_name, log_z_fname) in rel_and_log_z_fnames: facts_async_results.append( pool.apply_async(_find_facts, (rel_name, log_z_fname, fact_finders))) for res in facts_async_results: facts = res.get(REALLY_LONG_TIME) for fact in facts: summarizer(fact) pool.close() pool.join() return summarizer
def excavate(project_dir, log_cache_dir, interesting_fnames_res, boring_fnames_res, fact_finders, summarizer, num_procs=1, use_cached_logs=True): """ Extract the git logs for all the interesting files in `project_dir`, running each file's logs through all the supplied `fact_finders` and passing the generated facts into the supplied `summarizer`. If `summarizer` is None, the facts will be printed to standard out. - `project_dir`: the root directory of the project to excavate - `log_cache_dir`: the directory where the null terminated git logs will be written for later fact finding - `interesting_fnames_res`: the regular expressions that will be passed to is_interesting_fname to determine whether a given file in the project is interesting - `boring_fnames_res`: the regular expressions that will be passed to is_interesting_fname to determine whether a given file in the project is interesting - `fact_finders`: a list whose elements are either strs or lists of strs, each str representing a fully qualified function named (e.g. 'one.two.func'), which must be importable (i.e. somewhere in the python path), and each list of strs representing an external exe to invoke. For each log entry of each interesting file in the project dir, a function will be passed (fname, log_entry), where fname is the name of the file relative to project dir, and log_entry is a git_log.LogEntry named tuple. An external exe will receive the fname and fields of the log_entry on stdin, separated by null bytes. The fields will appear in the same order they are declared in git_log.LogEntry. It is guaranteed that for a given fname, each log entry will be passed to the fact finders in chronological order, in the same process. The fact finders can return anything that can be serialized across python processes, but note that if you provide a `summarizer`, the summarizer must handle whatever a fact finder might return, and if you do not provide a summarizer, whatever the fact finders return must be sensibly printable to stdout. - `summarizer`: a callable that will be called repeatedly, once for each generated fact - `num_procs`: how many parallel processes to use when generating logs and facts. Note that the logs are generated with calls to 'git', and are relatively CPU and disk intensive. Generally you can up this number until you're maxing out your disk, past which you won't see performance improvements. Returns `summarizer`. """ pool = multiprocessing.Pool(num_procs) project_dir = util.real_abs_path(project_dir) fnames_to_excavate = _interesting_fnames_in_proj(project_dir, interesting_fnames_res, boring_fnames_res) log.info("Found %d interesting fnames", len(fnames_to_excavate)) log.debug("Interesting fnames: %s", fnames_to_excavate) rel_and_log_z_fnames = _extract_logs(pool, fnames_to_excavate, project_dir, log_cache_dir, use_cached_logs) facts_async_results = [] for (rel_name, log_z_fname) in rel_and_log_z_fnames: facts_async_results.append(pool.apply_async(_find_facts, (rel_name, log_z_fname, fact_finders))) for res in facts_async_results: facts = res.get(REALLY_LONG_TIME) for fact in facts: summarizer(fact) pool.close() pool.join() return summarizer