Ejemplo n.º 1
0
def _parse_log_entry(raw_log_entry):
    """
    Parse a single git log entry into a LogEntry, or return None
    if it can't be parsed.
    """
    # A note on the encodings.  Git doesn't give us a way to get at
    # the encodings of the files / diffs (short of .gitattributes,
    # which has to be set by the original producers of the repo).  We
    # assume that the encoding is UTF-8, and just replace everything
    # else with that lovely question mark thing, to do the parsing /
    # manipulation, and then convert that utf-8 to unicode for the
    # returned LogEntry.
    utf8_log_entry = util.utf8(raw_log_entry)

    # attempt to split the header from the diff.
    split_log_entry = _split_entry_header(utf8_log_entry)
    if split_log_entry is None:
        return None
    header_lines, diff_lines = split_log_entry

    diff = '\n'.join(diff_lines)

    if not diff.strip():
        log.debug("Diff appeared to be empty.")
        return None

    author = _parse_header('Author: ', header_lines)
    if not author:
        log.debug("Could not parse author.")
        return None

    parsed_author = parse_name_and_email(author)
    if not parsed_author:
        log.debug("Could not parse author name / email.")
        return None
    author_name, author_email = parsed_author

    commit = _parse_header('commit ', header_lines)
    if not commit:
        log.debug("Could not parse commit.")
        return None

    log_msg = '\n'.join(_parse_log_msg(header_lines))

    return LogEntry(author_name=util.uc(author_name),
                    author_email=util.uc(author_email),
                    commit=util.uc(commit),
                    log_msg=util.uc(log_msg),
                    diff=util.uc(diff),
                    raw_log=raw_log_entry)
Ejemplo n.º 2
0
def _parse_log_entry(raw_log_entry):
    """
    Parse a single git log entry into a LogEntry, or return None
    if it can't be parsed.
    """
    # A note on the encodings.  Git doesn't give us a way to get at
    # the encodings of the files / diffs (short of .gitattributes,
    # which has to be set by the original producers of the repo).  We
    # assume that the encoding is UTF-8, and just replace everything
    # else with that lovely question mark thing, to do the parsing /
    # manipulation, and then convert that utf-8 to unicode for the
    # returned LogEntry.
    utf8_log_entry = util.utf8(raw_log_entry)

    # attempt to split the header from the diff.
    split_log_entry = _split_entry_header(utf8_log_entry)
    if split_log_entry is None:
        return None
    header_lines, diff_lines = split_log_entry

    diff = '\n'.join(diff_lines)

    if not diff.strip():
        log.debug("Diff appeared to be empty.")
        return None

    author = _parse_header('Author: ', header_lines)
    if not author:
        log.debug("Could not parse author.")
        return None

    parsed_author = parse_name_and_email(author)
    if not parsed_author:
        log.debug("Could not parse author name / email.")
        return None
    author_name, author_email = parsed_author

    commit = _parse_header('commit ', header_lines)
    if not commit:
        log.debug("Could not parse commit.")
        return None

    log_msg = '\n'.join(_parse_log_msg(header_lines))

    return LogEntry(author_name=util.uc(author_name),
                    author_email=util.uc(author_email),
                    commit=util.uc(commit),
                    log_msg=util.uc(log_msg),
                    diff=util.uc(diff),
                    raw_log=raw_log_entry)
Ejemplo n.º 3
0
def _split_entry_header(entry):
    """
    Parse `entry`, which is a single git log entry as unicode, diff
    and all, into a (header_lines, diff_lines) tuple, where
    header_lines is a list of unicodes including everything before the
    line beginning with 'diff', and diff_lines is a list of unicodes
    including everything from the diff line on.

    So header typically includes the commit hash, the date, the
    author, and the log msg.

    If there's no diff, no commit hash, or no author, this logs
    the fact and returns None (these can happen in a number of
    normal circumstances, including e.g. binary files, which we
    don't want anyway).
    """
    lines = entry.split('\n')
    # just in case of \r\n fun
    lines = [line.rstrip('\r') for line in lines]
    if not lines or len(lines) < 2:
        log.debug("Empty entry.")
        return None
    if not lines[0].startswith("commit"):
        log.debug("No commit line.")
        return None
    if not lines[1].startswith("Author"):
        log.debug("No author line.")
        return None
    # Start after the author line and look for the diff line.  This
    # should account for features like git notes.
    ind = 2
    lines_len = len(lines)
    while ind < lines_len and not lines[ind].startswith('diff'):
        ind += 1

    # call everything before the diff line the header, the rest
    # the diff.
    return lines[:ind], lines[ind:]
Ejemplo n.º 4
0
def _split_entry_header(entry):
    """
    Parse `entry`, which is a single git log entry as unicode, diff
    and all, into a (header_lines, diff_lines) tuple, where
    header_lines is a list of unicodes including everything before the
    line beginning with 'diff', and diff_lines is a list of unicodes
    including everything from the diff line on.

    So header typically includes the commit hash, the date, the
    author, and the log msg.

    If there's no diff, no commit hash, or no author, this logs
    the fact and returns None (these can happen in a number of
    normal circumstances, including e.g. binary files, which we
    don't want anyway).
    """
    lines = entry.split('\n')
    # just in case of \r\n fun
    lines = [line.rstrip('\r') for line in lines]
    if not lines or len(lines) < 2:
        log.debug("Empty entry.")
        return None
    if not lines[0].startswith("commit"):
        log.debug("No commit line.")
        return None
    if not lines[1].startswith("Author"):
        log.debug("No author line.")
        return None
    # Start after the author line and look for the diff line.  This
    # should account for features like git notes.
    ind = 2
    lines_len = len(lines)
    while ind < lines_len and not lines[ind].startswith('diff'):
        ind += 1

    # call everything before the diff line the header, the rest
    # the diff.
    return lines[:ind], lines[ind:]
Ejemplo n.º 5
0
def excavate(project_dir,
             log_cache_dir,
             interesting_fnames_res,
             boring_fnames_res,
             fact_finders,
             summarizer,
             num_procs=1,
             use_cached_logs=True):
    """
    Extract the git logs for all the interesting files in
    `project_dir`, running each file's logs through all the supplied
    `fact_finders` and passing the generated facts into the supplied
    `summarizer`.

    If `summarizer` is None, the facts will be printed to standard
    out.

    - `project_dir`: the root directory of the project to excavate

    - `log_cache_dir`: the directory where the null terminated git logs
      will be written for later fact finding

    - `interesting_fnames_res`: the regular expressions that will be
      passed to is_interesting_fname to determine whether a given file
      in the project is interesting

    - `boring_fnames_res`: the regular expressions that will be passed
      to is_interesting_fname to determine whether a given file in the
      project is interesting

    - `fact_finders`: a list whose elements are either strs or lists
      of strs, each str representing a fully qualified function named
      (e.g. 'one.two.func'), which must be importable (i.e. somewhere
      in the python path), and each list of strs representing an
      external exe to invoke.

      For each log entry of each interesting file in the project dir,
      a function will be passed (fname, log_entry), where fname is the
      name of the file relative to project dir, and log_entry is a
      git_log.LogEntry named tuple.

      An external exe will receive the fname and fields of the
      log_entry on stdin, separated by null bytes.  The fields will
      appear in the same order they are declared in git_log.LogEntry.

      It is guaranteed that for a given fname, each log entry will be
      passed to the fact finders in chronological order, in the same
      process.

      The fact finders can return anything that can be serialized
      across python processes, but note that if you provide a
      `summarizer`, the summarizer must handle whatever a fact finder
      might return, and if you do not provide a summarizer, whatever
      the fact finders return must be sensibly printable to stdout.

    - `summarizer`: a callable that will be called repeatedly, once
      for each generated fact

    - `num_procs`: how many parallel processes to use when generating
      logs and facts.  Note that the logs are generated with calls to
      'git', and are relatively CPU and disk intensive.  Generally you
      can up this number until you're maxing out your disk, past which
      you won't see performance improvements.

    Returns `summarizer`.
    """
    pool = multiprocessing.Pool(num_procs)

    project_dir = util.real_abs_path(project_dir)

    fnames_to_excavate = _interesting_fnames_in_proj(project_dir,
                                                     interesting_fnames_res,
                                                     boring_fnames_res)
    log.info("Found %d interesting fnames", len(fnames_to_excavate))
    log.debug("Interesting fnames: %s", fnames_to_excavate)

    rel_and_log_z_fnames = _extract_logs(pool, fnames_to_excavate, project_dir,
                                         log_cache_dir, use_cached_logs)

    facts_async_results = []
    for (rel_name, log_z_fname) in rel_and_log_z_fnames:
        facts_async_results.append(
            pool.apply_async(_find_facts,
                             (rel_name, log_z_fname, fact_finders)))

    for res in facts_async_results:
        facts = res.get(REALLY_LONG_TIME)
        for fact in facts:
            summarizer(fact)

    pool.close()
    pool.join()

    return summarizer
Ejemplo n.º 6
0
def excavate(project_dir, log_cache_dir,
             interesting_fnames_res,
             boring_fnames_res,
             fact_finders, summarizer, num_procs=1,
             use_cached_logs=True):
    """
    Extract the git logs for all the interesting files in
    `project_dir`, running each file's logs through all the supplied
    `fact_finders` and passing the generated facts into the supplied
    `summarizer`.

    If `summarizer` is None, the facts will be printed to standard
    out.

    - `project_dir`: the root directory of the project to excavate

    - `log_cache_dir`: the directory where the null terminated git logs
      will be written for later fact finding

    - `interesting_fnames_res`: the regular expressions that will be
      passed to is_interesting_fname to determine whether a given file
      in the project is interesting

    - `boring_fnames_res`: the regular expressions that will be passed
      to is_interesting_fname to determine whether a given file in the
      project is interesting

    - `fact_finders`: a list whose elements are either strs or lists
      of strs, each str representing a fully qualified function named
      (e.g. 'one.two.func'), which must be importable (i.e. somewhere
      in the python path), and each list of strs representing an
      external exe to invoke.

      For each log entry of each interesting file in the project dir,
      a function will be passed (fname, log_entry), where fname is the
      name of the file relative to project dir, and log_entry is a
      git_log.LogEntry named tuple.

      An external exe will receive the fname and fields of the
      log_entry on stdin, separated by null bytes.  The fields will
      appear in the same order they are declared in git_log.LogEntry.

      It is guaranteed that for a given fname, each log entry will be
      passed to the fact finders in chronological order, in the same
      process.

      The fact finders can return anything that can be serialized
      across python processes, but note that if you provide a
      `summarizer`, the summarizer must handle whatever a fact finder
      might return, and if you do not provide a summarizer, whatever
      the fact finders return must be sensibly printable to stdout.

    - `summarizer`: a callable that will be called repeatedly, once
      for each generated fact

    - `num_procs`: how many parallel processes to use when generating
      logs and facts.  Note that the logs are generated with calls to
      'git', and are relatively CPU and disk intensive.  Generally you
      can up this number until you're maxing out your disk, past which
      you won't see performance improvements.

    Returns `summarizer`.
    """
    pool = multiprocessing.Pool(num_procs)

    project_dir = util.real_abs_path(project_dir)

    fnames_to_excavate = _interesting_fnames_in_proj(project_dir,
                                                     interesting_fnames_res,
                                                     boring_fnames_res)
    log.info("Found %d interesting fnames", len(fnames_to_excavate))
    log.debug("Interesting fnames: %s", fnames_to_excavate)

    rel_and_log_z_fnames = _extract_logs(pool,
                                         fnames_to_excavate,
                                         project_dir,
                                         log_cache_dir,
                                         use_cached_logs)

    facts_async_results = []
    for (rel_name, log_z_fname) in rel_and_log_z_fnames:
        facts_async_results.append(pool.apply_async(_find_facts,
                                                    (rel_name,
                                                     log_z_fname,
                                                     fact_finders)))

    for res in facts_async_results:
        facts = res.get(REALLY_LONG_TIME)
        for fact in facts:
            summarizer(fact)

    pool.close()
    pool.join()

    return summarizer