Example #1
0
git.setRepository("/home/wolfgang/git-repos/linux-2.6/.git")
git.setRevisionRange("v2.6.14", "v2.6.33")
#git.setRevisionRange("v2.6.23", "v2.6.26")
#git.setRepository("/home/wolfgang/git-repos/perl/.git")
#git.setRevisionRange("8d063cd8450e", "HEAD")
#git.setSubsysDescription(kerninfo.subsysDescrLinux)

git.extractCommitData()

###################################################
print("Shelfing the git object")
import shelve
d = shelve.open("/home/wolfgang/linux-14-33")
d["git"] = git
d.close()
#
#print("Same in blue after unshelfing:")
#k = shelve.open("/tmp/git-shelf")
#git2 = k["git"]
#k.close()
###################################################

#res = createCumulativeSeries(git, "__main__")
#res = createCumulativeSeries(git, "block")
res = createSeries(git, subsys="__main__", revrange=["v2.6.23", "v2.6.26"])
print("Obtained a list with {0} commits".format(len(res)))

for i in range(0,10):
    print("{0}: {1}, {2}".format(res[i]["commit"].cdate, res[i]["value"][0],
                                 res[i]["commit"].getCommitMessageLines()))
Example #2
0
git.setRepository("/home/wolfgang/git-repos/linux-2.6/.git")
git.setRevisionRange("v2.6.14", "v2.6.33")
#git.setRevisionRange("v2.6.23", "v2.6.26")
#git.setRepository("/home/wolfgang/git-repos/perl/.git")
#git.setRevisionRange("8d063cd8450e", "HEAD")
#git.setSubsysDescription(kerninfo.subsysDescrLinux)

git.extractCommitData()

###################################################
print("Shelfing the git object")
import shelve
d = shelve.open("/home/wolfgang/linux-14-33")
d["git"] = git
d.close()
#
#print("Same in blue after unshelfing:")
#k = shelve.open("/tmp/git-shelf")
#git2 = k["git"]
#k.close()
###################################################

#res = createCumulativeSeries(git, "__main__")
#res = createCumulativeSeries(git, "block")
res = createSeries(git, subsys="__main__", revrange=["v2.6.23", "v2.6.26"])
print("Obtained a list with {0} commits".format(len(res)))

for i in range(0, 10):
    print("{0}: {1}, {2}".format(res[i]["commit"].cdate, res[i]["value"][0],
                                 res[i]["commit"].getCommitMessageLines()))
Example #3
0
def doAnalysis(vcs, basedir, revrange=None):
    # TODO: This needs to include the subrange analysis
    # TODO: Use a temporary dir for data storage (unless the R
    # data exchange problem is solved)
    print("Creating raw series")
    res = createSeries(vcs, "__main__", revrange)
    writeToFile(res, "/home/wolfgang/raw.dat")
    duration = getSeriesDuration(res)

    # Emergency stop: If the cycle is less than 200 commits long,
    # there are no meaningful results to be expected.
    if len(res) < 200:
        print("!!! Not enough commits in list, skipping analysis")
        return

    print("Creating cumulative series")
    res = createCumulativeSeries(vcs, "__main__", revrange)
    writeToFile(res, "/home/wolfgang/cum.dat")

    # TODO: How is it possible to exchange the data directly between python
    # and R? Writing to a file and then re-reading the stuff is a bit stupid
    # (if all else fails, we could at least use a named pipe)
    runR('raw = as.xts(read.zoo(file="/home/wolfgang/raw.dat", '\
             'FUN=tstamp_to_date))')
    raw = RtoPython(runR('raw'))

    # We use the average number of commits per quarter day as basis for the
    # moving average
    secs_per_hour = 60 * 60
    smooth_commits = len(raw) / (duration / (6 * secs_per_hour))

    print("Length: {0}, duration: {1}".format(len(raw), duration))

    # ... but also ensure that we do not get excessively large or
    # small values

    if smooth_commits < 20:
        smooth_commits = 20
    elif smooth_commits > 350:
        smooth_commits = 350

    print("Using {0} as smoothing factor".format(smooth_commits))

    if (len(raw) < smooth_commits):
        print("Pathological case: Excessively short series with {} commits "
              "detected, giving up.".format(len(raw)))
        return

    runR('reg = to.regts(raw[,1], {0})'.format(smooth_commits))
    runR('cum = as.xts(read.zoo(file="/home/wolfgang/cum.dat", '\
             'FUN=tstamp_to_date))')

    reg = RtoPython(runR('reg'))
    cum = RtoPython(runR('cum'))

    # HARDCODED assumptions about the position of the data fields
    # TODO: These should get symbolic R labels. How is this possible?
    diff_sizes = RtoPython(runR('coredata(raw)[,1]'))
    descr_sizes = RtoPython(runR('coredata(raw)[,5]'))

    deltat = int(runR('deltat(reg)')[0])
    tstart = int(runR('start(reg)')[0])
    tend = int(runR('end(reg)')[0])
    timelist_reg = RtoPython(runR('unclass(index(reg))'))

    # Create a simplified time range starting at zero
    timelist_reg_simplified = range(0, tend - tstart + 1, deltat)

    timelist_cum = RtoPython(runR('unclass(index(cum))'))

    # Plot the cumulative and the averaged series
    # TODO: Use different y axes for the components because they
    # scale vastly different
    # TODO: We need to re-initialise the plot object somehow since
    # in the second run, the histogram of the previous run is
    # plotted here.
    status("Computing Time Series Graphs")
    fig = plt.figure()
    ax = fig.add_subplot(111)
    _setupPythonGraphics(os.path.join(basedir, "timegraph"), "PDF")
    plot(timelist_reg, RtoPython(runR('reg')))
    xlabel("Time (TODO: Label with tags)")
    plt.show()
    _closePythonGraphics(os.path.join(basedir, "timegraph"), "PDF")

    _setupPythonGraphics(os.path.join(basedir, "timegraph_cum"), "PDF")
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(timelist_cum, RtoPython(runR('coredata(cum)[,1]')))
    xlabel("Time (TODO: Label with tags)")
    plt.show()
    _closePythonGraphics(os.path.join(basedir, "timegraph_cum"), "PDF")

    # Compare the histograms of commit size and description length
    # distributions
    # TODO: The plots overlap so that information gets lost. This is
    # obviously bad.
    status("Computing Histograms")
    _setupPythonGraphics(os.path.join(basedir, "histograms"), "PDF")
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.hold(True)
    ax.hist(descr_sizes, 100, range=(0, 100), normed=True)
    ax.hist(diff_sizes, 100, range=(0, 100), normed=True, alpha=0.5)
    ax.set_xlabel("Commit/Diff size")
    ax.set_ylabel("Probability")
    ax.grid(True)
    ax.hold(False)
    plt.show()
    _closePythonGraphics(os.path.join(basedir, "histograms"), "PDF")

    # Let's look at some correlations: Between different diff approaches,
    # and the correlation between diff size and
    status("Computing Correlations")
    computeDiffsizeCommitlengthCorrelation("raw",
                                           filename=os.path.join(
                                               basedir, "diff_commit_corr"),
                                           backend="PDF")
    computeDifftypeCorrelation("raw",
                               filename=os.path.join(basedir, "difftype_corr"),
                               backend="PDF")

    # Determine the density. TODO: Find the best bandwidth.
    status("Computing Density")
    computeDensity("reg",
                   bandwidth=10,
                   filename=os.path.join(basedir, "density"),
                   backend="PDF")

    # We could also use reg, but coredata gives more regular labels
    status("Computing Spectrum")
    computeSpectrum("coredata(reg)",
                    filename=os.path.join(basedir, "spectrum"),
                    backend="PDF")

    status("Computing ECDF")
    computeECDF("reg", filename=os.path.join(basedir, "ecdf"), backend="PDF")

    # Generate the recurrence diagram for a series
    # NOTE: When the number of considered data points exceeds a
    # certain threshold, we don't do the plot because it's
    # computationally too expensive
    if len(reg) < 5000:
        # We use PNG for plotting here because the PDF gets huge.
        # (we could also just pass reg, but extracting the coredata gives
        # "nicer" labels")
        status("Computing Recurrence Diagram")
        computeRecurrenceDiagram("coredata(reg)[,1]",
                                 filename=os.path.join(basedir, "recurrence"),
                                 backend="PNG")
    else:
        status("Skipping recurrence diagram: Too many data points")
Example #4
0
def doAnalysis(vcs, basedir, revrange=None):
    # TODO: This needs to include the subrange analysis
    # TODO: Use a temporary dir for data storage (unless the R
    # data exchange problem is solved)
    print("Creating raw series")
    res = createSeries(vcs, "__main__", revrange)
    writeToFile(res, "/home/wolfgang/raw.dat")
    duration = getSeriesDuration(res)

    # Emergency stop: If the cycle is less than 200 commits long,
    # there are no meaningful results to be expected.
    if len(res) < 200:
        print("!!! Not enough commits in list, skipping analysis")
        return

    print("Creating cumulative series")
    res = createCumulativeSeries(vcs, "__main__", revrange)
    writeToFile(res, "/home/wolfgang/cum.dat")



    # TODO: How is it possible to exchange the data directly between python
    # and R? Writing to a file and then re-reading the stuff is a bit stupid
    # (if all else fails, we could at least use a named pipe)
    runR('raw = as.xts(read.zoo(file="/home/wolfgang/raw.dat", '\
             'FUN=tstamp_to_date))')
    raw = RtoPython(runR('raw'))

    # We use the average number of commits per quarter day as basis for the
    # moving average
    secs_per_hour = 60*60
    smooth_commits = len(raw)/(duration/(6*secs_per_hour))

    print("Length: {0}, duration: {1}".format(len(raw), duration))

    # ... but also ensure that we do not get excessively large or
    # small values
    
    if smooth_commits < 20:
        smooth_commits = 20
    elif smooth_commits > 350:
        smooth_commits = 350

    print("Using {0} as smoothing factor".format(smooth_commits))

    if (len(raw) < smooth_commits):
        print("Pathological case: Excessively short series with {} commits "
              "detected, giving up.".format(len(raw)))
        return

    runR('reg = to.regts(raw[,1], {0})'.format(smooth_commits))
    runR('cum = as.xts(read.zoo(file="/home/wolfgang/cum.dat", '\
             'FUN=tstamp_to_date))')

    reg = RtoPython(runR('reg'))
    cum = RtoPython(runR('cum'))

    # HARDCODED assumptions about the position of the data fields 
    # TODO: These should get symbolic R labels. How is this possible?
    diff_sizes = RtoPython(runR('coredata(raw)[,1]'))
    descr_sizes = RtoPython(runR('coredata(raw)[,5]'))

    deltat = int(runR('deltat(reg)')[0])
    tstart = int(runR('start(reg)')[0])
    tend = int(runR('end(reg)')[0])
    timelist_reg = RtoPython(runR('unclass(index(reg))'))

    # Create a simplified time range starting at zero
    timelist_reg_simplified = range(0, tend-tstart+1, deltat)

    timelist_cum = RtoPython(runR('unclass(index(cum))'))

    # Plot the cumulative and the averaged series
    # TODO: Use different y axes for the components because they
    # scale vastly different
    # TODO: We need to re-initialise the plot object somehow since
    # in the second run, the histogram of the previous run is 
    # plotted here.
    status("Computing Time Series Graphs")
    fig = plt.figure()
    ax = fig.add_subplot(111)
    _setupPythonGraphics(os.path.join(basedir, "timegraph"), "PDF")
    plot(timelist_reg, RtoPython(runR('reg')))
    xlabel("Time (TODO: Label with tags)")
    plt.show()
    _closePythonGraphics(os.path.join(basedir, "timegraph"), "PDF")

    _setupPythonGraphics(os.path.join(basedir, "timegraph_cum"), "PDF")
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(timelist_cum, RtoPython(runR('coredata(cum)[,1]')))
    xlabel("Time (TODO: Label with tags)")
    plt.show()
    _closePythonGraphics(os.path.join(basedir, "timegraph_cum"), "PDF")


    # Compare the histograms of commit size and description length
    # distributions 
    # TODO: The plots overlap so that information gets lost. This is
    # obviously bad.
    status("Computing Histograms")
    _setupPythonGraphics(os.path.join(basedir, "histograms"), "PDF")
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.hold(True)
    ax.hist(descr_sizes,100,range=(0,100),normed=True)
    ax.hist(diff_sizes,100,range=(0,100),normed=True,alpha=0.5)
    ax.set_xlabel("Commit/Diff size")
    ax.set_ylabel("Probability")
    ax.grid(True)
    ax.hold(False)
    plt.show()
    _closePythonGraphics(os.path.join(basedir, "histograms"), "PDF")

    # Let's look at some correlations: Between different diff approaches,
    # and the correlation between diff size and 
    status("Computing Correlations")
    computeDiffsizeCommitlengthCorrelation("raw", 
                                           filename=os.path.join(basedir, 
                                                 "diff_commit_corr"), 
                                           backend="PDF")
    computeDifftypeCorrelation("raw", 
                               filename=os.path.join(basedir, "difftype_corr"), 
                               backend="PDF")

    # Determine the density. TODO: Find the best bandwidth. 
    status("Computing Density")
    computeDensity("reg", bandwidth=10, 
                   filename=os.path.join(basedir, "density"),
                   backend="PDF")

    # We could also use reg, but coredata gives more regular labels
    status("Computing Spectrum")
    computeSpectrum("coredata(reg)", 
                    filename=os.path.join(basedir, "spectrum"),
                    backend="PDF")

    status("Computing ECDF")
    computeECDF("reg", filename=os.path.join(basedir, "ecdf"),
                backend="PDF")

    # Generate the recurrence diagram for a series 
    # NOTE: When the number of considered data points exceeds a
    # certain threshold, we don't do the plot because it's
    # computationally too expensive
    if len(reg) < 5000:
        # We use PNG for plotting here because the PDF gets huge.
        # (we could also just pass reg, but extracting the coredata gives
        # "nicer" labels")
        status("Computing Recurrence Diagram")
        computeRecurrenceDiagram("coredata(reg)[,1]", 
                                 filename=os.path.join(basedir, "recurrence"), 
                                 backend="PNG")
    else:
        status("Skipping recurrence diagram: Too many data points")