def prob_dispersion(target, result, prob, nclasses=5, save_as=''):
  classprob = [[] for i in 2 * range(nclasses)]
  for i in xrange(len(result)):
    p = prob[i][result[i]]
    if result[i] == target[i]:
      classprob[2*result[i]].append(p)
    else:
      classprob[2*result[i] + 1].append(p)

  xlabels = [[str(i+1) + "-Good", str(i+1) + "-Bad"] for i in range(nclasses)]
  xlabels = reduce(list.__add__, xlabels, [])

  # Plotting the result
  fig = plt.figure()
  fig.suptitle('Probability distribution' , fontsize=20)
  plot = fig.add_subplot(111)
  pylab.boxplot(classprob)
  pylab.xticks(range(1, 2 * nclasses + 1), xlabels)
  plot.set_xlabel('Predicted' , fontsize = 16)
  plot.set_ylabel('Probabilities' , fontsize = 16)
  plot.tick_params(axis='both', which='major', labelsize=14)
  plot.tick_params(axis='both', which='minor', labelsize=8)
  
  # Save options
  if save_as =='':
    plt.show()
  else :
    fig.savefig(save_as)
Esempio n. 2
0
def chart(SW, a, b, label, folder, FILE):
    pylab.ioff()
    fig_width_pt = 350 					     # Get this from LaTeX using \showthe\columnwidth
    inches_per_pt = 1.0/72.27                # Convert pt to inch
    golden_mean = ((5**0.5)-1.0)/2.0         # Aesthetic ratio
    fig_width = fig_width_pt*inches_per_pt   # width in inches   
    fig_height = fig_width*golden_mean       # height in inches
    fig_size =  [fig_width,fig_height]

    params = { 'backend': 'ps',
           'axes.labelsize': 10,
           'text.fontsize': 10,
           'legend.fontsize': 10,
           'xtick.labelsize': 8,
           'ytick.labelsize': 8,
           'text.usetex': True,
           'figure.figsize': fig_size }

    pylab.rcParams.update(params)

    home = '/home/nealbob'
    img_ext = '.pdf'

    pylab.figure()
    pylab.boxplot([SW['SWA'], SW['OA'], SW['NS']], whis=5)
    pylab.axhline(y=1.0, color='0.5', linewidth=0.5, alpha=0.75, linestyle=':')
    pylab.ylim(a, b)
    pylab.ylabel(label)
    pylab.tick_params(axis='x', which = 'both', labelbottom='off')
    pylab.figtext(0.225, 0.06, 'SWA', fontsize = 10)
    pylab.figtext(0.495, 0.06, 'OA', fontsize = 10)
    pylab.figtext(0.76, 0.06, 'NS', fontsize = 10)
    pylab.savefig(home + folder + FILE + img_ext)
    pylab.show()
Esempio n. 3
0
def makeboxplot(filteredclusts, dblibrary, figname, pool=False):
    '''takes a filtered dict of clusts worth keeping and creates a boxplot of either by lane (default) or pool'''
    indiv_cluster_count = defaultdict(int) 
    for clust, inddict in filteredclusts.items():
        for ind, reads in inddict.items():
            if ind in indiv_cluster_count.keys():
                indiv_cluster_count[ind]+=1
            else:
                indiv_cluster_count[ind]+=1 
    
    t = gdata_tools.get_table_as_dict(dblibrary)
    db_ind_countd = Util.countdict([d['sampleid'] for d in t if d['sampleid'] in indiv_cluster_count.keys()[3]]) #creates a table of individual dicts from google spreadsheet
    indiv_by_group = defaultdict(list)
    for d in t:
        if 'pool' in d:
            indkey = (d.get('flowcell',None),d.get('lane',None),d.get('index',None),d.get('sampleid',None))
            if indkey in indiv_cluster_count:
                if pool == True:
                    indiv_by_group[(d['flowcell'],d['lane'],d.get('index',None),d['pool'])].append(indiv_cluster_count[indkey]) 
                else:
                    indiv_by_group[(d['flowcell'],d['lane'],d.get('index',None))].append(indiv_cluster_count[indkey])
    
    boxes = []
    labels = []
    for group,indcounts in indiv_by_group.items():
        boxes.append(indcounts)
        labels.append(group)
    boxplt = pylab.figure(1)
    pylab.boxplot(boxes)
    pylab.xticks(arange(1,(len(labels)+1)),labels,fontsize='small') #legend with best location (0) if pools
    boxplt.savefig(figname)
Esempio n. 4
0
def chart(idx, a, b, label, FILE):
    pylab.ioff()
    fig_width_pt = 350 					     # Get this from LaTeX using \showthe\columnwidth
    inches_per_pt = 1.0/72.27                # Convert pt to inch
    golden_mean = ((5**0.5)-1.0)/2.0         # Aesthetic ratio
    fig_width = fig_width_pt*inches_per_pt   # width in inches   
    fig_height = fig_width*golden_mean       # height in inches
    fig_size =  [fig_width*0.42,fig_height]

    params = { 'backend': 'ps',
           'axes.labelsize': 10,
           'text.fontsize': 10,
           'legend.fontsize': 10,
           'xtick.labelsize': 8,
           'ytick.labelsize': 8,
           'text.usetex': True,
           'figure.figsize': fig_size }

    pylab.rcParams.update(params)

    home = '/home/nealbob'
    folder = '/Dropbox/Thesis/IMG/chapter3/'
    img_ext = '.pdf'

    pylab.figure()
    pylab.boxplot(idx, whis=100)
    pylab.ylim(a, b)
    #pylab.ylabel(label)
    pylab.tick_params(axis='x', which = 'both', labelbottom='off')
    pylab.savefig(home + folder + FILE + img_ext)
    pylab.show()
def plot():
    swarmsize_marks = [20, 50, 100, 200]
    times = {}
    for mark in swarmsize_marks:
        times[mark] = []
    for time_filename, size_filename, label, style in lines_to_plot:
        time_file = open('parser_results/' + time_filename)
        size_file = open('parser_results/' + size_filename)
        for line in time_file:
            time = float(line.split()[0])
            size = int(size_file.next().split()[0])
            for mark in swarmsize_marks:
                if size <= mark:
                    times[mark].append(time)
                    break
    xs = []
    labels = []
    for mark in swarmsize_marks:
        xs.append(times[mark])
        labels.append('<=%d' % mark)
    pylab.boxplot(xs)
    pylab.setp(pylab.gca(), 'xticklabels', labels)
    pylab.savefig(output_filename)
#    pylab.close()
    

    print 'Output saved to:', output_filename
Esempio n. 6
0
def quartile_plot(
        fits,
        group_index_start, group_index_end,
        model_param_index,
        ylim=None,
        log=True,
        xlabel=None,
        ylabel=None,
        labels=None):
    model_param_values = [
        fit_params(fits, group_index, model_param_index)
        for group_index in xrange(
            group_index_start, group_index_end)
    ]
    fig = plt.figure(figsize=(len(model_param_values), 7))
    if log is True:
        plt.yscale('log')
    if ylim is not None:
        plt.ylim(ylim)
    if xlabel is not None:
        plt.xlabel(xlabel)
    if ylabel is not None:
        plt.ylabel(ylabel)
    plt.boxplot(
        model_param_values,
        labels=labels,
        showmeans=True)
    plt.grid()
    plt.show()
    def genderBoxplots(self, women, men, labels, path):
        data = [women.edition_count.values, men.edition_count.values]

        plt.figure()
        plt.boxplot(data)

        # mark the mean
        means = [np.mean(x) for x in data]
        print(means)

        plt.scatter(range(1, len(data) + 1), means, color="red", marker=">", s=20)
        plt.ylabel("num editions")
        plt.xticks(range(1, len(data) + 1), labels)
        plt.savefig(
            path + "/numeditions_gender_box_withOutlier" + self.pre + "-" + self.post + ".png", bbox_inches="tight"
        )

        plt.figure()
        plt.boxplot(data, sym="")
        # mark the mean
        means = [np.mean(x) for x in data]
        print(means)

        plt.scatter(range(1, len(data) + 1), means, color="red", marker=">", s=20)
        plt.ylabel("num editions")
        plt.xticks(range(1, len(data) + 1), labels)
        plt.savefig(path + "/numeditions_gender_box" + self.pre + "-" + self.post + ".png", bbox_inches="tight")
def plot_res_paper(df):
    """

    :param df:  contain field classifier_name, accuarcy, and fold
    :return:
    """
    ticks = []
    i = 0
    data_to_plot = []
    for g, v in df.groupby(df.classifier_name):
        data_to_plot.append(v['accuracy'].values)
        ticks.append(g)
        print v
    pylab.boxplot(data_to_plot)
    pylab.xticks(range(1, 1+ len(data_to_plot)), ticks)



    pylab.gca().invert_xaxis()
    pylab.ylabel('Classification accuracy')
    pylab.xlabel('Fold (cross validation fold for test)')
    pylab.gca().yaxis.set_ticks(np.arange(0, 1, 0.1))
    pylab.ylim((0,1))
    pylab.legend()
    pylab.show()
    return
def whiskers(i1, i2, lab1="", lab2=""):
    width = 0.35
    l1 = pb.boxplot([d[:, i1] for d in data] , positions=np.arange(len(data))-1.03*width/2., widths=width)
    l2 = pb.boxplot([d[:, i2] for d in data] , positions=np.arange(len(data))+1.03*width/2., widths=width)
    pb.xticks(np.arange(len(data)),[fn.split('raw')[0].replace('_',' ') for fn in fnames], rotation=45)
    pb.xlim(-1.2*width, len(data)-1+1.2*width)

    for key, lines in l1.iteritems():
        pb.setp(lines, lw=1)
        if key == "boxes":
            pb.setp(lines, color='b', lw=1.4)
        if key == 'whiskers':
            pb.setp(lines, color='b')
        if key == 'fliers':
            pb.setp(lines, color='b')
        if key == 'medians':
            pb.setp(lines, color='k', lw=1.4)
    for key, lines in l2.iteritems():
        pb.setp(lines, lw=1.2)
        if key == "boxes":
            pb.setp(lines, color='g', lw=1.4)
        if key == 'whiskers':
            pb.setp(lines, color='g')
        if key == 'fliers':
            pb.setp(lines, color='g')
        if key == 'medians':
            pb.setp(lines, color='k', lw=1.4)
Esempio n. 10
0
def graphPageSizeComparison(benchType, backend, writeUnits):
    clf()
    
    data = filter(table,
                  benchType=benchType,
                  backend=backend,
                  writeUnits=writeUnits)
    
    xData = project(data, 'pageSize')
    yData = project(data, 'latency')
    
    # Each sample represents 10 trials.
    yData = map(lambda x:x/10, yData)
    
    (xData, yData) = condense(xData, yData)
    
    pylab.boxplot(yData)
    fmt = ticker.FixedFormatter(map(str, xData))
    ax = gca()
    ax.get_xaxis().set_major_formatter(fmt)
    ax.set_ylabel("Sequential 4K block write latency (s)")
    ax.set_xlabel("Page size (B)")
    ax.get_yaxis().grid(color='gray', linestyle='dashed')
    ax.get_yaxis().set_major_locator(ticker.MaxNLocator(15))
    if backend == 's3':
        title('Backend: S3')
    else:
        title('Backend: DynamoDB; Provisioning Units = %d' % writeUnits)
    
    pylab.show()
Esempio n. 11
0
def finalgen(names):
    names = eval(names)
    totaleff = []
    for name in names:
        resultsfolder = "results/"+name+"/"
        final = resultsfolder + "gen049.dat"
        population = []
        name = final.rstrip('.dat')
        efflist = []

        resultsfile = open(final, 'r')
        for line in resultsfile:
            population.append(eval(line))

        for indiv in population:
            # if "fullrandom" in name:
            #     print "found", name
            #     lift = indiv['fitness'][0] - 0.5
            # else:
            lift = indiv['fitness'][0]
            drag = 5.0 - indiv['fitness'][1]
            efficiency = lift/drag
            efflist.append(efficiency)
        aveeff = ave(efflist)
        stdeff = std(efflist,aveeff)
        print "efficieny average", aveeff, "+-", stdeff
        totaleff.append(efflist)
    pylab.boxplot(totaleff)
    pylab.show()
    
    bwblift, bwbdrag = LIFT, 5 - DRAG
    print "bwbefficiency", bwblift/bwbdrag
Esempio n. 12
0
def graphDepthComparison(benchType):
    clf()
    
    data = filter(table,
                  timestamp=range(1336921433, 1336922429+1),
                  benchType=benchType)
    writeUnits = 160
    
    xData = project(data, 'depth')
    yData = project(data, 'latency')
    
    print len(xData)
    print len(yData)
    
    # Each sample represents 5 trials.
    yData = map(lambda x:x/5, yData)
    
    (xData, yData) = condense(xData, yData)
    
    pylab.boxplot(yData)
    fmt = ticker.FixedFormatter(map(str, xData))
    ax = gca()
    ax.get_xaxis().set_major_formatter(fmt)
    ax.set_ylabel("Sequential 4K block write latency (s)")
    ax.set_xlabel("Depth (number of parent directories)")
    ax.get_yaxis().grid(color='gray', linestyle='dashed')
    ax.get_yaxis().set_major_locator(ticker.MaxNLocator(10))
    title('Backend: DynamoDB; Provisioning Units = %d' % writeUnits)
    pylab.ylim([0,0.1])
    
    pylab.show()
Esempio n. 13
0
def do_proc(resdir, timedir):
    """
    EXPS on IPC6_SEQ_ELEVATORS_12 & IPC6_TEMPO_OPENSTACKS_17
    steadyState=50
    1) popsize=48 & runmax=1 & maxseconds=0
    2) RESTART case: popsize=96 & runmax=0 & maxseconds=1799
    foreach nthreads: 1, 24, 48
    repeat 11 times
    """

    if not options.cores: return

    for field, popsize, runmax, maxseconds in [
        ("PROC", 48, 1, 0),
        #("RESTART_PROC", 96, 0, 1799)
        ]:
        for name, domain, instance in SAMPLES:
            local_logger = logging.getLogger("GECCO2011.PROC.%s" % name)
            plotdata = []
            for num in range(1, options.nruns+1):
                subdata = []
                for nthreads in [1, 24, 48]:
                    field_name = "%s_%s_%d" % (field, "DYNAMIC" if options.dynamic else "STATIC", nthreads)
                    time_filename = PATTERN_TIME_FILENAME % {"TIMEDIR": timedir, "NAME": name, "FIELD": field_name, "NUM": num}
                    res_filename = PATTERN_RES_FILENAME % {"RESDIR": resdir, "NAME": name, "FIELD": field_name, "NUM": num}
                    plan_filename = PATTERN_PLAN_FILENAME % {"RESDIR": resdir, "NAME": name, "FIELD": field_name, "NUM": num}
                    cmd = PATTERN_CMD % {"DOMAIN": domain,
                                         "INSTANCE": instance,
                                         "LOOP": 1,
                                         "DYNAMIC": 1 if options.dynamic else 0,
                                         "THREADS": nthreads,
                                         "RUNMAX": runmax,
                                         "POPSIZE": popsize,
                                         "OFFSPRINGS": popsize*7,
                                         "MAXSECONDS": maxseconds,
                                         "GENSTEADY": 50,
                                         "TIME_FILENAME": time_filename,
                                         "RES_FILENAME": res_filename,
                                         "PLAN_FILENAME": plan_filename,
                                         }
                    local_logger.debug(cmd)
                    if options.execute:
                        os.system( cmd )
                    if options.plot:
                        try:
                            f = open(time_filename).readlines()
                            t1 = float(f[1].split()[-1])
                            tp = f[4].split()[-1].split(':')
                            tp = float(int(tp[0]) * 60 + float(tp[1]))
                            subdata.append([t1, tp, t1 / tp])
                        except IOError:
                            pass

                if options.plot:
                    if len(subdata):
                        plotdata.append(subdata)

            if options.plot:
                local_logger.info(plotdata)
                pylab.boxplot( plotdata )
Esempio n. 14
0
def analyze(real, samples, skip=0, thr=0.9):
  real = pickle.load(open(real, 'rb'))
  samples = pickle.load(open(samples, 'rb'))
  thr=float(thr)

  def flatten(measurements):
    shared, exclusive = [], []
    for es, ss in measurements:
      exclusive.extend(es[skip:])
      shared.extend(ss[skip:])
    return exclusive, shared

  true_values = OrderedDict()
  for vm, measurements in real.results.items():
    shared, exclusive = flatten(measurements)
    true_values[vm] = mean(shared)/mean(exclusive)
  print(true_values)

  means = OrderedDict()
  nums = []
  avgms, avgdevs = [], []
  #for vm, measurements in sorted(samples.results.items()):
  for vm, measurements in sorted(samples.results.items()):
    print("calculating", vm)
    shared, exclusive = flatten(measurements)
    def myfilter(l): return [e for e in l if e != 0]
    shared = myfilter(shared)
    exclusive = myfilter(exclusive)
    ns = []
    true = true_values[vm]
    for _ in range(1000):
      sh_samples, exc_samples = [], []
      n = 0
      while True:
        n += 1
        sh_samples.append(choice(shared))
        exc_samples.append(choice(exclusive))
        cur = mean(sh_samples)/mean(exc_samples)
        prec = 1 - abs(1-cur/true)
        if prec > thr:
          ns.append(n)
          break
        if n > 20:
          print(vm, "max precision:", prec)
          break
    if not ns:
      print("no data points for", vm)
      continue
    nums.append(ns)
    #m = mean(ns)
    #d = pstdev(ns)
    #rd = d/m*100
    #avgdevs.append(rd)
    #avgms.append(m)
    #print("{vm}: {m:.1f} {rd:.1f}%".format(vm=vm, m=m,d=d,rd=rd))
    #means[vm]=mean(nums)
  ticks = real.mapping
  p.xticks(range(len(ticks)), ticks)
  p.boxplot(nums)
Esempio n. 15
0
def main():
    # exon, intron, unknown
    specific = [[], [], []]
    nonspecific = [[], [], []]
    foldspecific = [[], [], []]
    foldnonspecific = [[], [], []]
    for prefix in sys.argv[1:]:
        tempexonic, tempspecific, tempnonspecific, tempfoldspecific, tempfoldnonspecific, templength = getData(
            prefix + ".exonic.overlap.out.annotation.txt", ["exon"]
        )
        exonic = tempexonic[0]
        exoniclength = templength[0]
        specific[0].append(tempspecific[0])
        nonspecific[0].append(tempnonspecific[0])
        foldspecific[0].append(tempfoldspecific[0])
        foldnonspecific[0].append(tempfoldnonspecific[0])
        tempData, tempspecific, tempnonspecific, tempfoldspecific, tempfoldnonspecific, templength = getData(
            prefix + ".novel.overlap.out.annotation.txt", ["intron", "unknown"]
        )
        intronic = tempData[0]
        unknown = tempData[1]
        introniclength = templength[0]
        unknownlength = templength[1]
        specific[1].append(tempspecific[0])
        specific[2].append(tempspecific[1])
        nonspecific[1].append(tempnonspecific[0])
        nonspecific[2].append(tempnonspecific[1])
        foldspecific[1].append(tempfoldspecific[0])
        foldspecific[2].append(tempfoldspecific[1])
        foldnonspecific[1].append(tempfoldnonspecific[0])
        foldnonspecific[2].append(tempfoldnonspecific[1])
        plotData = [exonic, intronic, unknown]
        print prefix
        print "exonic: ", len(exonic)
        print "intronic: ", len(intronic)
        print "unknown: ", len(unknown)
        fig = pl.figure()
        pl.boxplot(plotData)
        pl.ylim([2, 15])
        pl.ylabel("Log Expression Level")
        pl.xticks([1, 2, 3], ["Exonic", "Intronic", "Unknown"])
        pl.title(prefix.replace("_fsorted", "").replace("_", " "))
        fig.savefig(prefix + ".expression.png", dpi=fig.dpi)

        fig = pl.figure()
        pl.boxplot([exoniclength, introniclength, unknownlength])
        pl.ylim([60, 2500])
        pl.ylabel("Transcript Length")
        pl.xticks([1, 2, 3], ["Exonic", "Intronic", "Unknown"])
        pl.title(prefix.replace("_fsorted", "").replace("_", " "))
        fig.savefig(prefix + ".length.png", dpi=fig.dpi)
        # pl.show()
    abbr = []
    for i in sys.argv[1:]:
        tokens = i.split("_")
        abbr.append(tokens[0][0].upper() + tokens[1][0:2].title())
    plotSpec(specific, nonspecific, abbr, ["exonic", "intronic", "unknown"], "abs")
    plotSpec(foldspecific, foldnonspecific, abbr, ["exonic", "intronic", "unknown"], "fold")
    def vizFeature(self):
        minPoints = []
        
        col = ['b','g','r','c','m','y','k','w']
        
        boxpoints = []
                
        feat = self.mainWindow.subFeatCmb.currentText()
        
        bplot = self.mainWindow.boxPlotCheck.checkState()
        
        glyphNames = [thumb.scene().glyphtxt for thumb in self.thumbNails[0] if thumb.scene().created]
        #print glyphNames
        
        for ind in range(self.mainWindow.tabWidget.count()):
            for thumb in self.thumbNails[ind]:
                if thumb.scene().created:
                    feat = getattr(thumb.scene().windowS,self.FeatSelect[self.mainWindow.featCmb.currentText()])
                    subFeat = getattr(self,self.FeatSelect[self.mainWindow.featCmb.currentText()]+'Val')[self.mainWindow.subFeatCmb.currentText()]
                    
                    minPoints.append(feat[subFeat])
        
            if not bplot:
                pylab.plot(minPoints,col[ind],label=self.mainWindow.tabWidget.tabText(ind))
                pylab.plot(minPoints,'ro')
            else:
                boxpoints.append(minPoints)
                
                
            minPoints = []
            
        scriptNames = [self.mainWindow.tabWidget.tabText(ind) for ind in range(self.mainWindow.tabWidget.count())]
            
#        pylab.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.)    
        
        if not bplot:
            pylab.xticks(range(len(glyphNames)),glyphNames)  
            pylab.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.)        
            pylab.show()           
        else:
            pylab.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.)   
            pylab.xticks(range(len(scriptNames)),scriptNames)             
            pylab.boxplot(boxpoints)
            pylab.show()
        
#        import rpy2.robjects as R
#        
#        result = R.r['t.test'](R.IntVector(boxpoints[0]),R.IntVector(boxpoints[1]))
#
#        k =  str(result)[str(result).find('p-value = '):]
#
#        print k        

        points1 = []
        points2 = []
        
        strokes = []
Esempio n. 17
0
def plotChart(resultObj, patternName, stockPriceDataObj):
    print resultObj[0].identifyPos
    for patternData in resultObj:
        processedData = stockPriceDataObj[patternData.code]
        plt.boxplot(map(lambda res: (res[1], res[2], res[3], res[4]), processedData))
        plt.plot([patternData.upperLine.startX + 1, patternData.upperLine.endX + 1], [patternData.upperLine.startPriceY, patternData.upperLine.endPriceY])
        plt.plot([patternData.downLine.startX + 1, patternData.downLine.endX + 1], [patternData.downLine.startPriceY, patternData.downLine.endPriceY])
        plt.axvline(x = patternData.identifyPos + 1, color='red')
        plt.savefig("../demoImg/%s/%s" % (patternName, patternData.code))
        plt.clf()
	def UnivarDescStat(self,Data,FileOutPath):
		# Analitic Descriptives text
		N = len(Data)
		Mean = np.mean(Data)
		Minimum = np.min(Data)
		Maximum = np.max(Data)
		Variance = np.var(Data)
		Std = np.std(Data)
		
		MinimumQ = np.percentile(Data,0)
		Q1 = np.percentile(Data,25)
		Median = np.percentile(Data,50)
		Q3 = np.percentile(Data,75)
		MaximumQ = np.percentile(Data,100)
		
		
		txt = ("\nN : {0:8d}".format(N))
		txt = txt + ("\nMean : {0:8.6f}".format(Mean))
		txt = txt + ("\nMinimum : {0:8.6f}".format(Minimum))
		txt = txt + ("\nMaximum : {0:8.6f}".format(Maximum))
		txt = txt + ("\nVariance : {0:8.6f}".format(Variance))
		txt = txt + ("\nStd. deviation : {0:8.6f}".format(Std))
		txt = txt + ("\n\n\n")
		txt = txt + ("\nMinimum : {0:8.6f}".format(MinimumQ))
		txt = txt + ("\n1st Quartile : {0:8.6f}".format(Q1))
		txt = txt + ("\nMedian : {0:8.6f}".format(Median))
		txt = txt + ("\n3rd Quartile : {0:8.6f}".format(Q3))
		txt = txt + ("\nMaximum : {0:8.6f}".format(MaximumQ))
		

		# Grid to plot into.
		G = gridspec.GridSpec(2, 2, width_ratios=[2, 1])
		
		# Plot Analitics
		axes_1 = P.subplot(G[:,1])
		axes_1.set_title("Analitics")
		axes_1.axis('off')
		P.text(0.15, 0.25, txt, size=12)
		
		# Histogram and...
		axes_2 = P.subplot(G[0,0])	
		axes_2.set_title("Histogram")
		n, bins, patches = P.hist(Data, 15, normed=1)
		# ... PDF Plots (Probability Distribution Function)
		y = mlab.normpdf( bins, Mean, Std)
		P.plot(bins, y, 'r--', linewidth=1)
		P.ylabel('Probability')
			
		# Plot boxplot
		axes_3 = P.subplot(G[1,0])
		axes_3.set_title("Boxplot")
		P.boxplot(Data,0,'rs',0);
		
		# Store as SVG
		P.savefig(FileOutPath)
Esempio n. 19
0
def draw_learning_curve(data_first=None,
                        data_second=None,
                        measure=None,
                        x_axis=None,
                        delta=0.1,
                        scaling=100,
                        fname=None):
    """
    Accepts as input an iterator over lists of numbers.
    Draws the exponential decay grpah over the means of lists.
    """

    def learning_curve_function(x, a, b):
        return a * (1 - np.exp(-b * x))

    x_axis = np.array(x_axis)
    mean_originals = []
    for originals in data_first:
        mean_originals.append(np.mean(np.array(originals)))

    mean_originals_and_samples = []
    for originals_and_samples in data_second:
        mean_originals_and_samples.append(np.mean(np.array(originals_and_samples)))

    a, b = curve_fit(learning_curve_function, x_axis, mean_originals)
    c, d = curve_fit(learning_curve_function, x_axis, mean_originals_and_samples)

    x_axis_fit = np.linspace(x_axis.min(), x_axis.max(), 100)
    mean_originals_fit = learning_curve_function(x_axis_fit, *a)
    mean_originals_and_samples_fit = learning_curve_function(x_axis_fit, *c)

    fig, ax1 = plt.subplots(figsize=(10, 6))
    fig.canvas.set_window_title('Exponential Decay Learning Curves')
    # plt.subplots_adjust(left=0.04, right=0.35, top=0.9, bottom=0.25)

    ax1.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)
    ax1.set_title('Learning Curve Comparison for %s' % measure)
    ax1.set_xlabel('Dataset Percentage Used for Training')
    ax1.set_ylabel('%s Value' % measure)

    plt.boxplot(data_first, positions=(x_axis + delta) * scaling, notch=False)
    plt.plot((x_axis + delta) * scaling, mean_originals, 'ro', label='')
    plt.plot((x_axis_fit) * scaling, mean_originals_fit, 'r-', label='Original')

    plt.boxplot(data_second, positions=(x_axis - delta) * scaling, notch=False)
    plt.plot((x_axis - delta) * scaling, mean_originals_and_samples, 'go', label='')
    plt.plot((x_axis_fit) * scaling, mean_originals_and_samples_fit, 'g-', label='Original+sampled')
    plt.grid()
    plt.legend(loc='lower right')
    if fname is not None:
        plt.savefig(fname)
    else:
        plt.show()
 def make_boxplot(self, data, labels, filename, ylabel):
     plt.figure()
     plt.boxplot(data)
     # mark the mean
     means = [np.mean(x) for x in data]
     print ylabel
     print means
     #print range(1, len(data)+1)
     plt.scatter(range(1, len(data)+1), means, color="red", marker=">", s=20)
     plt.ylabel(ylabel)
     plt.xticks(range(1, len(data)+1), labels)
     plt.savefig(filename)
Esempio n. 21
0
def graphBackendComparison(benchType, pageSize):
    clf()
    
    dynData = filter(table,
                     benchType=benchType,
                     backend='dynamodb',
                     pageSize=pageSize)
    s3Data = filter(table,
                    benchType=benchType,
                    backend='s3',
                    pageSize=pageSize)
    
    dynXData = project(dynData, 'writeUnits')
    dynYData = project(dynData, 'latency')
    s3YData = project(s3Data, 'latency')
    
    # Each sample represents 10 trials.
    dynYData = map(lambda x:x/10, dynYData)
    s3YData = map(lambda x:x/10, s3YData)
    
    (dynXData, dynYData) = condense(dynXData, dynYData)
    
    # Merge the dynamodb and s3 datasets to they can be plotted in the same axes.
    yData = [s3YData] + dynYData
    pylab.boxplot(yData)

    captions = {
        'seqwrite': ('Sequential', 'write'),
        'seqread':  ('Sequential', 'read'),
        'randwrite':('Random', 'write'),
        'randread': ('Random', 'read')
    }
    (order, direction) = captions[benchType]
    
    limits = {
        'seqwrite': [0, 0.3],
        'seqread':  [0, 0.075],
        'randwrite':[0, 0.3],
        'randread': [0, 0.075]
    }
    
    fmt = ticker.FixedFormatter(['S3'] + map(str, dynXData))
    ax = gca()
    ax.get_xaxis().set_major_formatter(fmt)
    ax.set_ylabel(order + " 4K block " + direction + " latency (s)")
    ax.set_xlabel("Provisioned read and write units")
    ax.get_yaxis().grid(color='gray', linestyle='dashed')
    ax.get_yaxis().set_major_locator(ticker.MaxNLocator(10))
    title('Page Size = %dK' % (pageSize / 1024))
    pylab.ylim(limits[benchType])
    dpi = 60
    gcf().dpi = dpi 
    gcf().set_size_inches(400 / dpi, 300 / dpi) 
Esempio n. 22
0
 def plotStats(self,save) :
     
     figure()
     # show boxplot, iff we have enough data
     if min(map(len, self.stat_avg_z)) > 3 :        
         data = self.stat_avg_z
         boxplot(data,1)
     #else :
     figure()
     data2 = self.stat_avg_z_total
     plot(data2)
     show()
Esempio n. 23
0
	def generate( filenames ):
	    for cur in filenames:
		filename = RESULT_FILE_FORMAT % (pwd, cur, p, ps, P, d, ds, D, r, s)
		pylab.boxplot( get_boxplot_data( filename ) )
		nonzero = lambda x: x if x > 0 else 1
		iters = ( nonzero( P - p ) / ps ) * ( nonzero( D - d ) / ds )
		pylab.xlabel('%d iterations from %d,%d to %d,%d' % ( iters, p, d, P, D) )
		pylab.ylabel('%s - %s' % (cur, name))
		pylab.savefig( filename + '.pdf', format='pdf' )
		pylab.savefig( filename + '.png', format='png' )
		pylab.cla()
		pylab.clf()
Esempio n. 24
0
def main(args):
    ##======##
    ## init ##
    ##======##
    targetfile = args.input_tsv
    xdatacol   = args.xdatacol
    ydatacol   = args.ydatacol
    delim      = args.delimiter
    if   delim=="t": delim = "\t"
    elif delim=="n": delim = "\n" 
    xlabel     = args.xlabel
    ylabel     = args.ylabel
    tosave     = args.s
    if tosave: savefilename = os.path.splitext(targetfile)[0] + "_boxplot.png"

    ##===========##
    ## read data ##
    ##===========##
    xitems = list(set([float(line.rstrip().split(delim)[xdatacol]) for line in open(targetfile)]))
    #xitems.append(0.0)
    xitems.sort()
    data = {}
    for item in xitems:
        data[item] = []
    #data[0.0] = [0.0]
    for line in open(targetfile):
        lineitems = line.rstrip().split(delim)
        key, val = float(lineitems[xdatacol]), float(lineitems[ydatacol])
        data[key].append(val)
        
    ##======##
    ## show ##
    ##======##
    fig = pylab.figure()
    pylab.xticks(range(len(xitems)), xitems)
    pylab.xlabel(unicode(xlabel, sys.stdin.encoding))
    pylab.ylabel(unicode(ylabel, sys.stdin.encoding))

    for key in xitems:
        print key, data[key]
    showdata = [ data[key] for key in xitems ]
    pylab.boxplot(showdata)

    maxy = max([max(data[key]) for key in xitems])
    pylab.plot([1.0,len(xitems)], [0, maxy], 'k--')
    if not tosave:
        pylab.show()
    else:
        pylab.savefig(savefilename)


    print data
Esempio n. 25
0
    def _plot_nominal(self, data, result_dir, x_key, y_key):
        """ Creates a boxplot of the y_keys for the given nominal parameter x_key.  
        
        A method that  allows to create a plot that visualizes the effect 
        of differing one nominal  variable onto a second one (e.g. the effect of  
        differing the classifier onto the accuracy). 
        
        **Expected arguments**
        
          :data:   A dictionary, that contains a mapping from an attribute 
                   (e.g. accuracy) to a list of values taken by an attribute.
                   An entry is the entirety of all i-th values over all dict-values
          :result_dir: The director in which the plots will be saved.   
          :x_key:   The key of the dictionary whose values should be used as 
                    values for the x-axis (the independent variables)
          :y_key:   The key of the dictionary whose values should be used as
                    values for the y-axis, i.e. the dependent variable
        """
        # Create the plot for this specific dependent variable 
        values = defaultdict(list)
        for i in range(len(data[x_key])):
            parameter_value = data[x_key][i]
            if y_key[0] is not "#":
                performance_value = float(data[y_key][i])
            else: # A weighted cost function
                weight1, y_key1, weight2, y_key2 = y_key[1:].split("#")
                performance_value = float(weight1) * float(data[y_key1][i]) \
                                        + float(weight2) * float(data[y_key2][i]) 
                
            values[parameter_value].append(performance_value)
        
        values = sorted(values.items())
#        values = [("Standard_vs_Target", values["Standard_vs_Target"]),
#                  ("MissedTarget_vs_Target", values["MissedTarget_vs_Target"])]
        
        pylab.subplots_adjust(bottom = 0.3,   # the bottom of the subplots of the figure
                              )
        pylab.boxplot(map(lambda x: x[1], values))
        pylab.gca().set_xticklabels(map(lambda x: x[0], values))
        pylab.setp(pylab.gca().get_xticklabels(), rotation=-90)
        pylab.setp(pylab.gca().get_xticklabels(), size='x-small')
        pylab.gca().set_xlabel(x_key.replace("_", " "))
        
        if y_key[0] is not "#":
            pylab.gca().set_ylabel(y_key.replace("_", " "))
        else:
            pylab.gca().set_ylabel("%s*%s+%s*%s" % tuple(y_key[1:].split("#")))
         
        pylab.savefig("%s%s%s_%s.pdf" % (result_dir, os.sep, y_key, x_key))

        pylab.gca().clear()
        pylab.close("all")
Esempio n. 26
0
def boxplot_data(results_list, title):
    pylab.clf()
    pylab.figure(1)
    result_cols = []
    for i in range(len(results_list[0])):
        res = [result[i] for result in results_list]
        result_cols.append(res)
    pylab.boxplot(result_cols)
    pylab.figure(1).autofmt_xdate()
    title = title + '_boxplot'
    pylab.title(title)
    if not os.path.exists('./graphs'): os.makedirs('./graphs')
    filename  = 'graphs/' + title + FILETYPE
    pylab.savefig(filename)
Esempio n. 27
0
def boxPlot( rankDict ):
  import numpy as np

  pylab.rcParams.update({'lines.linewidth' : 2.0})

  counts = []
  for i,(k,v) in enumerate(rankDict.iteritems()):
    counts.append(v)
  pylab.boxplot(counts)
  pylab.xticks(np.arange(1,len(rankDict)+1),rankDict.keys(), rotation=60)
  #pylab.ylim(0.0, 1.0)
  pylab.title('Rank Distributions by Orthology Group')
  pylab.xlabel('Orthology Classes')
  pylab.ylabel('Relative Ranks')
Esempio n. 28
0
def draw_ind_by_clust_plots(rc_sort,cd_sort,inds,rc_orig,cd_cut=None,win=1000,rc_low=4,rc_hi=10,fignum=1,figsize=(8,10),filename=None):
    if cd_cut is None:
        cd_cut = 0.1
        
    pylab.figure(fignum,figsize)

    lol = lol_by_segment(rc_sort,win)
    cdlol = lol_by_segment(cd_sort,win)
    ncat = len(lol)
    step = ncat/20
    print >> sys.stderr, 'draw boxplots'

    pylab.subplot(4,1,1)
    pylab.boxplot([[len([v for v in d.values() if v>=rc_low]) for d in li] for li in lol])
    pylab.xticks(numpy.arange(0,ncat,step),(numpy.arange(0,ncat,step)*win)/1000,rotation=90)
    
    pylab.subplot(4,1,2)
    pylab.boxplot([[len([v for v in d.values() if v>=rc_hi]) for d in li] for li in lol])
    pylab.xticks(numpy.arange(0,ncat,step),(numpy.arange(0,ncat,step)*win)/1000,rotation=90)
    
    pylab.subplot(4,1,3)
    pylab.boxplot([[len([v for v in d.values() if v>=rc_low]) for this_cd,d in zip(cdli,li) if this_cd <= cd_cut] for cdli,li in zip(cdlol,lol)])
    pylab.xticks(numpy.arange(0,ncat,step),(numpy.arange(0,ncat,step)*win)/1000,rotation=90)
    
    pylab.subplot(4,1,4)
    pylab.boxplot([[len([v for v in d.values() if v>=rc_hi]) for this_cd,d in zip(cdli,li) if this_cd <= cd_cut] for cdli,li in zip(cdlol,lol)])
    pylab.xticks(numpy.arange(0,ncat,step),(numpy.arange(0,ncat,step)*win)/1000,rotation=90)

    if filename is not None:
        print >> sys.stderr, 'store boxplots'
        try:
            pylab.savefig(filename)
        except IOError:
            print >> sys.stderr, 'unable to write %s, output not stored' % filename
Esempio n. 29
0
def fwhm_whisker_plot(stampImgList=None,bkgList=None,sigma=1.1/scale):
    whk,fwhm = get_fwhm_whisker_list(stampImgList,bkgList,sigma=sigma)
    whk=list(whk.T)
    fwh=list(fwhm.T)
    pl.figure(figsize=(7,5))
    pl.boxplot(whk)
    pl.hlines(0.2,0,3,linestyle='solid',color='g')
    pl.ylim(0.,.4)
    pl.xticks(np.arange(1,3),['whisker_Wmoments','whisker_Amoments'])
    pl.figure(figsize=(12,5))
    pl.boxplot(fwh)
    pl.ylim(0.4,1.5)
    pl.hlines(0.9,0,6,linestyle='solid',color='g')
    pl.xticks(np.arange(1,6),['fwhm_weighted', 'fwhm_Amoments','fwhm_moffat', 'fwhm_gauss','fwhm_sech2'])
    return '-----done !----'
Esempio n. 30
0
	def boxPlot(self):
		"""
		Plots a box-plot of the contig lengths. 

		Returns
		------
		Box plot of contig sizes, saved in the file contig_boxplot.png
		"""
		seqLengths = []
		for x in self.contigsInfo.keys():
			seq = self.contigsInfo[x]
			seqLengths.append(len(seq))

		pylab.boxplot(seqLengths)
		pylab.savefig('contig_boxplot.png')	
Esempio n. 31
0
def estimate_possibility(n,
                         mu,
                         k,
                         g,
                         cs,
                         ks,
                         num_holdouts,
                         percentages,
                         fuzzifier,
                         verbose=False):
    axiom_indices = range(n)
    assert (len(axiom_indices) == len(mu) == n)

    paired_axioms = [axiom_indices[i:i + 2] for i in range(0, n, 2)]
    paired_labels = [mu[i:i + 2] for i in range(0, n, 2)]

    metrics_membership_rmse = []
    metrics_membership_median = []
    metrics_membership_stdev = []

    metrics_possibility_rmse = []
    metrics_possibility_median = []
    metrics_possibility_stdev = []

    for h in range(num_holdouts):
        (paired_values_train, paired_values_validate, paired_values_test,
         paired_mu_train, paired_mu_validate,
         paired_mu_test) = split_indices(paired_axioms, paired_labels,
                                         percentages)

        if verbose:
            print 'holdout {} of {}'.format(h, num_holdouts)

        best_c, _, result = model_selection_holdout(paired_values_train,
                                                    paired_mu_train,
                                                    paired_values_validate,
                                                    paired_mu_validate,
                                                    cs,
                                                    ks,
                                                    sample_generator=g,
                                                    log=False,
                                                    adjustment=adjustment,
                                                    fuzzifier=fuzzifier,
                                                    verbose=verbose)
        if best_c is None:
            if verbose:
                print 'in holdout {} optimization always failed!'.format(h)
            continue

        if verbose:
            print 'in holdout {} best C is {}'.format(h, best_c)
        estimated_membership = result[0]

        # values and labels are still paired, we need to flatten them out
        values_test = flatten(paired_values_test)
        mu_test = flatten(paired_mu_test)

        membership_square_err = [(estimated_membership(v) - m)**2
                                 for v, m in zip(values_test, mu_test)]
        membership_rmse = math.sqrt(
            sum(membership_square_err) / len(values_test))
        metrics_membership_rmse.append(membership_rmse)

        membership_median = np.median(membership_square_err)
        metrics_membership_median.append(membership_median)

        membership_stdev = np.std(membership_square_err)
        metrics_membership_stdev.append(membership_stdev)

        estimated_mu = map(estimated_membership, values_test)
        actual_possibility = [
            mfi - mnotfi for mfi, mnotfi in zip(mu_test[::2], mu_test[1::2])
        ]
        estimated_possibility = [
            mfi - mnotfi
            for mfi, mnotfi in zip(estimated_mu[::2], estimated_mu[1::2])
        ]

        possibility_square_err = [
            (actual - estimated)**2 for actual, estimated in zip(
                actual_possibility, estimated_possibility)
        ]
        possibility_rmse = math.sqrt(
            sum(possibility_square_err) / len(possibility_square_err))
        metrics_possibility_rmse.append(possibility_rmse)

        possibility_median = np.median(possibility_square_err)
        metrics_possibility_median.append(possibility_median)

        possibility_stdev = np.std(possibility_square_err)
        metrics_possibility_stdev.append(possibility_stdev)

        indices = ['-'.join(map(str, pair)) for pair in paired_values_test]

        results = [
            (i, phi, notphi, max(phi, notphi), ephi, enotphi,
             max(ephi, enotphi), p, ep, (p - ep)**2)
            for i, phi, notphi, p, ephi, enotphi, ep in zip(
                indices, mu_test[::2], mu_test[1::2], actual_possibility,
                estimated_mu[::2], estimated_mu[1::2], estimated_possibility)
        ]

        results.sort(key=lambda r: r[-1])

        with open(
                'data/axioms-results-holdout-{}-{}-details.csv'.format(
                    fuzzifier.name, h), 'w') as output_file:
            writer = csv.writer(output_file)
            writer.writerows(results)

        with open(
                'data/axioms-results-holdout-{}-{}-global.csv'.format(
                    fuzzifier.name, h), 'w') as output_file:
            writer = csv.writer(output_file)
            writer.writerows([
                ('membership RMSE', membership_rmse),
                ('membership median', membership_median),
                ('membership STDEV', membership_stdev),
                ('possibility RMSE', possibility_rmse),
                ('possibility median', possibility_median),
                ('possibility STDEV', possibility_stdev),
            ])

        errors = [r[-1] for r in results]
        p = plt.boxplot(errors)
        plt.savefig('data/axioms-results-holdout-{}-{}-boxplot.png'.format(
            fuzzifier.name, h))
        plt.clf()

        p = plt.hist(errors, bins=50)
        plt.savefig('data/axioms-results-holdout-{}-{}-histogram.png'.format(
            fuzzifier.name, h))
        plt.clf()

        gc.collect()

    if verbose:
        print 'Membership average values:'
        print 'RMSE: {}'.format(np.average(metrics_membership_rmse))
        print 'Median: {}'.format(np.average(metrics_membership_median))
        print 'STDEV: {}'.format(np.average(metrics_membership_stdev))

        print 'Possibility average values:'
        print 'RMSE: {}'.format(np.average(metrics_possibility_rmse))
        print 'Median: {}'.format(np.average(metrics_possibility_median))
        print 'STDEV: {}'.format(np.average(metrics_possibility_stdev))

    with open(
            'data/axioms-results-holdout-{}-average-metrics.csv'.format(
                fuzzifier.name), 'w') as output_file:
        writer = csv.writer(output_file)
        writer.writerows([
            ('membership average RMSE', np.average(metrics_membership_rmse)),
            ('membership average median',
             np.average(metrics_membership_median)),
            ('membership average STDEV', np.average(metrics_membership_stdev)),
            ('possibility average RMSE', np.average(metrics_possibility_rmse)),
            ('possibility average median',
             np.average(metrics_possibility_median)),
            ('possibility average STDEV',
             np.average(metrics_possibility_stdev)),
        ])
Esempio n. 32
0
B = [p[1], q[1]]
C = [p[2], q[2]]

# D=[p[3],q[3]]
# E=[p[4],q[4]]
# F=[p[5],q[5]]
# G=[p[6],q[6]]
# H=[p[7],q[7]]
# I=[p[8],q[8]]

fig = figure()
ax = axes()
hold(True)

# first boxplot pair
bp = boxplot(A, positions=[1, 2], widths=0.6, whis=100000000)
setBoxColors(bp)

for box in bp['boxes']:
    # change outline color
    box.set(linewidth=3)
for cap in bp['whiskers']:
    cap.set(linewidth=4)
for median in bp['medians']:
    median.set(linewidth=4)

# second boxplot pair
bp = boxplot(B, positions=[4, 5], widths=0.6, whis=100000000)
setBoxColors(bp)

for box in bp['boxes']:
def boxplot(data, **kwargs):
    for d, x in zip(data, kwargs['positions']):
        if numpy.percentile(d, 0.25) == -2:
            pylab.plot([x], [max(d)], '_')
    pylab.boxplot(data, **kwargs)
Esempio n. 34
0
    def box_plot(self,
                 df,
                 x_label=None,
                 fontsize=25,
                 figsize=(15, 10),
                 markersize=12,
                 colors=None,
                 custom_legend=None,
                 legend_loc='best',
                 legend_font_size='10',
                 legend_marker_size=0.85,
                 box_line_thickness=1.75,
                 draw_points=False):
        """
        Plots all data in a dataframe as a box-and-whisker plot with optional
        axis label
        """
        tick_labels = [str(column) for column in df.columns]

        fontsize = fontsize

        # Draw figure and axis
        fig, ax = plt.subplots(figsize=figsize)

        # Set background to opaque
        fig.patch.set_facecolor('white')

        # Set grid parameters
        ax.yaxis.grid(False)
        ax.xaxis.grid(True,
                      linestyle='--',
                      which='both',
                      color='black',
                      alpha=0.5,
                      zorder=1)

        # Set left frame attributes
        ax.spines['left'].set_linewidth(1.8)
        ax.spines['left'].set_color('gray')
        ax.spines['left'].set_alpha(1.0)

        # Remove all but bottom frame line
        # ax.spines['left'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.spines['bottom'].set_visible(False)

        # Draw box plot
        box_plot_kwargs = dict(notch=0,
                               sym='+',
                               vert=False,
                               whis=5,
                               patch_artist=True,
                               capprops=dict(color='k',
                                             linestyle='-',
                                             linewidth=box_line_thickness),
                               boxprops=dict(linestyle='-',
                                             linewidth=box_line_thickness,
                                             color='black'),
                               medianprops=dict(linestyle='none',
                                                color='k',
                                                linewidth=box_line_thickness),
                               whiskerprops=dict(color='k',
                                                 linestyle='-',
                                                 linewidth=box_line_thickness))

        bp = plt.boxplot(df.values, **box_plot_kwargs)

        # Set custom colors
        if colors:
            for item in ['boxes']:  #'medians' 'whiskers', 'fliers', 'caps'
                for patch, color in zip(bp[item], colors):
                    patch.set_color(color)

            for patch, color in zip(bp['medians'], colors):
                patch.set_color('black')
        else:
            for patch in bp['boxes']:
                patch.set_color('black')

            for patch in bp['medians']:
                patch.set_color('black')

        # Draw overlying data points
        if draw_points == True:
            for column_ind, column in enumerate(df.columns):
                # Get data
                y = (column_ind + 1) * np.ones(len(df[column]))
                x = df[column].values

                # Plot data points
                plt.plot(x, y, '.', color='k', markersize=markersize)

        # Set tick labels and sizes
        plt.setp(ax, yticklabels=tick_labels)
        plt.setp(ax.get_yticklabels(), fontsize=fontsize)

        plt.setp(ax.get_xticklabels(), fontsize=fontsize)

        # Adjust limits so plot elements aren't cut off
        x_ticks, x_tick_labels = plt.xticks()

        # shift half of range to left
        range_factor = 2

        x_min = x_ticks[0]
        x_max = x_ticks[-1] + (x_ticks[-1] - x_ticks[-2]) / float(range_factor)

        # Set new limits
        plt.xlim(x_min, x_max)

        # Set tick positions
        plt.xticks(x_ticks)

        # Place x- and y-labels
        plt.xlabel(x_label, size=fontsize)
        # plt.ylabel(y_label,size=small_text_size)

        # Move ticks to where I want them
        ax.xaxis.set_ticks_position('none')
        ax.yaxis.set_ticks_position('left')

        if custom_legend:
            ax.legend(custom_legend[1],
                      custom_legend[0],
                      handlelength=legend_marker_size,
                      handleheight=legend_marker_size,
                      frameon=False,
                      loc=legend_loc)

            plt.setp(plt.gca().get_legend().get_texts(),
                     fontsize=legend_font_size)

        # Draw a white dot for medians
        for column_ind, column in enumerate(df.columns):
            x_median = np.median(df[column].values)
            y_median = (column_ind + 1) * np.ones(1)

            # Plot data points
            plt.plot(x_median,
                     y_median,
                     'o',
                     color='white',
                     markersize=markersize,
                     markeredgecolor='white',
                     zorder=3)

        # Display plot
        plt.show()
Esempio n. 35
0
    h = np.random.uniform(0.020 * 0.95, 0.020 * 1.05)
    l = np.random.uniform(0.95, 1.05)
    b = np.random.uniform(0.95, 1.05)
    T = np.random.uniform(70 * 0.95, 70 * 1.05) - np.random.uniform(
        200 * .095, 200 * 1.05)
    to = np.random.uniform(0.15 * 0.95, 0.15 * 1.05)
    ti = np.random.uniform(0.1 * 0.95, 0.1 * 1.05)
    P = np.random.uniform(2000 * 0.95, 2000 * 1.05)
    omegaVal = omega(G, h, Eo, to, Ei, ti)
    x = np.linspace(0, 0.5, 50)
    tensoes = []

    for xi in x:
        tensoes.append(
            tensaoCisalhamento(P, omegaVal, b, xi, Eo, to, Ei, ti, alfao,
                               alfai, T, l))
    vetor.append(tensoes)
    if min(tensoes) < minimo:
        minimo = min(tensoes)
    if max(tensoes) > maximo:
        maximo = max(tensoes)
    pl.plot(x, tensoes)
    pl.grid()
    pl.title("Tensão cisalhante por distância")
    pl.xlabel("Distância (in)")
    pl.ylabel("Tensão cisalhante (lbf/in²)")
pl.show()
pl.figure()
pl.boxplot(vetor)
print("valor mínimo:" + str(minimo) + "\nvalor máximo: " + str(maximo))
Esempio n. 36
0
# B = [q, [7, 2, 5]]
# C = [[3, 2, 5, 7], [6, 7, 3]]

A = [p[0], q[0]]
B = [p[1], q[1]]
C = [p[2], q[2]]

D = [p[3], q[3]]
E = [p[4], q[4]]

fig = figure()
ax = axes()
hold(True)

# first boxplot pair
bp = boxplot(A, positions=[1, 2], widths=0.6, whis=100000000)
setBoxColors(bp)

for box in bp['boxes']:
    # change outline color
    box.set(linewidth=2)
for cap in bp['whiskers']:
    cap.set(linewidth=2)
for median in bp['medians']:
    median.set(linewidth=2)

# second boxplot pair
bp = boxplot(B, positions=[4, 5], widths=0.6, whis=100000000)
setBoxColors(bp)

for box in bp['boxes']:
Esempio n. 37
0
df = pd.read_excel('/Users/mac/Desktop/Machine Learning/module4/housing.xlsx',
                   header=0)
df.columns = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
    'PTRATIO', 'B', 'LSTAT', 'MEDV'
]
#EDA
summary = df.describe()
print(summary)
des = open('/Users/mac/Desktop/Machine Learning/module4/summary.csv', 'w')
print(df.describe(), file=des)
des.close()
#box-plot of 13 attributes
from pylab import boxplot
array = df.iloc[:, 0:13].values
boxplot(array)
plt.xlabel("Attribute Index")
plt.ylabel(("Quartile Ranges"))
plt.savefig('/Users/mac/Desktop/Machine Learning/module4/box-plot.jpg')
plt.show()

df = df.dropna(axis=0)
import seaborn as sns
sns.pairplot(df, size=2.5)
plt.tight_layout()
plt.savefig('/Users/mac/Desktop/Machine Learning/module4/pairplot.jpg')
plt.show()

#13x13 correlation matrix and heatmap
from pandas import DataFrame
corMat = DataFrame(df.corr())
Esempio n. 38
0
    setp(bp['whiskers'][0], color='blue')
    setp(bp['whiskers'][1], color='blue')
    setp(bp['fliers'][0], color='blue')
    setp(bp['fliers'][1], color='blue')
    setp(bp['medians'][0], color='blue')

    setp(bp['boxes'][1], color='red')
    setp(bp['caps'][2], color='red')
    setp(bp['caps'][3], color='red')
    setp(bp['whiskers'][2], color='red')
    setp(bp['whiskers'][3], color='red')
    setp(bp['fliers'][2], color='red')
    setp(bp['fliers'][3], color='red')
    setp(bp['medians'][1], color='red')


# Some fake data to plot
A = [auc1, auc2]
fig = figure()
ax = axes()
hold(True)
bp = boxplot(A, positions=[1, 2], widths=0.6)
#setBoxColors(bp)
# set axes limits and labels
ylim(0.8, 1.05)
ax.set_xticklabels(['Deep Learning', 'SVM'])
title('Comparision between Deep Learning and SVM')
ylabel('AUC Score')
savefig('boxcompare.png')
show()
Esempio n. 39
0
import numpy as np
data = np.random.randn(1000)
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(6, 3))
ax1.hist(data, bins=30, normed=True, color='b')
ax2.hist(data, bins=10, normed=False, color='r', cumulative=True)

x = [1, 2, 3, 2, 1]
y = [3, 2, 1, 3, 1]
pl.subplot(2, 1, 1)
pl.plot(x)
pl.subplot(2, 1, 2)
pl.plot(y)

x = [1, 2, 3, 2, 1]
y = [3, 2, 1, 3, 1]
pl.subplot(1, 2, 1)
pl.plot(x)
pl.subplot(1, 2, 2)
pl.plot(y)

x = np.random.randn(256)
pl.boxplot(x, vert=0)
pl.show()

samp1 = np.random.normal(loc=0, scale=3., size=200)
samp2 = np.random.normal(loc=5., scale=10., size=500)
samp3 = np.random.normal(loc=0.3, scale=1.2, size=100)
f, ax = plt.subplots(1, 1, figsize=(5, 4))
ax.boxplot((samp1, samp2, samp3))
ax.set_xticklabels(['sample1', 'sample2', 'sample3'])
    else:
        data = numpy.hstack((data, data_d))

    mi_ests[nr] = mi_est

    nr += 1

pl.figure(tight_layout=True, figsize=(len(widths) * 4, 4))
i = 0
for T_s in datas:
    pl.subplot(1, len(datas), i + 1)
    pl.scatter(T_s[:, 0], T_s[:, 1], alpha=.3, s=81)
    pl.title('w: ' + str(widths[i]))
    pl.xlim([-1, 1])
    pl.ylim([-1, 1])
    i += 1

pl.savefig("ring_data.png")

pl.figure(tight_layout=True, figsize=(6, 4))
pl.boxplot(data)
title = "N=%i (ring)" % (n)
pl.ylabel('MI')
pl.xlabel('ring width')
pl.gca().set_xticklabels(widths)
pl.plot(range(1, len(widths) + 1), mi_ests, c="red", label="est MI (kd-tree)")
pl.legend(loc=0, prop={'size': 8})
pl.title(title)

pl.savefig("mi_vs_ring.png")
Esempio n. 41
0
def analysis_results(options):
    """
    Analyzes the results of the comparisons
    """

    # Start marker for time measure
    start = time.time()

    print(
        "\n\t\t------------------------------------------------------------------------------------------------------------------------\n"
    )
    print(
        "\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Analysis by targets\n"
    )
    print(
        "\t\t------------------------------------------------------------------------------------------------------------------------\n"
    )

    # Get the script path
    main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
    toolbox_dir = os.path.join(main_path, 'diana/toolbox')

    # Check the directory of the profiles, comparisons and analysis
    data_dir = os.path.join(options.workspace, "profiles")
    check_directory(data_dir)

    results_dir = os.path.join(options.workspace, "comparisons")
    check_directory(results_dir)

    analysis_dir = os.path.join(options.workspace, "analysis")
    check_directory(analysis_dir)

    # Get the list of thresholds to create the profiles
    if options.threshold_list and fileExist(options.threshold_list):
        threshold_list = get_values_from_threshold_file(options.threshold_list)
    else:
        threshold_list = [1, 5, 10, 20, 50]

    # Do we consider Side Effects/ATC?
    if options.consider_se:
        consider_se = True
    else:
        consider_se = False

    # Get the names of the columns
    columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se)

    #-----------------------------------------------------#
    #   PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME   #
    #-----------------------------------------------------#

    pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl')
    pair2comb = cPickle.load(open(pair2comb_file))

    ddi = sum(1 for x in pair2comb.values() if x == 1)
    non_ddi = sum(1 for x in pair2comb.values() if x == 0)

    print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi))
    print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi))

    output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv')

    if not fileExist(output_dataframe):

        # Create a data frame to store the results
        df = pd.DataFrame(columns=columns)

        # Obtain all the results subfolders of the results main folder
        results_dir_list = [
            f for f in os.listdir(results_dir)
            if os.path.isdir(os.path.join(results_dir, f))
        ]

        for comparison in results_dir_list:

            drug_id1, drug_id2 = comparison.split('---')
            comparison_dir = os.path.join(results_dir, comparison)
            results_table = os.path.join(comparison_dir, 'results_table.tsv')

            # Add the Comb field (if it is drug combination or not)
            drug1 = drug_id1.split('_')[0].upper()
            drug2 = drug_id2.split('_')[0].upper()
            comparison_without_id = '{}---{}'.format(drug1, drug2)
            if comparison_without_id in pair2comb:
                combination_field = pair2comb[comparison_without_id]
            else:
                print(
                    'The comparison {} is not in the pair2comb dictionary!\n'.
                    format(comparison_without_id))
                print(pair2comb)
                sys.exit(10)

            if not fileExist(results_table):
                print('The comparison {} has not been executed properly!\n'.
                      format(comparison))
                sys.exit(10)

            results = get_results_from_table(results_table, columns,
                                             combination_field)

            df2 = pd.DataFrame([results], columns=columns, index=[comparison])
            # Add the information to the main data frame
            df = df.append(df2)

        # Output the Pandas dataframe in a CSV file
        df.to_csv(output_dataframe)

    else:
        df = pd.read_csv(output_dataframe, index_col=0)

    #---------------------------#
    #   REMOVE MISSING VALUES   #
    #---------------------------#

    # Replace the None values in dcstructure by nan
    if 'None' in df['dcstructure']:
        df = df.replace(to_replace={'dcstructure': {'None': np.nan}})

    # Remove the nan values in dcstructure
    df = df.dropna()

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print('Number of drug combinations after removing missing values:\t{}\n'.
          format(num_dc))
    print(
        'Number of non-drug combinations after removing missing values:\t{}\n'.
        format(num_ndc))

    #---------------------------#
    #   IDENTIFY ME-TOO DRUGS   #
    #---------------------------#

    me_too_dir = os.path.join(analysis_dir, 'me_too_drugs')
    create_directory(me_too_dir)
    me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv')
    me_too_drug_combs_table = os.path.join(me_too_dir,
                                           'me_too_drug_combinations.tsv')

    me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl')
    me_too_drug_comb_pairs_file = os.path.join(me_too_dir,
                                               'me_too_drug_comb_pairs.pcl')

    if not fileExist(me_too_drug_pairs_file) or not fileExist(
            me_too_drug_comb_pairs_file):

        df_struc = df[['dcstructure']]
        df_struc = df_struc.astype(float)
        me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations(
            df_struc, columns, me_too_drugs_table, me_too_drug_combs_table)
        cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w'))
        cPickle.dump(me_too_drug_comb_pairs,
                     open(me_too_drug_comb_pairs_file, 'w'))

    else:

        me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file))
        me_too_drug_comb_pairs = cPickle.load(
            open(me_too_drug_comb_pairs_file))

    # Process me-too drug combination pairs
    me_too_drug_combinations = set()
    drug_pair_to_me_too_times = {}
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2]))
        drug_pair_to_me_too_times.setdefault(drug_comb1, 0)
        drug_pair_to_me_too_times.setdefault(drug_comb2, 0)
        drug_pair_to_me_too_times[drug_comb1] += 1
        drug_pair_to_me_too_times[drug_comb2] += 1
    removed_drug_pairs = set()
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs:
            continue
        if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[
                drug_comb2]:
            removed_drug_pairs.add(drug_comb1)
        else:
            removed_drug_pairs.add(drug_comb2)

    # Remove the drug pairs which appear in me-too pairs of drug pairs more times
    df = df.loc[~df.index.isin(list(removed_drug_pairs))]

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print(
        'Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n'
        .format(num_dc))
    print(
        'Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n'
        .format(num_ndc))

    #-------------------------#
    #   EVALUATE PERFORMANCE  #
    #-------------------------#

    img_dir = os.path.join(analysis_dir, 'figures')
    create_directory(img_dir)
    fig_format = 'png'

    tables_dir = os.path.join(analysis_dir, 'tables')
    create_directory(tables_dir)

    # Machine learning parameters
    repetitions = 25  # Number of repetititons
    n_fold = 2  # Number of folds
    min_num_dc_group = 10
    greater_or_smaller = 'greater'
    classifier = 'SVC'
    classifiers = {
        'KNeighbors':
        KNeighborsClassifier(3),
        'SVC':
        SVC(probability=True),
        'SVC linear':
        SVC(kernel="linear", C=0.025),
        'SVC rbf':
        SVC(gamma=2, C=1),
        'DecisionTree':
        DecisionTreeClassifier(max_depth=5),
        'RandomForest':
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        'MLP':
        MLPClassifier(alpha=1),
        'AdaBoost':
        AdaBoostClassifier(),
        'GaussianNB':
        GaussianNB(),
        'QuadraticDiscr.':
        QuadraticDiscriminantAnalysis(),
        'SVC best 1':
        SVC(kernel="linear", C=0.1, probability=True),
        'SVC best 2':
        SVC(kernel="rbf", gamma=0.01, C=100.0, probability=True)
    }

    # Plot of distributions of AUC
    plot_name = os.path.join(img_dir,
                             'dcGUILD_1_threshold_auc.{}'.format(fig_format))

    # Get the targets file
    drugbank_to_targets_file = os.path.join(toolbox_dir,
                                            'drugbank_to_targets.pcl')
    drugbank_to_targets = cPickle.load(open(drugbank_to_targets_file))

    # Get the DIANA IDs file
    diana_id_to_drugbank_file = os.path.join(toolbox_dir,
                                             'diana_id_to_drugbank.pcl')
    diana_id_to_drugbank = cPickle.load(open(diana_id_to_drugbank_file))

    print('\nEVALUATION OF DCGUILD\n')
    repetitions = 25
    n_fold = 10
    analysis_results = {}

    # Obtain the different non-drug combination groups to repeat the analysis
    ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length(
        ndc_data, repetitions, num_dc
    )  # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times

    # dcGUILD_features = [str(x) for x in threshold_list]
    # dcGUILD_feature_to_columns = {}
    # # Get dcGUILD columns
    # for top_threshold in threshold_list:
    #     for data_type in ['node', 'edge', 'function']:
    #         for scoring_function in ['dot_product', 'spearman', 'jaccard']:
    #             col = 'dcg'+'_'+data_type+'_'+str(top_threshold)+'_'+scoring_function
    #             dcGUILD_feature_to_columns.setdefault(str(top_threshold), [])
    #             dcGUILD_feature_to_columns[str(top_threshold)].append(col)
    #     dcGUILD_feature_to_columns[str(top_threshold)].append('combination')

    dcGUILD_features = []
    dcGUILD_feature_to_columns = {}
    # Get dcGUILD columns
    for top_threshold in [1]:
        for data_type in ['node', 'edge', 'function']:
            for scoring_function in ['dot_product', 'spearman', 'jaccard']:
                col = 'dcg' + '_' + data_type + '_' + str(
                    top_threshold) + '_' + scoring_function
                dcGUILD_features.append(col)
                dcGUILD_feature_to_columns[col] = [col, 'combination']

    for feature in dcGUILD_features:

        df_method = df[dcGUILD_feature_to_columns[feature]]

        dc_data = df_method[df_method['combination'] == 1]
        ndc_data = df_method[df_method['combination'] == 0]
        num_dc = len(dc_data.index)
        num_ndc = len(ndc_data.index)

        print(feature)
        print(
            'Building {} repetition groups of {} (same) DC and {} (different) non-DC'
            .format(repetitions, num_dc, num_dc))
        ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length(
            ndc_data, repetitions, num_dc
        )  # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times

        mean_aucs = [
        ]  # Here we will store the means of AUCs from the cross-validations
        std_aucs = [
        ]  # Here we will store the standard deviations of the AUCs from the cross-validations
        all_aucs = []  # Here we will store ALL the AUCs
        all_probs = []  # Here we store all the probabilities and labels

        num_repetitions = 0
        for ndc_data_equal in ndc_repetitions:

            num_repetitions += 1
            num_items_group = int(
                float(num_dc) / float(n_fold)
            )  # Calculate the number of items in each group of the cross-validation
            if num_repetitions == 1:
                print(
                    'Building {} fold groups of {} DC and {} non-DC x {} repetitions'
                    .format(n_fold, num_items_group, num_items_group,
                            repetitions))

            dc_groups = diana_analysis.obtain_n_groups_of_k_length(
                dc_data, n_fold, num_items_group, me_too_drug_combinations
            )  # Defining the drug combination groups in each cross-validation step
            ndc_groups = diana_analysis.obtain_n_groups_of_k_length(
                ndc_data_equal, n_fold, num_items_group,
                me_too_drug_combinations
            )  # Defining the non-drug combination groups in each cross-validation step
            merged_groups = [
                pd.concat([x, y]) for x, y in zip(dc_groups, ndc_groups)
            ]

            mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_scikit_with_prob(
                n_fold, merged_groups, classifiers[classifier])

            mean_aucs.append(mean)
            std_aucs.append(std)
            all_aucs = all_aucs + list_auc
            all_probs = all_probs + list_prob

        final_mean = np.mean(mean_aucs)
        mean_std = np.mean(std_aucs)
        std_means = np.std(mean_aucs)
        std = np.std(all_aucs)
        print('FINAL MEAN: {}'.format(final_mean))
        print('MEAN of STD: {}'.format(mean_std))
        print('STD: {}\n'.format(std))

        # Store the distribution of AUCs in the dictionary
        analysis_results[feature] = all_aucs

    #------------------------------#
    #   PLOT DISTRIBUTION OF AUC   #
    #------------------------------#

    fig = pylab.figure(dpi=300)
    ax = pylab.axes()
    #pylab.hold(True)
    pos = 1
    col_num = 0

    xticks = []  # Define the places in which the labels will be
    xlabels = []  # Define the labels (the names of the features)
    #colors = [ ['#9ed0ff, blue'], ['#32f232', 'green'], ['#fbc562', '#d48900'], ['#ff7373', '#b80000'], ['grey', 'black'] ]

    for feature in dcGUILD_features:

        positions = []
        positions.append(pos)  # Define the positions of the boxplots
        pos += 2  # Add separation between boxplots
        xlabels.append(feature)  # Add the feature used at the x axis

        # Boxplot group
        #bp = boxplot(data, positions = positions, widths = 0.6)
        bp = pylab.boxplot(analysis_results[feature],
                           positions=positions,
                           widths=0.6,
                           patch_artist=True)

        tick = np.mean(
            positions
        )  # The label will be at the mean of the positions (in the middle)
        xticks.append(tick)

    # Set axes limits and labels
    pylab.xlim(0, pos - 1)
    pylab.ylim(0, 1)
    ax.set_xticklabels(xlabels)
    ax.set_xticks(xticks)
    pylab.xlabel('Features')
    pylab.ylabel('Distribution of AUC values')

    fig.autofmt_xdate()
    pylab.savefig(plot_name, format=fig_format)
    pylab.show()

    # End marker for time
    end = time.time()
    print(
        '\n  DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n'
        .format(end - start, (end - start) / 60))

    return
Esempio n. 42
0
    liveness = Float32Col()
    energy = Float32Col()
    speechiness = Float32Col()
    valence = Float32Col()
    tempo = Float32Col()
    key = Int32Col()
    mode = StringCol(5)

h5file = open_file('output.h5', mode='r', title='Spotify Tracks')

for table in h5file.root.trackinfo:
    acousticness = np.array([])

    for track in table:
        acousticness = np.append(acousticness, track['acousticness'])

    acousticness = acousticness.astype(float)
    p = P.figure()
    bp = P.boxplot(acousticness)

    p.suptitle('Acousticness Distribution for ' + table.name, fontsize=20)
    P.ylabel('Acousticness Score')
    P.ylim([0, 1])

    for i in range(acousticness.size):
        y = acousticness
        x = np.random.normal(1 + i, 0.04, size=acousticness.size)
        P.plot(x, y, 'ro', alpha=0.2)

P.show()
Esempio n. 43
0
                S = cc_state.cc_state([X[:, 0], X[:, 1]], ['normal'] * 2,
                                      ct_kernel=kernel,
                                      distargs=[None] * 2)
                S.transition(N=200)

                mi = iu.mutual_information(S, 0, 1)
                # linfoot = iu.mutual_information_to_linfoot(MI)

                MI[r, c] = mi

                print("w: %1.2f, MI: %1.6f" % (w, mi))
                print("%i of %i" %
                      (i + 1, len(W_list) * n_data_sets * n_samples * 2))

                del S

                i += 1
                r += 1
        c += 1

    w_labs = [str(w) for w in W_list]

    ax = pylab.subplot(1, 2, kernel + 1)
    pylab.boxplot(MI)
    pylab.ylim([0, 1])
    pylab.ylabel('MI')
    pylab.xlabel('ring width')
    pylab.title("kernel %i" % kernel)
    ax.set_xticklabels(w_labs)

pylab.show()
Esempio n. 44
0
    blah = numpy.load(outFileRaw + '.npz')
    W = blah['X']

    for ii in range(X.shape[0]):
        Xi = X[ii, :, :]
        Zi = Z[ii, :, :]
        Wi = W[ii, :, :]

        pylab.subplot(2, 3, 1)
        pylab.imshow(Xi, interpolation='nearest', cmap='gist_earth')
        pylab.title('(mu=%0.2f, std=%0.2f, sk=%0.2f)' %
                    (numpy.mean(Xi), numpy.std(Xi), stats.skew(col(Xi))))
        pylab.colorbar()

        pylab.subplot(2, 3, 4)
        pylab.boxplot(col(Xi))

        pylab.subplot(2, 3, 2)
        pylab.imshow(Zi, interpolation='nearest', cmap='gist_earth')
        pylab.title('(mu=%0.2f, std=%0.2f, sk=%0.2f)' %
                    (numpy.mean(Zi), numpy.std(Zi), stats.skew(col(Zi))))
        pylab.colorbar()

        pylab.subplot(2, 3, 5)
        pylab.boxplot(col(Zi))

        pylab.subplot(2, 3, 3)
        pylab.imshow(Wi, interpolation='nearest', cmap='gist_earth')
        pylab.title('(mu=%0.2f, std=%0.2f, sk=%0.2f)' %
                    (numpy.mean(Wi), numpy.std(Wi), stats.skew(col(Wi))))
        pylab.colorbar()
Esempio n. 45
0
sns.violinplot(x='target', y='lwt', hue="sit",
data=D_, palette="muted", split=True, zorder=2)

#%%

# 2.2.3. Análise usando bwt (variável quantitativa)

vars = ['smoke', 'race2', 'ptl2', 'ht', 'ui', 'ftv2']

f_trues = []
f_falses = []
for v in vars: ax = sns.boxplot(x="day", y="total_bill", hue="smoker",
...                  data=tips, palette="Set3")
    f_trues.append(X[X[v] == True]['bwt'])
    f_falses.append(X[X[v] == False]['bwt'])
    pl.boxplot([f_falses[-1], f_trues[-1]])
    pl.xticks([1,2], [0, 1])
    pl.yticks(range(0, 5001, 500))
    # pl.grid(axis='y')
    pl.show()
    pl.close()

# %%
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: iagorosa
"""

# In[]
import numpy as np 
Esempio n. 46
0
    def plot(self, vert=True, alpha=0.4, widths=0.5, **kwargs):
        """Plot the boxplots and dots


        """
        self.widths = widths
        if self.hold is False:
            pylab.clf()

        ordered_data = [self.data[key] for key in self.names]

        for i, vector in enumerate(ordered_data):
            N = len(vector)

            color = self.colors[i % len(self.colors)]
            if vert is True:
                X, Y = self.beeswarm(vector, i + 1), vector
            else:
                X, Y = vector, self.beeswarm(vector, i + 1)

            pylab.plot(X,
                       Y,
                       'o',
                       markersize=self.markersize,
                       markerfacecolor=color,
                       markeredgewidth=1,
                       alpha=alpha)

        #show means but not outliers
        try:
            d = pylab.boxplot(ordered_data,
                              widths=self.widths,
                              vert=vert,
                              patch_artist=True,
                              positions=range(1,
                                              len(ordered_data) + 1),
                              showmeans=True,
                              showfliers=False)
        except:
            # ReadTheDocs uses matplotlib 1.3.1 for now, so
            # need this without showmeans parameter
            d = pylab.boxplot(ordered_data,
                              widths=self.widths,
                              vert=vert,
                              patch_artist=True,
                              positions=range(1,
                                              len(ordered_data) + 1))

        # for further tuning if needed.
        self.tuning = d
        # This is now in matplotlib 1.4.3 (dots instead of lines
        # though)

        # additional line for the 1 std
        means = [pylab.mean(data) for data in ordered_data]
        stds = [pylab.std(data) for data in ordered_data]
        for i, this in enumerate(means):
            if vert is True:
                x1 = (i + 1) - widths / 2. / 1.5
                x2 = (i + 1) + widths / 2. / 1.5
                X = pylab.array([x1, x2])
                y = this + stds[i]
                pylab.plot(X, [y, y], lw=2, color='purple')
                y = this - stds[i]
                pylab.plot(X, [y, y], lw=2, color='purple')
            else:
                y1 = (i + 1) - widths / 2. / 1.5
                y2 = (i + 1) + widths / 2. / 1.5
                Y = pylab.array([y1, y2])
                x = this + stds[i]
                pylab.plot([x, x], Y, lw=2, color='purple')
                x = this - stds[i]
                pylab.plot([x, x], Y, lw=2, color='purple')

        for i, this in enumerate(d['boxes']):
            this.set_color('k')
            this.set_linewidth(self.lw)
            color = self.colors[i % len(self.colors)]
            this.set_facecolor(color)
            this.set_alpha(
                0.3)  # 0.4 is less than the alpha of the dots to ...
            # ... so as to see the dots inside the boxes
            this.set_zorder(10)  # this moves the box on top of all dots
        for this in d['caps']:
            this.set_linewidth(self.lw)
        for this in d['whiskers']:
            this.set_linewidth(self.lw)
        for this in d['medians']:
            this.set_linewidth(self.lw)

        # we will extend the limits by 5%
        m = min([min(this) for this in self.data.values()])
        M = max([max(this) for this in self.data.values()])
        extend = 0.05
        R = (M - m) * extend
        X, Y = range(1, len(self.names) + 1), self.names
        Y = [y.replace("_", " ") for y in Y]
        if vert is True:
            pylab.ylabel(self.ylabel, fontsize=self.fontsize)
            pylab.xticks(X, Y, fontsize=self.fontsize, rotation=90)
            pylab.ylabel(self.xlabel, fontsize=self.fontsize)
            pylab.yticks(pylab.yticks()[0], fontsize=self.fontsize)
            pylab.ylim([m - R, M + R])
        else:
            pylab.xlabel(self.xlabel, fontsize=self.fontsize)
            if len(X) > 20:
                pylab.yticks(X, Y, fontsize=self.fontsize / 1.6, rotation=00)
            else:
                pylab.yticks(X, Y, fontsize=self.fontsize, rotation=00)

            pylab.ylabel(self.ylabel, fontsize=self.fontsize)
            pylab.xticks(pylab.xticks()[0], fontsize=self.fontsize)
            pylab.xlim([m - R, M + R])

        pylab.title(self.title, fontsize=self.fontsize * 1.25)
        pylab.grid()
        try:
            pylab.tight_layout()
        except:
            pass

        return pylab.gca()
Esempio n. 47
0
    def _plot_nominal(self, data, result_dir, fig1, ax, x_key, y_key):
        """ Creates a boxplot of the y_keys for the given nominal parameter x_key.
        
        A method that  allows to create a plot that visualizes the effect
        of differing one nominal  variable onto a second one (e.g. the effect of
        differing the classifier onto the accuracy).
        
        **Expected parameters**
          *data*:  A dictionary, that contains a mapping from an attribute \
                   (e.g. accuracy) to a list of values taken by an attribute. \
                   An entry is the entirety of all i-th values over all dict-values
                   
          *result_dir*: The director in which the plots will be saved.
          
          *x_key*: The key of the dictionary whose values should be used as \
                    values for the x-axis (the independent variables)
                    
          *y_key*: The key of the dictionary whose values should be used as\
                    values for the y-axis, i.e. the dependent variable
        
        """

        ax.append(fig1.add_subplot(111, label="%d" % (ax.__len__() + 1)))
        fig1.sca(ax[-1])

        # Create the plot for this specific dependent variable
        values = defaultdict(list)
        for i in range(len(data[x_key])):
            parameter_value = data[x_key][i]
            if y_key[0] is not "#":
                performance_value = float(data[y_key][i])
            else:  # A weighted cost function
                weight1, y_key1, weight2, y_key2 = y_key[1:].split("#")
                performance_value = float(weight1) * float(data[y_key1][i]) \
                                        + float(weight2) * float(data[y_key2][i])

            values[parameter_value].append(performance_value)

        values = sorted(values.items())
        #        values = [("Standard_vs_Target", values["Standard_vs_Target"]),
        #                  ("MissedTarget_vs_Target", values["MissedTarget_vs_Target"])]

        pylab.subplots_adjust(bottom=0.3,
                              )  # the bottom of the subplots of the figure

        # pylab.boxplot(map(lambda x: x[1], values))

        b = pylab.boxplot(map(lambda x: x[1], values))
        medlines = b['medians']
        medians = range(len(medlines))
        for i in range(len(medians)):
            medians[i] = medlines[i].get_ydata()[0]
        # create array with median labels with 2 decimal places of precision
        upperLabels = [str(numpy.round(m, 2)) for m in medians]

        pylab.gca().set_xticklabels(map(lambda x: x[0], values))
        pylab.setp(pylab.gca().get_xticklabels(), rotation=-90)
        pylab.setp(pylab.gca().get_xticklabels(), size='x-small')
        pylab.gca().set_xlabel(x_key.replace("_", " "))

        # top = pylab.gca().get_ylim()[1]
        #         for i in range(len(medians)):
        #             pylab.gca().text(i+1,top-(top*0.05),upperLabels[i],
        #                 horizontalalignment='center', size='x-small')

        bottom = pylab.gca().get_ylim()[0]
        for i in range(len(medians)):
            pylab.gca().text(i + 1,
                             bottom + (bottom * 0.05),
                             upperLabels[i],
                             horizontalalignment='center',
                             size='x-small')

        if y_key[0] is not "#":
            pylab.gca().set_ylabel(y_key.replace("_", " "))
        else:
            pylab.gca().set_ylabel("%s*%s+%s*%s" % tuple(y_key[1:].split("#")))

        return fig1, ax
def plot_deviation(vals_of_replicas,
                   vals_of_graph,
                   metrics,
                   figpath,
                   jaccard_edges=None,
                   title_infix='',
                   seed=0,
                   Gname=''):
    #vals_of_graph could be a number (level 0) or a list (the same as the number of replicas)
    clean_names = {
        'num nodes': 'num nodes',
        'num edges': 'num edges',
        'clustering': 'clustering',
        'average degree': 'avg\ndegree',
        'degree assortativity': 'degree\nassortativity',
        'degree connectivity': 'degree\nconnectivity',
        'total deg*deg': 'total deg*deg\nassortativity',
        's-metric': 's metric',
        'mean ecc': 'avg\neccentricity',
        'num comps': 'num comps',
        'L eigenvalue sum': 'L eigen-\nvalue sum',
        'average shortest path': 'avg\ndistance',
        'harmonic mean path': 'harmonic avg\ndistance',
        'avg flow closeness': 'avg flow\ncloseness',
        'avg eigvec centrality': 'avg eigenvec.\ncentrality',
        'avg between. central.': 'avg between.\ncentrality',
        'modularity': 'modularity'
    }

    multiple_models = type(vals_of_graph[0]) is list

    pylab.show()
    fig = pylab.figure()
    pylab.hold(True)
    num_of_metrics = len(metrics)
    med_vals = [np.median(vals_of_replicas[i]) for i in range(num_of_metrics)]
    avg_vals = [np.average(vals_of_replicas[i]) for i in range(num_of_metrics)]
    p25_vals = [
        np.percentile(vals_of_replicas[i], 25) for i in range(num_of_metrics)
    ]
    p75_vals = [
        np.percentile(vals_of_replicas[i], 75) for i in range(num_of_metrics)
    ]
    max_vals = [np.max(vals_of_replicas[i]) for i in range(num_of_metrics)]
    min_vals = [np.min(vals_of_replicas[i]) for i in range(num_of_metrics)]
    std_vals = [np.std(vals_of_replicas[i]) for i in range(num_of_metrics)]

    replica_stats = {
        'median_of_replicas': med_vals,
        'avg_of_replicas': avg_vals,
        'p25_of_replicas': p25_vals,
        'p75_of_replicas': p75_vals,
        'max_of_replicas': max_vals,
        'min_of_replicas': min_vals,
        'std_of_replicas': std_vals
    }

    normed_replica_vals = []
    avg_norms = []
    print('Medians' +
          (' (average of model graphs)' if multiple_models else ''))
    print('-------')
    print('metric\t\tOriginalG\t\tReplicas')
    for met_num, metric in enumerate(metrics):
        try:
            model_val = np.average(
                vals_of_graph[met_num]
            ) if multiple_models else vals_of_graph[met_num]
            print('%s\t\t%.5f\t\t%.5f' %
                  (metric['name'], model_val, med_vals[met_num]))
        except:
            print('%\tserror' % metric['name'])
    for met_num, metric in enumerate(metrics):
        #handle error in original, 0 in original, error in one replica, error in all replicas
        nor_vals = []
        if multiple_models:
            assert len(vals_of_graph[met_num]) == len(
                vals_of_replicas[met_num])
            pruned_model_vals = [
                v for v in vals_of_graph[met_num]
                if v != graphutils.METRIC_ERROR
            ]
            if len(pruned_model_vals) > 0:
                v_graph = np.average(pruned_model_vals)
            else:
                v_graph = graphutils.METRIC_ERROR
        else:
            v_graph = vals_of_graph[met_num]

        v_reps = vals_of_replicas[met_num]
        if v_graph != graphutils.METRIC_ERROR:
            if v_graph != 0.0:
                nor_vals = [
                    float(v) / v_graph for v in v_reps
                    if v != graphutils.METRIC_ERROR
                ]
            else:
                if v_reps != [] and np.abs(v_reps).sum() == 0.:
                    nor_vals.append(len(v_reps) * [1.0])
            pylab.plot(1.0, met_num, 'o', color='k', linewidth=2., label=Gname)
            pylab.text(x=.0,
                       y=(met_num - 2. / len(metrics)),
                       s='%.2e' % v_graph)
            #if type(v_graph) is int:
            #    pylab.text(x=.0, y=(met_num-2./len(metrics)), s=str(v_graph))
            #else:
            #    pylab.text(x=.0, y=(met_num-2./len(metrics)), s='%.3f'%v_graph)
            nor_vals = np.array(nor_vals)
            normed_replica_vals.append(nor_vals)
            if len(nor_vals) > 0:
                pylab.boxplot(nor_vals,
                              positions=[met_num],
                              vert=0,
                              widths=0.5)
                if (nor_vals == graphutils.METRIC_ERROR).any():
                    val_str = r'undefined'
                    avg_norm = -np.inf
                elif np.abs(nor_vals).sum() < 1000:
                    avg_norm = np.average(nor_vals)
                    val_str = r'$%.2f$' % np.average(
                        nor_vals) if latex_available else r'%.2f' % avg_norm
                else:
                    avg_norm = np.inf
                    val_str = r'$\gg0$' if latex_available else r'>>0'
                avg_norms.append(avg_norm)
            else:
                val_str = r'undefined'
                avg_norms.append(None)
        else:
            val_str = r'undefined'
            normed_replica_vals.append([None, None])
            avg_norms.append(None)
        pylab.text(x=1.74, y=(met_num - 2. / len(metrics)), s=val_str)
    try:
        pylab.yticks(
            list(range(num_of_metrics)),
            [clean_names.get(met['name'], met['name']) for met in metrics],
            rotation=0)
        if multiple_models:
            pylab.xlabel(r'Relative to mean of coarse networks',
                         rotation=0,
                         fontsize='20')  #, x=0.1)
        else:
            pylab.xlabel(r'Relative to real network',
                         rotation=0,
                         fontsize='20')  #, x=0.1)
        #pylab.title(G.name)
        #pylab.legend(loc='best')
        max_axis = 2
        pylab.xlim(-0.02, max_axis)
        pylab.ylim(-1.0, len(metrics))
        pylab.text(x=0.00,
                   y=len(metrics) + 0.05,
                   s='Template\ngraph',
                   va='bottom')
        pylab.text(x=1.650, y=-1.05, s='Median of\nreplicas', va='top')
        if jaccard_edges != None:
            pylab.text(x=0.30,
                       y=len(metrics) + 0.05,
                       s='(Jaccard=%.3f)' % jaccard_edges,
                       va='bottom')
            #pylab.text(x=-0.30, y=len(metrics)*(-0.15), s='E[EdgeJaccard]=%.3f'%jaccard_edges, ha='right', va='top')

        fig.subplots_adjust(left=0.17, right=0.95)

        if figpath == None:
            figpath = 'output/replica_vs_original_' + Gname + '_' + title_infix + '_' + str(
                seed) + '__' + timeNow()
            figpath = clean_path(figpath)
        save_figure_helper(figpath)
        pylab.hold(False)
    except Exception as inst:
        print('Warning: could not save stats figure ' + figpath + ':\n' +
              str(inst))
        exc_traceback = sys.exc_info()[2]
        print(
            str(inst) + "\n" +
            str(traceback.format_tb(exc_traceback)).replace('\\n', '\n'))

    replica_stats['normed_replica_vals'] = normed_replica_vals
    replica_stats['avg_norm_of_replicas'] = avg_norms

    mean_rel_errors = []
    mean_relstd_errors = []
    for met_i in range(num_of_metrics):
        normed_vals = normed_replica_vals[met_i]
        if graphutils.METRIC_ERROR in normed_vals or len(normed_vals) == 1:
            mean_rel_errors.append(None)
            mean_relstd_errors.append(None)
            continue
        rel_error_ar = [v - 1.0 for v in normed_vals if v != None]
        if len(rel_error_ar) == 0:
            rel_error_ar = [graphutils.METRIC_ERROR, graphutils.METRIC_ERROR]
        mean_rel_errors.append(np.average(rel_error_ar))
        mean_relstd_errors.append(
            np.average(rel_error_ar) / (1E-20 + np.std(rel_error_ar)))

    replica_stats['mean_rel_errors'] = mean_rel_errors
    replica_stats['mean_relstd_errors'] = mean_relstd_errors
    try:
        replica_stats['mean_mean_error'] = np.average(
            mean_rel_errors)  #the grand stat
        replica_stats['mean_mean_errorstd'] = np.average(
            mean_relstd_errors)  #the grand stat
    except:
        replica_stats['mean_mean_error'] = None
        replica_stats['mean_mean_errorstd'] = None

    return replica_stats, figpath
Esempio n. 49
0
def create_boxplot(label, data, title, xlabel, ylabel):
    pl.boxplot(data, labels=label, showmeans=True)
    pl.suptitle(title, fontweight='bold')
    pl.xlabel(xlabel)
    pl.ylabel(ylabel)
make_lists(1, WK38)
make_lists(2, WK48)
make_lists(3, WK59)
make_lists(4, WK119)
make_lists(5, WK206)

############################################################################
#PLOT BOXPLOTS AND CALCULATE P-VALUES
############################################################################

P.figure()

data = [WK38, WK48, WK59, WK119, WK206]

bp = P.boxplot(
    data, whis=1000000
)  #setting whiskers to an unreasonably large number forces a plot of the whiskers to represent the min and max of the data

for i in range(len(data)):
    y = data[i]
    y = np.array(y)
    print i
    print y
    x = np.random.normal(i + 1, 0.08, size=len(y))
    x = np.array(x)
    #calculate point density
    xy = np.vstack([x, y])
    z = gaussian_kde(xy)(xy)
    #sort points by density
    idx = z.argsort()
    x, y, z = x[idx], y[idx], z[idx]
Esempio n. 51
0
      repr((avg_values_RBBUP[1] / 2500) * 100) + " Length = " +
      repr((avg_values_RBBUP[2] / 2500) * 100) + " Turns = " +
      repr((avg_values_RBBUP[3] / avg_values_RBBUP[2]) * 100))

fig = plt.figure()
fig.set_size_inches(10, 7)

# Set properties for the plot.
n_groups = 6
index = np.arange(n_groups)
bar_width = 0.4
opacity = 0.8

# Dead ends
subplot1 = plt.subplot(221)
plot1 = plt.boxplot(dead_ends, vert=0)
plt.xlabel('Algorithm')
plt.ylabel('Cell')
plt.title('Dead Ends')
plt.yticks(index + 1, ('RB', 'P', 'W', 'RC', 'BU', 'RBBU'))

# Rivers
plt.subplot(222)
plot2 = plt.boxplot(rivers, vert=0)
plt.xlabel('Algorithm')
plt.ylabel('Cell')
plt.title('River Factor')
plt.yticks(index + 1, ('RB', 'P', 'W', 'RC', 'BU', 'RBBU'))

# Length
plt.subplot(223)
Esempio n. 52
0
def compare_nbrs():
    """Decide which number of nbrs is best

    *** 3 ***
    [1428.5, 49.100000000000001, 6.2999999999999998]
    *** 7 ***
    [2240.9000000000001, 56.799999999999997, 6.4000000000000004]
    *** 11 ***
    [3262.0, 66.900000000000006, 7.7999999999999998]
    *** 15 ***
    [4020.5, 66.299999999999997, 7.9000000000000004]
    *** 31 ***
    [5613.6999999999998, 70.700000000000003, 8.0]
    *** 66 ***
    [6858.1999999999998, 55.200000000000003, 6.5]
    *** 3 ***
    [1489.7, 40.700000000000003, 5.0999999999999996]
    *** 7 ***
    [2357.4000000000001, 58.700000000000003, 7.2000000000000002]
    *** 11 ***
    [3079.3000000000002, 66.700000000000003, 7.2000000000000002]
    *** 15 ***
    [3791.8000000000002, 62.799999999999997, 7.0999999999999996]
    *** 31 ***
    [5291.3999999999996, 68.799999999999997, 8.0999999999999996]
    *** 66 ***
    [6714.8999999999996, 60.600000000000001, 7.0999999999999996]
    """
    for matrix in [2, 4]:
        nbrs = [3, 7, 11, 15, 31, 66]

        alldata = []
        for nbr in nbrs:
            print "***", nbr, "***"
            data = [[], [], []]
            for seed in range(10):
                filename = os.path.join(
                    "paramsweep", "output",
                    "alltetramers_CHR64_NBR%d_MAT%d_distance_SEED%d.txt" %
                    (nbr, matrix, seed))
                db = os.path.join("..", "dims_and_tets", "alltetramers")
                res = GA_Results(filename, db, 4)  # Check
                result = len(
                    res.pols), len(res.pols
                                   & chosentetramers), len(res.pols
                                                           & besttetramers)

                for i in range(3):
                    data[i].append(result[i])
            print[pylab.mean(x) for x in data]
            alldata.append(data)

        for i, x in enumerate([
                'Number of polymers', 'Number of chosen tetramers',
                'Number of top 10 most efficient tetramers'
        ]):
            pylab.boxplot([y[i] for y in alldata])
            pylab.xlabel("Number of neighbours")
            pylab.ylabel(x)
            pylab.gca().set_xticklabels(nbrs)
            pylab.savefig(
                os.path.join("pictures",
                             "Nnbrs_matrix%d_%d.png" % (matrix, i)))
            pylab.clf()
Esempio n. 53
0
def box_plot(df,
             val,
             factors=None,
             where=None,
             fname=None,
             output_dir='',
             quality='medium'):
    """
    Makes a box plot

    args:
       df:
          a pyvttbl.DataFrame object
          
       val:
          the label of the dependent variable

    kwds:
       factors:
          a list of factors to include in boxplot
          
       where:
          a string, list of strings, or list of tuples
          applied to the DataFrame before plotting
          
       fname:
          output file name
          
       quality:
          {'low' | 'medium' | 'high'} specifies image file dpi
    """

    if factors == None:
        factors = []

    if where == None:
        where = []

    # check to see if there is any data in the table
    if df == {}:
        raise Exception('Table must have data to print data')

    # check to see if data columns have equal lengths
    if not df._are_col_lengths_equal():
        raise Exception('columns have unequal lengths')

    # check the supplied arguments
    if val not in list(df.keys()):
        raise KeyError(val)

    if not hasattr(factors, '__iter__'):
        raise TypeError("'%s' object is not iterable" % type(factors).__name__)

    for k in factors:
        if k not in list(df.keys()):
            raise KeyError(k)

    # check for duplicate names
    dup = Counter([val] + factors)
    del dup[None]
    if not all([count == 1 for count in list(dup.values())]):
        raise Exception('duplicate labels specified as plot parameters')

    # check fname
    if not isinstance(fname, _strobj) and fname != None:
        raise TypeError('fname must be None or string')

    if isinstance(fname, _strobj):
        if not (fname.lower().endswith('.png') or \
                fname.lower().endswith('.svg')):
            raise Exception('fname must end with .png or .svg')

    test = {}

    if factors == []:
        d = df.select_col(val, where=where)
        fig = pylab.figure()
        pylab.boxplot(np.array(d))
        xticks = pylab.xticks()[0]
        xlabels = [val]
        pylab.xticks(xticks, xlabels)

        test['d'] = d
        test['val'] = val

    else:
        D = df.pivot(val, rows=factors, where=where, aggregate='tolist')

        fig = pylab.figure(figsize=(6 * len(factors), 6))
        fig.subplots_adjust(left=.05, right=.97, bottom=0.24)
        pylab.boxplot([np.array(_flatten(d)) for d in D])
        xticks = pylab.xticks()[0]
        xlabels = ['\n'.join('%s = %s' % fc for fc in c) for c in D.rnames]
        pylab.xticks(xticks, xlabels, rotation=35, verticalalignment='top')

        test['d'] = [np.array(_flatten(d)) for d in D]
        test['xlabels'] = xlabels

    maintitle = '%s' % val

    if factors != []:
        maintitle += ' by '
        maintitle += ' * '.join(factors)

    fig.text(0.5,
             0.95,
             maintitle,
             horizontalalignment='center',
             verticalalignment='top')

    test['maintitle'] = maintitle

    if fname == None:
        fname = 'box(%s' % val
        if factors != []:
            fname += '~' + '_X_'.join([str(f) for f in factors])
        fname += ').png'

    fname = os.path.join(output_dir, fname)

    test['fname'] = fname

    # save figure
    if quality == 'low' or fname.endswith('.svg'):
        pylab.savefig(fname)

    elif quality == 'medium':
        pylab.savefig(fname, dpi=200)

    elif quality == 'high':
        pylab.savefig(fname, dpi=300)

    else:
        pylab.savefig(fname)

    pylab.close()

    if df.TESTMODE:
        return test
    # setp(bp['fliers'][2], color='red')
    # setp(bp['fliers'][3], color='red')
    setp(bp['medians'][1], color='red')


# Some fake data to plot
A = [[1, 2, 5], [7, 2]]
B = [[5, 7, 2, 2, 5], [7, 2, 5]]
C = [[3, 2, 5, 7], [6, 7, 3]]

fig = figure()
ax = axes()
# hold(True)

# first boxplot pair
bp = boxplot(data, positions=[1, 2], widths=0.6)
setBoxColors(bp)

# # second boxplot pair
# bp = boxplot(B, positions = [4, 5], widths = 0.6)
# setBoxColors(bp)
#
# # thrid boxplot pair
# bp = boxplot(C, positions = [7, 8], widths = 0.6)
# setBoxColors(bp)

# set axes limits and labels
# xlim(0,9)
# ylim(0,9)
# ylim(ymin=0)
xlim(xmin=0)
Esempio n. 55
0
def crossvalidate_krr(X,
                      Y,
                      f=5,
                      kwidths=10.0**np.array([0, 1, 2]),
                      llambdas=10.0**np.array([-4, -2, 0])):
    ''' 
    Test generalization performance of kernel ridge regression with gaussian kernel
    Input:	    X	data (dims-by-samples)
    			Y	labels (dims2-by-samples)
    			f	number of cross-validation folds
    			kwidths width of gaussian kernel function 
    			llambdas regularizer (height of ridge on kernel matrix)
    '''
    N = f * (X.shape[-1] / f)
    idx = sp.reshape(sp.random.permutation(sp.arange(N, dtype=int)),
                     (f, N / f))

    r2_outer = sp.zeros((f))
    r2_linear = sp.zeros((f))
    r2_inner = sp.zeros((f - 1, kwidths.shape[-1], llambdas.shape[-1]))

    # to outer cross-validation (model evaluation)
    for ofold in range(f):
        # split in training and test (outer fold)
        otestidx = sp.zeros((f), dtype=bool)
        otestidx[ofold] = 1
        otest = idx[otestidx, :].flatten()

        otrain = idx[~otestidx, :]

        # inner cross-validation (model selection)
        for ifold in range(f - 1):
            # split in training and test (inner fold)
            itestidx = sp.zeros((f - 1), dtype=bool)
            itestidx[ifold] = 1
            itest = otrain[itestidx, :].flatten()

            itrain = otrain[~itestidx, :].flatten()

            # do inner cross-validation (model selection)
            for illambda in range(llambdas.shape[-1]):
                for ikwidth in range(kwidths.shape[-1]):
                    #compute kernel for all data points

                    alphas = train_krr(X[:, itrain], Y[:, itrain],
                                       kwidths[ikwidth], llambdas[illambda])
                    yhat = apply_krr(alphas, X[:, itrain], X[:, itest],
                                     kwidths[ikwidth])
                    r2_inner[ifold, ikwidth,
                             illambda] = compute_rsquare(yhat, Y[:, itest])

        #train again using optimal parameters
        r2_across_folds = r2_inner.mean(axis=0)
        optkwidthidx, optllambdaidx = np.unravel_index(
            r2_across_folds.flatten().argmax(), r2_across_folds.shape)
        #evaluate model on outer test fold
        alphas = train_krr(X[:, otrain.flatten()], Y[:, otrain.flatten()],
                           kwidths[optkwidthidx], llambdas[optllambdaidx])
        yhat = apply_krr(alphas, X[:, otrain.flatten()], X[:, otest],
                         kwidths[optkwidthidx])
        r2_outer[ofold] = compute_rsquare(yhat, Y[:, otest])

        # for comparison: predict with linear model
        w_est = train_ols(X[:, otrain.flatten()], Y[:, otrain.flatten()])
        y_est_lin = apply_ols(w_est, X[:, otest])
        r2_linear[ofold] = compute_rsquare(y_est_lin, Y[:, otest])

        print( 'Fold %d'%ofold + ' best kernel width %f'%kwidths[optkwidthidx] +\
        ' best regularizer %f'%llambdas[optllambdaidx] + \
        ' rsquare %f'%r2_outer[ofold] + \
        ' rsquare linear %f'%r2_linear[ofold])
    pl.figure()
    pl.boxplot(sp.vstack((r2_outer, r2_linear)).T)
    pl.ylabel('$r^2$')
    pl.xticks((1, 2), ('KRR', 'Lin'))
    pl.savefig('krr_vs_lin_comparison.pdf')
    return r2_outer, r2_linear
        break
    n += 1

bins = np.array(list(range(0, 21))) / 10
inds = np.digitize(x, bins)
print(bins)
print(inds)

x_bin = []
y_bin = []
for group in range(min(inds), max(inds)):
    # x_bin.append(x[inds == group])
    y_bin.append(np.array(y)[inds == group])

fig, ax = plt.subplots()
boxplot(y_bin, 0, '')
ax.set_xticklabels(bins)
plt.xlabel("Reserve Price", fontsize=30)
ax.xaxis.set_tick_params(labelsize=20)
plt.ylabel("Impression Revenue", fontsize=30)
ax.yaxis.set_tick_params(labelsize=20)
plt.show()

fig, ax = plt.subplots()
ax.plot(x, y, '.', markersize=0.3, color='black')
plt.xlim(0, 2)
plt.ylim(0, 4)
plt.xlabel("Reserve Price", fontsize=30)
ax.xaxis.set_tick_params(labelsize=20)
plt.ylabel("Impression Revenue", fontsize=30)
ax.yaxis.set_tick_params(labelsize=20)
Esempio n. 57
0
#############################################################################

#############################################################################

#1rst component

scz = projections[y == 1, 0]
scz_asd = projections[y == 2, 0]
asd = projections[y == 3, 0]
data = [scz, scz_asd, asd]

import pylab as P
import numpy as np
P.figure()
bp = P.boxplot(data)
P.ylabel('Predicted')
plt.ylabel('Score on 1rst component')
P.xticks([1, 2, 3], ['SCZ', 'SCZ-ASD', 'ASD'])
for i in range(3):
    y = data[i]
    x = np.random.normal(1 + i, 0.04, size=len(y))
    P.plot(x, y, 'bo', alpha=0.6)

P.show()

#2nd component
y = np.load(DATA_Y)
y = y[y != 0]

scz = projections[y == 1, 1]
		while any(len(b.exp_frac_bi) < 10000 for b in bins):
			r = random.random()**2
			sim_states = [(random.random() < r, random.random() < r) for p in sample_pairs]
			num_bi = sum(p and m for p,m in sim_states)
			num_silent = sum(not (p or m) for p,m in sim_states)
			bins[num_silent].addsim(num_bi)
	
	if 'sayN' in o.plotstyle:
		Ns = [len(b.obs_frac_bi) for b in bins[:-1]]
		print 'min N =', min(Ns), 'max =', max(Ns)
	if 'violin' in o.plotstyle:
		violin_plot(pylab.axes(), [b.obs_frac_bi for b in bins[:-1]], [b.frac_silent for b in bins[:-1]], leftside=False, color='b', widthf=0.1)
		violin_plot(pylab.axes(), [random.sample(b.exp_frac_bi, len(o.obs_frac_bi)) for b in bins[:-1]], [b.frac_silent for b in bins[:-1]], rightside=False, color='r', widthf=0.1)
	if 'boxplot' in o.plotstyle:
		pylab.plot([b.frac_silent for b in bins], [numpy.mean(b.exp_frac_bi) for b in bins], color='r', label='expected')
		pylab.boxplot([b.obs_frac_bi for b in bins], positions=[b.frac_silent for b in bins], widths=0.5/len(sample_pairs))
	if 'mean_graph' in o.plotstyle:
		pylab.plot([b.frac_silent for b in bins], [numpy.mean(b.exp_frac_bi) for b in bins], color='r', label='expected')
		pylab.plot([b.frac_silent for b in bins], [numpy.mean(b.obs_frac_bi) for b in bins], color='b', label='observed')
	if 'mean_sem' in o.plotstyle:
		pylab.plot([b.frac_silent for b in bins], [numpy.mean(b.exp_frac_bi) for b in bins], color='r', label='expected')
		pylab.errorbar([b.frac_silent for b in bins], [numpy.mean(b.obs_frac_bi) for b in bins], [sem(b.obs_frac_bi) for b in bins], label='observed')
	if 'std' in o.plotstyle:
		pylab.plot([b.frac_silent for b in bins], [numpy.mean(b.exp_frac_bi) for b in bins], color='r', label='expected')
		pylab.errorbar([b.frac_silent for b in bins], [numpy.mean(b.obs_frac_bi) for b in bins], [numpy.std(b.obs_frac_bi) for b in bins], label='observed')
	if 'sayY' in o.plotstyle:
		print [numpy.mean(b.obs_frac_bi) for b in bins]
		print [numpy.mean(b.exp_frac_bi) for b in bins]
		from scipy import stats
		print 'paired t test', stats.ttest_rel([numpy.mean(b.obs_frac_bi) for b in bins if not b.hasNaN()], [numpy.mean(b.exp_frac_bi) for b in bins if not b.hasNaN()])
	pylab.title(' '.join(o.stages))
Esempio n. 59
0
def plotGCContent(all_result_outputs, label=''):

    #Merge data across samples
    unique_cols = ['Oligo ID', 'Indel', 'GC Content', 'MH Len', 'MH Dist']
    datas = [
        x[0]['Data'][unique_cols + ['Indel Reads', 'Non-Null Reads']]
        for x in all_result_outputs
    ]
    merged_data = datas[0]
    for i, data in enumerate(datas[1:]):
        merged_data = pd.merge(merged_data,
                               data,
                               on=unique_cols,
                               suffixes=('', '%d' % (i + 2)),
                               how='outer')
    suffix = lambda i: '%d' % (i + 1) if i > 0 else ''
    merged_data['Indel Reads Sum'] = merged_data[[
        'Indel Reads' + suffix(i) for i in range(len(datas))
    ]].sum(axis=1)
    merged_data['Non-Null Reads Sum'] = merged_data[[
        'Non-Null Reads' + suffix(i) for i in range(len(datas))
    ]].sum(axis=1)

    #Compute mean regression lines across samples for each MH length
    mean_lines = {}
    for mh_len in range(2, 16):
        if mh_len not in all_result_outputs[0][0]['RegrLines']: continue
        regr_lines = [
            x[0]['RegrLines'][mh_len][:2] for x in all_result_outputs
        ]
        mean_lines[mh_len] = np.mean(regr_lines, axis=0)

    #Restrict to only MH dist in (0,10) and adjust for mh len-dist relationship
    for mh_len in [9]:
        compute_resid = lambda row: row[
            'Perc Reads'
        ]  # - getRegrValue(row['MH Len'],row['MH Dist'],mean_lines)
        sel_data = merged_data.loc[(merged_data['MH Len'] == mh_len)
                                   & (merged_data['MH Dist'] >= 0) &
                                   (merged_data['MH Dist'] <= 10)]
        sel_data['Perc Reads'] = sel_data[
            'Indel Reads Sum'] * 100.0 / sel_data['Non-Null Reads Sum']
        sel_data['Perc Reads Residual'] = sel_data.apply(compute_resid, axis=1)
        PL.figure(figsize=(4, 4))
        gcs = sel_data['GC Content'].unique()
        gcs.sort()
        boxdata_lk = {
            gc:
            sel_data.loc[sel_data['GC Content'] == gc]['Perc Reads Residual']
            for gc in gcs
        }
        gcs = [gc for gc in gcs if len(boxdata_lk[gc]) > 20
               ]  #Limit to GC with at least 20 data points
        boxdata = [boxdata_lk[gc] for gc in gcs]
        print([len(x) for x in boxdata])
        PL.boxplot(boxdata)
        PL.ylabel('Percent total mutated reads of MH-mediated deletion')
        PL.xlabel('GC content of microhomologous sequence')
        PL.title('Microhomology of length %d\n(at max 10 distance)' % mh_len)
        PL.xticks(range(1, len(gcs) + 1), gcs)
        PL.show(block=False)
        saveFig('gc_content_mh%d' % mh_len)
Esempio n. 60
0
large_size_ratios = [pair[1] for pair in zip(sizes, ratios) if pair[0] > 1000]
print('best compression overall: ', max(ratios))
print()
print('Results for all blocks, n: ', len(ratios))
print('mean compression: ', np.mean(ratios))
print('median compression: ', np.median(ratios))
print()
print('Results for blocks with more than 1K txs, n: ', len(large_size_ratios))
print('mean compression: ', np.mean(large_size_ratios))
print('median compression: ', np.median(large_size_ratios))
plt.scatter(sizes, ratios, alpha=0.75)
plt.xlabel('transactions in block')
plt.ylabel('compression ratio')
plt.ylim((0.95, 1.0))
plt.grid(True)

size_ratios_map = defaultdict(list)
for size, ratio in zip(sizes, ratios):
    size_ratios_map[size//50].append(ratio)

plt.figure()
plt.boxplot(size_ratios_map.values())
plt.xlabel('transactions in block')
plt.ylabel('compression rate')
plt.ylim((0.95, 1.0))
plt.xticks(rotation='vertical')
plt.xticks(sorted(size_ratios_map.keys()), [50*k for k in sorted(size_ratios_map.keys())], rotation='vertical')
plt.tight_layout()
plt.show()