Beispiel #1
0
def dispcouplets(fname, rows=2, cols=2, size='small',
                 divs=7, normalized=False):
  import numpy as np
  import matplotlib.pylab as plt
  chars=sorted(nletddict(fname, 1))
  sp = nletddict(fname, 2)
  mat = [[sp[ci+cj] for cj in chars] for ci in chars]
  matlab = [[ci+cj for cj in chars] for ci in chars]
  maxcount = max(max(mat))
  l = len(chars)
  pos = np.arange(l)+.5
  for s in range(0, l, rows*cols):
    plt.figure()
    for i in range(rows*cols):
      if i+s<l:
        plt.subplot(rows, cols, i+1)
        plt.barh(pos,mat[i+s],align='center')
        plt.yticks(pos,map(repr,map(second, matlab[i+s])))
        plt.ylabel("couplets")
        plt.xlabel("count")
        if not normalized:
          plt.xticks(np.arange(divs+1)*maxcount/divs, size=size)
        else:
          plt.xticks(size=size)
        plt.title("The %d couplets that begin with %s" % (sum(mat[i+s]), repr(matlab[i+s][0][0])))
  plt.show()
Beispiel #2
0
def plot(W, idx2term):
    """
    Plot the interpretation of NMF basis vectors on Medlars data set. 
    
    :param W: Basis matrix of the fitted factorization model.
    :type W: `scipy.sparse.csr_matrix`
    :param idx2term: Index-to-term translator.
    :type idx2term: `dict`
    """
    print "Plotting highest weighted terms in basis vectors ..."
    for c in xrange(W.shape[1]):
        if sp.isspmatrix(W):
            top10 = sorted(enumerate(W[:, c].todense().ravel().tolist()[0]), key = itemgetter(1), reverse = True)[:10]
        else:
            top10 = sorted(enumerate(W[:, c].ravel().tolist()[0]), key = itemgetter(1), reverse = True)[:10]
        pos = np.arange(10) + .5
        val = zip(*top10)[1][::-1]
        plb.figure(c + 1)
        plb.barh(pos, val, color = "yellow", align = "center")
        plb.yticks(pos, [idx2term[idx] for idx in zip(*top10)[0]][::-1])
        plb.xlabel("Weight")
        plb.ylabel("Term")
        plb.title("Highest Weighted Terms in Basis Vector W%d" % (c + 1))
        plb.grid(True)
        plb.savefig("documents_basisW%d.png" % (c + 1), bbox_inches = "tight")
    print "... Finished."
Beispiel #3
0
def plmyfig(df, bgname, dirname, tar, count=10):
    #plot fig!
    print("Starting Plot %s %s" % (dirname, bgname))
    if len(df) > count:
        df = df.head(count)
    pos = plt.arange(len(df)) + 0.5
    ytick = _getTerm(df['Term_description'], df['Term_ID'], bgname)
    xs = [float(n) for n in df[' -log10(pvalue)']]
    ytick.reverse()
    xs.reverse()
    plt.barh(pos, xs, align = 'center', height = 0.5, alpha = 1, color='orange')
    plt.yticks(pos, ytick, size = 'x-small')
    plt.xlabel('$-Log10(pValue)$')
    plt.title('%s' % bgname)
    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    try:
        plt.tight_layout()
    except ValueError:
        pass
    filename = os.path.join(tar, dirname, dirname + '_' + bgname)
    plt.savefig(filename + '.png', dpi = 72)
    plt.savefig(filename + '.pdf')
    plt.close()
def main(argv=None):
    if argv is None:
        argv = sys.argv
    if len(argv) != 4:
        print "Usage: " + argv[0] + " <report name> <report dir> <graph dir>"
        return 1

    report = argv[1]
    report_dir = argv[2]
    graph_dir = argv[3]

    file_list = glob(os.path.join(report_dir, 'part*'))
    #Copy the raw data file to the graph_dir
    raw_file = os.path.join(graph_dir, report + '.tsv')
    shutil.copyfile(file_list[0], raw_file)

    #Process the file into a graph, ideally I would combine the two into one but for now I'll stick with two
    data_file = csv.DictReader(open(raw_file, 'rb'), fieldnames = ['IP', 'Requests', 'Bytes'], delimiter="\t")
    ips = []
    requests = []
    num_bytes = []
    for row in data_file:
        ips.append(row['IP'])
        requests.append(int(row['Requests']))
        num_bytes.append(int(row['Bytes']))

    if len(ips) > 25:
        length = 25
    else:
        length = len(ips)

    fig = pylab.figure(1)
    pos = pylab.arange(length) + .5
    pylab.barh(pos, requests[:length], align='center', aa=True, ecolor='r')
    pylab.yticks(pos, ips[:length])
    pylab.xlabel('Requests')
    pylab.title('Top %d ips ordered by # of requests' % length)
    pylab.grid(True)

    #Save the figure
    pylab.savefig(os.path.join(graph_dir, report + '.pdf'), bbox_inches='tight', pad_inches=1)
def main(argv=None):
    if argv is None:
        argv = sys.argv
    if len(argv) != 4:
        print "Usage: " + argv[0] + " <report name> <report dir> <graph dir>"
        return 1

    report = argv[1]
    report_dir = argv[2]
    graph_dir = argv[3]

    file_list = glob(os.path.join(report_dir, "part*"))
    # Copy the raw data file to the graph_dir
    raw_file = os.path.join(graph_dir, report + ".tsv")
    shutil.copyfile(file_list[0], raw_file)

    # Process the file into a graph, ideally I would combine the two into one but for now I'll stick with two
    data_file = csv.DictReader(open(raw_file, "rb"), fieldnames=["page", "avgTime"], delimiter="\t")
    pages = []
    avg_time = []
    for row in data_file:
        pages.append(row["page"])
        avg_time.append(int(row["avgTime"]))

    if len(pages) > 25:
        length = 25
    else:
        length = len(pages)

    fig = pylab.figure(1)
    pos = pylab.arange(length) + 0.5
    pylab.barh(pos, avg_time[:length], align="center", aa=True, ecolor="r")
    pylab.yticks(pos, pages[:length])
    pylab.xlabel("Average Time Taken")
    pylab.title("Top %d pages ordered by average time taken" % length)
    pylab.grid(True)

    # Save the figure
    pylab.savefig(os.path.join(graph_dir, report + ".pdf"), bbox_inches="tight", pad_inches=2)
def plot_importance(model, df, max_features=10):
    '''
    plot feature of importance
    :param model: model name
    :param df: pandas dataframe
    :param max_features:int, maximum number of features to plot
    :raise: bar chart
    '''
    feature_importance = model.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5

    # Show only top features
    pos = pos[-max_features:]
    feature_importance = (feature_importance[sorted_idx])[-max_features:]
    feature_names = (df.columns[sorted_idx])[-max_features:]

    plt.barh(pos, feature_importance, align='center')
    plt.yticks(pos, feature_names)
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')
    def matplot_metrics(self, metrics, topk=None):
        """
        可视化特征的得分
        :param metrics:
        :param topk:
        :return:
        """

        if isinstance(metrics[0], float):
            metrics = list(map(lambda x: round(x, 2), metrics))

        plt.rcParams['font.sans-serif'] = ['SimHei']
        data = list(zip(self.feature_name, metrics))
        data = sorted(data, key=lambda x: x[1])
        y = list(map(lambda x: x[0], data))
        x = list(map(lambda x: x[1], data))

        if topk:
            y = y[:topk]
            x = x[:topk]
        fig, ax = plt.subplots()
        b = ax.barh(range(len(y)), x, color='#6699CC')

        for rect in b:
            w = rect.get_width()
            ax.text(w,
                    rect.get_y() + rect.get_height() / 2,
                    '%s' % (w),
                    ha='left',
                    va='center')
        ax.set_yticks(range(len(y)))
        ax.set_yticklabels(y)
        plt.barh(y, x)
        plt.ylabel('特征')
        plt.xlabel('指标')
        plt.show()
def friends_barhplot():
    fig, ax = plt.subplots(figsize=(8, 1.5))
    reader = csv.reader(open('percents.csv', "r"))
    next(reader)
    prev_bar = 0
    for values in reader:
        bottom = prev_bar
        plt.barh(y='a',
                 width=float(values[1]) - bottom,
                 left=bottom,
                 color=values[2],
                 label=values[0])
        ax.text((float(values[1]) - bottom) / 2 + bottom,
                -0.53,
                values[0],
                ha='center')
        ax.text((float(values[1]) - bottom) / 2 + bottom,
                -0.04,
                str(round((float(values[1]) - bottom) * 100, 1)) + '%',
                ha='center')
        prev_bar = float(values[1])

    plt.axis('off')
    plt.show()
Beispiel #9
0
print next(color)
print next(color)

# 업종분포차트
import matplotlib.pylab as plt
import matplotlib.font_manager as fm
fontprop = fm.FontProperties(fname="fonts/malgun.ttf")

top20 = sector_counts[0:20]

fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title(u'업종분포', fontproperties=fontprop)
pos = arange(20)
pos = pos[::-1] # reverse pos list
plt.yticks(pos, [x.decode('utf8') for x in top20.index], fontproperties=fontprop)
plt.barh(pos, top20.values, align='center', color=colors_list, alpha=0.7)
# plt.show()

df_semi = df[df['sector']=='소프트웨어']
print(df_semi.head(10))

#시가총액 합산
from pandas.tools.pivot import pivot_table
ttable = df[['sector', 'marcap']]
piv = pivot_table(ttable, values='marcap', rows=['sector'], aggfunc=np.sum)
sector_marcap = piv.copy()
sector_marcap.sort(ascending=False)
print(sector_marcap[:10])

#업종별 시가총액 차트
import matplotlib.pylab as plt
# providing y is sufficient to generate the splits and
# hence np.zeros(n_samples) may be used as a placeholder for X
# instead of actual training data.
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

mean_scores = []
for kind in kinds:
    svc = LinearSVC(random_state=0)
    cv_scores = cross_val_score(svc,
                                connectivity_biomarkers[kind],
                                y=adhd_labels,
                                cv=cv,
                                groups=adhd_labels,
                                scoring='accuracy',
                                )
    mean_scores.append(cv_scores.mean())

###############################################################################
# Finally, we can display the classification scores.
plt.figure(figsize=(6, 4))
positions = np.arange(len(kinds)) * .1 + .1
plt.barh(positions, mean_scores, align='center', height=.05)
yticks = [kind.replace(' ', '\n') for kind in kinds]
plt.yticks(positions, yticks)
plt.xlabel('Classification accuracy')
plt.grid(True)
plt.tight_layout()

plt.show()
def plot_feature_importances_cancer(model, train_data, feature_names):
    n_features = train_data.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), feature_names)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
Beispiel #12
0
def test_accuracy(svm,brt,prepro_data,col_names,pca = False):
    #in this function we are really computing the accuracy. As well as representing
    #the solution of the best brt algorithm
    #get X_test and y_test
    X_test = prepro_data['X_test']
    y_test = prepro_data['y_test']
    #get scaling factors and apply them to X_test, note only to those index that are
    #numeric, verify this with the previous normalization in prepro fucntion
    scaler = prepro_data['Scaler']
    pca_ob = prepro_data['pca_object']
    idx_to_norm = prepro_data['idx_to_norm']
    
    #get data to normalize
    X_test_norm = X_test[:]
    X_to_norm = X_test[:,idx_to_norm]
    X_norm = scaler.transform(X_to_norm)
    X_test_norm[:,idx_to_norm] = X_norm
    
    #get pca object and project X_test
    if pca == False:
        X_test_proj = X_test_norm
    else:
        X_test_proj = pca_ob.transform(X_test_norm)
    

    #get the best estimator
    svm_best = svm.best_estimator_
    brt_best = brt.best_estimator_
    
    print svm_best
    print brt_best
    
    y_pred_svm = svm_best.predict(X_test_proj)
    y_pred_brt = brt_best.predict(X_test_proj)
    
    print np.mean(y_test == y_pred_svm)
    acc_svm = metrics.accuracy_score(np.ravel(y_test),y_pred_svm)
    acc_brt = metrics.accuracy_score(np.ravel(y_test),y_pred_brt)  
    
    print "Accuracy svm: ", acc_svm
    print "Accuracy brt: ", acc_brt

    print "-----------------"
    print " Classification report SVM"
    print(metrics.classification_report(y_test, y_pred_svm))   

    print "-----------------"
    print " Classification report BRT"
    print(metrics.classification_report(y_test, y_pred_brt)) 

    feature_importance = brt_best.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
   # plt.subplot(1, 2, 2)
    plt.barh(pos,feature_importance[sorted_idx], align='center')
    cl_n = np.array(col_names)
    plt.yticks(pos, cl_n[sorted_idx])   
    plt.xlabel('Realtive importance')
    
    #plot some partial dependence plots    
    plt.figure()
    sorted_idx_list = sorted_idx.tolist()
    features = sorted_idx_list[0:4] #+ [(sorted_idx_list[0],sorted_idx_list[1])]
    
    fig, axs = plot_partial_dependence(brt_best, prepro_data['X_train_proj'],features,n_jobs = -1, grid_resolution = 2000)
    fig.suptitle('Partial dependence of ECI data')
    plt.subplots_adjust(top = 0.9)
    
    #boxplot for predicted probabilities
    
    pred_prob = brt_best.predict_proba(X_test_proj)
    y_1 = pred_prob[y_test.ravel() == 1,1]
    y_0 = pred_prob[y_test.ravel() == 0,1,]
    
    plt.figure()
    plt.boxplot([y_0.tolist(),y_1.tolist()],labels = ['0','1'])
    plt.ylabel('Predicted probability')
    plt.xlabel('Transaction')
    
    plt.figure()
    
    xx = np.arange(len(pred_prob))
    
    print len(xx)
    print len(pred_prob)
    print len(y_test)
    plt.scatter(xx[y_test.ravel() == 0],pred_prob[y_test.ravel() == 0,1],s = 60,c = 'r',alpha= 0.6,label = 'Trans = 0')
    plt.scatter(xx[y_test.ravel() == 1],pred_prob[y_test.ravel() == 1,1],s = 60,c = 'b',alpha= 0.6,label = 'Trans = 1')
    plt.legend(scatterpoints = 1)
    plt.ylabel('Predicted probability')
    plt.xlabel('# sample')
Beispiel #13
0
#==============================================================================
"""----------------------- features importances ------------------------- """
#==============================================================================
"""--------------- selection de variables Grad_Bossiting -------------- """

gbc = ensemble.GradientBoostingClassifier()
gbc.fit(X, y)

# Get Feature Importance from the classifier
feature_importance = gbc.feature_importances_
# Normalize The Features
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(16, 12))
plt.barh(pos, feature_importance[sorted_idx], align='center', color='#7A68A6')
plt.yticks(pos, np.asanyarray(df.columns[1:].tolist())[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.savefig('Relative_Importance_GBoosting_5c.png')
plt.show()
"""--------------- selection de variables adaboost -------------- """

gbc = ensemble.AdaBoostClassifier()
gbc.fit(X, y)
# Get Feature Importance from the classifier
feature_importance = gbc.feature_importances_
# Normalize The Features
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
import pandas as pd
import matplotlib.pylab as plt
cars = pd.read_csv("C:\\project\\Python\\cars.csv")
# Rename the column of "Unnamed" to "model"
cars = cars.rename(columns={'Unnamed: 0': 'model'})
# print(cars.describe())
print(cars.head())
#########################
y1 = cars['hp']
y2 = cars['disp']
x = range(32)
plt.plot(x, y1)
plt.plot(x, y2)
# plt.legend()
# plt.show()
plt.plot(x, y1, linewidth=2.0, color='C')
plt.stackplot(x, y1, color='purple', alpha=0.7)
plt.plot(x, y2, linewidth=1.0, color='r')
plt.stackplot(x, y2, color='black', alpha=0.5)
# plt.show() #Print the area plot
###########
x1 = cars['model'].tolist()
# Adding fingure to adjust figsize
fig = plt.figure(figsize=(30, 15))
# See how hp changes with bar plot
# plt.bar(x1,y1,color='purple', alpha=0.8)
plt.barh(x1, y1, color='purple', alpha=0.8)
plt.show()
Beispiel #15
0
def hist(x,bins, weight=None, weights=None, index=None, 
         norm=None, frac=False, total=False, dist=False,
         cumulative=False, revcumulative=False,
         bottom=None, filled=False, 
         **kwargs):
    '''
    norm == set max value of hist
    frac == return the fraction index is of the entire sample 
    
    '''
    rotate = kwargs.pop('rotate',False)
    noplot = kwargs.pop('noplot',False)
    if bottom is None: bottom=0.0
    if index is not None:
        xx = x[index]
        if weight is not None and weights is not None:
            raise ValueError('Only supply one.  This is just so that you can push weight and weights')
        if weight is not None:
            ww = weight[index]
        elif weights is not None:
            ww = weights[index]
        else:
            ww = None
    else:
        xx = x
        ww = weight
    
    v,l = np.histogram(xx,bins,weights=ww)
    d = np.diff(l)
    l = l[:-1] + d/2.0
    
    if frac:
        vv = np.histogram(x,bins, weight)[0]
        ii = np.where(vv == 0)
        v = np.array(v)*1.0/np.array(vv)
        v[ii] = 0
    
    if cumulative:
        v = np.cumsum(v)
    if revcumulative:
        v = np.cumsum(v[::-1])[::-1] 
    
    if norm is not None:
        v = v/float(np.max(v))*float(norm)
    if total:
        v = v / (1.0*np.sum(v))
    
    if dist:
      v /= d
    
    if bottom is not None:
        v += bottom
    # if rotate:
    #     l,v = v,l
    if not noplot:
        if filled:
            kwargs.setdefault('align', 'center')
            # hack to fix pylab.bar's coloring 
            if 'color' not in kwargs:
                kwargs['color'] = next(pylab.gca()._get_lines.color_cycle)
            if rotate:
                pylab.barh(l,v-bottom, height=d, left=bottom, **kwargs)
            else:
                pylab.bar(l,v-bottom, width=d, bottom=bottom, **kwargs)
        else:
            if rotate:
                pylab.step(v,l, where='mid', **kwargs)
            else:
                pylab.step(l,v, where='mid', **kwargs)
    # if rotate:
    #     l,v = v,l
    return l,v
#Plot x-y
plt.cla() #clears past graphs
plt.plot(data1,data2) #plt.plot creates quick plot 
plt.cla() #clears past graphs
plt.plot(data1,data3)

#Plot scatter
plt.cla()
plt.scatter(carDf.PRICE, carDf.WEIGHT, color='r') 

#Plot bar charts
plt.cla()
plt.bar(carDf.ID, carDf.PRICE)
plt.cla()
plt.barh(carDf.ID, carDf.WEIGHT)
plt.yticks(carDf.ID, carDf.MODEL)

#Plot pie chart
plt.cla()
plt.pie(carDf.PRICE, labels=carDf.MODEL, shadow=True, autopct='%1.1f')

#Plot a histogram
plt.cla()
plt.hist(data3, color='g')
plt.title('Demo Histogram')
plt.xlabel('Sin Weights')
plt.ylabel('Frequency')

#Plot a boxplot
plt.cla() #pass a list of lists
Beispiel #17
0
 def plot(self):
     def colorhash(s):
         colorsum = 0
         for i in range(len(s)):
             modpos = 255**(i % 3)
             colorsum += ord(s[i])*modpos
         colorsum = colorsum % 0xffffff
         return '#%0*X' % (6,colorsum)
     
     filters = {'U': 'darkviolet', 
                'B': 'blue', 
                'V': 'green', 
                'R': 'red', 
                'I': 'maroon',
                'u': 'darkviolet',
                'v': 'purple',
                'b': 'blue',
                'y': 'yellow', 
                'hbn':'darkcyan','hbw':'cyan',
                'han':'firebrick','haw':'darkred',
                 'clear': 'ghostwhite',
                 'up': 'midnightblue',
                 'gp': 'steelblue',
                 'rp': 'orangered',                                                                                                                                                        
                 'ip': 'brown',                                                                                                                                                        
                 'zp': 'maroon',
                 'ThAr night': 'orange','ThAr day': 'orange',
                 'ThAr bad weather': 'orange', 'Bias STELLA2':'black',
                 'Flat Field long': 'slategrey',
                 'Twilight Sky Spectra': 'skyblue',
                 'Vega': 'azure'}
 
     plt.subplot(1,1,1)
     obj_pos = np.arange(len(self.objects))
     objarray = list(self.objects)
     for i in range(len(self.data['object'])):
         width = self.data['exptime'][i]/86400.0
         left = self.data['dateobs'][i]
         obj = self.data['object'][i]
         
         ypos = objarray.index(obj)
         cfilter = str(self.data['filter'][i]).rstrip()
         if cfilter in filters:
             filtercol = filters[cfilter]
         elif obj in filters:
             filtercol = filters[obj]
         else:
             #print '"%s" "%s"' % (cfilter, obj)
             #print colorhash(obj)
             filtercol= colorhash(obj)
             #filtercol='c'
         
         plt.barh(ypos, width, left=left, align='center', color=filtercol, edgecolor=filtercol)
     plt.yticks(obj_pos, self.objects)
     plt.xlabel('Performance')
     if self.instrument==1:
         plt.title('Observation Log STELLA2')
     elif self.instrument==2:
         plt.title('Observation Log STELLA1')
     plt.subplots_adjust(top = 0.90,left=0.22, bottom=0.10,right=0.95)
     
     try:
         start = self.data['dateobs'][0]
     except IndexError:
         start = datetime.datetime.now() - datetime.timedelta(hours=12)
     try:
         end = self.data['dateobs'][-1] + datetime.timedelta(hours=2)
     except IndexError:
         end = datetime.datetime.now()
     hours = []
     labels = []
     start = start.replace(minute=0,second=0,microsecond=0)
     current = start + datetime.timedelta(hours=1)
     while current<end:
         hours.append(current)
         labels.append('%02dh' % current.hour)
         current += datetime.timedelta(hours=2)
     plt.xticks(hours, labels)
     plt.xlabel('time')
     plt.ylim(-1,len(self.objects))
     plt.grid()
     plt.savefig('obslog%d.svg' % self.instrument)
     #plt.show()
     plt.close()
Beispiel #18
0
Here is the number of events associated with the negative class:

no_CME_data = negative_result[0]
negative_class = negative_result[1]
print("There are", len(no_CME_data), "no-CME events in the negative class.")

## Step 3: Feature selection

Some of the features within a data set may be powerful for distinguishing between the positive and negative class, whereas others may be redundant or irrelevant. To identify features in the former category, we use a univariate feature selection method, which is implemented in the feature selection module of the scikit-learn library, for feature scoring.

To improve the performance of the feature selection algorithm, we'll normalize each feature so that they lie within similar ranges. To do this, we subtract from every feature its median value and divide by its standard deviation.

CME_data = np.array(CME_data)
no_CME_data = np.array(no_CME_data)

def normalize_the_data(flare_data):
    flare_data = np.array(flare_data)
    n_elements = flare_data.shape[0]
    for j in range(flare_data.shape[1]):
        standard_deviation_of_this_feature = np.std(flare_data[:, j])
        median_of_this_feature = np.median(flare_data[:, j])
        for i in range(n_elements):
            flare_data[i, j] = (
                flare_data[i, j] - median_of_this_feature) / (standard_deviation_of_this_feature)
    return flare_data


no_CME_data = normalize_the_data(no_CME_data)
CME_data = normalize_the_data(CME_data)

print("There are", no_CME_data.shape[0], "flares with no associated CMEs.")
print("There are", CME_data.shape[0], "flares with associated CMEs.")

Let's look at the distribution of one feature for the active regions that both flared and produced a CME (green) and for the active regions that flared but did not produce a CME (red). You can change the value of `i` in the code block below to see that some features are totally useless as there is barely any difference in the distributions for the positive and negative class. As such, we can throw such features out of our sample. It's a good idea to do some feature selection before running the SVM, so as to reduce noise (in this case, with only 18 features, there's not too much noise to begin with). 

sharps = ['Total unsigned flux', 'Mean gradient of total field',
          'Mean current helicity (Bz contribution)', 'Mean photospheric magnetic free energy',
          'Fraction of Area with Shear > 45 deg', 'Total unsigned current helicity',
          'Mean gradient of horizontal field', 'Mean characteristic twist parameter, alpha',
          'Mean angle of field from radial', 'Mean gradient of vertical field',
          'Mean vertical current density', 'Total unsigned vertical current',
          'Sum of the modulus of the net current per polarity',
          'Total photospheric magnetic free energy density', 'Mean shear angle',
          'Area of strong field pixels in the active region', 'Sum of flux near polarity inversion line',
          'Absolute value of the net current helicity']

i = 2

# For the positive class (green)
mu_fl = np.mean(CME_data[:, i])
sigma_fl = np.std(CME_data[:, i])
num_bins = 15
n_fl, bins_fl, patches_fl = plt.hist(
    CME_data[:, i], num_bins, normed=1, facecolor='green', alpha=0.5)
y_fl = scipy.stats.norm.pdf(bins_fl, mu_fl, sigma_fl)
plt.plot(bins_fl, y_fl, 'g--', label='positive class')

# For the negative class (red)
mu_nofl = np.mean(no_CME_data[:, i])
sigma_nofl = np.std(no_CME_data[:, i])
n_nofl, bins_nofl, patches_nofl = plt.hist(
    no_CME_data[:, i], num_bins, normed=1, facecolor='red', alpha=0.5)
y_nofl = scipy.stats.norm.pdf(bins_nofl, mu_nofl, sigma_nofl)
plt.plot(bins_nofl, y_nofl, 'r--', label='negative class')

text_style = dict(fontsize=16, fontdict={'family': 'monospace'})
plt.xlabel('Normalized '+sharps[i], **text_style)
plt.ylabel('Number (normalized)', labelpad=20, **text_style)
fig = plt.gcf()
fig.set_size_inches(10, 5)
fig.savefig('fscore_tmp.png', bbox_inches='tight')
legend = plt.legend(loc='upper right', fontsize=12, framealpha=0.0, title='')
legend.get_frame().set_linewidth(0.0)

Now we will compute the Univariate F-score for feature selection. It is a very simple method: the F-score measures the distance between the two distributions for a given feature (inter-class distance), divided by the sum of the variances for this feature (intra-class distance). We can use the `sklearn.feature_selection` module to do this:

# import the feature selection method
from sklearn.feature_selection import SelectKBest, f_classif
# select the number of features
N_features = 18
Nfl = CME_data.shape[0]
Nnofl = no_CME_data.shape[0]
yfl = np.ones(Nfl)
ynofl = np.zeros(Nnofl)
# k is the number of features
selector = SelectKBest(f_classif, k=N_features)
selector.fit(np.concatenate((CME_data, no_CME_data), axis=0),
             np.concatenate((yfl, ynofl), axis=0))
scores = selector.scores_
print(scores)

It's not easy to interpret the scores in this fashion, so let's plot the results. The higher the Univariate Fisher Score, the more predictive the feature.

plt.clf()
order = np.argsort(scores)
orderedsharps = [sharps[i] for i in order]
y_pos2 = np.arange(18)
plt.barh(y_pos2, sorted(scores/np.max(scores)))
plt.ylim((-1, 19))
plt.yticks(y_pos2, orderedsharps, fontsize=12)
plt.xlabel('Normalized Fisher Score', fontsize=15)
plt.title('Normalized Univariate Fisher Score Per Feature', fontsize=15)
plt.subplots_adjust(left=0.5, right=1.0)
fig = plt.gcf()
fig.set_size_inches(9, 5)
plt.show()

## Step 4: The support vector machine

Now we initialize the support vector machine on the data. The SVM uses non-linear decision functions to map the feature space into a higher-dimensional space, where the positive and negative examples can be separated linearly by a hyperplane. <br>

This is incredibly non-intuitive. But we can think of a simpler example. Suppose we had two classes: CME-producing and non-CME producing active regions. And suppose we had two features: the total flux in these regions, and the total area of these regions. We could construct a two-dimentional feature space, where we plot the flux against the area of each active region. Positive examples could be indicated by an X and negatives ones by an O. In theory, if our data behaved well, we could draw a line between these classess. <br>

Since we have 18 features, the SVM constructs an 18-dimensional feature space. In this feature space, the decision boundary separating the positive and negative examples may be non-linear. As such, the algorithm then enlarges this 18-dimensional feature space (using the function indicated by the `kernel` parameter in the [`svm.SVC`](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html) function) into a higher-dimensional feature space wherein it is possible to linearly separate the positive and negatives classes. There are lots of people trying to work on how to [visualize these multi-dimensional feature spaces](https://github.com/tmadl/highdimensional-decision-boundary-plot), which is an active area of research.

number_of_examples = Nfl + Nnofl
C = 4.0
gamma = 0.075
class_weight = {1: 6.5}
clf = svm.SVC(C=C, gamma=gamma, kernel='rbf', class_weight=class_weight,
              cache_size=500, max_iter=-1, shrinking=True, tol=1e-8, probability=True)

## Step 5: Stratified k-folds cross-validation

Now we run and evaluate the performance of the SVM. There are lots of different ways to evaluate the performance of a classifier, which we discuss in Section 4 of [Bobra & Couvidat (2015)](https://arxiv.org/abs/1411.1405). We're going to choose a metric called the True Skill Score, or the TSS, which we can calculate from four quantities: true positives, true negatives, false positives, and false negatives. We prefer the TSS to all the other metrics as it is insensitive to the class imbalance ratio and thus best for comparison to other groups. The TSS is symmetrically distributed about 0: i.e., it goes from [-1, 1] where 0 represents no skill and a negative value represents a perverse prediction. Thus we are able to predict CMEs in a fashion better than randomly guessing. Here we define a confusion table to measure the performance of our binary classification: <br>

def confusion_table(pred, labels):
    """
    computes the number of TP, TN, FP, FN events given the arrays with predictions and true labels
    and returns the true skill score

    Args:
    pred: np array with predictions (1 for flare, 0 for nonflare)
    labels: np array with true labels (1 for flare, 0 for nonflare)

    Returns: true negative, false positive, true positive, false negative
    """
    Nobs = len(pred)
    TN = 0.
    TP = 0.
    FP = 0.
    FN = 0.
    for i in range(Nobs):
        if (pred[i] == 0 and labels[i] == 0):
            TN += 1
        elif (pred[i] == 1 and labels[i] == 0):
            FP += 1
        elif (pred[i] == 1 and labels[i] == 1):
            TP += 1
        elif (pred[i] == 0 and labels[i] == 1):
            FN += 1
        else:
            print("Error! Observation could not be classified.")
    return TN, FP, TP, FN

Now we run the SVM on our data and cross-validate our results. In our case, the positive sample size is quite small (both objectively and compared to the negative sample size). Therefore, we use a stratified k-folds cross-validation method, which makes k partitions of the data set and uses k-1 folds for training the SVM and 1 fold for testing the trained SVM. The stratification preserves the ratio of positive to negative examples per fold. Then we can permute over the partitions such that each partition eventually makes its way into the testing set. For each individual testing set, we can calculate a skill score. Then we can average the skill scores over the total number of testing sets. 

To compute the TSS, we must first select a value of k. k can be arbitrarily defined and take any value between 2 and `number_of_examples`, so we can explore this parameter space. As k approaches `number_of_examples`, the k-fold method reduces to the Leave One Out method, in which only one example is in the testing set and all other examples are in the training set. The literature suggests this method is not the best, so we can stray away from high values of k. Many studies (e.g. [Kohavi, 1995](http://web.cs.iastate.edu/~jtian/cs573/Papers/Kohavi-IJCAI-95.pdf)) recommend the stratified 10-fold cross-validation to reduce variance and bias. Here, we test their recommendation by computing the TSS using 50 k values, ranging from 2 to 52. 

# lists to hold the TSS and standard deviation of the TSS
array_of_avg_TSS = np.ndarray([50])
array_of_std_TSS = np.ndarray([50])

# xdata are the examples
# ydata are the labels
xdata = np.concatenate((CME_data, no_CME_data), axis=0)
ydata = np.concatenate((np.ones(Nfl), np.zeros(Nnofl)), axis=0)

# mdata contain metadata about the active region that will be useful
# when we interpret the results using LIME
mdata = np.concatenate((positive_class, negative_class), axis=0)

# compute the TSS for a variety of k ranging from 2 to 52
# this is to see how the TSS varies as a function of k, and to test if k=10 really makes sense
for k in range(2, 52):
    skf = StratifiedKFold(n_splits=k, shuffle=True)
    these_TSS_for_this_k = []
    for train_index, test_index in skf.split(xdata, ydata):
        # xtrain are the examples in the training set
        xtrain = xdata[train_index]
        # ytrain are the labels in the training set
        ytrain = ydata[train_index]
        # xtest are the examples in the testing set
        xtest = xdata[test_index]
        ytest = ydata[test_index]    # ytest are the labels in the testing set
        # metadata useful for interpreting with LIME
        mtrain = mdata[train_index]
        # metadata useful for interpreting with LIME
        mtest = mdata[test_index]
        clf.fit(xtrain, ytrain)
        TN, FP, TP, FN = confusion_table(clf.predict(xtest), ytest)
        if (((TP+FN) == 0.0) or (FP+TN) == 0.0):
            these_TSS_for_this_k.append(np.nan)
            continue
        else:
            these_TSS_for_this_k.append(TP/(TP+FN) - FP/(FP+TN))
    TSS_k = np.array(these_TSS_for_this_k)
    array_of_avg_TSS[k-2] = np.mean(TSS_k)
    array_of_std_TSS[k-2] = np.std(TSS_k)

Now we can plot the mean TSS per k, using the standard deviation as the error in the TSS. We see that for high values of k, the standard deviation in the TSS can be greater than the mean. These points are indicated in blue.

fig, ax = plt.subplots(figsize=(10, 8))      # define the size of the figure
orangered = (1.0, 0.27, 0, 1.0)              # create an orange-red color
cornblue = (0.39, 0.58, 0.93, 1.0)           # create a cornflower-blue color

# define some style elements
marker_style_red = dict(linestyle='', markersize=8,
                        fillstyle='full', color=orangered, markeredgecolor=orangered)
marker_style_blue = dict(linestyle='', markersize=8,
                         fillstyle='full', color=cornblue, markeredgecolor=cornblue)
text_style = dict(fontsize=16, fontdict={'family': 'monospace'})

# ascribe the data to the axes
k = np.arange(50)+2
for i in range(50):
    if (array_of_avg_TSS[i] > array_of_std_TSS[i]):
        ax.errorbar(k[i], array_of_avg_TSS[i],
                    yerr=array_of_std_TSS[i], linestyle='', color=orangered)
        ax.plot(k[i], array_of_avg_TSS[i], 'o', **marker_style_red)
    if (array_of_avg_TSS[i] <= array_of_std_TSS[i]):
        ax.errorbar(k[i], array_of_avg_TSS[i],
                    yerr=array_of_std_TSS[i], linestyle='', color=cornblue)
        ax.plot(k[i], array_of_avg_TSS[i], 'o', **marker_style_blue)

# set plot limits
plt.xlim(xmax=52, xmin=0)
plt.ylim(ymax=1.1, ymin=0)

# label the axes and the plot
ax.set_xlabel('k', **text_style)
ax.set_ylabel('TSS', labelpad=20, **text_style)
plt.title(r'TSS per k using stratified k-fold cross-validation', **text_style)
fig = plt.gcf()
fig.set_size_inches(10, 5)

As such, we confirm that high k-values result in a high variance. We find it reasonable to use the stratified 10-fold cross-validation method to compute the TSS and will follow this recommendation. Therefore we report this score as our final result:

print("The TSS equals", array_of_avg_TSS[9],
      "plus or minus", array_of_std_TSS[9], ".")

## Addendum : Local Interpretable Model-Agnostic Explanations (LIME)

Machine-learning is a powerful technique that can help us predict CMEs. However, our goal is not only to predict CMEs, but also to quantitatively understand which signatures indicate the imminent eruption of a CME. But the practical successes of machine-learning algorithms are often not matched by successes in understanding, and this has become an issue within the machine-learning community ([Rahimi and Recht, 2017](http://www.argmin.net/2017/12/11/alchemy-addendum/)).

The SVM is a good model to start with, because it is (relatively) simple and we can use the Univariate Fisher Score to identify the most predictive features. But it would also be useful to figure out why each individual active region was classed as positive or negative. To do this, we can use a tool called [LIME](https://github.com/marcotcr/lime) (or Local Interpretable Model-Agnostic Explanations). <br>

First, we initialize the LIME explainer:

explainer = lime.lime_tabular.LimeTabularExplainer(
    xtrain, feature_names=sharps, class_names=['CME', 'no CME'], discretize_continuous=True)

Then we use the explainer to explain its choice for a particular active region. To do this, the LIME module generates neighborhood data by randomly perturbing the values of the features associated with this active region. If, for any given feature, this perturbation does not change the outcome of the prediction, this feature isn't useful along the perturbed dimension. If, for any given feature, the perturbation does change the outcome of the prediction, this feature is useful along the perturbed dimension. Thus the explainer can determine which features are useful under which conditions.

i = np.random.randint(0, xtest.shape[0])
exp = explainer.explain_instance(xtest[i], clf.predict_proba, num_features=8)

Now we can visualize the results. The bottom left panel shows the probabilities assigned to this particular example (which are computed by the SVM via the `probability=True` parameter). The right panel plots the weights per feature (and indicates the values of these weights at the end of each horizontal bar). The text describes the conditions under which this feature  is predictive. 

print("Here is the prediction explanation for NOAA Active Region",
      mtest[i][1], "(HARPNUM ", mtest[i][0], "),\n which produced a", mtest[i][2], "class flare on", mtest[i][3], ".")
exp.show_in_notebook(show_table=False, show_all=False)

Here is the same information in words:

explained_list = exp.as_list()
for i in range(len(explained_list)):
    if (explained_list[i][1]) < 0:
        feature_sign = 'no CME'
    else:
        feature_sign = 'CME'
    print("The following condition:", explained_list[i][0], "\n predicts",
          feature_sign, "with a model weight of", abs(explained_list[i][1]), ".")
Beispiel #19
0
fig, ax = plt.subplots(figsize=(8, 5))
Cert = np.asarray([.1, .6, .85, .4, .1])
Prob = np.asarray([.6, .2, .1, .05, .05])
objects = ('Worse alot', 'Worse a little', 'Same', 'Better a little',
           'Better alot')
N = len(objects)
y_pos = len(objects) - np.arange(len(objects)) - 1
colors = rvb(1 - Cert)

# Add colormap
y = np.unique(Cert)
newcmp = ListedColormap(rvb(1 - y))
plot = plt.scatter(y, y, c=y, cmap=newcmp, alpha=0.5)
plt.clf()
plt.colorbar(plot)

#! Barplot
plt.barh(y_pos, Prob, align='center', alpha=0.5, color=colors)
plt.yticks(y_pos, objects)
plt.xlabel('Probability')
plt.title('HAB Condition Forecast')

# Turns off grid on the left Axis.
ax.grid(False)

#! Add current date
sns.despine(ax=ax, offset=10)
fig.tight_layout()
plt.show()
plt.savefig("./PNG/Fig_prediction.png", dpi=600)
Beispiel #20
0
print(ax2)
ax3 = plt.subplot(2,2,3)
plt.plot(x3)
print(ax3)
ax4 = plt.subplot(2,2,4)
plt.plot(x4)
print(ax4)


y = [2,3,1]
x = np.arange(len(y))
xlabel = ['가', '나', '다']
plt.bar(x,y)
plt.xticks(x, xlabel)

plt.barh(x,y)
plt.yticks(x, xlabel)
plt.show()

men = [20, 35, 30, 35, 27]
women = [25, 32, 34, 20, 25]
x = np.arange(len(men))
plt.bar(x,men,width=0.3,label='men')
plt.bar(x+0.3,women,width=0.3,label='women')
plt.xticks(x+0.15, np.arange(len(men)))
plt.legend(loc=0)


data = np.random.normal(0,6,100)
plt.boxplot(data)
    def plot_predictions(self):
        for i in range(0,10):
            data = self.get_next_batch(train=False)[2] # get a test batch
            num_classes = self.test_data_provider.get_num_classes()
            NUM_ROWS = 2
            NUM_COLS = 4
            NUM_IMGS = NUM_ROWS * NUM_COLS
            NUM_TOP_CLASSES = min(num_classes, 4) # show this many top labels
            
            label_names = self.test_data_provider.batch_meta['label_names']
            label_names = [label.split(',')[0] for label in label_names]
            if self.only_errors:
                preds = n.zeros((data[0].shape[1], num_classes), dtype=n.single)
            else:
                preds = n.zeros((NUM_IMGS, num_classes), dtype=n.single)
                rand_idx = nr.randint(0, data[0].shape[1], NUM_IMGS)
                data[0] = n.require(data[0][:,rand_idx], requirements='C')
                data[1] = n.require(data[1][:,rand_idx], requirements='C')
            data += [preds]

            # Run the model
            self.libmodel.startFeatureWriter(data, self.sotmax_idx)
            self.finish_batch()
            
            fig = pl.figure(i)
            fig.text(.4, .95, '%s test case predictions' % ('Mistaken' if self.only_errors else 'Random'))
            if self.only_errors:
                err_idx = nr.permutation(n.where(preds.argmax(axis=1) != data[1][0,:])[0])[:NUM_IMGS] # what the net got wrong
                data[0], data[1], preds = data[0][:,err_idx], data[1][:,err_idx], preds[err_idx,:]
                
            data[0] = self.test_data_provider.get_plottable_data(data[0])
            for r in xrange(NUM_ROWS):
                for c in xrange(NUM_COLS):
                    img_idx = r * NUM_COLS + c
                    if data[0].shape[0] <= img_idx:
                        break
                    pl.subplot(NUM_ROWS*2, NUM_COLS, r * 2 * NUM_COLS + c + 1)
                    pl.xticks([])
                    pl.yticks([])
                    try:
                        img = data[0][img_idx,:,:,:]
                    except IndexError:
                        # maybe greyscale?
                        img = data[0][img_idx,:,:]
                    img = ndimage.rotate(img,270)    
                    pl.imshow(img, interpolation='nearest')
                    true_label = int(data[1][0,img_idx])

                    img_labels = sorted(zip(preds[img_idx,:], label_names), key=lambda x: x[0])[-NUM_TOP_CLASSES:]
                    pl.subplot(NUM_ROWS*2, NUM_COLS, (r * 2 + 1) * NUM_COLS + c + 1, aspect='equal')

                    ylocs = n.array(range(NUM_TOP_CLASSES)) + 0.5
                    height = 0.5
                    width = max(ylocs)
                    pl.rc('font',**{'family':'serif','serif':['Helvetica']})
                    pl.rc('text', usetex=True)
                    pl.rc('text', fontsize=9)
                    pl.barh(ylocs, [l[0]*width for l in img_labels], height=height, \
                            color=['r' if l[1] == label_names[true_label] else 'b' for l in img_labels])
                    pl.title(label_names[true_label])
                    pl.yticks(ylocs + height/2, [l[1] for l in img_labels])
                    pl.xticks([width/2.0, width], ['50%', ''])
                    pl.ylim(0, ylocs[-1] + height*2)
            pl.savefig('preds%i.png'%(i))
Beispiel #22
0
    dist = {}
    prev = ""
    for line in f:
        pos = line.rfind('unit')
        words = line.split(' ')
        if (pos != -1):
            diff = get_ms(line[:pos].strip()) - get_ms(prev)

            if dist.get(words[3]) is None:
                dist[words[3]] = diff
            else:
                dist[words[3]] += diff
        prev = words[0]
for key in dist:
    dist[key] = dist[key] / n

# In[8]:

import matplotlib.pylab as plt

lists = sorted(dist.items(),
               key=lambda x: x[1])  # sorted by key, return a list of tuples

x, y = zip(*lists)  # unpack a list of pairs into two tuples
plt.figure(figsize=(10, 10))
plt.barh(x, y, color=(0.2, 0.4, 0.6, 0.6))
plt.xlabel('init time, ms')
plt.ylabel('module names')
plt.tight_layout()
plt.savefig('mod_dist.png')
df.loc[:4, "chrom"]

# <codecell>

df["test"] = "test"

# <codecell>

df.groupby(df.chrom).mean()

# <codecell>

#kind : {‘line’, ‘bar’, ‘barh’, ‘kde’, ‘density’, ‘scatter’}

plt.barh(df.coverage)

# <codecell>

df = pd.DataFrame({'coverage':[632, 1638, 569, 115, 433, 1130, 754, 555, 345],
                     'sample':["A", "A", "A", "B","B","B", "C","C","C"],
                     'chrom':[1,2,3,1,2,3,1,2,3]})

plt.hist(df.coverage, bins=10)
plt.xlabel("X")
plt.ylabel("Y")
plt.xlim(0,1000)
plt.title("Title")

# <codecell>
Beispiel #24
0
    def plot_predictions(self):
        epoch, batch, data = self.get_next_batch(
            train=False)  # get a test batch
        num_classes = self.test_data_provider.get_num_classes()
        NUM_ROWS = 2
        NUM_COLS = 4
        NUM_IMGS = NUM_ROWS * NUM_COLS if not self.save_preds else data[
            0].shape[1]
        NUM_TOP_CLASSES = min(num_classes, 5)  # show this many top labels
        NUM_OUTPUTS = self.model_state['layers'][self.softmax_name]['outputs']
        PRED_IDX = 1

        label_names = [
            lab.split(',')[0]
            for lab in self.test_data_provider.batch_meta['label_names']
        ]
        if self.only_errors:
            preds = n.zeros((data[0].shape[1], NUM_OUTPUTS), dtype=n.single)
        else:
            preds = n.zeros((NUM_IMGS, NUM_OUTPUTS), dtype=n.single)
            #rand_idx = nr.permutation(n.r_[n.arange(1), n.where(data[1] == 552)[1], n.where(data[1] == 795)[1], n.where(data[1] == 449)[1], n.where(data[1] == 274)[1]])[:NUM_IMGS]
            rand_idx = nr.randint(0, data[0].shape[1], NUM_IMGS)
            if NUM_IMGS < data[0].shape[1]:
                data = [
                    n.require(d[:, rand_idx], requirements='C') for d in data
                ]


#        data += [preds]
# Run the model
        print[d.shape for d in data], preds.shape
        self.libmodel.startFeatureWriter(data, [preds], [self.softmax_name])
        IGPUModel.finish_batch(self)
        print preds
        data[0] = self.test_data_provider.get_plottable_data(data[0])

        if self.save_preds:
            if not gfile.Exists(self.save_preds):
                gfile.MakeDirs(self.save_preds)
            preds_thresh = preds > 0.5  # Binarize predictions
            data[0] = data[0] * 255.0
            data[0][data[0] < 0] = 0
            data[0][data[0] > 255] = 255
            data[0] = n.require(data[0], dtype=n.uint8)
            dir_name = '%s_predictions_batch_%d' % (os.path.basename(
                self.save_file), batch)
            tar_name = os.path.join(self.save_preds, '%s.tar' % dir_name)
            tfo = gfile.GFile(tar_name, "w")
            tf = TarFile(fileobj=tfo, mode='w')
            for img_idx in xrange(NUM_IMGS):
                img = data[0][img_idx, :, :, :]
                imsave = Image.fromarray(img)
                prefix = "CORRECT" if data[1][0, img_idx] == preds_thresh[
                    img_idx, PRED_IDX] else "FALSE_POS" if preds_thresh[
                        img_idx, PRED_IDX] == 1 else "FALSE_NEG"
                file_name = "%s_%.2f_%d_%05d_%d.png" % (prefix, preds[
                    img_idx, PRED_IDX], batch, img_idx, data[1][0, img_idx])
                #                gf = gfile.GFile(file_name, "w")
                file_string = StringIO()
                imsave.save(file_string, "PNG")
                tarinf = TarInfo(os.path.join(dir_name, file_name))
                tarinf.size = file_string.tell()
                file_string.seek(0)
                tf.addfile(tarinf, file_string)
            tf.close()
            tfo.close()
            #                gf.close()
            print "Wrote %d prediction PNGs to %s" % (preds.shape[0], tar_name)
        else:
            fig = pl.figure(3, figsize=(12, 9))
            fig.text(
                .4, .95, '%s test samples' %
                ('Mistaken' if self.only_errors else 'Random'))
            if self.only_errors:
                # what the net got wrong
                if NUM_OUTPUTS > 1:
                    err_idx = [
                        i for i, p in enumerate(preds.argmax(axis=1))
                        if p not in n.where(data[2][:, i] > 0)[0]
                    ]
                else:
                    err_idx = n.where(data[1][0, :] != preds[:, 0].T)[0]
                    print err_idx
                err_idx = r.sample(err_idx, min(len(err_idx), NUM_IMGS))
                data[0], data[1], preds = data[0][:, err_idx], data[
                    1][:, err_idx], preds[err_idx, :]

            import matplotlib.gridspec as gridspec
            import matplotlib.colors as colors
            cconv = colors.ColorConverter()
            gs = gridspec.GridSpec(NUM_ROWS * 2,
                                   NUM_COLS,
                                   width_ratios=[1] * NUM_COLS,
                                   height_ratios=[2, 1] * NUM_ROWS)
            #print data[1]
            for row in xrange(NUM_ROWS):
                for col in xrange(NUM_COLS):
                    img_idx = row * NUM_COLS + col
                    if data[0].shape[0] <= img_idx:
                        break
                    pl.subplot(gs[(row * 2) * NUM_COLS + col])
                    #pl.subplot(NUM_ROWS*2, NUM_COLS, row * 2 * NUM_COLS + col + 1)
                    pl.xticks([])
                    pl.yticks([])
                    img = data[0][img_idx, :, :, :]
                    pl.imshow(img, interpolation='lanczos')
                    show_title = data[1].shape[0] == 1
                    true_label = [int(data[1][0, img_idx])
                                  ] if show_title else n.where(
                                      data[1][:, img_idx] == 1)[0]
                    #print true_label
                    #print preds[img_idx,:].shape
                    #print preds[img_idx,:].max()
                    true_label_names = [label_names[i] for i in true_label]
                    img_labels = sorted(zip(preds[img_idx, :], label_names),
                                        key=lambda x: x[0])[-NUM_TOP_CLASSES:]
                    #print img_labels
                    axes = pl.subplot(gs[(row * 2 + 1) * NUM_COLS + col])
                    height = 0.5
                    ylocs = n.array(range(NUM_TOP_CLASSES)) * height
                    pl.barh(ylocs, [l[0] for l in img_labels], height=height, \
                            color=['#ffaaaa' if l[1] in true_label_names else '#aaaaff' for l in img_labels])
                    #pl.title(", ".join(true_labels))
                    if show_title:
                        pl.title(", ".join(true_label_names),
                                 fontsize=15,
                                 fontweight='bold')
                    else:
                        print true_label_names
                    pl.yticks(ylocs + height / 2, [l[1] for l in img_labels],
                              x=1,
                              backgroundcolor=cconv.to_rgba('0.65', alpha=0.5),
                              weight='bold')
                    for line in enumerate(axes.get_yticklines()):
                        line[1].set_visible(False)
                    #pl.xticks([width], [''])
                    #pl.yticks([])
                    pl.xticks([])
                    pl.ylim(0, ylocs[-1] + height)
                    pl.xlim(0, 1)
Beispiel #25
0
                 textcoords='offset points',
                 fontsize=16,
                 arrowprops=dict(arrowstyle='->'))
plt.show()

#==============================================================================
# matplotlib의 여러가지 플롯
#==============================================================================
#matplotlib의 한글 적용
#나눔고딕 폰트가 깔려있다면 이렇게 적용해 줄 수 있다. 유니문자열 이용해서 한글 사용해야 함
mpl.rc('font', family='nanumgothic')

#bar chart
#http://matplotlib.org/1.5.1/api/pyplot_api.html#matplotlib.pyplot.bar
#http://matplotlib.org/1.5.1/api/pyplot_api.html#matplotlib.pyplot.barh
y = [2, 3, 1]
x = np.arange(len(y))
xlabel = [u'가', u'나', u'다']
plt.bar(x, y, align='center')  #이렇게 해야 tick이 가운데에 오도록 bar가 생성된다.
plt.xticks(x, xlabel)
plt.show()
#xerr, yerr 로 에러 바를 추가할 수 있다.
people = (u'가', u'나', u'다', u'라', u'마')
y_pos = np.arange(len(people))
performance = 3 + 10 * np.random.rand(len(people))
error = np.random.rand(len(people))
plt.barh(y_pos, performance, xerr=error, align='center', alpha=0.4)
plt.yticks(y_pos, people)
plt.xlabel(u'x 라벨')
#두개 bar chart를 한번에 그리는 경우
    kinds
}]

cv = StratifiedShuffleSplit(n_splits=15, random_state=0, test_size=5)
gs = GridSearchCV(pipe,
                  param_grid,
                  scoring='accuracy',
                  cv=cv,
                  verbose=1,
                  refit=False)

######################################################################
# fit grid search
gs.fit(pooled_subjects, classes)
mean_scores = gs.cv_results_['mean_test_score']
scores_std = gs.cv_results_['std_test_score']

######################################################################
plt.figure(figsize=(6, 4))
positions = np.arange(len(kinds) + 1) * .1 + .1
plt.barh(positions, mean_scores, align='center', height=.05, xerr=scores_std)
yticks = ['dummy'] + list(gs.cv_results_['param_connectivity__kind'].data[1:])
yticks = [t.replace(' ', '\n') for t in yticks]
plt.yticks(positions, yticks)
plt.xlabel('Classification accuracy')
plt.gca().grid(True)
plt.gca().set_axisbelow(True)
plt.tight_layout()

plt.show()
Beispiel #27
0
def sellout_analysis( discography, artist ):

  """

  sellout_analysis

  Conducts 'sellout' analysis on audio features

  Inputs: discography, list        -  discography of artist, with each
                                      element being a dictionary of songs,
                                      with the following keys:

                                        'album_name'    - name of album song appears on
                                        'album_date'    - date of album release
                                        'song_title'    - title of song
                                        'track number'  - number it appears on
                                                          album

  Outputs: sellout_feature, string - which feature in features is indicative
                                     of an artist selling out

              sellout_index, int   - index of selling out index (album)                       
                               
  """

  if discography == None and artist == None:

  	return False

  # first klll off any albums which have no data
  # discography = dict()

  # for album in raw_discography:

  #   skipit = all( [ item == {} for key, item in raw_discography['SIX HITS']['tracks'].items() ] )
  
  #   if skipit:

  #     pass

  #   else:
      
  #     discography[ album ] = raw_discography[ album ]

  # for album, songs in raw_discography.items():

  #   if raw_discography[ album ]  	

  feature_names = [ 'energy', 'liveness', 'speechiness',
                   'acousticness',
                   'valence', 'danceability']

  # are these features global maxes or mins for 
  # 'hardcore' bands?
  feature_max_mins = { 
                       'energy':         'max',
                       'liveness':       'max',
                       'speechiness':    'min',
                       'acousticness':   'min',
                       'valence':        'min',
                       'danceability':   'min'
                      }

  # collect median feature values for each album
  for album, songs in discography.items():

    # initialise
    for f in feature_names:

      discography[ album ][ f ] = []

    for song, features in discography[ album ][ 'tracks' ].items():

      if features != {}:

        # and features
        for f in feature_names:

          discography[ album ][ f ].append( features[ f ] )    	
       
  # calculate the medians, standard deviations
  n_features = len( feature_names )

  n_albums = len( discography )

  # keep track of best stuff
  best_score = -np.inf
  best_vals = None
  best_feature = None
  best_sellout_index = None

  for ifeat, feature in enumerate( feature_names ):

    vals = []

    album_dates = []

    album_titles = []

    for album, data in discography.items():

      album_dates.append( data[ 'album_date' ] )

      vals.append( np.median( discography[ album ][ feature ] ) )      
  
      album_titles.append( album )

    # sort these vals by album date
    sort_inds = np.argsort( album_dates )

    vals = [ vals[ i ] for i in sort_inds ]

    album_titles = [ album_titles[ i ] for i in sort_inds ]

    album_dates = [ album_dates[ i ] for i in sort_inds ]

    print vals, feature

    # plot if monotonic max found
    if feature_max_mins[ feature ] == 'max':

      score, sellout_index = score_max( vals )

    else:

      score, sellout_index = score_min( vals )
      	
    if score > best_score:

      best_score = -np.inf
      best_vals = vals
      best_feature = feature
      best_sellout_index = sellout_index

  # plot
  fig, ax = plt.subplots( 1 )

  plt.barh( range( n_albums ),vals, align='center', 
      	              color=cm.PuBu( 0.5 ), edgecolor=cm.PuBu( 0.9 ) )

  fig.patch.set_facecolor('white')

  # Remove all ticks
  ax.xaxis.set_ticks_position('none')
  ax.yaxis.set_ticks_position('none')
      
  # Remove spines
  spines_to_remove = ['top', 'right']

  for spine in spines_to_remove:

    ax.spines[spine].set_visible( False )

    # make lines almost black
    almost_black = '#262626'

    spines_to_keep = ['bottom', 'left']

    for spine in spines_to_keep:

      ax.spines[ spine ].set_linewidth( 0.5 )

      ax.spines[ spine ].set_color( almost_black )

    # tex font
    plt.rc( 'text', usetex=True )
    plt.rc( 'font', family='serif' )
      
    # sort out y limits
    plt.ylim( [ -0.5, n_albums - 0.5 ] )
      
    # xlabel = feature
    plt.xlabel( feature[ 0 ].upper() + feature[ 1 : ] )

    # yticks = album names
    titles_years = [ a[ 0 ] + a[ 1 : ].lower() + ' (' + str( album_dates[ ialbum ] ) + ')' for ialbum,a in enumerate( album_titles ) ]

    plt.yticks( range( len( album_titles ) ), titles_years )

    # title text
    sellout_album = album_titles[ sellout_index ]

    # make a goofy title
    title = 'Scientific$^{*}$ proof that ' + artist + ' sold out after recording ' + sellout_album[ 0 ] + sellout_album[ 1 : ].lower()

    # put a disclaimer
    plt.text( 0, -1, '$^{*}$in no way scientific')

    plt.title( title )

    # tight layout
    plt.tight_layout()

    # save
    plt.savefig( artist + '.pdf')
Beispiel #28
0
from opyenxes.data_in.XUniversalParser import XUniversalParser
from opyenxes.classification.XEventAttributeClassifier import XEventAttributeClassifier
from collections import defaultdict
from matplotlib import pylab as plt
import numpy as np


with open("xes_file/example_compress_log.xes.gz") as file:
    logs = XUniversalParser().parse(file)

classifier = XEventAttributeClassifier("Resource", ["Resource"])
people_dict = defaultdict(lambda: 0)
for log in logs:
    for trace in log:
        for event in trace:
            people = classifier.get_class_identity(event)
            people_dict[people] += 1

name_people = list(people_dict.keys())
posicion_y = np.arange(len(name_people))
units = list(map(lambda people: people_dict[people], name_people))
plt.barh(posicion_y, units, align ="center")
plt.yticks(posicion_y, name_people)
plt.xlabel('Number of activities')
plt.title("Activities done by people")
plt.show()
# bar chart : bar(vertical(default) / barh(horizontal)
# bar(x axis, y axis)
y = [2,3,1]
x = np.arange(3)
xlabel = ['A','B','C']
plt.bar(x,y)
plt.xticks(x,xlabel)
plt.grid(True)
plt.show()

# barh
np.random.seed(0)
yLabel = ['A','B','C','D']
yPos = np.arange(4)
yValue = 2+10*np.random.rand(4)
plt.barh(yPos,yValue, alpha=0.5)    # alpha : 바의 투명도
plt.yticks(yPos,yLabel)     # 세로축이니 xticks 가 아니라 yticks
plt.grid(True)
plt.show()

# histogram
# hist
# bins : 데이터를 집계할 구간 정보 설정
np.random.seed(0)
x = np.random.randn(1000)
arrays, bins, patches = plt.hist(x, bins=10)     # 열개의 구간
print(arrays)
# [  9.  20.  70. 146. 217. 239. 160.  86.  38.  15.]
print(bins)
# [-3.04614305 -2.46559324 -1.88504342 -1.3044936  -0.72394379 -0.14339397
#   0.43715585  1.01770566  1.59825548  2.1788053   2.75935511]
Beispiel #30
0
Nnofl = no_CME_data.shape[0]
yfl = np.ones(Nfl)
ynofl = np.zeros(Nnofl)
selector = SelectKBest(f_classif, k=N_features)  # k is the number of features
selector.fit(np.concatenate((CME_data, no_CME_data), axis=0),
             np.concatenate((yfl, ynofl), axis=0))
scores = selector.scores_
print scores

#interpret the scores in plot:
mpld3.disable_notebook()
plt.clf()
order = np.argsort(scores)
orderedsharps = [sharps[i] for i in order]
y_pos2 = np.arange(19)
plt.barh(y_pos2, sorted(scores / np.max(scores)), align='center')
plt.ylim((-1, 19))
plt.yticks(y_pos2, orderedsharps)
plt.xlabel('Normalized Fisher Score', fontsize=15)
plt.title('Ranking of SHARP features', fontsize=15)
fig = plt.gcf()
fig.set_size_inches(8, 10)
fig.savefig('sharp_ranking_48hours.png', bbox_inches='tight')
plt.show()

#Pearson linear correlation coefficients
xdata = np.concatenate((CME_data, no_CME_data), axis=0)
ydata = np.concatenate((np.ones(Nfl), np.zeros(Nnofl)), axis=0)

for i in range(len(sharps)):
    for j in range(len(sharps)):
# bar chart
# bar(x, y) ; x는 x축의 위치, y축의 값

y = [2, 3, 1]
x = np.arange(3)
xlabel = ['A', 'B', 'C']
plt.bar(x, y)
plt.xticks(x, xlabel)
#plt.show()

np.random.seed(0)
yLabel = ['A', 'B', 'C', 'D']
yPos = np.arange(4)
yValue = 2 + 10 * np.random.rand(4)

plt.barh(yPos, yValue, alpha=0.5)  # alpha는 투명도를 의미한다. 0 ~ 1
plt.yticks(yPos, yLabel)
#plt.show()
plt.cla()

x = np.random.randn(1000)
bins = plt.hist(x, bins=10)
print(bins)

arrays, bins, patchs = plt.hist(x, bins=10)
print(bins)

plt.hist(x, bins=10)
#plt.show()

# pie chart
            which_model, catchment, output_dem, run_id,
            model_swe_sc_threshold),
    dpi=600)

model_swe = np.nanmean(ann_av_swe, axis=0)
mean_swe = np.full(np.arange(0, 3600 + 1, 200).shape, np.nan)
area = np.full(np.arange(0, 3600 + 1, 200).shape, np.nan)
for i, x in enumerate(np.arange(0, 3600 + 1, 200)):
    mean_swe[i] = np.nanmean(model_swe[np.logical_and(nztm_dem > x,
                                                      nztm_dem <= x + 200)])
    area[i] = np.nansum(np.logical_and(nztm_dem > x,
                                       nztm_dem <= x + 200)) * .25 * .25
fig, ax = plt.subplots(figsize=(4, 4))
# plt.barh(np.arange(0, 3600 + 1, 200) + 100, mean_swe_dsc * area / 1e6, height=200, label='dsc_snow')
plt.barh(np.arange(0, 3600 + 1, 200) + 100,
         mean_swe * area / 1e6,
         height=200,
         label='clark')
plt.yticks(np.arange(0, 3600 + 1, 400))
plt.ylim(0, 3600)
plt.ylabel('Elevation (m)')
plt.xlabel('Average snow storage (cubic km)')
plt.tight_layout()
# fig.savefig(plot_folder + '/hist av snow storage clark.png')
#
# model_max_swe = np.nanmean(ann_max_swe, axis=0)
# max_swe = np.full(np.arange(0, 3600 + 1, 200).shape, np.nan)
# area = np.full(np.arange(0, 3600 + 1, 200).shape, np.nan)
# for i, x in enumerate(np.arange(0, 3600 + 1, 200)):
#     max_swe[i] = np.nanmean(model_max_swe[np.logical_and(nztm_dem > x, nztm_dem <= x + 200)])
#     area[i] = np.nansum(np.logical_and(nztm_dem > x, nztm_dem <= x + 200)) * .25 * .25
# fig, ax = plt.subplots(figsize=(4, 4))

df_active_top=df_confirmed_top-df_recovered_top - df_deaths_top
df_active1_top=df_confirmed_top-df_recovered_top

#df_active_top.to_csv(odir+'active-data-top-affected-countries.csv')
#df_active1_top.to_csv(odir+'active1-data-top-affected-countries.csv')


# In[9]:


plt.figure(figsize=(10,10))
plt.title('Total Number of Cases',fontsize=15)
y_pos=np.arange(0,len(top_affected_countries))
p1 =plt.barh(y_pos,width=np.flip((df_confirmed_top[date].values)),align='center',label='Confirmed')
p1 =plt.barh(y_pos,width=np.flip((df_active_top[date].values)),align='center', label='Active')
plt.yticks(y_pos,np.flip(top_affected_countries),rotation=0,fontsize=13)
plt.ylim(0,len(top_affected_countries))
plt.legend(fontsize=14)
plt.xlabel('Log Scale',fontsize=12)
plt.xscale('log')
plt.grid()
#plt.xlim([9e4,0])
plt.tight_layout()
plt.savefig(odir+'Confirmed_active_hor.png')
#pd.plotting.table(data=df_confirmed_top['3/21/20'])


# In[10]:
Beispiel #34
0
import numpy as np
import matplotlib as mpl
import matplotlib.pylab as plt
y = [2, 3, 1]
x = np.arange(len(y))
print(x)
xlabel = ['x label']
error = np.random.rand(len(y))
plt.title("Bar Chart")
plt.barh(x, y, alpha=0.5, xerr=error)  #alpha는 투명도, xerr:에러의 허용범위
plt.show()

#스템 플롯(폭이 없는 막대차트)
x = np.linspace(0.1, 2 * np.pi, 10)
plt.title("Stem Plot")
plt.stem(x, np.cos(x), '-.')
plt.show()

#파이차트
label = ['자바', '씨', '씨++', '파이썬']
sizes = [15, 30, 45, 10]
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral']
plt.title('Pie Chart')
plt.pie(sizes, labels=label, colors=colors,
        startangle=90)  #shadow=True로 그림자 넣어줄 수도 있음
plt.axis('equal')  #
plt.show()

#히스토그램(도수분포표 막대그래프로 나타낸 것)
x = np.random.randn(1000)
plt.title("histogram")
Beispiel #35
0
def tradeanalyze4(rpt,fname,outputDir,data):
    
    """ エグジット要因ごとの勝率
    
    """
    print "\n--------------------------------------------" 
    print "- Exit  \n"
    print 'Entry -> Exit : profit rate [win/total]'    
    cc = []
    cd = []
    count = 1
    for t in ['close','t/p','s/l']:
        f = filter(lambda n:n[6]==t ,rpt.buyorder)
        cc.append(len(f))        
        pc,nc = winlossCount(f)
        if pc+nc>0:
            t0 = "%6.2f %% [ %3d / %3d ]" % (100*float(pc)/float(pc+nc),pc,\
            (pc+nc))            
            cd.append(100*float(pc)/float(pc+nc))
            t1 = "Long - %6s : " % (t)            
            print t1 + t0
            t3 = "$$T0040%02d$$" % (count)  
            count = count+1
            data.update({t3:t0})
        else:
            t3 = "$$T0040%02d$$" % (count)
            count = count+1
            data.update({t3:'NA'})
            cd.append(0.)
    for t in ['close','t/p','s/l']:
        f = filter(lambda n:n[6]==t ,rpt.sellorder)
        cc.append(len(f))        
        pc,nc = winlossCount(f)
        if pc+nc>0:
            t0 = "%6.2f %% [ %3d / %3d ]" % (100*float(pc)/float(pc+nc),pc,\
                (pc+nc))           
            cd.append(100*float(pc)/float(pc+nc))
            t1 = "Short- %6s : " % (t)            
            print t1 + t0
            t3 = "$$T0040%02d$$" % (count)  
            count = count+1
            data.update({t3:t0})
        else:
            t3 = "$$T0040%02d$$" % (count)
            count = count+1
            data.update({t3:'NA'})
            cd.append(0.)

    s = np.array([cd[0],cd[1],cd[2]])
    b = np.array([cd[3],cd[4],cd[5]])
    
    x = np.array(range(len(s)))
    xt = ['close','t/p','s/l']
    gw = 0.4
    plt.figure(figsize=(5,2))
    plt.title('Profit rate by exit ')
    plt.xlabel("Profit rate")
    plt.xlim(0,100)
    #plt.ylim(0+0.5,5-0.5)    
    plt.yticks(x,xt)
    plt.grid(True)
    plt.barh(x -gw/2, s, height = gw, align='center',color='b',alpha=0.5,\
            label='Short')
    plt.barh(x +gw/2, b, height = gw, align='center',color='g',alpha=0.5,\
            label='Long'    )
    plt.legend()
    plt.axvline(50,color='r')
    plt.savefig(outputDir+fname+"test41.png",dpi=72)
    data.update({'$$I004001$$':fname+"test41.png"})       
    #plt.show()
    plt.draw()
    

    ac = cc[0]+cc[3]
    tc = cc[1]+cc[4]
    sc = cc[2]+cc[5]
    tt = ac+tc+sc    

    
    print " "
    t0 = "%6.2f %% [ %4d / %4d ]" % (100*float(ac)/float(tt),ac,tt)
    t1 = "%6.2f %% [ %4d / %4d ]" % (100*float(tc)/float(tt),tc,tt)   
    t2 = "%6.2f %% [ %4d / %4d ]" % (100*float(sc)/float(tt),sc,tt)
    print "Exit  : rate [count/total]" 
    print "Close : " + t0
    print "t/p   : " + t1
    print "s/l   : " + t2
    data.update({'$$T005001$$':t0})    
    data.update({'$$T005002$$':t1})    
    data.update({'$$T005003$$':t2})    
    
    from matplotlib import cm
    plt.figure(figsize=(2,2))
    names =[ 'close','t/p', 's/l']
    # それぞれの割合を用意
    ratios = [ac, tc,sc]
    # どれだけ飛び出すか指定
    moves=(0, 0, 0)
    # 適当なカラーをマッピング
    col = cm.Set2(np.arange(3)/3.,0.7)
    # 円グラフを描画(影付き)
    plt.pie(ratios, explode=moves, labels=names, autopct='%1d%%',\
    shadow=True,colors=col)
    # 円グラフ他ので縦横比は等しく
    plt.gca().set_aspect('equal')
    plt.title('Exit count')
    plt.savefig(outputDir+fname+"test42.png",dpi=72)
    data.update({'$$I004002$$':fname+"test42.png"})       
    #plt.show()
    plt.draw()
    plt.close('all')    
Beispiel #36
0
print(bestY_pred)


########################################################################
# 여기서 부터는 옵션으로 plot 해보기

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, gbm0.train_score_, 'b-', label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-', label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')

plt.show()


# 영향력이 큰 feature찾기
feature_importance = gbm0.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, data[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()
# Estimators = estimator.estimators_
# for index, model in enumerate(Estimators):
#     filename = 'iris_' + str(index) + '.pdf'
#     dot_data = tree.export_graphviz(model , out_file=None,
#                          feature_names=featureName,
#                          class_names='CM_purchase',
#                          filled=True, rounded=True,
#                          special_characters=True)
#     graph = pydot.graph_from_dot_data(dot_data)
#     # 使用ipython的终端jupyter notebook显示。
#     #Image(graph.create_png())
#     graph[0].write_pdf(filename)
############特征重要度可视化#######################
y_importances = estimator.feature_importances_
x_importances = featureName
y_pos = np.arange(len(x_importances))
# 横向柱状图
plt.barh(y_pos, y_importances, align='center')
plt.yticks(y_pos, x_importances)
plt.xlabel('Importances')
plt.xlim(0, 1)
plt.title('Features Importances')
plt.show()

# 竖向柱状图
plt.bar(y_pos, y_importances, width=0.4, align='center', alpha=0.4)
plt.xticks(y_pos, x_importances)
plt.ylabel('Importances')
plt.ylim(0, 1)
plt.title('Features Importances')
plt.show()
Beispiel #38
0
y = [2, 3, 1]
x = np.arange(3)
xlabel = ['A', 'B', 'C']
plt.bar(x, y)
plt.xticks(x, xlabel)
plt.grid(True)
plt.show()


np.random.seed(0)

yLabel = ['A', 'B', 'C', 'D']
yPos = np.arange(4)
yValue = 2+10*np.random.rand(4)

plt.barh(yPos, yValue, alpha=0.5)
plt.yticks(yPos, yLabel)
plt.show()

# 히스토 그램

# hist 명령으로 히스토그램을 만들 수 있다. bins인수를 사용한다.
# bins : 데이터를 집계할 구간 정보를 설정한다.

np.random.seed(0)
x = np.random.randn(1000)
plt.hist(x, bins=10)
plt.show()

# 파이 차트
Beispiel #39
0
def hist(x,
         bins,
         weight=None,
         weights=None,
         index=None,
         norm=None,
         frac=False,
         total=False,
         dist=False,
         cumulative=False,
         revcumulative=False,
         bottom=None,
         filled=False,
         **kwargs):
    '''
    norm == set max value of hist
    frac == return the fraction index is of the entire sample 
    
    '''
    rotate = kwargs.pop('rotate', False)
    noplot = kwargs.pop('noplot', False)
    if bottom is None: bottom = 0.0
    if index is not None:
        xx = x[index]
        if weight is not None and weights is not None:
            raise ValueError(
                'Only supply one.  This is just so that you can push weight and weights'
            )
        if weight is not None:
            ww = weight[index]
        elif weights is not None:
            ww = weights[index]
        else:
            ww = None
    else:
        xx = x
        ww = weight

    v, l = np.histogram(xx, bins, weights=ww)
    d = np.diff(l)
    l = l[:-1] + d / 2.0

    if frac:
        vv = np.histogram(x, bins, weight)[0]
        ii = np.where(vv == 0)
        v = np.array(v) * 1.0 / np.array(vv)
        v[ii] = 0

    if cumulative:
        v = np.cumsum(v)
    if revcumulative:
        v = np.cumsum(v[::-1])[::-1]

    if norm is not None:
        v = v / float(np.max(v)) * float(norm)
    if total:
        v = v / (1.0 * np.sum(v))

    if dist:
        v /= d

    if bottom is not None:
        v += bottom
    # if rotate:
    #     l,v = v,l
    if not noplot:
        if filled:
            kwargs.setdefault('align', 'center')
            # hack to fix pylab.bar's coloring
            if 'color' not in kwargs:
                kwargs['color'] = next(pylab.gca()._get_lines.color_cycle)
            if rotate:
                pylab.barh(l, v - bottom, height=d, left=bottom, **kwargs)
            else:
                pylab.bar(l, v - bottom, width=d, bottom=bottom, **kwargs)
        else:
            if rotate:
                pylab.step(v, l, where='mid', **kwargs)
            else:
                pylab.step(l, v, where='mid', **kwargs)
    # if rotate:
    #     l,v = v,l
    return l, v
Beispiel #40
0
def graphStacked(histogramMap,
                 prefix,
                 titleText,
                 leftAdjust=0.23,
                 maxCols=None,
                 figsize=(6, 6),
                 showEntries=True,
                 bottomAdjust=0.19,
                 topAdjust=0.75,
                 condensed=False,
                 xticks=None,
                 makeLegend=True):

    if condensed:
        histogramMap = condenseHistograms(histogramMap)
    #print "user", histogramMap
    #print "bin", histogramMap[6].bins
    allValues = Histogram()

    for histogram in histogramMap.values():
        allValues.merge(histogram)
    mpl.figure(figsize=figsize)

    if not showEntries:
        histogramMap = {"all": allValues}

    if xticks == None:
        maxval = na.max(allValues.bins.values())
        xticks = na.arange(0, maxval, maxval / 5)

    width = 0.7
    allTuples = allValues.tuples()
    if maxCols != None:
        allTuples = allTuples[0:maxCols]
    allTuples.reverse()
    #print "tuples", allTuples
    colors = get_colors(len(histogramMap))
    yoff = na.array([0.0] * len(allTuples))
    ylocations = na.arange(len(allTuples)) + 0.5
    mpl.xlabel("Count", fontsize=20)
    legendArray = []
    legendNames = []
    #print allTuples
    for row, userId in enumerate(sorted(histogramMap.keys())):
        histogram = histogramMap[userId]
        data = [
            histogram.bins[desc] if desc in histogram.bins else 0
            for desc, count in allTuples
        ]
        rects = mpl.barh(ylocations,
                         data,
                         width,
                         left=yoff,
                         color=colors[row],
                         align="center")

        if len(rects) != 0:
            legendArray.append(rects[0])
            legendNames.append(userId)

        yoff = yoff + data

    mpl.yticks(
        ylocations,
        [label.replace("Davin", "the baby") for label, count in allTuples])
    mpl.subplots_adjust(left=leftAdjust, bottom=bottomAdjust, top=topAdjust)

    if len(legendArray) > 1 and makeLegend:
        mpl.legend(legendArray, legendNames, loc="lower right")
    mpl.xticks(xticks)
    mpl.title(titleText, fontsize=20)
    ax = mpl.axes()
    for tick in ax.xaxis.get_major_ticks():
        tick.label1.set_fontsize(15)
    for tick in ax.yaxis.get_major_ticks():
        tick.label1.set_fontsize(15)

    basename = "%s.%s%s" % (prefix, titleText.replace(
        " ", "_"), "" if not condensed else ".condensed")
    mpl.savefig("%s.png" % (basename))
    mpl.savefig("%s.pdf" % (basename))
Beispiel #41
0
def predictTest(X_test, y_test):
    # 加载模型
    print("load GBDT model ...")
    model = joblib.load('clfGBDT.model')

    # 用测试集验证最优模型
    t0 = time()
    print("Predicting label on the test set")
    y_pre = model.predict(X_test)
    y_preprob = model.predict_proba(X_test)[:, 1]
    print('Accuaracy: %.4g' % metrics.accuracy_score(y_test, y_pre))
    print('AUC Score (Test): %f' % metrics.roc_auc_score(y_test, y_preprob))
    print("done in %0.3fs" % (time() - t0))

    # classification_report
    # 输入:测试集真实的结果和预测的结果
    # 返回:每个类别的准确率召回率F值以及宏平均值。
    print("classification report:")
    print(classification_report(y_test, y_pre))

    # 混淆矩阵
    # 输出:
    # 第0行第0列的数表示y_true中值为0,y_pred中值也为0的个数;第0行第1列的数表示y_true中值为0,y_pred中值为1的个数
    # 第1行第0列的数表示y_true中值为1,y_pred中值也为0的个数;第1行第1列的数表示y_true中值为1,y_pred中值为1的个数
    matrix = confusion_matrix(y_test, y_pre)
    print("confusion matrix:")
    print(matrix)
    # 画出混淆矩阵
    font = FontProperties(fname=r"c:\windows\fonts\msyh.ttc", size=10)  # 字体
    plt.matshow(matrix)
    plt.colorbar()
    plt.xlabel('预测类型', fontproperties=font)
    plt.ylabel('实际类型', fontproperties=font)
    labels = ['0', '1']
    plt.xticks(np.arange(matrix.shape[1]), labels)
    plt.yticks(np.arange(matrix.shape[1]), labels)
    plt.show()

    # 画ROC曲线
    fpr, tpr, _ = roc_curve(y_test, y_preprob)
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr, label='GBDT')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()

    plt.figure(2)
    plt.xlim(-0.1, 1.1)
    plt.ylim(-0.1, 1.1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr, label='GBDT')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve (zoomed in at top left)')
    plt.legend(loc='best')
    plt.show()

    # 输出特征的重要程度
    feature_names = []
    for i in range(1, 343):  # feature_names为list类型
        feature_names.append('feature' + str(i))
    feature_names = np.array(feature_names)  # 将feature_names转为array类型
    feature_importance = model.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance /
                                  feature_importance.max())
    sorted_idx = np.argsort(feature_importance)  # np.argsort()返回的是数组值从小到大的索引值
    pos = np.arange(sorted_idx.shape[0]) + 0.5
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, feature_names[sorted_idx - 1])  # y轴刻度
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')
    plt.show()
Beispiel #42
0
def first_round_user_query_visualization(df, plot_flag, fig_dir_path):
    if plot_flag == 'min':
        title_prefix = "Minimum"
    else:
        title_prefix = "Preferred"

    df3, category, perc = get_degree(df, plot_flag)
    df4_sorted, color = get_sorted_degree_area(df, plot_flag)
    df5_sorted = get_sorted_skills(df, plot_flag)
    #print df5_sorted
    #df5_sorted = df5_sorted.head(10)

    # Create 2x2 sub plots
    pie_colors = [[183, 183, 183], [230, 252, 209], [252, 230, 174],
                  [168, 247, 239], [255, 188, 122], [214, 198, 255],
                  [255, 226, 242]]

    pie_colors = [[
        pie_colors[i][0] / 255.0, pie_colors[i][1] / 255.0,
        pie_colors[i][2] / 255.0
    ] for i in range(len(pie_colors))]

    gs = gridspec.GridSpec(2, 2)
    #plot pie chart for degree
    fig = pl.figure(figsize=(18, 12))
    ax = pl.subplot(gs[0, 0])  # row 0, col 0
    n = pl.pie(perc,
               labels=category,
               colors=pie_colors[:len(category)],
               startangle=90,
               autopct='%.1f%%',
               textprops={'fontsize': small_fontsize})
    ax.set_aspect('equal')
    pl.title(title_prefix + " Qualifications - Degree",
             fontsize=20,
             fontweight='bold')

    # plot bar chart for degree areas
    ax = pl.subplot(gs[0, 1])  # row 0, col 1
    objects = df4_sorted["Degree_Area"]
    y_pos = np.arange(len(objects))
    performance = df4_sorted["Percent"]
    pl.barh(y_pos, performance, align='center', alpha=0.5, color=tuple(color))
    pl.yticks(y_pos, objects, rotation=0, fontsize=small_fontsize)
    pl.xlabel('Percent %', fontsize=14)
    pl.xlim([0, 100])
    pl.title('Degree Majors', fontsize=20, fontweight='bold')
    #pl.rcParams.update({'font.size': 8})

    ax = pl.subplot(gs[1, :])  # row 1, span all columns
    df5_sorted = df5_sorted.iloc[:15]
    df5_sorted = df5_sorted.sort_values(['Count'], ascending=[1])

    objects = df5_sorted["Exp/Skills"]
    y_pos = np.arange(len(objects))
    performance = df5_sorted["Percent"]

    pl.barh(y_pos, performance, align='center', alpha=0.5)
    pl.yticks(y_pos, objects, rotation=0, fontsize=small_fontsize)
    pl.xlabel('Percent %', fontsize=14)
    pl.xlim([0, 30])
    pl.title('Top 15 Exp/Skills', fontsize=20, fontweight='bold')
    pl.show()
Beispiel #43
0
 def plot_predictions(self):
     epoch, batch, data = self.get_next_batch(train=False) # get a test batch
     num_classes = self.test_data_provider.get_num_classes()
     NUM_ROWS = 2
     NUM_COLS = 4
     NUM_IMGS = NUM_ROWS * NUM_COLS if not self.save_preds else data[0].shape[1]
     NUM_TOP_CLASSES = min(num_classes, 5) # show this many top labels
     NUM_OUTPUTS = self.model_state['layers'][self.softmax_name]['outputs']
     PRED_IDX = 1
     if self.save_preds:
         # print preds
         if not os.path.exists(self.save_preds):   
             os.makedirs(self.save_preds)
         # we process all the batches
         while True:
             # some constant
             NUM_IMGS = data[0].shape[1]
             NUM_TOP_CLASSES = min(num_classes, 5) # show this many top labels
             NUM_OUTPUTS = self.model_state['layers'][self.softmax_name]['outputs']
             PRED_IDX = 1
             preds = n.zeros((NUM_IMGS, NUM_OUTPUTS), dtype=n.single)
             # we only save the prediction result instead of the image
             dir_name = 'predictions_batch_%d' % (batch)
             tar_name = os.path.join(self.save_preds, dir_name)
             # Run the model
             print  [d.shape for d in data], preds.shape
             self.libmodel.startFeatureWriter(data, [preds], [self.softmax_name])
             # in the mean while, prepare to load the next batch of data
             new_epoch, batch, new_data = self.get_next_batch(train=False)
             IGPUModel.finish_batch(self)
             # swap the data
             # concatenate the pred into the groud true label
             preds=n.concatenate((n.transpose(data[1]), preds), axis=1);
             tfo = open(tar_name, "wb");
             cPickle.dump(preds, tfo)
             tfo.close()
             print "Wrote %d prediction PNGs to %s" % (preds.shape[0], tar_name)
             if new_epoch!=epoch:
                 print "All batches process"
                 break;
             data=new_data
     else:
         label_names = [lab.split(',')[0] for lab in self.test_data_provider.batch_meta['label_names']]
         if self.only_errors:
             preds = n.zeros((data[0].shape[1], NUM_OUTPUTS), dtype=n.single)
         else:
             preds = n.zeros((NUM_IMGS, NUM_OUTPUTS), dtype=n.single)
             rand_idx = nr.randint(0, data[0].shape[1], NUM_IMGS)
             if NUM_IMGS < data[0].shape[1]:
                 data = [n.require(d[:,rand_idx], requirements='C') for d in data]
         # Run the model
         print  [d.shape for d in data], preds.shape
         self.libmodel.startFeatureWriter(data, [preds], [self.softmax_name])
         IGPUModel.finish_batch(self)
         # print preds
         data[0] = self.test_data_provider.get_plottable_data(data[0])
         fig = pl.figure(3, figsize=(12,9))
         fig.text(.4, .95, '%s test samples' % ('Mistaken' if self.only_errors else 'Random'))
         if self.only_errors:
             # what the net got wrong
             if NUM_OUTPUTS > 1:
                 err_idx = [i for i,p in enumerate(preds.argmax(axis=1)) if p not in n.where(data[2][:,i] > 0)[0]]
             else:
                 err_idx = n.where(data[1][0,:] != preds[:,0].T)[0]
                 print err_idx
             err_idx = r.sample(err_idx, min(len(err_idx), NUM_IMGS))
             data[0], data[1], preds = data[0][:,err_idx], data[1][:,err_idx], preds[err_idx,:]
             
         
         import matplotlib.gridspec as gridspec
         import matplotlib.colors as colors
         cconv = colors.ColorConverter()
         gs = gridspec.GridSpec(NUM_ROWS*2, NUM_COLS,
                                width_ratios=[1]*NUM_COLS, height_ratios=[2,1]*NUM_ROWS )
         #print data[1]
         for row in xrange(NUM_ROWS):
             for col in xrange(NUM_COLS):
                 img_idx = row * NUM_COLS + col
                 if data[0].shape[0] <= img_idx:
                     break
                 pl.subplot(gs[(row * 2) * NUM_COLS + col])
                 #pl.subplot(NUM_ROWS*2, NUM_COLS, row * 2 * NUM_COLS + col + 1)
                 pl.xticks([])
                 pl.yticks([])
                 img = data[0][img_idx,:,:,:]
                 pl.imshow(img, interpolation='lanczos')
                 show_title = data[1].shape[0] == 1
                 true_label = [int(data[1][0,img_idx])] if show_title else n.where(data[1][:,img_idx]==1)[0]
                 #print true_label
                 #print preds[img_idx,:].shape
                 #print preds[img_idx,:].max()
                 true_label_names = [label_names[i] for i in true_label]
                 img_labels = sorted(zip(preds[img_idx,:], label_names), key=lambda x: x[0])[-NUM_TOP_CLASSES:]
                 #print img_labels
                 axes = pl.subplot(gs[(row * 2 + 1) * NUM_COLS + col])
                 height = 0.5
                 ylocs = n.array(range(NUM_TOP_CLASSES))*height
                 pl.barh(ylocs, [l[0] for l in img_labels], height=height, \
                         color=['#ffaaaa' if l[1] in true_label_names else '#aaaaff' for l in img_labels])
                 #pl.title(", ".join(true_labels))
                 if show_title:
                     pl.title(", ".join(true_label_names), fontsize=15, fontweight='bold')
                 else:
                     print true_label_names
                 pl.yticks(ylocs + height/2, [l[1] for l in img_labels], x=1, backgroundcolor=cconv.to_rgba('0.65', alpha=0.5), weight='bold')
                 for line in enumerate(axes.get_yticklines()): 
                     line[1].set_visible(False) 
                 #pl.xticks([width], [''])
                 #pl.yticks([])
                 pl.xticks([])
                 pl.ylim(0, ylocs[-1] + height)
                 pl.xlim(0, 1)
Beispiel #44
0
#!/usr/bin/env python

import numpy as np
import matplotlib.pylab as pl
from scipy.stats import norm


def ginv(x):
    """transform func"""
    return 1 / (1 + np.exp(-x + 5))

mu, sigma = 6, 1
n = 10 ** 6
x = norm.rvs(size=n, loc=mu, scale=sigma)
x_range = np.arange(0, 10, 0.01)
#plot the histogram
hist, bin_edges = np.histogram(x, bins=50, normed=True)
pl.bar(bin_edges[:-1], hist, width=bin_edges[1] - bin_edges[0], color='r')
hist, bin_edges = np.histogram(ginv(x), bins=50, normed=True)
pl.barh(bin_edges[:-1], hist, height=bin_edges[1] - bin_edges[0], color='g')

#plot transform function
pl.plot(x_range, ginv(x_range), 'b', lw=5)

#plot line at mu
pl.plot([mu, mu], [0, ginv(mu)], 'y', lw=5)
pl.plot([0, mu], [ginv(mu), ginv(mu)], 'y', lw=5)

pl.savefig('bayesChangeOfVar.png')
pl.show()
Beispiel #45
0
    def run(self):

        # plt.figure()
        plt.figure(figsize=(15, 10))
        plt.ion()
        plt.show()

        bar_edge = [-0.1, 0, 0.1]
        while not rospy.is_shutdown():
            if not self.sensor_front or not self.sensor_back or not self.sensor_vehicle:
                continue
            plt.gcf().clear()

            plt.subplot(232)
            plt.ylim([0, 15])
            plt.bar(bar_edge,
                    self.sensor_front,
                    width=0.04,
                    color=self.color_front)
            for i, v in enumerate(self.sensor_front):
                plt.text(bar_edge[i],
                         v + 1,
                         str(round(v, 2)),
                         fontsize=20,
                         bbox=dict(facecolor='brown', alpha=0.6))

            plt.subplot(235)
            plt.ylim([0, 15])
            plt.bar(bar_edge,
                    self.sensor_back,
                    width=0.04,
                    color=self.color_back)
            for i, v in enumerate(self.sensor_back):
                plt.text(bar_edge[i],
                         v + 1.5,
                         str(round(v, 2)),
                         fontsize=20,
                         bbox=dict(facecolor='brown', alpha=0.6))
            plt.gca().invert_yaxis()

            width = 0.2
            plt.subplot(131)
            plt.xlim([0, 4])
            plt.barh(np.arange(2),
                     self.sensor_vehicle[:2],
                     width,
                     align="center",
                     color=self.color_vehicle[:2])
            for i, v in enumerate(self.sensor_vehicle[:2]):
                plt.text(i,
                         v - 2,
                         str(round(-v, 2)),
                         fontsize=20,
                         bbox=dict(facecolor='brown', alpha=0.6))

            plt.subplot(133)
            plt.xlim([0, 4])
            plt.barh(np.arange(2),
                     self.sensor_vehicle[2:],
                     width,
                     align="center",
                     color=self.color_vehicle[2:])
            for i, v in enumerate(self.sensor_vehicle[2:]):
                plt.text(i,
                         v - 2,
                         str(round(-v, 2)),
                         fontsize=20,
                         bbox=dict(facecolor='brown', alpha=0.6))
            plt.gca().invert_xaxis()

            plt.pause(0.01)
            self.color_front = []
            self.color_back = []
            self.color_vehicle = []
Beispiel #46
0
    def plot_predictions(self):
        epoch, batch, data = self.get_next_batch(train=False)  # get a test batch
        num_classes = self.test_data_provider.get_num_classes()
        NUM_ROWS = 2
        NUM_COLS = 4
        NUM_IMGS = NUM_ROWS * NUM_COLS if not self.save_preds else data[0].shape[1]
        NUM_TOP_CLASSES = min(num_classes, 5)  # show this many top labels
        NUM_OUTPUTS = self.model_state["layers"][self.softmax_name]["outputs"]
        PRED_IDX = 1

        label_names = [lab.split(",")[0] for lab in self.test_data_provider.batch_meta["label_names"]]
        if self.only_errors:
            preds = n.zeros((data[0].shape[1], NUM_OUTPUTS), dtype=n.single)
        else:
            preds = n.zeros((NUM_IMGS, NUM_OUTPUTS), dtype=n.single)
            # rand_idx = nr.permutation(n.r_[n.arange(1), n.where(data[1] == 552)[1], n.where(data[1] == 795)[1], n.where(data[1] == 449)[1], n.where(data[1] == 274)[1]])[:NUM_IMGS]
            rand_idx = nr.randint(0, data[0].shape[1], NUM_IMGS)
            if NUM_IMGS < data[0].shape[1]:
                data = [n.require(d[:, rand_idx], requirements="C") for d in data]
        #        data += [preds]
        # Run the model
        print [d.shape for d in data], preds.shape
        self.libmodel.startFeatureWriter(data, [preds], [self.softmax_name])
        IGPUModel.finish_batch(self)
        print preds
        data[0] = self.test_data_provider.get_plottable_data(data[0])

        if self.save_preds:
            if not gfile.Exists(self.save_preds):
                gfile.MakeDirs(self.save_preds)
            preds_thresh = preds > 0.5  # Binarize predictions
            data[0] = data[0] * 255.0
            data[0][data[0] < 0] = 0
            data[0][data[0] > 255] = 255
            data[0] = n.require(data[0], dtype=n.uint8)
            dir_name = "%s_predictions_batch_%d" % (os.path.basename(self.save_file), batch)
            tar_name = os.path.join(self.save_preds, "%s.tar" % dir_name)
            tfo = gfile.GFile(tar_name, "w")
            tf = TarFile(fileobj=tfo, mode="w")
            for img_idx in xrange(NUM_IMGS):
                img = data[0][img_idx, :, :, :]
                imsave = Image.fromarray(img)
                prefix = (
                    "CORRECT"
                    if data[1][0, img_idx] == preds_thresh[img_idx, PRED_IDX]
                    else "FALSE_POS"
                    if preds_thresh[img_idx, PRED_IDX] == 1
                    else "FALSE_NEG"
                )
                file_name = "%s_%.2f_%d_%05d_%d.png" % (
                    prefix,
                    preds[img_idx, PRED_IDX],
                    batch,
                    img_idx,
                    data[1][0, img_idx],
                )
                #                gf = gfile.GFile(file_name, "w")
                file_string = StringIO()
                imsave.save(file_string, "PNG")
                tarinf = TarInfo(os.path.join(dir_name, file_name))
                tarinf.size = file_string.tell()
                file_string.seek(0)
                tf.addfile(tarinf, file_string)
            tf.close()
            tfo.close()
            #                gf.close()
            print "Wrote %d prediction PNGs to %s" % (preds.shape[0], tar_name)
        else:
            fig = pl.figure(3, figsize=(12, 9))
            fig.text(0.4, 0.95, "%s test samples" % ("Mistaken" if self.only_errors else "Random"))
            if self.only_errors:
                # what the net got wrong
                if NUM_OUTPUTS > 1:
                    err_idx = [i for i, p in enumerate(preds.argmax(axis=1)) if p not in n.where(data[2][:, i] > 0)[0]]
                else:
                    err_idx = n.where(data[1][0, :] != preds[:, 0].T)[0]
                    print err_idx
                err_idx = r.sample(err_idx, min(len(err_idx), NUM_IMGS))
                data[0], data[1], preds = data[0][:, err_idx], data[1][:, err_idx], preds[err_idx, :]

            import matplotlib.gridspec as gridspec
            import matplotlib.colors as colors

            cconv = colors.ColorConverter()
            gs = gridspec.GridSpec(NUM_ROWS * 2, NUM_COLS, width_ratios=[1] * NUM_COLS, height_ratios=[2, 1] * NUM_ROWS)
            # print data[1]
            for row in xrange(NUM_ROWS):
                for col in xrange(NUM_COLS):
                    img_idx = row * NUM_COLS + col
                    if data[0].shape[0] <= img_idx:
                        break
                    pl.subplot(gs[(row * 2) * NUM_COLS + col])
                    # pl.subplot(NUM_ROWS*2, NUM_COLS, row * 2 * NUM_COLS + col + 1)
                    pl.xticks([])
                    pl.yticks([])
                    img = data[0][img_idx, :, :, :]
                    img = img.squeeze()
                    if len(img.shape) > 2:  # more than 2 dimensions
                        if img.shape[2] is 2:  # if two channels
                            # copy 2nd to 3rd channel for visualization
                            a1 = img
                            a2 = img[:, :, 1]
                            a2 = a2[:, :, n.newaxis]
                            img = n.concatenate((a1, a2), axis=2)
                        pl.imshow(img, interpolation="lanczos")
                    else:
                        pl.imshow(img, interpolation="lanczos", cmap=pl.gray())
                    show_title = data[1].shape[0] == 1
                    true_label = [int(data[1][0, img_idx])] if show_title else n.where(data[1][:, img_idx] == 1)[0]
                    # print true_label
                    # print preds[img_idx,:].shape
                    # print preds[img_idx,:].max()
                    true_label_names = [label_names[i] for i in true_label]
                    img_labels = sorted(zip(preds[img_idx, :], label_names), key=lambda x: x[0])[-NUM_TOP_CLASSES:]
                    # print img_labels
                    axes = pl.subplot(gs[(row * 2 + 1) * NUM_COLS + col])
                    height = 0.5
                    ylocs = n.array(range(NUM_TOP_CLASSES)) * height
                    pl.barh(
                        ylocs,
                        [l[0] for l in img_labels],
                        height=height,
                        color=["#ffaaaa" if l[1] in true_label_names else "#aaaaff" for l in img_labels],
                    )
                    # pl.title(", ".join(true_labels))
                    if show_title:
                        pl.title(", ".join(true_label_names), fontsize=15, fontweight="bold")
                    else:
                        print true_label_names
                    pl.yticks(
                        ylocs + height / 2,
                        [l[1] for l in img_labels],
                        x=1,
                        backgroundcolor=cconv.to_rgba("0.65", alpha=0.5),
                        weight="bold",
                    )
                    for line in enumerate(axes.get_yticklines()):
                        line[1].set_visible(False)
                    # pl.xticks([width], [''])
                    # pl.yticks([])
                    pl.xticks([])
                    pl.ylim(0, ylocs[-1] + height)
                    pl.xlim(0, 1)
Beispiel #47
0
        cv=cv,
        groups=groups,
        scoring='accuracy',
    )
    mean_scores.append(cv_scores.mean())

###############################################################################
# Finally, we can display the classification scores.

###############################################################################
# Finally, we can display the classification scores.
from nilearn.plotting import show

plt.figure(figsize=(6, 4))
positions = np.arange(len(kinds)) * .1 + .1
plt.barh(positions, mean_scores, align='center', height=.05)
yticks = [kind.replace(' ', '\n') for kind in kinds]
plt.yticks(positions, yticks)
plt.xlabel('Classification accuracy')
plt.grid(True)
plt.tight_layout()

###############################################################################
# While the comparison is not fully conclusive on this small dataset,
# `Dadi et al 2019
# <https://www.sciencedirect.com/science/article/pii/S1053811919301594>`_
# Showed that across many cohorts and clinical questions, the tangent
# kind should be preferred.

show()
def run_models(data, unit_col, time_col, cohort_col, X_cols, y_col, knowledge_date=True, X_categories=None, output_dir=''):
    dummy_dict = {
        'clf': DummyClassifier,
        'param_dict': {
            'strategy': ['stratified', 'uniform'],
            'random_state': [0],
            }
        }
    ab_dict = {
        'clf': AdaBoostClassifier,
        'param_dict': {
            'base_estimator': ['DecisionTreeClassifier'],
            'n_estimators': [5, 10, 25, 50],
            'learning_rate': [0.01, 0.1, 0.5, 1],
            'algorithm': ['SAMME', 'SAMME.R'],
            'random_state': [0],
            }
        }
    dt_dict = {
        'clf': DecisionTreeClassifier,
        'param_dict': {
            'criterion': ['entropy', 'gini'],
            'splitter': ['best', 'random'],
            'max_features': ['sqrt', 'log2', None],
            'max_depth': [3, 5, 10, 25, None],
            'random_state': [0],
            }
        }
    et_dict = {
        'clf': ExtraTreesClassifier,
        'param_dict': {
            'n_estimators': [5, 10, 25, 50, 100], 
            'criterion': ['entropy', 'gini'],
            'max_features': ['sqrt', 'log2', None],
            'max_depth': [3, 5, 10, 25, None],
            'bootstrap': [True, False],
            'random_state': [0],
            'n_jobs': [-1],
            }
        }
    gb_dict = {
        'clf': GradientBoostingClassifier,
        'param_dict': {
            'loss': ['deviance', 'exponential'],
            'learning_rate': [0.01, 0.1, 0.5, 1],
            'n_estimators': [5, 10, 25, 100],
            'max_depth': [3, 5, 10, 25, None],
            'subsample': [0.01, 0.1, 0.5, 1],
            'max_features': ['sqrt', 'log2', None],
            'random_state': [0],
            }
        }
    lr_dict = {
        'clf': LogisticRegression,
        'param_dict': {
            'penalty': ['l1', 'l2'],
            'C': [0.001, 0.01, 1, 10],
            'random_state': [0],
            }
        }
    mnb_dict = {
        'clf': MultinomialNB,
        'param_dict': {
            'alpha': [0.0001, 0.001, 0.01, 0.1], 
            'C': [0.001, 0.01, 1, 10],
            }
        }
    rf_dict = {
        'clf': RandomForestClassifier, 
        'param_dict': {
            'n_estimators': [5, 10, 25, 50, 100], 
            'criterion': ['entropy', 'gini'],
            'max_features': ['sqrt', 'log2', None],
            'max_depth': [3, 5, 10, 25, None],
            'bootstrap': [True, False],
            'random_state': [0],
            'n_jobs': [-1],
            }
        }
    sgd_dict = {
        'clf': SGDClassifier, 
        'param_dict': {
            'alpha': [0.0001, 0.001, 0.01, 0.1], 
            'penalty': ['l1', 'l2'],
            'random_state': [0],
            }
        }
    svm_dict = {
        'clf': SVC, 
        'param_dict': {
            'C': [0.001, 0.01, 0.1, 1, 10],
            'kernal': ['poly', 'rbf', 'sigmoid'],
            'probability': [True],
            'random_state': [0],
            }
        }

    rf_best_dict = {
        'clf': RandomForestClassifier, 
        'param_dict': {
            'n_estimators': [5, 50], 
            'criterion': ['entropy'],
            'max_features': [None, 'log2'],
            'max_depth': [3, 10],
            'bootstrap': [True],
            'random_state': [0],
            'n_jobs': [-1],
            }
        }

    rf_very_best_dict = {
        'clf': RandomForestClassifier, 
        'param_dict': {
            'n_estimators': [50], 
            'criterion': ['entropy'],
            'max_features': [None],
            'max_depth': [3],
            'bootstrap': [True],
            'random_state': [0],
            'n_jobs': [-1],
            }
        }

    clf_library = [
                   dummy_dict,
                   #ab_dict, 
                   #dt_dict, 
                   #et_dict, 
                   #gb_dict, 
                   #lr_dict, 
                   #mnb_dict, 
                   #rf_dict, 
                   rf_best_dict, 
                   #rf_very_best_dict, 
                   #sgd_dict, 
                   #svm_dict, 
                   ]

    # Make sure clf_library is iterable (a list).
    if type(clf_library) is not list:
        clf_library = [clf_library]
    # Generate list of instantiated objects from list of dictionaries.
    clf_library = generate_models(clf_library)

    # Feature category sets to use for modeling.
    feat_sets = [['all']]
    if X_categories is not None:
        print X_categories.head()
        # Determine the unique feature categories.
        unique_categories = X_categories['feature_category_primary'].unique()
        # Drop featue categories that should not be used for modeling.
        categories_to_drop = np.array(['id', 'attendance', 'coursework', 'demographic', 'school'])
        unique_categories = np.array(list(set(x for x in unique_categories.tolist()).difference(set(x for x in categories_to_drop.tolist()))))
        # Add single feature categories to the feature sets.
        feat_sets.extend([[x] for x in unique_categories])
        # Add all but single feature categories to the feature sets.
        feat_sets.extend(list(itertools.combinations(unique_categories, len(unique_categories)-1)))
        # Add all categories to the feature sets.
        feat_sets.extend(list(itertools.combinations(unique_categories, len(unique_categories))))

    # Dataset modifiers to use prior to modeling (all combinations)
    mod_dict = {
        'subsample_rate':        [None],
        'oversample_rate_SMOTE': [None],
        #'subsample_rate':        [None, 1.0, 2.5, 5.0],
        #'oversample_rate_SMOTE': [None, 100, 200, 500],
        'imputation':            ['mean'],
        }
    mod_names = sorted(mod_dict)
    mod_sets = [dict(zip(mod_names, product)) for product in itertools.product(*(mod_dict[mod_names] for mod_names in mod_names))]

    top_k = [.001, .002, .005, .01, .02, .05, .1, .5, 1]

    results = []

    train_only_most_recent = True
    train_cohort_years = None
    test_only_most_recent = False
    #test_cohort_years = range(2011, 2016)
    test_cohort_years = [2014]
    grades = range(6, 12)
    evaluations = ['accuracy', 'brier', 'f1', 'roc', 'prc', 'precision', 'recall']

    cohorts = sorted(data[cohort_col].unique())
    cohorts_train = cohorts[:-1]
    cohorts_test = cohorts[1:]
    if train_cohort_years is not None:
        cohorts_train = train_cohort_years
    if test_cohort_years is not None:
        cohorts_test = test_cohort_years

    print("Grades %s" % grades)
    print("Cohorts %s" % cohorts)

    for cohort in cohorts:
        print("Cohort %s with shape %s" % (cohort, data[data['cohort'] == cohort].shape))

    # Iterate over grade levels.
    for grade in grades:
        print("==========\nGrade %s\n==========" % grade)

        X_cols_filtered = X_cols

        # Filter features from higher grade levels.
        X_cols_filtered_by_grade = X_cols_filtered
        highest_grade = 12
        higher_grade_regex = '^(?!' # negation
        if grade < highest_grade:
            for higher_grade in range(grade+1, highest_grade+1):
                higher_grade_regex += r"{time_col}_{grade_level}|".format(time_col=str(time_col), 
                                                                          grade_level=str(higher_grade)
                                                                          )
            higher_grade_regex = higher_grade_regex[:-1] # remove last '|'
            higher_grade_regex = higher_grade_regex + ').*'
            #data = data.filter(regex=higher_grade_regex)
            regex = re.compile(higher_grade_regex)
            # The regex should select all columns except those prefixed by a higher grade level.
            X_cols_filtered_by_grade = filter(lambda i: regex.search(i), X_cols_filtered_by_grade)


        # Iterate over feature sets.
        for features in feat_sets:
            print("  Features: %s" % ', '.join(features))

            if X_categories is not None:
                # Filter to only columns/features to be used for modeling.
                filtered_feats = X_categories.loc[X_categories['exclude_when_modeling'] == 0]
                # Filter to only columns/features in the selected feature categories.
                filtered_feats = filtered_feats['feature_name'].loc[filtered_feats['feature_category_primary'].isin(features)]
                category_regex = '('
                for feat in filtered_feats:
                    category_regex += r"{feat}|".format(feat=str(feat))
                category_regex = category_regex[:-1] # remove last '|'
                category_regex = category_regex + ')'
                regex = re.compile(category_regex)
                # The regex should select all columns that are a member of each selected feature category.
                X_cols_filtered_by_grade_and_category = filter(lambda i: regex.search(i), X_cols_filtered_by_grade)

                print("  %i features." % (len(X_cols_filtered_by_grade_and_category)))
                if len(X_cols_filtered_by_grade_and_category) == 0:
                    continue
            else:
                X_cols_filtered_by_grade_and_category = X_cols_filtered_by_grade

            # Iterate over cohort train/test pairs.
            for train_year in cohorts_train:
                for test_year in cohorts_test:
                    # Train cohort must be prior to test cohort.
                    if train_year >= test_year:
                        continue
                    if knowledge_date:
                        # Train and test cohorts must be separated according to grade level.
                        if train_year > test_year - (12 - grade):
                            continue
                    # Train only on the latest possible cohort, given the test cohort.
                    #if (train_only_most_recent == True) and (train_year != test_year - (12 - grade)):
                    #    continue
                    # Test only on the most recently available cohort.
                    if (test_only_most_recent == True) and (test_year != cohorts[-1]):
                        continue
                    print("  %s %s" % (train_year, test_year))

                    train_and_test = data[(data['cohort'] <= train_year) | (data['cohort'] >= test_year)]

                    # Set training/testing labels (train = 0, test = 1).
                    kf_labels = np.where((train_and_test['cohort'] >= test_year), 1, 0)

                    if len(kf_labels[kf_labels == 0]) == 0:
                        print("No training data.")
                    if len(kf_labels[kf_labels == 1]) == 0:
                        print("No testing data.")
                    if len(kf_labels[kf_labels == 0]) == 0 or len(kf_labels[kf_labels == 1]) == 0:
                        continue

                    if kf_labels is not None:
                        # The cross-validation labels are our train/test cohorts.
                        kf = [(np.where(kf_labels == 0)[0], np.where(kf_labels == 1)[0])]
                    else:
                        kf = cross_validation.StratifiedKFold(train_and_test[y_col], n_folds=n_folds, shuffle=True)

                    X = train_and_test[X_cols_filtered_by_grade_and_category].as_matrix()
                    y = train_and_test[y_col].as_matrix()
                    sid = train_and_test[unit_col].as_matrix() # student IDs

                    # Iterate over train/test sets (only one iteration if on entire cohorts).
                    for train, test in kf:
                        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
                        sid_train, sid_test = sid[train], sid[test]

                        print("  %i training instances and %i testing instances." % (len(X_train), len(X_test)))
                        print("  %i class 0 instances in the training data." % len(y_train[y_train == 0]))
                        print("  %i class 1 instances in the training data." % len(y_train[y_train == 1]))
                        print("  %i class 0 instances in the testing data." % len(y_test[y_test == 0]))
                        print("  %i class 1 instances in the testing data." % len(y_test[y_test == 1]))
                        
                        # Iterate over dataset modifiers (imputation, sampling, etc.).
                        for mods in mod_sets:
                            imputation_method = mods['imputation']
                            oversample_rate_SMOTE = mods['oversample_rate_SMOTE']
                            subsample_rate = mods['subsample_rate']

                            # Copy the training data and testing feature data before modifications.
                            X_train_t = np.copy(X_train)
                            y_train_t = np.copy(y_train)
                            X_test_t = np.copy(X_test)

                            # Imputation methods.
                            print "  Performing %s-based imputation" % str(imputation_method)
                            if imputation_method == 'mean':
                                #imp = Imputer(missing_values=np.nan, strategy='mean', axis=0)
                                #X_train_t = imp.fit_transform(X_train_t)
                                #X_train_t = DataFrameImputer().fit_transform(X_train_t)
                                X_train_t = pd.DataFrame(X_train_t).fillna(pd.DataFrame(X_train_t).mean().fillna(0)).as_matrix()
                                X_test_t = pd.DataFrame(X_test_t).fillna(pd.DataFrame(X_test_t).mean().fillna(0)).as_matrix()
                            elif imputation_method == 'regression':
                                pass

                            # Assert that there are no missing or infinite training values.
                            #assert not np.any(np.isnan(X_train_t) | np.isinf(X_train_t))
                            #assert not np.any(np.isnan(y_train_t) | np.isinf(y_train_t))

                            # Ensure the training data has at least two class labels.
                            if len(np.unique(y_train_t)) < 2:
                                print("Warning: The data contains only one class: %s" % (str(np.unique(y_train_t))))

                            # Sampling methods.
                            sampled = False
                            print "  Oversampling with SMOTE rate is %s" % str(oversample_rate_SMOTE)
                            print "  Subsample rate is %s" % str(subsample_rate)

                            if oversample_rate_SMOTE is not None:
                                print "  SMOTEing data..."
                                minority = X_train_t[np.where(y_train_t == 1)]
                                smoted = SMOTE(minority, oversample_rate_SMOTE, 5)
                                X_train_t = np.vstack((X_train_t, smoted))
                                y_train_t = np.append(y_train_t, np.ones(len(smoted), dtype=np.int32))
                                sampled = True
                            if subsample_rate is not None:
                                print "  Subsampling data..."
                                sampled = subsample(X_train_t, y_train_t, subsample_rate)
                                X_train_t = X_train_t[sampled]
                                y_train_t = y_train_t[sampled]
                                sampled = True

                            if sampled:
                                print("  Data sampled. There are now:")
                                print("  %i training instances and %i testing instances." % (len(X_train_t), len(X_test_t)))
                                print("  %i class 0 instances in the training data." % len(y_train_t[y_train_t == 0]))
                                print("  %i class 1 instances in the training data." % len(y_train_t[y_train_t == 1]))

                            # Iterate over classification models.
                            for i, clf in enumerate(clf_library):
                                clf_name = str(clf)[:str(clf).index('(')]
                                print("  Model: %s" % clf_name)
                                print("  Parameters: %s" % clf.get_params())

                                summary = {}
                                summary_headers = [
                                    'grade',
                                    'test',
                                    'train',
                                    'model',
                                    'params',
                                    'features',
                                    'subsampled',
                                    'smoted',
                                    'summary_hash',
                                    'acc',
                                    'brier',
                                    'f1',
                                    'roc_auc',
                                    'prc_auc',
                                    'recall',
                                    'pre',
                                    ]
                                for k in top_k:
                                    summary_headers.extend(['pre@' + str(k*100) + '%'])
                                for k in top_k:
                                    summary_headers.extend(['ap@' + str(k*100) + '%'])
                                for header in summary_headers:
                                    summary[header] = ''
                                summary['grade'] = grade
                                summary['model'] = clf_name
                                summary['params'] = clf.get_params()
                                summary['features'] = ', '.join(features)
                                summary['test'] = test_year
                                summary['train'] = train_year
                                summary['subsampled'] = ''
                                summary['smoted'] = ''
                                summary['summary_hash'] = ''
                                if oversample_rate_SMOTE is not None:
                                    summary['smoted'] = oversample_rate_SMOTE
                                if subsample_rate is not None:
                                    summary['subsampled'] = subsample_rate

                                y_pred = y_prob = y_true = [];
                                test_indexes = []

                                # Generate "probabilities" for the current hold-out sample being predicted.
                                fitted_clf = clf.fit(X_train_t, y_train_t)
                                #feature_importances = getattr(fitted_clf, 'feature_importances_', None)

                                # Generate predicted labels and label probabilities.
                                preds_ = fitted_clf.predict(X_test_t)
                                probas_ = fitted_clf.predict_proba(X_test_t)

                                # Define the actual test indexes and labels.
                                test_indexes = np.concatenate((test_indexes, test), axis=0)
                                y_true = np.concatenate((y_true, y_test), axis=0)

                                # Aggregated (if applicable) model predictions of labels and label probabilities.
                                y_pred = np.concatenate((y_pred, preds_), axis=0)
                                y_prob = np.concatenate((y_prob, probas_[:, 1]), axis=0)

                                if clf_name == 'RandomForestClassifier':
                                    features_list = train_and_test[X_cols_filtered_by_grade_and_category].columns.values

                                    # Fit a random forest with (mostly) default parameters to determine feature importance
                                    feature_importance = fitted_clf.feature_importances_

                                    # make importances relative to max importance
                                    feature_importance = 100.0 * (feature_importance / feature_importance.max())

                                    # A threshold below which to drop features from the final data set. Specifically, this number represents
                                    # the percentage of the most important feature's importance value
                                    fi_threshold = 1

                                    # Get the indexes of all features over the importance threshold
                                    important_idx = np.where(feature_importance > fi_threshold)[0]

                                    # Create a list of all the feature names above the importance threshold
                                    important_features = features_list[important_idx]
                                    print "\n", important_features.shape[0], "Important features(>", fi_threshold, "% of max importance):\n", \
                                            important_features

                                    # Get the sorted indexes of important features
                                    sorted_idx = np.argsort(feature_importance[important_idx])[::-1]
                                    print "\nFeatures sorted by importance (DESC):\n", important_features[sorted_idx]

                                    # Adapted from http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html
                                    pos = np.arange(sorted_idx.shape[0]) + .5

                                    #importances_df = pd.DataFrame({'pos': pos.tolist(), 'feature_importance': feature_importance[important_idx][sorted_idx[::-1]]})
                                    #sns.barplot('pos', 'feature_importance', importances_df)

                                    f, ax = plt.subplots(figsize=(50, 15))

                                    plt.barh(pos, feature_importance[important_idx][sorted_idx[::-1]], align='center', color='#7777FF')
                                    plt.yticks(pos, important_features[sorted_idx[::-1]])
                                    plt.xlabel('Relative Importance', fontsize=35)
                                    plt.ylabel('Feature', fontsize=35)
                                    plt.title('Variable Importance')
                                    for label in (ax.get_xticklabels() + ax.get_yticklabels()):
                                        label.set_fontsize(20) # Size here overrides font_prop
                                    ax.legend(title="")

                                    #plt.draw()
                                    #plt.show()
                                    figure_dir = os.path.join(output_dir, 'figures')
                                    if not os.path.exists(figure_dir):
                                        os.makedirs(figure_dir) 
                                    plt.savefig(os.path.join(figure_dir, 'feature_importance_rf_' + str(grade) + '.pdf'), dpi=100, transparent=True)


                                # Iterate over evaluation metrics.
                                for evaluation in evaluations:
                                    metric_score = None

                                    if evaluation == 'accuracy':
                                        # Compute the accuracy.
                                        accuracy = metrics.accuracy_score(y_true, y_pred)
                                        if accuracy is not None:
                                            summary['acc'] = accuracy

                                    if evaluation == 'brier':
                                        # Compute the brier score.
                                        brier_score = metrics.brier_score_loss(y_true, y_prob)
                                        if brier_score is not None:
                                            summary['brier'] = brier_score

                                    if evaluation == 'f1':
                                        # Compute the F1 score.
                                        f1_score = metrics.f1_score(y_true, y_pred)
                                        if f1_score is not None:
                                            summary['f1'] = f1_score

                                    if evaluation == 'roc':
                                        # Compute the ROC curve and the area under the curve.
                                        mean_tpr = 0.0
                                        mean_fpr = np.linspace(0, 1, 100)

                                        fpr, tpr, thresholds = metrics.roc_curve(y_true, y_prob)
                                        mean_tpr += np.interp(mean_fpr, fpr, tpr)

                                        # Plot the ROC baseline.
                                        #pl.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Baseline')

                                        # Compute true positive rates.
                                        mean_tpr /= len(kf)
                                        mean_tpr[-1] = 1.0
                                        mean_auc = metrics.auc(mean_fpr, mean_tpr)

                                        # Plot the ROC curve.
                                        #pl.plot(mean_fpr, mean_tpr, 'k-',
                                        #        label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

                                        #pl.xlim([-0.05, 1.05])
                                        #pl.ylim([-0.05, 1.05])
                                        #pl.xlabel('False Positive Rate')
                                        #pl.ylabel('True Positive Rate')
                                        #pl.title(models[ix] + ' ROC')
                                        #pl.legend(loc="lower right")
                                        #pl.show()

                                        if mean_auc is not None:
                                            summary[evaluation + '_auc'] = mean_auc

                                    elif evaluation == 'prc':
                                        # Compute overall precision, recall, and area under PR-curve.
                                        precision, recall, thresholds = metrics.precision_recall_curve(y_true, y_prob)
                                        pr_auc = metrics.auc(recall, precision)

                                        # Plot the precision-recall curve.
                                        #pl.plot(recall, precision, color='b', label='Precision-Recall curve (area = %0.2f)' % pr_auc)
                                        #pl.xlim([-0.05, 1.05])
                                        #pl.ylim([-0.05, 1.05])
                                        #pl.xlabel('Recall')
                                        #pl.ylabel('Precision')
                                        #pl.title(models[ix] + ' Precision-Recall')
                                        #pl.legend(loc="lower right")
                                        #pl.show()

                                        if pr_auc is not None:
                                            summary[evaluation + '_auc'] = pr_auc

                                    elif evaluation == 'recall':
                                        # Compute the recall.
                                        recall_score = metrics.recall_score(y_true, y_pred, average='binary')

                                        if recall_score is not None:
                                            summary[evaluation] = recall_score

                                    elif evaluation == 'precision':
                                        for k in top_k:
                                            # Compute the precision.
                                            precision = metrics.precision_score(y_true, y_pred)
                                            summary['pre'] = precision

                                            # Compute the precision on the top k%.
                                            ord_prob = np.argsort(y_prob,)[::-1] 
                                            r = int(k * len(y_true))

                                            if r == 0:
                                                pre_score_k = 0.0
                                            else:
                                                pre_score_k = np.sum(y_true[ord_prob][:r]) / r

                                            if pre_score_k is not None:
                                                summary['pre@' + str(k * 100) + '%'] = pre_score_k

                                            # Compute the average precision on the top k%.
                                            ap_score_k = 0.0
                                            num_hits = 0.0

                                            for i, p in enumerate(y_pred[:r]):
                                                if p in y_true and p not in y_pred[:i]:
                                                    num_hits += 1.0
                                                    ap_score_k += num_hits / (i + 1.0)

                                            if min(len(y_true), r) == 0:
                                                ap_score_k = 0.0
                                            else:
                                                ap_score_k = ap_score_k / min(len(y_true), r)

                                            if ap_score_k is not None:
                                                summary['ap@' + str(k * 100) + '%'] = ap_score_k

                                    elif evaluation == 'risk':
                                        # Output a list of the topK% students at highest risk along with their risk scores.
                                        #test_indexes = test_indexes.astype(int)
                                        #sort_ix = np.argsort(test_indexes)
                                        #students_by_risk = X.index[test_indexes]
                                        #y_prob = ((y_prob[sort_ix]) * 100).astype(int)
                                        #probas = np.column_stack((students_by_risk, y_prob))
                                        #r = int(top_k * len(y_original_values))
                                        #logging.info(models[ix] + ' top ' + str(100 * top_k) + '%' + ' highest risk')
                                        #logging.info('--------------------------')
                                        #logging.info('%-15s %-10s' % ('Student', 'Risk Score'))
                                        #logging.info('%-15s %-10s' % ('-------', '----------'))
                                        #probas = probas[np.argsort(probas[:, 1])[::-1]]
                                        #for i in range(r):
                                        #    output += '%-15s %-10d' % (probas[i][0], probas[i][1])
                                        #logging.info('\n')
                                        pass

                                summary_hash = str(hashlib.sha1(json.dumps((str(summary['grade']), 
                                                                           str(summary['test']), 
                                                                           str(summary['train']), 
                                                                           str(summary['model']), 
                                                                           str(summary['params']), 
                                                                           str(summary['features']), 
                                                                           str(summary['subsampled']), 
                                                                           str(summary['smoted'])), 
                                                                           sort_keys=True)).hexdigest())
                                summary['summary_hash'] = summary_hash
                                summary_output = []

                                if clf_name == 'DummyClassifier':
                                    summary_file = os.path.join(output_dir, 'summary_dummy.csv')
                                else:
                                    summary_file = os.path.join(output_dir, 'summary.csv')

                                for key in summary_headers:
                                    if key in summary:
                                        summary_output = [[summary[key] for key in summary_headers if key in summary]]
                                write_summary_results(summary_file, summary_output, summary_headers)

                                model_file = os.path.join(output_dir, 'predictions',
                                                          'grade', str(summary['grade']), 
                                                          'test', str(summary['test']), 
                                                          'train', str(summary['train']),
                                                          'model', str(summary['model']), 
                                                          str(summary['summary_hash']) + '.csv')
                                write_model_results(model_file, y_true, y_pred, y_prob, sid_test)