Example #1
0
    def find_data_point_from_coordinate(filepath, input_sen_path, labels,
                                        syllable_data_tag, area):

        # Read data file
        data_point = Utility.load_obj(filepath)
        #         print data_point

        # Get input sensitivity
        input_sen_obj = Utility.load_obj(input_sen_path)
        input_sensitivety = Utility.get_input_sensitivity(input_sen_obj, 3)
        #         print input_sensitivety

        x_coordinate = data_point[:,
                                  [input_sensitivety[0], input_sensitivety[1]]]
        #         print x_coordinate
        x_cor = np.array(x_coordinate)
        index = DataReader.filter_data(x_cor, area)

        print index

        lab = Utility.load_obj(labels)
        print len(lab)
        print lab[index]

        syllable_tag = Utility.load_obj(syllable_data_tag)
        print len(syllable_tag)
        print syllable_tag

        # Return

        pass
Example #2
0
def analysis(main_path):
    # main_path = '/work/w13/decha/Inter_speech_2016_workplace/Data/07c-5dims_missing_data_delta_deltadelta/BayesianGPLVMMiniBatch_Missing/Tone_4/'
    gpmodel = Utility.load_obj('{}/GP2dRegression.npy'.format(main_path))

    model_path = '{}/GP_model.npy'.format(main_path)
    model = Utility.load_obj(model_path)
    data = model.X.mean

    x = []

    input_sensitivity = model.input_sensitivity()
    print input_sensitivity

    index = Utility.get_input_sensitivity(input_sensitivity, 2)
    print index

    for i in range(len(data)):
        x.append([data[i, index[0]], data[i, index[1]]])

    x = np.array(x)

    y = np.array(gpmodel.predict(x)[0])
    print y.shape

    plt.clf()
    plt.scatter(x[:, 0], x[:, 1], c=y, cmap='gray')
    plt.savefig('{}/gpregression.pdf'.format(main_path))
    pass
Example #3
0
def run_training(base_path, db_file, name_out_path):

    names_file = '{}/names.pkl'.format(base_path)
    out_data = '{}/x.pkl'.format(base_path)
    input_sensitivity = '{}/input_sensitivity.pkl'.format(base_path)

    names = Utility.load_obj(names_file)
    db = Utility.load_obj(db_file)

    name_list = []
    for d in db:
        name_list.append(d['id'])

    label = []
    for nn in names:
        idx = name_list.index(nn)

        if nn in potential_list:
            label.append('3')
        elif db[idx]['stress'] == '1':
            label.append(db[idx]['stress'])
        else:
            label.append(db[idx]['stress'])

    out = Utility.load_obj(out_data)
    input_sent = Utility.load_obj(input_sensitivity)

    print 'Input sensitivity', input_sent

    most_dominants = Utility.get_input_sensitivity(input_sent, 2)

    label = map(int, label)
    label = np.array(label)

    train = np.append(out[label == 2], out[label == 3], axis=0)

    train = np.c_[train[:, most_dominants[0]], train[:, most_dominants[1]]]

    print train.shape

    global kern
    lengthscale = 1 / np.array(input_sent, dtype=float)
    kern = GPy.kern.RBF(len(train[0]),
                        ARD=True,
                        lengthscale=[
                            lengthscale[most_dominants[0]],
                            lengthscale[most_dominants[1]]
                        ])

    print most_dominants

    xx, yy = np.meshgrid(np.linspace(-5, 5, 500), np.linspace(-5, 5, 500))
    plane = np.c_[xx.ravel(), yy.ravel()]
    svm_classifier(train, '', '', '', '', plane, xx, yy)

    pass
Example #4
0
def plot_result(model, data_object, out_file_path):

        data = model.X.mean

        y, name_index, tone, stress, syllable_short_long_type, syllable_positions, phonemes, syllable_type = data_object.get_GP_LVM_training_data(
            Syllable.TRAINING_FEATURE_POLYNOMIAL_2_DEGREE_VOICE, 
            dur_position=[1,2] ,
            num_sampling=25)

        # print syllable_type
        # print model.X.mean
        x = []
        y = []

        input_sensitivity = model.input_sensitivity()
        print input_sensitivity

        index = Utility.get_input_sensitivity(input_sensitivity, 2)
        print index

        data = np.array(data)
        stress = np.array(stress)

        labels_true = np.arange(len(stress), dtype=int)
        labels_true[stress == 'Stress'] = 1
        labels_true[stress == 'Unstress'] = 0

        new_label = []
        for idx, t in enumerate(tone):
            if (labels_true[idx] == 1):
                if (t in [0,1]) :
                    new_label.append(1)
                elif (t in [2]) :
                    new_label.append(2)
                else :
                    new_label.append(3)
            else:
                new_label.append(0)

        try:
            DBSCAN_executioner.run(
                data, 
                new_label, 
                os.path.dirname(outpath), 
                [index[0], index[1]], 
                input_sensitivity, 
                stress_only=False,
                stress_list=labels_true)
            # Kmeans_executioner.run(data, labels_true, os.path.dirname(outpath), [index[0], index[1]], input_sensitivity)
        except:
            print 'Error at path : {}'.format(outpath)
            traceback.print_exc()
def plot(data, inverselengthscale, labels):
    most_dominants = Utility.get_input_sensitivity(inverselengthscale, 2)

    x = data[:, most_dominants[0]]
    y = data[:, most_dominants[1]]

    label = map(int, labels)
    label = np.array(labels)

    print set(labels)

    colors = ['red', 'green', 'blue', 'purple']

    plt.clf()
    plt.scatter(x, y, c=labels, cmap=matplotlib.colors.ListedColormap(colors))
    plt.savefig('./link_clustering_test.eps')
def plot_latent_space(base_path, db_file, name_out_path):

    names_file = '{}/names.pkl'.format(base_path)
    out_data = '{}/x.pkl'.format(base_path)
    input_sensitivity = '{}/input_sensitivity.pkl'.format(base_path)

    if not Utility.is_file_exist(out_data):
        print out_data
        print 'Not exist'
        return

    names = Utility.load_obj(names_file)

    db = Utility.load_obj(db_file)

    name_list = []
    for d in db:
        name_list.append(d['id'])

    label = []
    for nn in names:
        idx = name_list.index(nn)
        label.append(db[idx]['stress'])

    out = Utility.load_obj(out_data)

    print out.shape

    input_sent = Utility.load_obj(input_sensitivity)
    print input_sent
    most_dominants = Utility.get_input_sensitivity(input_sent, 2)

    x = out[:, most_dominants[0]]
    y = out[:, most_dominants[1]]

    label = map(int, label)
    label = np.array(label)

    print set(label)

    colors = ['red', 'green', 'blue', 'purple']

    plt.clf()
    plt.scatter(x, y, c=label, cmap=matplotlib.colors.ListedColormap(colors))
    plt.savefig(name_out_path)
def plot(data, inverselengthscale, labels):
    most_dominants = Utility.get_input_sensitivity(inverselengthscale, 2)

    x = data[ :, most_dominants[0] ]
    y = data[ :, most_dominants[1] ]

    label = map(int, labels)
    label = np.array(labels)

    print set(labels)

    colors = Utility.get_color_map(len(set(labels)))

    plt.clf()

    for idx, s in enumerate(set(labels)):
        print s
        plt.scatter(x[labels==s], y[labels==s], c=colors[idx] , label=s)
    plt.legend()
    plt.savefig( './dbscan_test.eps' )
    def find_data_point_from_coordinate(filepath, input_sen_path, labels,
                                        syllable_data_path, area, tone):

        # Read data file
        data_point = Utility.load_obj(filepath)
        #         print data_point

        # Get input sensitivity
        input_sen_obj = Utility.load_obj(input_sen_path)
        input_sensitivety = Utility.get_input_sensitivity(input_sen_obj, 3)
        #         print input_sensitivety

        x_coordinate = data_point[:,
                                  [input_sensitivety[0], input_sensitivety[1]]]
        #         print x_coordinate
        x_cor = np.array(x_coordinate)
        index = DataReader.filter_data(x_cor, area)

        print index

        lab = Utility.load_obj(labels)
        print len(lab)
        print lab[index]

        syllable_tag = DataReader.gen_syllable_tag(syllable_data_path, tone,
                                                   'a', 'h', 'tscsd_manual')

        print len(syllable_tag)

        syllable_tag = np.array(syllable_tag)

        print syllable_tag[index]

        lab_indexed = lab[index]
        syllable_tag_indexed = syllable_tag[index]

        print syllable_tag_indexed[lab_indexed == 'Tone 2']

        # Return

        pass
Example #9
0
def run(main_path, syllable_management_path):
    # main_path = '/work/w13/decha/Inter_speech_2016_workplace/Data/07c-5dims_missing_data_delta_deltadelta/BayesianGPLVMMiniBatch_Missing/Tone_4/'

    # syllable_management_path = '/home/h1/decha/Dropbox/Inter_speech_2016/Syllable_object/01_manual_labeling_object/syllable_4.pickle'

    model_path = '{}/GP_model.npy'.format(main_path)
    outpath = '{}/GP2dRegression.npy'.format(main_path)

    model = Utility.load_obj(model_path)

    data = model.X.mean

    x = []

    input_sensitivity = model.input_sensitivity()
    print input_sensitivity

    index = Utility.get_input_sensitivity(input_sensitivity, 2)
    print index

    for i in range(len(data)):
        x.append([data[i, index[0]], data[i, index[1]]])

    x = np.array(x)

    syllable_management = Utility.load_obj(syllable_management_path)
    y, name_index, tone, stress, syllable_short_long_type, syllable_positions, phonemes, syllable_type = syllable_management.get_GP_LVM_training_data(
        Syllable.TRAINING_FEATURE_POLYNOMIAL_2_DEGREE_VOICE,
        subtract_typical_contour=False)

    y = np.array(y)
    # print y[:,50]
    y = y[:, 50]
    y = y[np.newaxis].T
    print y.shape

    GPModelByGPy.execute_training(x, y, outpath)

    pass
def plot(data, inverselengthscale, labels, name_out_file, title):

    most_dominants = Utility.get_input_sensitivity(inverselengthscale, 2)

    x = data[ :, most_dominants[0] ]
    y = data[ :, most_dominants[1] ]

    label = map(int, labels)
    label = np.array(labels)

    # print set(labels)

    # colors = ['red','green','blue','purple']

    colors = Utility.get_color_map(len(set(labels)))

    plt.clf()
    # plt.scatter(x, y, c=labels, cmap=matplotlib.colors.ListedColormap(colors))
    for idx, s in enumerate( sorted( set(labels)) ):
        plt.scatter(x[labels==s], y[labels==s], c=colors[idx] , label=s)
    plt.legend()
    plt.title(title)

    plt.savefig( name_out_file )
Example #11
0
model_path = '/work/w13/decha/Inter_speech_2016_workplace/mix-projection-addtional/01_mix_a-5dims_BayesianGPLVMMiniBatch_data_no_delta_missing_data_subtract_typical_contour/BayesianGPLVMMiniBatch_Missing/Tone_01234/GP_model.npy'

syllable_management = Utility.load_obj(data_path)
y, name_index, tone, stress, syllable_short_long_type, syllable_positions, phonemes = syllable_management.get_GP_LVM_training_data(Syllable.TRAINING_FEATURE_POLYNOMIAL_2_DEGREE_VOICE, subtract_typical_contour=False)

model = Utility.load_obj(model_path)
data = model.X.mean

x = []
y = []

input_sensitivity = model.input_sensitivity()
print input_sensitivity

index = Utility.get_input_sensitivity(input_sensitivity, 2)
print index

for i in range(len(data)):
    x.append(data[i,index[0]])
    y.append(data[i,index[1]])

x = np.asarray(x)
y = np.asarray(y)

stress = np.array(stress)
stress_index = np.where(stress=='Stress')

x_stress = x[stress_index]
y_stress = y[stress_index]
Example #12
0
def run_training(base_path, db_file, name_out_path, name):

    names_file = '{}/names.pkl'.format(base_path)
    out_data = '{}/x.pkl'.format(base_path)
    input_sensitivity = '{}/input_sensitivity.pkl'.format(base_path)

    names = Utility.load_obj(names_file)
    db = Utility.load_obj(db_file)

    name_list = []
    for d in db:
        name_list.append( d['id'] )

    label = []

    train_name_list = []

    train_idx = []

    true_label = []

    for i, nn in enumerate(names) :
        idx = name_list.index(nn)
    
        if 'j' in nn:
            train_name_list.append(nn)
            train_idx.append(i)

        elif db[idx]['stress'] == '2':
            train_name_list.append(nn)
            train_idx.append(i)

        if db[idx]['stress'] == '1':
            # label.append(db[idx]['stress'])
            label.append(-1)
        elif nn in potential_list:
            # label.append('3')
            label.append(1)
        elif db[idx]['stress'] == '2':
            label.append(1)
        else :
            # label.append(db[idx]['stress'])
            label.append(-1)

        if db[idx]['stress'] == '2':
            true_label.append(1)
        else:
            true_label.append(int(db[idx]['stress']))

    out = Utility.load_obj(out_data)
    input_sent = Utility.load_obj(input_sensitivity)

    most_dominants = Utility.get_input_sensitivity(input_sent, 2)

    label = map(int, label)
    label = np.array(label)

    train = out[train_idx]
    train_lab = label[train_idx]

    # print len(train), len(train_lab), set(train_lab)

    global kern
    lengthscale=1/np.array(input_sent, dtype=float)

    lengthscale = lengthscale/lengthscale.min()

    # print 'lengthscale : ', lengthscale
    # kern = GPy.kern.RBF(len(train[0]), ARD=True, lengthscale=lengthscale)
    kern = GPy.kern.RBF(len(train[0]), ARD=True, lengthscale=lengthscale)

    min_max_dims = []
    ten_or_not = []

    for d in xrange(len(out[0])):
        out_d = out[:, d]
        m = []
        m.append(min(out_d))
        m.append(max(out_d))
        min_max_dims.append(m)

        if d in most_dominants:
            ten_or_not.append(100),
        else :
            ten_or_not.append(1)

    m = min_max_dims
    # for mm in m:
    #     print mm

    print most_dominants

    m_grid = []
    for d in xrange(len(out[0])):
        if ten_or_not[d] != 1:
            m_grid.append( np.linspace(m[d][0], m[d][1], ten_or_not[d]) )
        else:
            m_grid.append([0])

    d0, d1, d2, d3, d4, d5, d6, d7, d8, d9 = np.meshgrid(
        m_grid[0], m_grid[1], m_grid[2], m_grid[3], m_grid[4], 
        m_grid[5], m_grid[6], m_grid[7], m_grid[8], m_grid[9]
        )

    plane = np.c_[
        d0.ravel(), d1.ravel(), d2.ravel(), d3.ravel(), d4.ravel(), 
        d5.ravel(), d6.ravel(), d7.ravel(), d8.ravel(), d9.ravel()
        ]

    # print plane

    y_pred_test = svm_classifier(train, train_lab, out, '', name_out_path, plane, np.linspace(m[most_dominants[0]][0], m[most_dominants[0]][1], ten_or_not[most_dominants[0]]), np.linspace(m[most_dominants[1]][0], m[most_dominants[1]][1], ten_or_not[most_dominants[1]]), most_dominants, input_sent)

    global syl_dict

    for n, y_pred, true_lab in zip(names, y_pred_test, true_label) :
        syllable = dict()

        if y_pred == 1:
            syllable['stress'] = 2
        else:
            syllable['stress'] = true_lab

        syl_dict[n] = syllable

    pass
def run_training(base_path, db_file, name_out_path, name):

    names_file = '{}/names.pkl'.format(base_path)
    out_data = '{}/x.pkl'.format(base_path)
    input_sensitivity = '{}/input_sensitivity.pkl'.format(base_path)

    names = Utility.load_obj(names_file)
    db = Utility.load_obj(db_file)

    name_list = []
    for d in db:
        name_list.append(d['id'])

    label = []
    for nn in names:
        idx = name_list.index(nn)

        if db[idx]['stress'] == '1':
            label.append(db[idx]['stress'])
        elif nn in potential_list:
            label.append('3')
        else:
            label.append(db[idx]['stress'])

    out = Utility.load_obj(out_data)
    input_sent = Utility.load_obj(input_sensitivity)

    print 'Input sensitivity', input_sent

    most_dominants = Utility.get_input_sensitivity(input_sent, 2)

    label = map(int, label)
    label = np.array(label)

    train = np.append(out[label == 2], out[label == 3], axis=0)

    global kern
    lengthscale = 1 / np.array(input_sent, dtype=float)
    lengthscale = lengthscale / lengthscale.min()
    kern = GPy.kern.RBF(len(train[0]), ARD=True, lengthscale=lengthscale)

    min_max_dims = []
    ten_or_not = []

    for d in xrange(len(out[0])):
        out_d = out[:, d]
        m = []
        m.append(min(out_d))
        m.append(max(out_d))
        min_max_dims.append(m)

        if d in most_dominants:
            ten_or_not.append(100),
        else:
            ten_or_not.append(1)

    m = min_max_dims
    # print ten_or_not, m

    print most_dominants

    m_grid = []
    for d in xrange(len(out[0])):
        if ten_or_not[d] != 1:
            m_grid.append(np.linspace(m[d][0], m[d][1], ten_or_not[d]))
        else:
            m_grid.append([0])

    d0, d1, d2, d3, d4, d5, d6, d7, d8, d9 = np.meshgrid(
        m_grid[0], m_grid[1], m_grid[2], m_grid[3], m_grid[4], m_grid[5],
        m_grid[6], m_grid[7], m_grid[8], m_grid[9])

    plane = np.c_[d0.ravel(),
                  d1.ravel(),
                  d2.ravel(),
                  d3.ravel(),
                  d4.ravel(),
                  d5.ravel(),
                  d6.ravel(),
                  d7.ravel(),
                  d8.ravel(),
                  d9.ravel()]

    # print plane

    svm_classifier(
        train, '', out, '', name_out_path, plane,
        np.linspace(m[most_dominants[0]][0], m[most_dominants[0]][1],
                    ten_or_not[most_dominants[0]]),
        np.linspace(m[most_dominants[1]][0], m[most_dominants[1]][1],
                    ten_or_not[most_dominants[1]]), most_dominants)

    pass
def plot_latent_space(base_path, db_file, name_out_path):

    names_file = '{}/names.pkl'.format(base_path)
    out_data = '{}/x.pkl'.format(base_path)
    input_sensitivity = '{}/input_sensitivity.pkl'.format(base_path)

    if not Utility.is_file_exist(out_data) : 
        print out_data
        print 'Not exist'
        return

    names = Utility.load_obj(names_file)

    db = Utility.load_obj(db_file)

    name_list = []
    for d in db:
        name_list.append( d['id'] )

    label = []
    for nn in names:
        idx = name_list.index(nn)
        
        if db[idx]['stress'] == '1':
            label.append(db[idx]['stress'])
        elif nn in potential_list:
            label.append('3')
        else :
            label.append(db[idx]['stress'])


    out = Utility.load_obj(out_data)

    # print out.shape

    input_sent = Utility.load_obj(input_sensitivity)
    # print 'input_sensitivity : ', sorted(input_sent)
    most_dominants = Utility.get_input_sensitivity(input_sent, 2)

    x = out[ :, most_dominants[0] ]
    y = out[ :, most_dominants[1] ]

    label = map(int, label)
    label = np.array(label)

    # print set(label)

    train = np.append( out[label==2] , out[label==3], axis=0 )
    # train = out[label==2] 
    # print train.shape

    test = out[label==0]

    lengthscale=1/np.array(input_sent, dtype=float)
    k = GPy.kern.RBF(len(train[0]), ARD=True, lengthscale=lengthscale)

    plt.clf()

    colors = ['red','green','blue','purple']

    md = most_dominants[0]

    mean = np.mean(train, axis=0)
    var = np.var(train, axis=0)
    # rv = multivariate_normal(mean=np.mean(train, axis=0), cov=np.var(train, axis=0))

    sd = np.std(train[:,most_dominants[0]], axis=0)

    for idx, lab in enumerate(label): 
        # if lab == 2: continue
        # if lab == 3: continue

        # print out[idx][md], mean

        d = distance.euclidean(out[idx][md], mean[md])
        # print d

        if d < sd:
            # label[idx] = 4
            label[idx] = 2
        elif d < 2*sd:
            # label[idx] = 6
            label[idx] = 2
            pass

    label[label==3] = 2
    print len(out), len(names)
    print set(label), len(label)

    if len(set(label)) > 3:
        print 'error : ', name_out_path
        raise

    global syl_dict

    for n, lab in zip(names, label) :
        syllable = dict()
        syllable['stress'] = lab
        syl_dict[n] = syllable

    # print names
    # print label

    # return

    for idx, s in enumerate( [0,1,4,5,6,2,3] ):
        # if (s == 3) | (s == 2):
        if (s == 2):
            # plt.scatter(x[label==s], y[label==s], c=colors[s], label=s, s=100)
            plt.scatter(x[label==s], y[label==s], c=colors[s], label='Manual weak stress labeling', s=100)
            pass
        elif (s==-1):
            plt.scatter(x[label==s], y[label==s], c='red', label=s, s=20)
        elif (s==1):
            plt.scatter(x[label==s], y[label==s], c='red', label='Stress', s=7)
            pass
        elif (s==5):
            # plt.scatter(x[label==s], y[label==s], c='yellow', label=s, s=20, marker='^', linewidth='0')
            pass
        elif (s==4):
            plt.scatter(x[label==s], y[label==s], c='green', label='Weak stress in 1 SD', s=20, marker='*', linewidth='0')
        elif (s==6):
            plt.scatter(x[label==s], y[label==s], c='orange', label='Weak stress in 2 SD', s=20, marker='h', linewidth='0')
        # else:
        elif (s==0):
            plt.scatter(x[label==s], y[label==s], c='black', label='Unstress', s=7, marker='.', linewidth='0')
            pass

    plt.scatter(mean[most_dominants[0]], mean[most_dominants[1]], c='red', label=s, s=200, marker='x')

    x_lim = plt.xlim()
    y_lim = plt.ylim()

    xx, yy = np.mgrid[x_lim[0]:x_lim[1]:.01, y_lim[0]:y_lim[1]:.01]
    pos = np.empty(xx.shape + (2,))
    pos[:, :, 0] = xx; pos[:, :, 1] = yy

    x_train = train[ :, most_dominants[0] ]
    y_train = train[ :, most_dominants[1] ]

    # rv = multivariate_normal(
    #     [mean[most_dominants[0]], mean[most_dominants[1]] ], 
    #     [var[most_dominants[0]], var[most_dominants[1]] ])
    # print rv
    # print 'means : ', rv.pdf([np.mean(x_train), np.mean(y_train)])

    # plt.contourf(xx, yy, rv.pdf(pos), alpha=0.5)

    # plt.legend(prop={'size':12})
    plt.savefig( name_out_path )
def plot_latent_space(base_path, db_file, name_out_path):

    names_file = '{}/names.pkl'.format(base_path)
    out_data = '{}/x.pkl'.format(base_path)
    input_sensitivity = '{}/input_sensitivity.pkl'.format(base_path)

    if not Utility.is_file_exist(out_data):
        print out_data
        print 'Not exist'
        return

    names = Utility.load_obj(names_file)

    db = Utility.load_obj(db_file)

    name_list = []
    for d in db:
        name_list.append(d['id'])

    label = []
    iden = []

    target_id = []

    for nn in names:
        idx = name_list.index(nn)
        label.append(db[idx]['stress'])
        iden.append(nn)
        if nn in ['tscsdm38_55', 'tscsdu01_32', 'tscsdg02_21']:
            target_id.append(idx)

    target_id = np.array(target_id)

    out = Utility.load_obj(out_data)

    iden = np.array(iden)

    print out.shape

    input_sent = Utility.load_obj(input_sensitivity)
    print input_sent
    most_dominants = Utility.get_input_sensitivity(input_sent, 2)

    x = out[:, most_dominants[0]]
    y = out[:, most_dominants[1]]

    label = map(int, label)
    label = np.array(label)

    print set(label)

    # ind = np.random.choice(len(label), 20)
    # x = x[ind]
    # y = y[ind]
    # label = label[ind]
    # iden = iden[ind]

    colors = ['red', 'green', 'blue', 'purple']

    plt.clf()
    plt.scatter(x,
                y,
                c=label,
                cmap=matplotlib.colors.ListedColormap(colors),
                alpha=0.5)

    el = Ellipse((2, -1), 0.5, 0.5)

    for lab, xx, yy, yyy in zip(['Unstress', 'Strong stress', 'Weak stress'],
                                x[target_id], y[target_id], [50, 100, 50]):
        # for lab, xx, yy in zip(iden, x, y):
        # yyy = 20
        plt.annotate(lab,
                     xy=(xx, yy),
                     xytext=(0, yyy),
                     textcoords='offset points',
                     ha='left',
                     va='bottom',
                     bbox=dict(boxstyle='round,pad=0.5',
                               fc='yellow',
                               alpha=0.5),
                     arrowprops=dict(arrowstyle="simple",
                                     fc="0.6",
                                     ec="none",
                                     patchB=el,
                                     connectionstyle="arc3,rad=0.3",
                                     color='g'))
    plt.savefig(name_out_path)
    def plot_scatter(model, data_object, outpath, label_type=None, target_tone=None, name_index_list=None, phoneme_list=None, plotted_tone=None, bivariate=False, followed_list_file=None, perform_unsupervised=False, get_only_stress=False, non_unlabelled_stress=False, get_only_gpr_data=False,
                    return_after_dbscan=False, get_only_manual_data=False, no_short_duration=False):

        data = model.X.mean

        y, name_index, tone, stress, syllable_short_long_type, syllable_positions, phonemes, syllable_type = data_object.get_GP_LVM_training_data(
            Syllable.Training_feature_tonal_part_raw_remove_head_tail_interpolated , 
            dur_position=[1,2] , no_short_duration=no_short_duration, 
            num_sampling=50, get_only_stress=get_only_stress, non_unlabelled_stress=non_unlabelled_stress, get_only_gpr_data=get_only_gpr_data, get_only_manual_data=get_only_manual_data)

        # print 'Plot scatter'
        # print stress
        # sys.exit()

        # print syllable_type
        # print model.X.mean
        x = []
        y = []

        input_sensitivity = model.input_sensitivity()
        print input_sensitivity

        index = Utility.get_input_sensitivity(input_sensitivity, 3)
        print index

        data = np.array(data)

        name_index_list = np.array(name_index_list)

        index_filter = []
        for n in name_index:
            # print n
            idx = np.where( name_index_list==n ) [0]
            # print idx
            index_filter.append(idx[0])

        data = data[index_filter]

        stress = np.array(stress)

        labels_true = np.arange(len(stress), dtype=int)
        labels_true[stress == 'Stress'] = 1
        labels_true[stress == 'Unstress'] = 0

        # print len(data), len(stress)
        # print len(labels_true), set(labels_true)
        # sys.exit()

        if len(data) != len(stress):
            print 'Error data is not equal'
            return

        plt.clf()

        if perform_unsupervised:
            try:

                DBSCAN_executioner.run(data, labels_true, os.path.dirname(outpath), [index[0], index[1]], input_sensitivity)
                # Kmeans_executioner.run(data, labels_true, os.path.dirname(outpath), [index[0], index[1]], input_sensitivity)
            except:
                print 'Error at path : {}'.format(outpath)
                traceback.print_exc()

        if return_after_dbscan:
            return

        plt.clf()

        print 'Data : {}'.format(len(data))
        print 'Stress : {}'.format(len(stress))
        # print stress

        x = data[:,index[0]]
        x = data[:,1]
        y = data[:,index[1]]
        y = data[:,0]
        z = data[:,index[2]]

        print 'syllable_positions', len(syllable_positions)

        if label_type is GP_LVM_Scatter.LABEL_TYPE_STRESS:
            # Scatter.plot(x, y, outpath, label_list=stress, color=['r','b','g'])
            
            stress_index = np.where(stress == 'Stress')
            unstress_index = np.where(stress == 'Unstress')

            mask = np.ones(len(stress), dtype=bool)
            mask[unstress_index] = False
            # print stress
            # sys.exit()
            # Scatter.plot(x[mask], y[mask], outpath, label_list=stress[mask], color=['r','b','g'], bivariate=bivariate, X_bi=x[stress_index], Y_bi=y[stress_index])
            Scatter.plot(x, y, outpath, label_list=stress, color=['r','b','g'], bivariate=bivariate, X_bi=x[stress_index], Y_bi=y[stress_index])
        elif label_type is GP_LVM_Scatter.LABEL_TYPE_STRESS_3D_COLORING:
            # Scatter.plot(x, y, outpath, label_list=stress, color=['r','b','g'])
            
            stress_index = np.where(stress == 'Stress')
            unstress_index = np.where(stress == 'Unstress')
            normalized = (z-min(z))/(max(z)-min(z)) * 100
            Scatter.plot(x, y, outpath, label_list=None, color=normalized.astype(int).tolist(), cmap='gray')
        elif label_type is GP_LVM_Scatter.LABEL_TYPE_STRESS_SEP_GPR:

            gpr_file_list = []
            for idx, n in enumerate(name_index): 
                if 'gpr' in n:
                    gpr_file_list.append(idx)

            gpr_file_list = np.array(gpr_file_list)

            stress[gpr_file_list] = 'GPR_Stress'

            stress_index = np.where(stress == 'Stress')
            unstress_index = np.where(stress == 'Unstress')

            mask = np.ones(len(stress), dtype=bool)
            mask[unstress_index] = False

            Scatter.plot(x, y, outpath, label_list=stress, color=['r','b','g'], bivariate=bivariate, X_bi=x[stress_index], Y_bi=y[stress_index])
        elif label_type is GP_LVM_Scatter.LABEL_TYPE_STRESS_AND_SPLIT_TONE:

            stress_index = np.where(stress == 'Stress')
            unstress_index = np.where(stress == 'Unstress')

            tone = np.array(tone)

            mask = np.ones(len(stress), dtype=bool)
            mask[unstress_index] = False

            outpath = Utility.get_base_path(outpath)

            canplot = True

            try:
                labels_object = Utility.load_obj('{}/clustered_label.npy'.format(outpath))
                if len(labels_object)!=len(stress):
                    canplot = False
            except:
                canplot = False

            for t in set(tone):

                Utility.make_directory('{}/tone_stress_label/'.format(outpath))
                Utility.make_directory('{}/clustering_label/'.format(outpath))
                print len(x), len(y), len(tone), len(stress) 
                Scatter.plot(x[tone==t], y[tone==t], '{}/tone_stress_label/tone_{}.eps'.format(outpath, t), label_list=stress[tone==t], bivariate=bivariate, X_bi=x[stress_index], Y_bi=y[stress_index])
                if canplot:
                    'Plot label tone {}'.format(t)
                    Scatter.plot(x[tone==t], y[tone==t], '{}/clustering_label//tone_{}.eps'.format(outpath, t), label_list=labels_object[tone==t], bivariate=bivariate, X_bi=x[stress_index], Y_bi=y[stress_index])

        elif label_type is GP_LVM_Scatter.LABEL_TYPE_SYLLABLE_SHORT_LONG:
            Scatter.plot(x, y, outpath, label_list=syllable_short_long_type)
        elif label_type is GP_LVM_Scatter.LABEL_TYPE_SYLLABLE_POSITIONS:

            long_list = []
            short_list = []

            for idx, p in enumerate(phonemes):
                v = p.split('-')[1]
                if v not in Syllable.short_vowel:
                    long_list.append(idx)
                else:
                    short_list.append(idx)

            print len(long_list) , len(x)
            x = np.array(x)
            y = np.array(y)
            syllable_positions = np.array(syllable_positions)
            Scatter.plot(x[long_list], y[long_list], outpath, label_list=syllable_positions[long_list])
        elif label_type is GP_LVM_Scatter.LABEL_TYPE_TONES:
            Scatter.plot(x, y, outpath, label_list=tone, color=['r','g','b','black','yellow'])
        elif label_type is GP_LVM_Scatter.LABEL_TYPE_ONE_TONE_STRESS_UNSTRESS:  
            tone = np.array(map(str, tone))
            stress = np.core.defchararray.add(stress, '_' )
            stress_tone = np.core.defchararray.add(stress, tone)

            target_list = np.array([])
            print target_tone
            for t in target_tone:
                print t, target_list, np.where(tone == t)
                target_list = np.union1d(target_list, np.where(tone == t)[0])
            stress_tone = stress_tone[target_list.astype(int)]#np.delete(stress_tone, delete_list)
            x = x[target_list.astype(int)]#np.delete(x, delete_list)
            y = y[target_list.astype(int)]#np.delete(y, delete_list)
            Scatter.plot(x, y, outpath, label_list=stress_tone)
        elif label_type is None :
            Scatter.plot(x, y, outpath, label_list=None)
        elif label_type is GP_LVM_Scatter.LABEL_TYPE_SYLLABLE_IN_MANUAL_PHRASE:
            name_index = np.array(name_index)
            # print name_index
            single_list = np.array(Utility.load_obj(name_index_list['single']))
            followed_by_sil_list = np.array(Utility.load_obj(name_index_list['followed_by_sil']))
            poly_list = np.array(Utility.load_obj(name_index_list['poly']))

            all_union = []

            single_indices = [] 
            for syl in single_list:
                single_indices = np.union1d(single_indices, np.where( name_index == syl)[0])
            
            followed_by_sil_indices = [] 
            for syl in followed_by_sil_list:
                followed_by_sil_indices = np.union1d(followed_by_sil_indices, np.where( name_index == syl)[0])

            poly_indices = [] 
            for syl in poly_list:
                poly_indices = np.union1d(poly_indices, np.where( name_index == syl)[0])

            name_index[single_indices.astype(int)] = 'Single '
            name_index[followed_by_sil_indices.astype(int)] = 'Followed'
            name_index[poly_indices.astype(int)] = 'Poly'

            all_union = np.union1d(all_union, single_indices)
            all_union = np.union1d(all_union, followed_by_sil_indices)
            all_union = np.union1d(all_union, poly_indices)

            mask = np.ones(len(name_index), dtype=bool)
            mask[all_union.astype(int)] = False
            name_index[mask] = 'Other'

            Scatter.plot(x, y, outpath, label_list=name_index, color=['r','g','b','y'])

        elif label_type is GP_LVM_Scatter.LABEL_TYPE_SYLLABLE_IN_MANUAL_PHRASE_PLUS_SHORT_LONG_SYLLABLE:
            
            name_index = np.array(name_index)
            # print name_index
            single_list = np.array(Utility.load_obj(name_index_list['single']))
            followed_by_sil_list = np.array(Utility.load_obj(name_index_list['followed_by_sil']))
            poly_list = np.array(Utility.load_obj(name_index_list['poly']))

            all_union = []

            single_indices = [] 
            for syl in single_list:
                single_indices = np.union1d(single_indices, np.where( name_index == syl)[0])
            
            followed_by_sil_indices = [] 
            for syl in followed_by_sil_list:
                followed_by_sil_indices = np.union1d(followed_by_sil_indices, np.where( name_index == syl)[0])

            poly_indices = [] 
            for syl in poly_list:
                poly_indices = np.union1d(poly_indices, np.where( name_index == syl)[0])

            name_index[single_indices.astype(int)] = 'Single '
            name_index[followed_by_sil_indices.astype(int)] = 'Followed'
            name_index[poly_indices.astype(int)] = 'Poly'

            all_union = np.union1d(all_union, single_indices)
            all_union = np.union1d(all_union, followed_by_sil_indices)
            all_union = np.union1d(all_union, poly_indices)

            mask = np.ones(len(name_index), dtype=bool)
            mask[all_union.astype(int)] = False
            name_index[mask] = 'Other'

            outpath = outpath.split('.')[0]

            syllable_short_long_type = np.array(syllable_short_long_type)
            short_list = np.where(syllable_short_long_type=='short')[0]
            long_list = np.where(syllable_short_long_type=='long')[0]

            # print short_list, long_list

            Scatter.plot(x[short_list], y[short_list], '{}_short.pdf'.format(outpath), label_list=name_index[short_list], color=['r','g','b','y'])
            Scatter.plot(x[long_list], y[long_list], '{}_long.pdf'.format(outpath), label_list=name_index[long_list], color=['r','g','b','y'])

        elif label_type is GP_LVM_Scatter.LABEL_TYPE_PHONEME:
            phonemes = np.array(phonemes)
            stress = np.array(stress)
            for phoneme in phoneme_list:
                if plotted_tone != '01234':
                    if plotted_tone not in phoneme: continue

                target_index = np.where(phonemes == phoneme)
                stress_index = np.where(stress == 'Stress')
                # print stress_index

                outpath = outpath.split('.')[0]
                Scatter.plot(x[target_index], y[target_index], '{}_{}.pdf'.format(outpath, phoneme), label_list=stress[target_index], bivariate=True, X_bi=x[stress_index], Y_bi=y[stress_index], title=phoneme, xlim=(-4.4657748693986417, 8.1238328278216105), ylim=(-7.2366812187855185, 6.1187134324317736))

        elif label_type is GP_LVM_Scatter.LABEL_TYPE_SYLLABLE_TYPE:

            syllable_type = np.array(syllable_type)
            stress = np.array(stress)
            types = set(syllable_type)

            for typ in types:

                print typ

                typ_index = np.where(syllable_type==typ)

                sub_stress = stress[typ_index]
                sub_x = x[typ_index]
                sub_y = y[typ_index]
                
                stress_index = np.where(sub_stress == 'Stress')
                unstress_index = np.where(sub_stress == 'Unstress')

                mask = np.ones(len(sub_stress), dtype=bool)
                mask[unstress_index] = False

                outpath = outpath.split('.')[0]

                Scatter.plot(sub_x, sub_y, '{}_{}.pdf'.format(outpath, typ), label_list=sub_stress, color=['r','b','g'], bivariate=False, X_bi=sub_x[stress_index], Y_bi=sub_y[stress_index], title=typ, xlim=(-4.4657748693986417, 8.1238328278216105), ylim=(-7.2366812187855185, 6.1187134324317736))

        elif label_type is GP_LVM_Scatter.LABEL_TYPE_FOLLOWED_BY_SIL:
            followed_list = Utility.load_obj(followed_list_file)
            fow_index = []
            name_index = np.array(name_index)
            for f in followed_list:
                k = np.where(name_index == f)[0]
                for kk in k:
                    fow_index.append(kk.astype(int))

            # print fow_index

            stress = np.array(stress)
            stress_index = np.where(stress == 'Stress')
            unstress_index = np.where(stress == 'Unstress')

            stress[stress_index] = 'Unstress'
            stress[fow_index] = 'Stress'

            Scatter.plot(x, y, outpath, label_list=stress, color=['r','b','g'], bivariate=True, X_bi=x[fow_index], Y_bi=y[fow_index])

            tone = np.array(tone)
            for t in [0,1,2,3,4]:

                x_tone = x[np.where(tone == t)]
                y_tone = y[np.where(tone == t)]
                stress_tone = stress[np.where(tone == t)]
                tone_path = '{}_{}.pdf'.format(outpath.split('.')[0], t)
                Scatter.plot(x_tone, y_tone, tone_path, label_list=stress_tone, color=['r','b','g'], bivariate=True, X_bi=x[fow_index], Y_bi=y[fow_index], title='Tone {}'.format(t), xlim=(-3.7420549236630576, 3.7939531202951904), ylim=(-4.2426927228030289, 6.3913714950885101))

            base_path = outpath.split('.')[0] 
            Utility.save_obj(x, '{}_{}.pickle'.format(base_path,'x'))
            Utility.save_obj(y, '{}_{}.pickle'.format(base_path,'y'))
            Utility.save_obj(stress, '{}_{}.pickle'.format(base_path,'stress_followed'))
            Utility.save_obj(tone, '{}_{}.pickle'.format(base_path,'tone'))

        # elif label_type is GP_LVM_Scatter.LABEL_TYPE_SEPARATED_UNSUPERVISED_GROUP:


        pass