Example #1
0
def get_j_set(db_file, out_file, sort_list_out_file):

    j_set_db = []
    j_set_sort_list = []

    db = Utility.load_obj(db_file)
    for syl in db:
        if 'j' in syl['id']:
            j_set_db.append(syl)
            # print syl['dur']
            dur = 0
            for idx, d in enumerate(syl['dur']):
                if idx == 0: continue

                dur = dur + d

            j_set_sort_list.append((syl['id'], d, syl['stress']))

    Utility.sort_by_index(j_set_sort_list, 1)
    print j_set_sort_list

    Utility.save_obj(j_set_sort_list, sort_list_out_file)
    Utility.save_obj(j_set_db, out_file)

    pass
Example #2
0
def is_finish(rmse_list):

    if len(rmse_list) < 2:
        print 'Finish cause no more input'
        return True

    Utility.sort_by_index(rmse_list, 0)
    print rmse_list

    first = rmse_list[0]
    second = rmse_list[1]

    for a, b in zip(first, second):
        if abs(a - b) > threshold:
            return False

    return True
Example #3
0
        # the histogram of the data
        n, bins, patches = plt.hist(errors_list,
                                    100,
                                    normed=1,
                                    facecolor='green',
                                    alpha=0.75)

        plt.savefig('hist.eps')

        Utility.save_obj(errors, './errors_dict.pkl')

        rmse = np.sqrt(sklearn.metrics.mean_squared_error(
            true, dct_regen)) * 1200 / np.log(2)
        print 'Coeff {} all rmse : '.format(coeff), rmse

        Utility.sort_by_index(errors_tuple, 1)
        Utility.write_to_file_line_by_line('./errors_sorted.txt', errors_tuple)

        print len(syl_dct)

        base = '/work/w2/decha/Data/GPR_speccom_data/Interspeech2017/tone_separated/'

        Utility.make_directory(base)

        Utility.save_obj(
            syl_dct,
            '/work/w2/decha/Data/GPR_speccom_data/Interspeech2017/tone_separated/tone_all_dct_coeff_{}.pkl'
            .format(coeff))

        for t in range(5):
def run(main_fig_path, out_path):
    vowel_type = ['v', 'vv', 'vn', 'vvn', 'vsg', 'vvsg']
    # vowel_type = ['v', 'vv', 'vn', 'vvn', 'vsg', 'vvsg']
    tones = ['0', '1', '2', '3', '4']

    # vowel_type = ['vv']
    # tones = ['1']

    c = dict()
    c[-1] = 'black'
    c[0] = 'blue'
    c[1] = 'red'
    c[2] = 'green'
    c[3] = 'yellow'

    fig_per_line = 4

    syllable_lists = dict()
    # for v in vowel_type:
    for v in ['vvv', 'vvvn', 'vvvsg']:
        for t in tones:

            if v == 'vvv':
                vv = ['v', 'vv']
            elif v == 'vvvn':
                vv = ['vn', 'vvn']
            elif v == 'vvvsg':
                vv = ['vsg', 'vvsg']
            else:
                print 'wtf'

            dire = '{}/{}/'.format(out_path, v)
            Utility.make_directory(dire)
            latext_out_file = '{}/stress-list_vowel-type-{}_Tone-{}.tex'.format(
                dire, v, t)

            for vi in vv:
                path = '{}/{}/{}/'.format(main_fig_path, vi, t)
                files = Utility.list_file(path)
                file_list = []
                # print files
                for f in files:
                    # tscsd_manual_f17_22_tone_3_dur_15.025_syl_n-aa-m^_stress_0

                    if f.startswith('.'): continue

                    pattern = re.compile(
                        r"""(?P<name>.+)_tone.+dur_(?P<dur>.+)_syl.+_stress_(?P<stress>.+).eps""",
                        re.VERBOSE)
                    match = re.match(pattern, f)
                    # print match
                    if match:
                        dur = float(match.group('dur'))

                        stress = 'Unstress'
                        if int(match.group('stress')) == 1:
                            stress = 'Stress'

                        file_list.append(('{}/{}'.format(path, f), dur, stress,
                                          match.group('name')))

                    else:
                        print f

            Utility.sort_by_index(file_list, 1)
            # print file_list

            eps_list = []
            caption_list = []

            temp_eps = []
            temp_cap = []

            for fi in file_list:
                file_path = fi[0]
                dur = fi[1]
                stress = fi[2]
                name = fi[3]

                group = find_group(t, v, name)
                color = c[group]
                # if group != 0:
                #     print v, t
                #     print group, color
                # print '\\textcolor{{{}}}{{{}}}'.format(color,name)
                name = name.replace('_', '\_')
                # {\color{red} Panda }
                temp_eps.append(file_path)
                temp_cap.append('{{\color{{{}}} {} }}'.format(color, name))

                if fig_per_line == len(temp_eps):
                    eps_list.append(temp_eps)
                    caption_list.append(temp_cap)

                    temp_eps = []
                    temp_cap = []

            eps_list.append(temp_eps)
            caption_list.append(temp_cap)

            # print len(eps_list)
            if len(eps_list) == 1: continue
            Latext_Tool.plot_all_data(eps_list, caption_list, latext_out_file)

        pass
def get_data(path):

    clustered_label = Utility.load_obj('{}/clustered_label.npy'.format(path))
    # print len(clustered_label)
    best_measure_params = Utility.load_obj('{}/best_measure_params.npy'.format(path))
    # print best_measure_params
    name_index = np.array( Utility.load_obj('{}/name_index.npy'.format(path)) )
    # print len(name_index)
    m = Utility.load_obj('{}/GP_model.npy'.format(path))
    model = m.X.mean
    model = np.array(model)
    # print model

    input_sensitivity = m.input_sensitivity()
    print 'Input sent : ', input_sensitivity

    group_list = ['-1', '0', '1', '2']

    sort_list = []

    for group in group_list:
        g = '{}/group_list/{}.npy'.format(path, group)
        if Utility.is_file_exist(g):
            names = Utility.load_obj(g)
            sort_list.append((group, len(names)))

    Utility.sort_by_index(sort_list, 1)
    print sort_list

    unstressed_mean = get_means(sort_list, len(sort_list)-1, name_index, model, path)
    unstressed_list = Utility.load_obj( '{}/group_list/{}.npy'.format(path, sort_list[len(sort_list)-1][0]) )

    stressed_mean = get_means(sort_list, len(sort_list)-2, name_index, model, path)
    stressed_list = Utility.load_obj( '{}/group_list/{}.npy'.format(path, sort_list[len(sort_list)-2][0]) )

    Utility.save_obj( 
        {'unstress_mean': unstressed_mean, 'stress_mean': stressed_mean}, 
        '{}/mean_of_unstress_stress.npy'.format(path) )

    lengthscale=1/np.array(input_sensitivity, dtype=float)
    kernel = GPy.kern.RBF(10, lengthscale=lengthscale, ARD=True)

    print len(unstressed_list), len(stressed_list)

    for idx, g in enumerate(sort_list):
        if idx == (len(sort_list)-2): break
        names = Utility.load_obj('{}/group_list/{}.npy'.format(path, g[0]))

        print len(names)

        latent_data = get_data_from_a_giving_name_index(names, name_index, model)

        stu = np.array([unstressed_mean, stressed_mean])
        distance = -1*np.log(kernel.K( latent_data, stu ))

        for n, dis in zip(names, distance):
            if dis[0] > dis[1]:
                unstressed_list = np.append(unstressed_list, n)
            else : 
                stressed_list = np.append(stressed_list, n)

    print len(unstressed_list), len(stressed_list)
    # print unstressed_list, stressed_list

    unstressed_data = get_data_from_a_giving_name_index(unstressed_list, name_index, model)
    stressed_data = get_data_from_a_giving_name_index(stressed_list, name_index, model)

    unstress_distance = -1*np.log(kernel.K( unstressed_data, np.array([stressed_mean]) ))
    # print unstress_distance

    unstress_distance = np.append(unstress_distance, 0.0)
    unstress_distance = np.reshape(unstress_distance, (len(unstress_distance),1 ))

    min_max_scaler = preprocessing.MinMaxScaler()
    unstress_distance = min_max_scaler.fit_transform(unstress_distance)
    unstress_distance = np.reshape(unstress_distance, len(unstress_distance))

    unstress_distance = 1 - unstress_distance
    # print unstress_distance
    # print min(unstress_distance), max(unstress_distance)

    for nam, dis in zip(unstressed_list, unstress_distance):
        out_dict[nam] = float("{0:.4f}".format(dis)) 

    for nam, dis in zip(stressed_list, [1.0] * len(stressed_list)):
        out_dict[nam] = dis

    # print out_dict

    return (unstressed_list, stressed_list)
Example #6
0
            child = ''
            for opt in base_opt:
                child = child + '{}:'.format(opt)

            if not child in rmse_list:
                optimal_queue.append((name, base_opt))

    print rmse_list

    print '-------------------------'
    tups = []
    for key in rmse_list:
        tups.append((key, rmse_list[key]))

    Utility.sort_by_index(tups, 1)

    print tups
    Utility.save_obj(
        tups,
        '{}/results_{}.pkl'.format(outname,
                                   strftime("%Y-%m-%d %H:%M:%S", gmtime())))

    # for idx, oooo in enumerate(st) :
    #     if (st[idx]-ed[idx]) == 0:
    #         optimal_threshold.append(st[idx])
    #         continue

    #     print 'start : ', st

    #     rmse_list = []
    def run(X, labels_true, path, dominant, inverselengthscale, stress_only=False, stress_list=None):

        ##############################################################################
        X = np.array(X)
        labels_true = np.array(labels_true)
        if stress_only:
            print 'stress_only'

            stress_index = np.where(stress_list==1)

            print stress_index

            X = np.copy(X[stress_index])
            labels_true = np.copy(labels_true[stress_index])

        # Compute DBSCAN
        print 'Stress : {}, Unstress: {}'.format(len(np.where(labels_true==1)[0]), len(np.where(labels_true==0)[0]))
        lengthscale=1/np.array(inverselengthscale, dtype=float)
        kernel = GPy.kern.RBF(len(X[0]), lengthscale=lengthscale, ARD=True)

        print lengthscale

        XX = -1*np.log(kernel.K(X, X))

        # incre = 0.00005
        incre = 0.00001

        jncre = 1
        done = False

        measure_list = []
        outfile = []
        # print labels_true
        # Best :  (0.0025000000000000005, 35.0, 0.69180773481515445)
        print XX.shape, len(labels_true)
        print 'Mean, min, max'
        print np.mean(XX), np.amin(XX), np.amax(XX)

        # sys.exit()
        outfile.append('Incre : {}'.format(incre))
        outfile.append('Mean, min, max')
        outfile.append('{}, {}, {}'.format(np.mean(XX), np.amin(XX), np.amax(XX)))

        for i in np.flipud(np.arange(0.00, 0.01, incre)):
        # for i in np.flipud(np.arange(0.001, 0.004, incre)):
            if done : break
            for j in np.flipud(np.arange(jncre, 40.0, jncre)):
                try:
                    db = DBSCAN(eps=i, min_samples=j, metric='precomputed').fit(XX)
                    labels = db.labels_
                    
                    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
                    # if n_clusters_ == en(set(labels_true)):
                    # print n_clusters_, i, j
                    measure_list.append((i, j, metrics.v_measure_score(labels_true, labels)))
                except:
                    # print 'Error at : eps={}, min_samples={}'.format(i ,j)
                    # traceback.print_exc()
                    # sys.exit()
                    pass
                    
        Utility.sort_by_index(measure_list, 2)

        if len(measure_list) == 0:
            print 'Error: Cannot find best at : {}'.format(path)

        print 'Best : {}'.format(measure_list[len(measure_list)-1])
        v_best = measure_list[len(measure_list)-1][2]

        outfile.append('Best : '.format(measure_list[len(measure_list)-1]))

        for m in measure_list:
            if m[2] == v_best:
                print m
                outfile.append('{}'.format(m))

        db = DBSCAN(
            eps=measure_list[len(measure_list)-1][0], 
            min_samples=int(measure_list[len(measure_list)-1][1]),
            metric='precomputed').fit(XX)

        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_

        acc = accuracy_score(labels_true, labels)
        swap = np.copy(labels_true)
        stress_index = np.where(swap==1)
        unstress_index = np.where(swap==0)
        swap[stress_index] = 0
        swap[unstress_index] = 1
        acc_swap = accuracy_score(swap, labels) 
        if acc_swap > acc:
            acc = acc_swap

        print 'Accuracy score : {} / swap: {}'.format(acc, acc_swap)

        # for idx, t in enumerate(labels):
        #     print labels[idx], labels_true[idx]

        # print db.core_sample_indices_
        # print labels
        Utility.save_obj([len(measure_list)-1][0], '{}/best_measure_params.npy'.format(path))
        Utility.save_obj(labels, '{}/clustered_label.npy'.format(path))

        # Number of clusters in labels, ignoring noise if present.
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

        print('Estimated number of clusters: %d' % n_clusters_)
        print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
        print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
        print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
        print("Adjusted Rand Index: %0.3f"
              % metrics.adjusted_rand_score(labels_true, labels))
        print("Adjusted Mutual Information: %0.3f"
              % metrics.adjusted_mutual_info_score(labels_true, labels))

        outfile.append('Estimated number of clusters: %d' % n_clusters_)
        outfile.append("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
        outfile.append("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
        outfile.append("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
        outfile.append("Adjusted Rand Index: %0.3f"
              % metrics.adjusted_rand_score(labels_true, labels))
        outfile.append("Adjusted Mutual Information: %0.3f"
              % metrics.adjusted_mutual_info_score(labels_true, labels))

        Utility.write_to_file_line_by_line('{}/clustering_result.txt'.format(path), outfile)

        # print("Silhouette Coefficient: %0.3f"
        #       % metrics.silhouette_score(X, labels))

        ##############################################################################
        # Plot result
        import matplotlib.pyplot as plt
        plt.clf()
        # Black removed and is used for noise instead.
        unique_labels = set(labels)
        colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
        for k, col in zip(unique_labels, colors):
            if k == -1:
                # Black used for noise.
                col = 'k'

            class_member_mask = (labels == k)

            xy = X[class_member_mask & core_samples_mask]
            plt.plot(xy[:, dominant[0]], xy[:, dominant[1]], 'o', markerfacecolor=col,
                     markeredgecolor='k', markersize=14)

            xy = X[class_member_mask & ~core_samples_mask]
            plt.plot(xy[:, dominant[0]], xy[:, dominant[1]], 'o', markerfacecolor=col,
                     markeredgecolor='k', markersize=6)

        plt.title('Estimated number of clusters: %d' % n_clusters_)
        # plt.show()
        print '{}/stress_unstress_clustering_lengthscale.eps'.format(path)
        plt.savefig('{}/stress_unstress_clustering_lengthscale.eps'.format(path))
        return labels_true, labels