Exemple #1
0
def visualize_sentence_embeddings(word_emb, sentence_embeddings):
    # take only 100 examples for each category for visualization
    examples_from_category = 100
    folds_count = int(DATA_SIZE / (examples_from_category * CATEGORIES_COUNT))

    folds_count = max(folds_count, 3)
    skf = StratifiedKFold(n_splits=folds_count)
    _, example_data_indices = next(skf.split(SENTENCES, LABELS))

    example_labels = LABELS[example_data_indices]
    example_sentences = SENTENCES[example_data_indices]

    word_emb.build()

    fig = plt.figure(figsize=(20, 10))
    fig.suptitle("Example of several sentence embeddings in action")
    gs = gridspec.GridSpec(1, len(sentence_embeddings))
    colors = ['r', 'g', 'b', 'yellow', 'magenta', 'cyan', 'gray', 'white']
    legend_handles = []

    for i, sen_emb_class in enumerate(sentence_embeddings):
        sen_emb = sen_emb_class(3) # PCA to 3 dimensions
        print ("Building sentence embedding: " + type(sen_emb).__name__ + "...")
        sen_emb.build(word_emb, SENTENCES)

        example_sentences_vectors = [sen_emb[s] for s in example_sentences]

        ax = plt.subplot(gs[i], projection='3d')
        ax.set_title(type(sen_emb).__name__)

        colors_gen = itertools.cycle(colors)

        # plot dots representing sentences
        xs, ys, zs = [], [], []

        for j in xrange(CATEGORIES_COUNT):
            xs.append([])
            ys.append([])
            zs.append([])
            category_vectors = filter(lambda (k, s): example_labels[k] == j, enumerate(example_sentences_vectors))
            for k, sentence_vector in category_vectors:
                xs[j].append(sentence_vector[0])
                ys[j].append(sentence_vector[1])
                zs[j].append(sentence_vector[2])

            color = next(colors_gen)
            ax.scatter(xs[j], ys[j], zs[j], c=color, s=60, picker=True)

            if i == len(sentence_embeddings) - 1:
                legend_handles.append(mpatches.Patch(color=color, label=CATEGORIES[j]))

        ax.set_xlabel('X')
        ax.set_ylabel('Y')
        ax.set_zlabel('Z')

    plt.legend(handles=legend_handles)
    plt.tight_layout()
    save_current_plot('sentence_embeddings.svg')
    plt.show()
Exemple #2
0
def plot_pca_accuracies(classifier_class, pca_lengths, pca_accuracies,
                        training_times, testing_times):
    accuracy_legend = mpatches.Patch(color='b', label="Accuracy")
    training_time_legend = mpatches.Patch(color='r',
                                          label="Relative model training time")
    testing_time_legend = mpatches.Patch(
        color='g', label="Relative predicting time using model")
    no_pca_legend = mpatches.Patch(color='black', label="* = no PCA")

    plt.rcParams["figure.figsize"] = [11, 8]
    plt.legend(handles=[
        accuracy_legend, training_time_legend, testing_time_legend,
        no_pca_legend
    ])
    lines = plt.plot(pca_lengths[:-1], pca_accuracies[:-1], 'b',
                     pca_lengths[:-1], pca_accuracies[:-1], 'bo',
                     pca_lengths[:-1], training_times[:-1], 'r',
                     pca_lengths[:-1], training_times[:-1], 'ro',
                     pca_lengths[:-1], testing_times[:-1], 'g',
                     pca_lengths[:-1], testing_times[:-1], 'go')
    plt.setp(lines, linewidth=2, markersize=8)
    plt.scatter(pca_lengths[-1:],
                pca_accuracies[-1:],
                c='b',
                s=150,
                marker='*',
                edgecolors='black')
    plt.scatter(pca_lengths[-1:],
                training_times[-1:],
                c='r',
                s=150,
                marker='*',
                edgecolors='black')
    plt.scatter(pca_lengths[-1:],
                testing_times[-1:],
                c='g',
                s=150,
                marker='*',
                edgecolors='black')

    plt.title(
        'How PCA dimension reduction affects model accuracy (using {0})?'.
        format(classifier_class.__name__))
    plt.xlabel('PCA dimensions')
    plt.ylabel('Accuracy vs execution time')
    save_current_plot('pca_{0}.svg'.format(classifier_class.__name__))
    plt.show()
Exemple #3
0
def compare_models_bar_chart(best_results_for_models):
    # best_results = [(cls_class, (word_emb_class, word_emb_params, sen_emb_class, params, best_result, avg_result)]

    # sort results by evaluation on test set
    best_results_for_models.sort(key=lambda (cls, params): params[-3])

    fig, ax = plt.subplots()
    N, width = len(best_results_for_models), 0.15
    ind = np.arange(N)

    classifier_classes = [clf.__name__ for clf, _ in best_results_for_models]
    average_cv_results = [params[-1] for _, params in best_results_for_models]
    train_evaluations = [params[-2] for _, params in best_results_for_models]
    test_evaluations = [params[-3] for _, params in best_results_for_models]
    max_cv_results = [params[-4] for _, params in best_results_for_models]

    rects1 = ax.bar(ind, test_evaluations, width, color='b')
    rects2 = ax.bar(ind + width, train_evaluations, width, color='g')
    rects3 = ax.bar(ind + 2 * width, max_cv_results, width, color='orange')
    rects4 = ax.bar(ind + 3 * width, average_cv_results, width, color='r')

    plt.xticks(ind + (1.5 * width), classifier_classes)

    eval_leg = mpatches.Patch(color='b', label="Model Evaluation on test set")
    t_eval_leg = mpatches.Patch(color='g',
                                label="Model Evaluation on training set")
    max_cv_leg = mpatches.Patch(color='orange', label="Max CV result")
    avg_cv_leg = mpatches.Patch(color='r', label="Average CV result")

    plt.legend(handles=[eval_leg, t_eval_leg, avg_cv_leg, max_cv_leg])

    plt.title('Comparison of performance of tested models')
    plt.ylabel('Cross-validation results')

    # Attach a text label above each bar displaying its value
    for rects in [rects1, rects2, rects3, rects4]:
        for rect in rects:
            height = rect.get_height()
            ax.text(rect.get_x() + rect.get_width() / 2.,
                    height,
                    "{:4.2f}%".format(height),
                    ha='center',
                    va='bottom')

    save_current_plot('models_comparison.svg')
    plt.show()
Exemple #4
0
def compare_sentence_embeddings_bar_chart(results_for_classifiers):
    # sort results by performance

    fig, ax = plt.subplots()
    N, width = len(results_for_classifiers), 0.35
    ind = np.arange(N)

    max_colors = ['#641E16', '#154360', "#7D6608", "#7B7D7D"]
    avg_colors = ['#C0392B', '#2980B9', "#F1C40F", "#ECF0F1"]

    classifier_classes = [clf.__name__ for clf, _ in results_for_classifiers]
    sen_embeddings = [
        sen_emb for sen_emb, max_r, avg_r in results_for_classifiers[0][1]
    ]

    legend_handles = []

    for i, sen_embedding in enumerate(sen_embeddings):
        max_performances = [
            params[i][1] for clf, params in results_for_classifiers
        ]
        average_performances = [
            params[i][2] for clf, params in results_for_classifiers
        ]
        rects1 = ax.bar(ind + i * width,
                        max_performances,
                        width,
                        color=max_colors[i])
        rects2 = ax.bar(ind + i * width,
                        average_performances,
                        width,
                        color=avg_colors[i])

        # Attach a text label above each bar displaying its value
        for shift, rects in [(0, rects1), (-2, rects2)]:
            for rect in rects:
                height = rect.get_height()
                ax.text(rect.get_x() + rect.get_width() / 2.,
                        height + shift,
                        "{:4.2f}%".format(height),
                        ha='center',
                        va='bottom')

        # Add description to legend
        legend_handles.append(
            mpatches.Patch(color=max_colors[i],
                           label=sen_embedding + " best CV result"))
        legend_handles.append(
            mpatches.Patch(color=avg_colors[i],
                           label=sen_embedding + " average CV result"))

    plt.xticks(ind + width * (len(sen_embeddings) - 1) / 2.0,
               classifier_classes)

    plt.legend(handles=legend_handles)

    plt.title('Comparison of performance of Sentence Embeddings')
    plt.ylabel('Cross-validation results')

    save_current_plot('sentence_embeddings_comparison.svg')
    plt.show()
Exemple #5
0
def visualize_word_embedding(word_emb):
    print("Training PCA on words from dataset...")

    dataset_words = get_unique_words(PROCESSED_DATA_PATH)
    dataset_words_with_emb = [
        word_emb[w] for w in dataset_words if word_emb[w] is not None
    ]
    print("{:d}/{:d} ({:4.2f}% words from dataset exist in embedding".format(
        len(dataset_words_with_emb), len(dataset_words),
        len(dataset_words_with_emb) / float(len(dataset_words)) * 100))

    pca = PCA(n_components=3)
    pca.fit(dataset_words_with_emb)

    # take all the words from dataset and count their occurrences in categories
    # take only words which occur at least in 3 different tweets and are longer than 2 letters
    uniq_sens = (set(sen)
                 for sen in SENTENCES)  # remove duplicate words from tweets
    words_with_tweets_counts = {}
    for i, sen in enumerate(uniq_sens):
        sen_category = LABELS[i]
        for word in filter(lambda x: len(x) > 2, sen):
            if word not in words_with_tweets_counts:
                words_with_tweets_counts[word] = [0] * CATEGORIES_COUNT
            words_with_tweets_counts[word][sen_category] += 1

    words_with_tweets_counts = filter(
        lambda
        (word, counters): sum(counters) >= 3 and word_emb[word] is not None,
        words_with_tweets_counts.iteritems())
    trimmed_words = []
    words_from_category = 75  # leave only 75 words from each "category"
    categories_counters = [0] * CATEGORIES_COUNT
    for word, counters in words_with_tweets_counts:
        word_category = counters.index(max(counters))
        if categories_counters[word_category] >= words_from_category:
            continue
        categories_counters[word_category] += 1
        trimmed_words.append((word, counters))

    words = list(map(lambda x: x[0], trimmed_words))

    colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1), (1, 1, 0), (1, 0, 1), (0, 1, 1),
              (0.3, 0.3, 0.3), (1, 1, 1)]  # colors in RGB [0,1]
    words_colors = list(map(lambda x: mix_colors(x[1], colors), trimmed_words))

    fig = plt.figure(figsize=(10, 10))
    fig.suptitle("Example of word embedding reduced by PCA to 3 dimensions")
    legend_handles = []

    ax = plt.subplot(projection='3d')

    # plot dots representing words
    xs, ys, zs = [], [], []
    for word in words:
        word_vector = pca.transform([word_emb[word]])[0]
        xs.append(word_vector[0])
        ys.append(word_vector[1])
        zs.append(word_vector[2])

    ax.scatter(xs, ys, zs, c=words_colors, s=60, picker=True)

    for i in xrange(CATEGORIES_COUNT):
        legend_handles.append(
            mpatches.Patch(color=colors[i], label=CATEGORIES[i]))

    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')

    curr_tooltip = None

    def on_pick(event):
        try:
            if event.mouseevent.button != 1:
                return  # only left-click
            print ', '.join(words[ind] for ind in event.ind)
        except Exception as e:
            print "Exception on pick event: " + e

    fig.canvas.mpl_connect('pick_event', on_pick)

    plt.legend(handles=legend_handles)
    plt.tight_layout()
    save_current_plot('word_embedding.svg')
    plt.show()
def visualize_2d(word_emb, classifier_classes):
    trained_classifiers = []
    classifiers_features = []
    subplots = []

    fig = plt.figure(figsize=(8 * len(classifier_classes), 8))
    fig.suptitle("Example of several sentence embeddings in action")
    gs = gridspec.GridSpec(1, len(classifier_classes))

    plt.rcParams["figure.figsize"] = [11, 8]
    colors = ['r', 'y', 'b', 'g', 'cyan', 'magenta', 'gray', 'white', 'black']

    legend_handles = []
    colors_gen = itertools.cycle(colors)
    color_map = ListedColormap(list(itertools.islice(colors_gen, CATEGORIES_COUNT)), name='classifiers_color_map')

    colors_gen = itertools.cycle(colors)
    for category in CATEGORIES:
        color = next(colors_gen)
        legend_handles.append(mpatches.Patch(color=color, label=category))


    for classifier_index, Classifier in enumerate(classifier_classes):
        best_parameters = get_best_from_grid_search_results(Classifier)
        if best_parameters is None:
            exit(-1)
        _, _, sen_emb_class, params = best_parameters

        print ("\nEvaluating model for sentence embedding {:s} with params {:s}..."
               .format(sen_emb_class.__name__, str(params)))
        params["n_jobs"] = multiprocessing.cpu_count()

        # for the sake of visualization we will use 2 dimensional sentence vectors
        sen_emb = sen_emb_class(2)

        sen_emb.build(word_emb, SENTENCES)

        fb = build_features.FeatureBuilder()
        fb.build(sen_emb, LABELS, SENTENCES)
        classifiers_features.append(fb.features)

        params['probability'] = True
        classifier = Classifier(sen_emb, **params)
        classifier.fit(fb.features, fb.labels)
        trained_classifiers.append(classifier)

        print ("Rendering plot...")

        xs, ys = [], []
        for i in xrange(CATEGORIES_COUNT):
            category_vectors = filter(lambda (k, s): fb.labels[k] == i, enumerate(fb.features))
            xs.append([vec[0] for _, vec in category_vectors])
            ys.append([vec[1] for _, vec in category_vectors])

        x_min, x_max = min(min(x) for x in xs), max(max(x) for x in xs)
        y_min, y_max = min(min(y) for y in ys), max(max(y) for y in ys)
        MESHGRID_SIZE = 300
        xx, yy = np.meshgrid(np.linspace(x_min - 0.1, x_max + 0.1, MESHGRID_SIZE),
                             np.linspace(y_min - 0.1, y_max + 0.1, MESHGRID_SIZE))

        ax = fig.add_subplot(gs[classifier_index])
        ax.text(.5, .92, Classifier.__name__, horizontalalignment='center', transform=ax.transAxes)
        subplots.append(ax)
        classifier.visualize_2d(xx, yy, ax, color_map)

        colors_gen = itertools.cycle(colors)

        for i, category in enumerate(CATEGORIES):
            ax.scatter(xs[i], ys[i], color=next(colors_gen), picker=True, edgecolors='black', s=30)

    def on_click(event):
        x, y = event.xdata, event.ydata
        print "Point {:5.4f}, {:5.4f}:".format(x, y)
        for classifier in trained_classifiers:
            proba = classifier.clf.predict_proba([[x, y]])[0]
            print "{0} prediction: {1}" \
                .format(type(classifier).__name__, ", ".join(
                [CATEGORIES[i] + ": {:2.0f}%".format(100 * p) for i, p in enumerate(proba)]))
        print "\n"


    fig.canvas.callbacks.connect('button_press_event', on_click)

    plt.suptitle("Visualization of chosen classification algorithms with number of dimensions reduced to 2")
    plt.tight_layout()
    plt.legend(handles=legend_handles)
    save_current_plot('2d_visualization.svg')
    print ""
    plt.show()
def analyze_single_parameter(parameter, classifier_class, all_parameters_list):
    # count average, max and min performance for each value of parameter

    average_performances = {}
    min_performances = {}
    max_performances = {}

    for parameters, result in all_parameters_list:
        tested_param_value = parameters[parameter]

        if tested_param_value in average_performances:
            average_performances[tested_param_value] += result
            if result < min_performances[tested_param_value]:
                min_performances[tested_param_value] = result
            if result > max_performances[tested_param_value]:
                max_performances[tested_param_value] = result
        else:
            average_performances[tested_param_value] = result
            min_performances[tested_param_value] = result
            max_performances[tested_param_value] = result

    param_values = sorted(average_performances.iterkeys())
    param_values_count = len(param_values)
    tests_count_per_param_value = len(all_parameters_list) / param_values_count

    for param_value in average_performances.iterkeys():
        average_performances[param_value] /= tests_count_per_param_value

    # convert dictionaries to lists sorted by tested param values
    average_performances = [average_performances[key] for key in param_values]
    min_performances = [min_performances[key] for key in param_values]
    max_performances = [max_performances[key] for key in param_values]

    fig, ax = plt.subplots()
    use_log_scale = False

    # if parameter is numerical, plot lines and ask if to use logarithmic scale
    if all(isinstance(x, int) or isinstance(x, float) for x in param_values):
        use_log_answer = raw_input("Use logarithmic scale? [y/n] ").lower()
        use_log_scale = use_log_answer == 'y' or use_log_answer == 'yes'
        if use_log_scale:
            ax.set_xscale('log')
        lines = ax.plot(param_values, average_performances, 'orange',
                        param_values, min_performances, 'r', param_values,
                        max_performances, 'g')

        ax.scatter(param_values,
                   average_performances,
                   c='orange',
                   s=150,
                   marker='*',
                   edgecolors='black')
        ax.scatter(param_values,
                   min_performances,
                   c='red',
                   s=150,
                   marker='*',
                   edgecolors='black')
        ax.scatter(param_values,
                   max_performances,
                   c='green',
                   s=150,
                   marker='*',
                   edgecolors='black')

        plt.setp(lines, linewidth=2, markersize=8)

    # if parameter is non-numerical, plot a bar chart
    else:
        N, width = param_values_count, 0.15
        ind = np.arange(N)
        ax.bar(ind, average_performances, width, color='orange', label='Avg')
        ax.bar(ind + width, min_performances, width, color='r', label='Min')
        ax.bar(ind + 2 * width,
               max_performances,
               width,
               color='g',
               label='Max')
        plt.xticks(ind + width, param_values)

    avg_legend = mpatches.Patch(color='orange', label="Average performance")
    min_legend = mpatches.Patch(color='r', label="Minimum performance")
    max_legend = mpatches.Patch(color='g', label="Maximum performance")

    plt.legend(handles=[avg_legend, min_legend, max_legend])

    plt.title('{0} performance for different values of {1}'.format(
        classifier_class.__name__, parameter))

    if use_log_scale:
        plt.xlabel('Values of {0} (logarithmic scale)'.format(parameter))
    else:
        plt.xlabel('Values of {0}'.format(parameter))

    plt.ylabel('Cross-validation results')

    save_current_plot('parameters_{0}_{1}.svg'.format(
        classifier_class.__name__, parameter))
    plt.show()
def analyze_two_parameters(parameter1, parameter2, classifier_class,
                           all_parameters_list):
    # count max performance for each combinations of value of parameter1 and parameter2

    max_performances = {}

    for parameters, result in all_parameters_list:
        tested_param1_value = parameters[parameter1]
        tested_param2_value = parameters[parameter2]

        tested_tuple = (tested_param1_value, tested_param2_value)

        if tested_tuple in max_performances:
            if result > max_performances[tested_tuple]:
                max_performances[tested_tuple] = result
        else:
            max_performances[tested_tuple] = result

    param1_values = sorted([p1 for p1, p2 in max_performances.iterkeys()])
    param2_values = sorted([p2 for p1, p2 in max_performances.iterkeys()])

    # plot makes sense only if parameters ale numerical
    if not all(isinstance(x, int) or isinstance(x, float) for x in param1_values) or \
       not all(isinstance(x, int) or isinstance(x, float) for x in param2_values):
        print "Tested parameters must be numerical. Non-numerical paramateres can be analyzed only individually"
        return

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    zv = np.empty((len(param1_values), len(param2_values)))
    for i, param1 in enumerate(param1_values):
        for j, param2 in enumerate(param2_values):
            zv[i, j] = max_performances[(param1, param2)]

    points = np.zeros((len(param1_values) * len(param2_values), 2))
    values = np.zeros((len(param1_values) * len(param2_values)))

    param1_points = param1_values[:]
    param2_points = param2_values[:]

    use_log_answer = raw_input(
        "Use logarithmic scale for {0}? [y/n] ".format(parameter1)).lower()
    use_log_scale1 = use_log_answer == 'y' or use_log_answer == 'yes'
    if use_log_scale1:
        param1_points = np.log2(param1_values)

    use_log_answer = raw_input(
        "Use logarithmic scale for {0}? [y/n] ".format(parameter2)).lower()
    use_log_scale2 = use_log_answer == 'y' or use_log_answer == 'yes'
    if use_log_scale2:
        param2_points = np.log2(param2_values)

    # interpolate for better visual effect
    point_i = 0
    for i, param1_val in enumerate(param1_values):
        for j, param2_val in enumerate(param2_values):
            points[point_i] = [param1_points[i], param2_points[j]]
            values[point_i] = max_performances[(param1_val, param2_val)]
            point_i += 1

    grid_size = 20
    grid_x, grid_y = np.meshgrid(
        np.linspace(param1_points[0], param1_points[-1], num=grid_size),
        np.linspace(param2_points[0], param2_points[-1], num=grid_size))
    grid_z = griddata(points, values, (grid_x, grid_y), method='linear')

    for i in xrange(grid_z.shape[0]):
        for j in xrange(grid_z.shape[1]):
            if grid_z[i, j] > 100:
                grid_z[i, j] = 100

    ax.plot_surface(grid_x,
                    grid_y,
                    grid_z,
                    cmap=cm.coolwarm,
                    linewidth=0,
                    alpha=0.8)

    # scatter real points
    xs_and_ys = list(itertools.product(param1_points, param2_points))
    xs = [x for x, y in xs_and_ys]
    ys = [y for x, y in xs_and_ys]
    zs = [
        max_performances[(x, y)]
        for (x, y) in itertools.product(param1_values, param2_values)
    ]

    ax.scatter(xs, ys, zs, s=5)

    plt.title('{0} performance for different values of {1} and {2}'.format(
        classifier_class.__name__, parameter1, parameter2))

    if use_log_scale1:
        ax.set_xlabel(
            'Values of {0} (logarithmic scale: 2^))'.format(parameter1))
    else:
        ax.set_xlabel('Values of {0}'.format(parameter1))

    if use_log_scale2:
        ax.set_ylabel(
            'Values of {0} (logarithmic scale: 2^))'.format(parameter2))
    else:
        ax.set_ylabel('Values of {0}'.format(parameter2))

    ax.set_zlabel('Cross-validation results')

    save_current_plot('parameters_{0}_{1}_and_{2}.svg'.format(
        classifier_class.__name__, parameter1, parameter2))
    plt.show()