Beispiel #1
0
def ex1():
    """
    Minimal example.
    """

    test_string_1 = "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua."

    test_string_2 = "At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."

    # tokenize words (approximately at least):
    sets = []
    for string in [test_string_1, test_string_2]:

        # get a word list
        words = string.split(' ')

        # remove non alphanumeric characters
        words = [''.join(ch for ch in word if ch.isalnum()) for word in words]

        # convert to all lower case
        words = [word.lower() for word in words]

        sets.append(set(words))

    # create visualisation
    venn2_wordcloud(sets)

    return
def ex3():
    """
    Answer to
    http://stackoverflow.com/questions/42812083/auto-venn-diagram-text-rendering/42839350#42839350
    """

    just_dem = [
        "sincerely", "women", "service", "newsletter", "program", "families",
        "community", "funding", "important", "million", "department"
    ]
    dem_and_rep = [
        "country", "make", "support", "state", "people", "jobs", "American",
        "care", "health", "president", "work", "veterans", "tax", "survey",
        "years", "need", "economy"
    ]
    just_rep = [
        "security", "nation", "Obama", "energy", "law", "spending", "budget",
        "states", "committee", "passed", "job", "business"
    ]
    dem = just_dem + dem_and_rep
    rep = just_rep + dem_and_rep

    def color_func(word, *args, **kwargs):
        if word in just_dem:
            # return "#000080" # navy blue
            return "#0000ff"  # blue1
        elif word in just_rep:
            # return "#8b0000" # red4
            return "#ff0000"  # red1
        else:
            return "#0f0f0f"  # gray6 (aka off-black)

    words = just_dem + dem_and_rep + just_rep

    # for testing, assign random word frequencies;
    # frequencies = np.random.rand(len(words))
    # frequencies /= np.sum(frequencies)

    # word frequencies follow Zipf's law;
    # words will probably be within the 10k most frequent words;
    # however, we will probably exclude the 1000 most common words from analysis;
    frequencies = 1. / np.arange(1000, 10000)  # Zipf's law with alpha = 1
    frequencies = np.random.choice(frequencies, size=len(words))

    word_to_frequency = dict(zip(words, frequencies))

    fig, ax = plt.subplots(1, 1)
    ax.set_title("Congress says what?", fontsize=36)
    venn2_wordcloud([set(dem), set(rep)],
                    set_labels=["Democrats", "Republicans"],
                    set_edgecolors=['b', 'r'],
                    word_to_frequency=word_to_frequency,
                    wordcloud_kwargs=dict(color_func=color_func,
                                          relative_scaling=.5),
                    ax=ax)

    return
Beispiel #3
0
def get_classification_results(attrs):
    timestamp = time.time()
    df = get_data_from_files()
    df['processed'] = PreprocessTransformer(attrs).transform(df['text'])

    pipeline = make_pipeline(VectorTransformer(attrs),
                             DecompositionTransformer(attrs))
    features = pipeline.transform(df['processed'])

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        df['label'],
                                                        test_size=0.3,
                                                        random_state=42,
                                                        stratify=df['label'])

    results = []
    for algorithm in attrs['algorithms']:
        model, y_pred, y_prob = ClassifierSelector(attrs, algorithm,
                                                   X_test).transform(
                                                       X_train, y_train)

        result = {
            'Name': classifier_names[algorithm],
            'Acc. (Train)': '{:.4f}'.format(model.score(X_train, y_train)),
            'Acc. (Test)': '{:.4f}'.format(model.score(X_test, y_test)),
        }

        # update for multiclass/binary specific
        result = update_binary(result, y_test, y_pred, model.classes_[1]) \
            if attrs['labelsType'] == 'binary' \
            else update_multiclass(result, y_test, y_pred)
        results.append(result)

        make_plots(y_test, y_pred, y_prob, algorithm, timestamp)

    # venn word diagram
    sets = []
    for label in df['label'].unique():
        docs = df[df['label'] == label]['processed'].values
        text = ' '.join(docs)
        counter = Counter(text.split())
        sets.append(set([item[0] for item in counter.most_common(80)]))

    venn2_wordcloud(sets)
    plt.savefig('static/img/venn/venn_words-{}.png'.format(timestamp), dpi=200)
    plt.close('all')

    return results, timestamp
Beispiel #4
0
def draw_word_venn(datasets, dataset_name1, dataset_name2):
    fig, ax = plt.subplots(1, 1, figsize=(30, 30))
    set1 = set(get_names_from_files(datasets[dataset_name1]))
    set2 = set(get_names_from_files(datasets[dataset_name2]))
    v = venn2_wordcloud([set1, set2], (dataset_name1, dataset_name2), ax=ax, set_colors=['blue', 'yellow'])
    for text in v.set_labels:
        text.set_fontsize(30)
    for text in v.subset_labels:
        text.set_fontsize(18)
    plt.show()
Beispiel #5
0
def ex6():
    """
    Issue #5:
    https://github.com/paulbrodersen/matplotlib_venn_wordcloud/issues/5

    Allow user to specify max_font_size/min_font_size.
    """

    test_string_1 = "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua."

    test_string_2 = "At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet."

    # tokenize words (approximately at least):
    sets = []
    for string in [test_string_1, test_string_2]:

        # get a word list
        words = string.split(' ')

        # remove non alphanumeric characters
        words = [''.join(ch for ch in word if ch.isalnum()) for word in words]

        # convert to all lower case
        words = [word.lower() for word in words]

        sets.append(set(words))

    # create visualisation
    fig, axes = plt.subplots(2, 2, figsize=(20, 20))
    ax2, ax3, ax4, ax5 = axes.ravel()

    # These paramater values hould have no effect as the given min and
    # max font size should be much larger than WC would want them to
    # be anyway (negative control).
    venn2_wordcloud(sets,
                    wordcloud_kwargs=dict(max_font_size=1000, min_font_size=0),
                    ax=ax2)
    ax2.set_title('max_font_size and min_font_size outside range')

    # Positive controls
    venn2_wordcloud(sets, wordcloud_kwargs=dict(max_font_size=50), ax=ax3)
    ax3.set_title('max_font_size=50')

    venn2_wordcloud(sets, wordcloud_kwargs=dict(min_font_size=30), ax=ax4)
    ax4.set_title('min_font_size=30')

    venn2_wordcloud(sets,
                    wordcloud_kwargs=dict(max_font_size=50, min_font_size=30),
                    ax=ax5)
    ax5.set_title('max_font_size=50, min_font_size=30')
Beispiel #6
0
def ex4():
    """
    Issue #2:
    https://github.com/paulbrodersen/matplotlib_venn_wordcloud/issues/2
    """

    from matplotlib import pyplot as plt
    from matplotlib_venn_wordcloud import venn2_wordcloud

    x = {'sincerely', 'department', 'usa', 'usa nation'}
    y = {'sincerely', 'security', 'usa democracy'}
    s = (x, y)

    v = venn2_wordcloud(s)
Beispiel #7
0
def ex5():
    """
    Issue #4:
    https://github.com/paulbrodersen/matplotlib_venn_wordcloud/issues/4

    Handle non-overlapping sets gracefully.
    """
    from matplotlib import pyplot as plt
    from matplotlib_venn_wordcloud import venn2_wordcloud

    x = set('abcd')
    y = set('efgh')
    s2 = (x, y)

    v2 = venn2_wordcloud(s2)

    z = set('ijkl')
    s3 = (x, y, z)
    v3 = venn3_wordcloud(s3)