def make_word_cloud(color):
    data_arr = get_data_arr()
    df = pd.DataFrame(data_arr, columns=['id', 'word', 'length'])
    text_raw = " ".join(df['word'].drop_duplicates())
    wordcloud = WordCloud(background_color=color,
                          stopwords=STOPWORDS).generate(text_raw)
    wordcloud.to_file('words_cloud.png')
Beispiel #2
0
def get_stat_with_frequency():
    data_arr = get_data_arr()
    df = pd.DataFrame(data_arr, columns=['id', 'word', 'length'])
    frequency_df = df.word.value_counts().to_frame()
    a = frequency_df['word'].to_dict()

    xval = []
    yval = []
    for key, value in a.items():
        xval.append(value)
        yval.append(key)

    x = frequency_df['word']
    e = x.mean()
    frequency_df['word'] = round(x - e, 2)
    frequency_df['colors'] = ['red' if x < 0 else 'green' for x in frequency_df['word']]

    plt.figure(figsize=(200, 200), dpi=80)
    plt.hlines(y=frequency_df.index, xmin=0, xmax=frequency_df.word)
    for x, y, tex in zip(frequency_df.word, frequency_df.index, frequency_df.word):
        t = plt.text(x, y, tex, horizontalalignment='right' if x < 0 else 'left',
                     verticalalignment='center', fontdict={'color': 'red' if x < 0 else 'green', 'size': 12})

    plt.yticks(frequency_df.index, frequency_df.index, fontsize=12)
    plt.title('Diverging Text Bars of Car Mileage', fontdict={'size': 20})
    plt.grid(linestyle='--', alpha=0.5)
    # plt.xlim(-2.5, 2.5)
    plt.savefig('words_freq_bar.png')
Beispiel #3
0
def get_top_using_words():
    data_arr = get_data_arr()
    df = pd.DataFrame(data_arr, columns=['id', 'word', 'length'])
    # print(len(df))
    frequency_df = df.word.value_counts().to_frame()
    frequency_df_without_bad_words = frequency_df[frequency_df['word'].apply(
        lambda x: df.word.value_counts().mean() - 3 < x < df.word.value_counts().mean() + 3)]
    frequency_df_with_bad_words = frequency_df[frequency_df['word'].apply(
        lambda x: df.word.value_counts().mean() - 3 >= x or x >= df.word.value_counts().mean() + 3)]
    return frequency_df_without_bad_words['word'].to_dict(), frequency_df_with_bad_words['word'].to_dict()
def draw_stat_length():
    data_arr = get_data_arr()
    figure, axes = plt.subplots(3, figsize=(50, 50))
    ax1 = axes[0]
    ax2 = axes[1]
    ax3 = axes[2]

    ax1.set_xlabel('Length', fontsize=20)
    ax1.set_ylabel('Number words', fontsize=20)

    # ax1.xaxis.set_major_locator(ticker.MultipleLocator(1))
    # ax1.yaxis.set_major_locator(ticker.MultipleLocator(10))

    ax2.set_xlabel('Number words', fontsize=20)
    ax2.set_ylabel('Length', fontsize=20)
    # ax2.xaxis.set_major_locator(ticker.MultipleLocator(10))

    df = pd.DataFrame(data_arr, columns=['id', 'word', 'length'])

    df.length.value_counts().sort_index(ascending=False).plot(ax=ax1,
                                                              grid=True,
                                                              fontsize=20)

    extent = ax1.get_window_extent().transformed(
        figure.dpi_scale_trans.inverted())
    ax1.figure.savefig('saved_length_statistic1.png',
                       bbox_inches=extent.expanded(1.1, 1.2))

    df.length.value_counts().sort_index(ascending=False).plot(ax=ax2,
                                                              kind='barh',
                                                              grid=True,
                                                              fontsize=20)
    extent = ax2.get_window_extent().transformed(
        figure.dpi_scale_trans.inverted())
    ax2.figure.savefig('saved_length_statistic2.png',
                       bbox_inches=extent.expanded(1.1, 1.2))

    df.length.value_counts().sort_index(ascending=False).plot(
        ax=ax3,
        kind='pie',
        autopct='%1.1f%%',
        shadow=True,
        ylabel='Length',
        grid=True,
        fontsize=20)
    extent = ax3.get_window_extent().transformed(
        figure.dpi_scale_trans.inverted())
    ax3.figure.savefig('saved_length_statistic3.png',
                       bbox_inches=extent.expanded(1.1, 1.2))
    plt.close()
Beispiel #5
0
def get_next_prev_words(word):
    data_arr = get_data_arr()
    df = pd.DataFrame(data_arr, columns=['id', 'word', 'length'])
    list_words = df['word'].to_list()
    next_words = []
    prev_words = []
    if len(list_words) > 1 and list_words[0] == word:
        next_words.append(list_words[1])
    if len(list_words) > 1 and list_words[len(list_words) - 1] == word:
        prev_words.append(list_words[len(list_words) - 2])

    for i in range(1, len(list_words) - 1):
        if list_words[i] == word:
            next_words.append(list_words[i + 1])
            prev_words.append(list_words[i - 1])
    return prev_words, next_words
def find_abc_in_df():
    data_arr = get_data_arr()
    df = pd.DataFrame(data_arr, columns=['id', 'word', 'length'])
    lang = 'abcdefghijklmnopqrstuvwxyz'
    LANG = 'ZYXWVUTSRQPONMLKJIHGFEDCBA'
    LANG = LANG[::-1]
    kol_up = {}
    kol_down = {}
    for symb in lang:
        kol_down[symb] = 0
    for symb in LANG:
        kol_up[symb] = 0
    for expr in df['word']:
        for symb in expr:
            if symb in lang or symb in LANG:
                if symb.isupper():
                    kol_up[symb] += 1
                else:
                    kol_down[symb] += 1
    return kol_up, kol_down