def tsne_plot(model, search_word, n_neighbors=10, ax=None): # Credit for function: # https://medium.com/@khulasaandh/word-embeddings-fun-with-word2vec-and-game-of-thrones-ea4c24fcf1b8 labels = [bidi.get_display(search_word)] tokens = [model.wv[search_word]] similar = [1] close_words = model.wv.similar_by_word(search_word, topn=n_neighbors) for word in close_words: tokens.append(model.wv[word[0]]) labels.append(bidi.get_display(word[0])) similar.append(word[1]) tsne_model = TSNE(n_components=2, init='pca') coordinates = tsne_model.fit_transform(tokens) df = pd.DataFrame({'x': [x for x in coordinates[:, 0]], 'y': [y for y in coordinates[:, 1]], 'words': labels, 'similarity': similar} ) if ax is None: _, ax = plt.subplots() plot = ax.scatter(df.x, df.y, c=df.similarity, cmap='Reds') for i in range(len(df)): ax.annotate(" {} ({:.2f})".format(df.words[i].title(), df.similarity[i]), (df.x[i], df.y[i])) plt.colorbar(mappable=plot, ax=ax) ax.set_title('t-SNE visualization for {}'.format( bidi.get_display(search_word))) utils.draw()
def words2sections(G, df, to_plot=False, title=''): sections = np.unique(df.section) colors = ('cyan', 'red', 'green', 'lime', 'orange', 'gold', 'grey', 'magenta', 'plum', 'peru') # assign to each word the section in which it appears the most for w in G.node: G.node[w]['section'] = \ sections[np.argmax(['\n'.join(df[df.section==sec].text).count(w) for sec in sections])] sections = np.unique([G.node[w]['section'] for w in G.node]) if to_plot: colors = {sec: colors[i % len(colors)] for i, sec in enumerate(sections)} G2 = nx.relabel_nodes( G, {w: bidi.get_display(w) for w in G.node}, True) pos = nx.spring_layout(G2) for sec in sections: nx.draw_networkx_nodes(G2, pos=pos, node_color=colors[sec], with_labels=True, label=bidi.get_display(sec), nodelist=[w for w in G2 if G2.node[w]['section']==sec]) nx.draw_networkx_edges(G2, pos=pos, edge_color='pink') nx.draw_networkx_labels(G2, pos=pos) plt.title(title) plt.legend() utils.draw()
def perceptron_diagnosis(model, col_names=None, title=None, fig=None, max_features=50): # input validation if len(model.coef_)<=2: raise NotImplementedError('Binary classification diagnosis is ' + 'currently not supported.') if fig is None: fig = plt.subplots(1,1) plt.figure(fig[0].number) if col_names is None: col_names = list(range(len(model.coef_[0]))) col_names = ['intercept'] + [bidi.get_display(nm) for nm in col_names] # get std of coefficients coef_std = [np.std(model.intercept_)] + \ [np.std([cfs[i] for cfs in model.coef_]) for i in range(len(model.coef_[0]))] if max_features: ids = np.array(coef_std).argsort()[-max_features:][::-1] col_names = [col_names[i] for i in ids] coef_std = [coef_std[i] for i in ids] # plot pre_title = '' if title is None else title+'\n' utils.barplot(fig[1], col_names, coef_std, vertical_xlabs=True, title=pre_title + 'Perceptron Diagnosis ' + f'({model.n_iter_:d} iterations)', xlab='Feature', colors=('black',), ylab='STD(coef) over classes\n' + '(not STD(x*coef)!)') utils.draw()
def plot_words_repetitions(tab): f, axs = plt.subplots(1, 1) axs.plot(list(range(101)), utils.dist([t[1] for t in tab], list(range(101)))[2:], 'k-') axs.set_yscale('log') axs.set_xlim((0, 100)) axs.set_xlabel(f'Quantile [%]\n(100% = {len(tt):d} words)', fontsize=12) axs.set_ylabel('Repetitions', fontsize=12) axs.set_title('Frequency of Words in Articles\n' + '(in Hebrew without stopwords)', fontsize=14) utils.draw()
def lengths_analysis(df, by=None): f, axs = plt.subplots(3, 3) # remove blocked haaretz texts before analysis df = df[np.logical_not(df['blocked'])] # count units df['words_per_text'] = count_words(df.text) df['words_per_title'] = count_words(df.title) df['words_per_subtitle'] = count_words(df.subtitle) df['characters_per_text'] = [len(s) for s in df.text] df['sentences_per_text'] = count_sentences(df.text) df['paragraphs_per_text'] = count_paragraphs(df.text) df['characters_per_title'] = [len(s) for s in df.title] df['unique_words_per_100_words'] =\ [100*len(np.unique(list(filter(None,re.split(' |\t|\n\r|\n',s))))) / len(list(filter(None,re.split(' |\t|\n\r|\n',s)))) for s in df.text] df['characters_per_word'] =\ [len(s)/len(list(filter(None,re.split(' |\t|\n\r|\n',s)))) for s in df.text] # plot columns = ('words_per_text', 'words_per_subtitle', 'words_per_title', 'characters_per_text', 'sentences_per_text', 'paragraphs_per_text', 'characters_per_title', 'unique_words_per_100_words', 'characters_per_word') for i,col in enumerate(columns): ax = axs[int(i/3),i%3] bp = df.boxplot(ax=ax, column=col, by=['source']+([by] if by else []), return_type='both', patch_artist=True) colors = np.repeat(('blue','red','green'), int(len(bp[0][1]['boxes'])/3)) for box, color in zip(bp[0][1]['boxes'], colors): box.set_facecolor(color) ax.set_xlabel('')#'Source', fontsize=12) ax.set_ylabel(col.replace('_',' ').capitalize(), fontsize=12) if by: ax.set_xticklabels( [bidi.get_display( t._text.replace('(', '').replace(')', '').replace(', ', '\n') ) for t in ax.get_xticklabels()], rotation=90) if i==0: ax.set_title('TOKENS COUNT', fontsize=14) else: ax.set_title('') # draw utils.draw()
def validity_tests(df): sources = np.unique(df['source']) blocked_contents = (1-check_haaretz_blocked_text(df[df['source'] == 'haaretz'])\ / np.sum(df['source']=='haaretz')) * 100 df = df[np.logical_not(df['blocked'])] n = {src: np.sum(df['source'] == src) for src in sources} # get anomalies bad_types = {src: verify_valid(df[df['source']==src], {'date':datetime,'blocked':np.bool_}) for src in sources} bad_lengths = {src: check_lengths(df[df['source']==src]) for src in sources} bad_tokens = {src: verify_hebrew_words(df[df['source']==src]) for src in sources} # plot anomalies f, axs = plt.subplots(3, len(sources)) for i, src in enumerate(sources): tit = ('DATA SANITY TESTS\n' if i==int(len(sources)/2) else '\n') +\ f'[{src:s}] Invalid field types' +\ (f'\n(out of {blocked_contents:.0f}% unblocked articles)' if src=='haaretz' else '\n') utils.barplot(axs[0, i], bad_types[src].keys(), 100 * np.array(tuple(bad_types[src].values())) / n[src], vertical_xlabs=True, title=tit, ylab='Having invalid type [%]', ylim=(0, 100)) sp = inspect.getfullargspec(check_lengths) limits = list(itertools.chain.from_iterable(sp[3][0].values())) for i, src in enumerate(sources): utils.barplot(axs[1, i], [a+f'\n({b:.0f} chars)' for a,b in zip(bad_lengths[src].keys(),limits)], 100 * np.array(tuple(bad_lengths[src].values())) / n[src], vertical_xlabs=True, title=f'[{src:s}] Suspicious string-field lengths', ylab='Having invalid length [%]', ylim=(0, 100)) utils.barplot(axs[2,0], sources, [100*(1-bad_tokens[src][0]) for src in sources], xlab='Source', ylab='Words without numbers\nor Hebrew letters [%]') utils.barplot(axs[2,1], sources, [100*(1-bad_tokens[src][1]) for src in sources], xlab='Source', ylab='Words of length <=1 [%]') for i in range(2,len(sources)): utils.clean_figure(axs[2,i]) # draw utils.draw()
def plot_results(res, axs, title='Test Classification', reference=None): for i,test in enumerate(res): ax = axs[i] n_samples = res[test][0] # plot reference if reference is not None: ax.plot((n_samples[0], n_samples[-1]), 2 * [reference], 'k--', label='Random') # plot actual results for model in res[test][1]: accuracy = res[test][1][model] ax.plot(n_samples, accuracy, label=model) ax.set_title(title + f'\n({test:s})', fontsize=14) ax.set_xlabel('Training samples', fontsize=12) ax.set_ylabel('Accuracy [%]', fontsize=12) ax.set_xlim((n_samples[0],n_samples[-1])) ax.set_ylim((0,101)) ax.grid(color='k', linestyle=':', linewidth=1) ax.legend(loc='upper left') utils.draw()
def data_description(df): sources = np.unique(df['source']) n = len(sources) f, axs = plt.subplots(2, n) # counters per source bar_per_source(axs[0,0], df, ylab='Articles\n(black = partially blocked contents)', fun=lambda d: d.shape[0], title='\nArticles per Source') bar_per_source(axs[0,1], df, ylab='Words [x1000]\n(black = partially blocked contents)', fun=lambda d: sum(len(l.split()) for t in d['text'].values for l in t.split('\n')) / 1e3, title='BASIC DATA DESCRIPTION\nWords per Source') # remove blocked haaretz texts before next analysis df = df[np.logical_not(df['blocked'])] # sections per source articles_per_section =\ [df[np.logical_and(df.source==src,df.section==sec)].shape[0] for src in sources for sec in np.unique(df[df.source==src].section)] axs[0,2].pie([df[df.source==src].shape[0] for src in sources], labels=sources, colors=utils.DEF_COLORS[:3], startangle=90, frame=True, counterclock=False) patches,_ = axs[0,2].pie(articles_per_section, radius=0.75, startangle=90, counterclock=False) centre_circle =\ plt.Circle((0, 0), 0.5, color='black', fc='white', linewidth=0) axs[0,2].add_artist(centre_circle) axs[0,2].set_title('\nSources and Sections', fontsize=14) axs[0,2].legend( patches, [bidi.get_display(sec) for src in sources for sec in np.unique(df[df.source==src].section)], ncol=5, loc='upper right', bbox_to_anchor=(1, 0.11), fontsize=8 ) # dates & authors date_hist(axs[1,0], df) author_concentration(axs[1,1], df) top_authors(axs[1,2], df) # draw utils.draw()
def naive_bayes_diagnosis(model, col_names=None, title=None, fig=None, max_features=50): # input validation if fig is None: fig = plt.subplots(1,1) plt.figure(fig[0].number) if col_names is None: col_names = list(range(len(model.feature_importances))) col_names = [bidi.get_display(nm) for nm in col_names] # get std of coefficients log_probs_std = [np.std([lp[i] for lp in model.feature_log_prob_]) for i in range(len(model.feature_log_prob_[0]))] if max_features: ids = np.array(log_probs_std).argsort()[-max_features:][::-1] col_names = [col_names[i] for i in ids] log_probs_std = [log_probs_std[i] for i in ids] # plot pre_title = '' if title is None else title+'\n' utils.barplot(fig[1], col_names, log_probs_std, vertical_xlabs=True, title=pre_title+f'Naive Bayes Diagnosis', xlab='Feature', colors=('black',), ylab='STD(log probability)\nover classes') utils.draw()
def random_forest_diagnosis(model, col_names=None, title=None, fig=None, max_features=50): # input validation if fig is None: fig = plt.subplots(1,1) plt.figure(fig[0].number) if col_names is None: col_names = list(range(len(model.feature_importances_))) col_names = [bidi.get_display(nm) for nm in col_names] # get importance importance = model.feature_importances_ if max_features: ids = np.array(importance).argsort()[-max_features:][::-1] col_names = [col_names[i] for i in ids] importance = [importance[i] for i in ids] # plot pre_title = '' if title is None else title+'\n' utils.barplot(fig[1], col_names, importance, vertical_xlabs=True, title=pre_title + 'Random Forest Diagnosis ' + f'({len(model.estimators_):d} trees)', xlab='Feature', colors=('black',), ylab='Gini importance') utils.draw()
def count_parties( ax, df, col='text', by='source', binary_per_text=False, logscale=False, keys=('ליכוד', ('ביבי', 'נתניהו'), ('כחול לבן', 'כחול-לבן'), 'גנץ', 'העבודה', 'גבאי', ('חד"ש', 'תע"ל'), 'עודה', 'יהדות התורה', 'ליצמן', 'איחוד הימין', "סמוטריץ'", 'הימין החדש', 'בנט', 'זהות', 'פייגלין', 'מרצ', 'זנדברג', 'ש"ס', 'דרעי', 'כולנו', 'כחלון', ('בל"ד', 'רע"ם'), 'עבאס', ('ישראל ביתנו', 'ישראל-ביתנו'), 'ליברמן', 'גשר', 'אורלי לוי')): groups = np.unique(df[by]) sep = SEPARATOR['word'] count = {grp: len(keys) * [0] for grp in groups} for grp in groups: for i, txt in enumerate(df[df[by] == grp][col]): for j, key in enumerate(keys): # multi-word keys appears = 0 if isinstance(key, tuple): for k in key: if ' ' in k: appears = txt.count(k) count[grp][j] += bool( appears) if binary_per_text else appears if binary_per_text: break else: k = key if ' ' in k: appears = txt.count(k) count[grp][j] += bool( appears) if binary_per_text else appears if binary_per_text and appears: continue # one-word keys for w in re.split(sep, txt): w = re.sub('\.|,|\(|\)|;|:|\t', '', w).strip() if w.endswith(key): count[grp][j] += 1 if binary_per_text: break keys = tuple(k[0] + ' /\n' + k[1] if isinstance(k, tuple) else k for k in keys) keys = tuple(bidi.get_display(k) for k in keys) colors = utils.DEF_COLORS bottom = np.array([0 for _ in keys]) ylab = ('Texts with the expression' if binary_per_text else 'Total appearances') +\ '\n(as end of a word)' for i, group in enumerate(groups): utils.barplot(ax, keys, count[group], bottom=bottom, plot_bottom=False, ylab=ylab, title='Frequency of appearance', vertical_xlabs=True, colors=colors[i % len(colors)], label=bidi.get_display(group)) bottom += count[group] if logscale: ax.set_yscale('log') ax.legend() utils.draw()