def perceptron_diagnosis(model, col_names=None, title=None, fig=None, max_features=50): # input validation if len(model.coef_)<=2: raise NotImplementedError('Binary classification diagnosis is ' + 'currently not supported.') if fig is None: fig = plt.subplots(1,1) plt.figure(fig[0].number) if col_names is None: col_names = list(range(len(model.coef_[0]))) col_names = ['intercept'] + [bidi.get_display(nm) for nm in col_names] # get std of coefficients coef_std = [np.std(model.intercept_)] + \ [np.std([cfs[i] for cfs in model.coef_]) for i in range(len(model.coef_[0]))] if max_features: ids = np.array(coef_std).argsort()[-max_features:][::-1] col_names = [col_names[i] for i in ids] coef_std = [coef_std[i] for i in ids] # plot pre_title = '' if title is None else title+'\n' utils.barplot(fig[1], col_names, coef_std, vertical_xlabs=True, title=pre_title + 'Perceptron Diagnosis ' + f'({model.n_iter_:d} iterations)', xlab='Feature', colors=('black',), ylab='STD(coef) over classes\n' + '(not STD(x*coef)!)') utils.draw()
def bar_per_source(ax, df, fun, ylab, title, colors='black', bcolors=utils.DEF_COLORS): sources = np.unique(df.source) utils.barplot( ax, sources, [fun(df[np.logical_and(df.source==src,df.blocked)]) for src in sources], bottom= [fun(df[np.logical_and(df.source==src,np.logical_not(df.blocked))]) for src in sources], ylab=ylab, title=title, colors=colors, bcolors=bcolors )
def date_hist(ax, df, old_thresh=np.datetime64(datetime(2019,3,1))): dts = [str(dt) if str(dt)=='NaT' else str(dt)[:4] if dt<old_thresh else str(dt)[:10] for dt in df.date] dts_vals = sorted(list(set(dts))) sources = np.unique(df.source) date_count = {src: [np.sum(sc==src and dt==dt_val for sc,dt in zip(df.source,dts)) for dt_val in dts_vals] for src in sources} bottom = np.array([0 for _ in dts_vals]) for i,src in enumerate(sources): utils.barplot(ax, dts_vals, date_count[src], bottom=bottom, title='Dates', ylab='Articles', vertical_xlabs=True, label=src, colors=('b','r','g')[i], plot_bottom=False) bottom += date_count[src] ax.legend(loc='upper left')
def naive_bayes_diagnosis(model, col_names=None, title=None, fig=None, max_features=50): # input validation if fig is None: fig = plt.subplots(1,1) plt.figure(fig[0].number) if col_names is None: col_names = list(range(len(model.feature_importances))) col_names = [bidi.get_display(nm) for nm in col_names] # get std of coefficients log_probs_std = [np.std([lp[i] for lp in model.feature_log_prob_]) for i in range(len(model.feature_log_prob_[0]))] if max_features: ids = np.array(log_probs_std).argsort()[-max_features:][::-1] col_names = [col_names[i] for i in ids] log_probs_std = [log_probs_std[i] for i in ids] # plot pre_title = '' if title is None else title+'\n' utils.barplot(fig[1], col_names, log_probs_std, vertical_xlabs=True, title=pre_title+f'Naive Bayes Diagnosis', xlab='Feature', colors=('black',), ylab='STD(log probability)\nover classes') utils.draw()
def random_forest_diagnosis(model, col_names=None, title=None, fig=None, max_features=50): # input validation if fig is None: fig = plt.subplots(1,1) plt.figure(fig[0].number) if col_names is None: col_names = list(range(len(model.feature_importances_))) col_names = [bidi.get_display(nm) for nm in col_names] # get importance importance = model.feature_importances_ if max_features: ids = np.array(importance).argsort()[-max_features:][::-1] col_names = [col_names[i] for i in ids] importance = [importance[i] for i in ids] # plot pre_title = '' if title is None else title+'\n' utils.barplot(fig[1], col_names, importance, vertical_xlabs=True, title=pre_title + 'Random Forest Diagnosis ' + f'({len(model.estimators_):d} trees)', xlab='Feature', colors=('black',), ylab='Gini importance') utils.draw()
def validity_tests(df): sources = np.unique(df['source']) blocked_contents = (1-check_haaretz_blocked_text(df[df['source'] == 'haaretz'])\ / np.sum(df['source']=='haaretz')) * 100 df = df[np.logical_not(df['blocked'])] n = {src: np.sum(df['source'] == src) for src in sources} # get anomalies bad_types = {src: verify_valid(df[df['source']==src], {'date':datetime,'blocked':np.bool_}) for src in sources} bad_lengths = {src: check_lengths(df[df['source']==src]) for src in sources} bad_tokens = {src: verify_hebrew_words(df[df['source']==src]) for src in sources} # plot anomalies f, axs = plt.subplots(3, len(sources)) for i, src in enumerate(sources): tit = ('DATA SANITY TESTS\n' if i==int(len(sources)/2) else '\n') +\ f'[{src:s}] Invalid field types' +\ (f'\n(out of {blocked_contents:.0f}% unblocked articles)' if src=='haaretz' else '\n') utils.barplot(axs[0, i], bad_types[src].keys(), 100 * np.array(tuple(bad_types[src].values())) / n[src], vertical_xlabs=True, title=tit, ylab='Having invalid type [%]', ylim=(0, 100)) sp = inspect.getfullargspec(check_lengths) limits = list(itertools.chain.from_iterable(sp[3][0].values())) for i, src in enumerate(sources): utils.barplot(axs[1, i], [a+f'\n({b:.0f} chars)' for a,b in zip(bad_lengths[src].keys(),limits)], 100 * np.array(tuple(bad_lengths[src].values())) / n[src], vertical_xlabs=True, title=f'[{src:s}] Suspicious string-field lengths', ylab='Having invalid length [%]', ylim=(0, 100)) utils.barplot(axs[2,0], sources, [100*(1-bad_tokens[src][0]) for src in sources], xlab='Source', ylab='Words without numbers\nor Hebrew letters [%]') utils.barplot(axs[2,1], sources, [100*(1-bad_tokens[src][1]) for src in sources], xlab='Source', ylab='Words of length <=1 [%]') for i in range(2,len(sources)): utils.clean_figure(axs[2,i]) # draw utils.draw()
def count_parties( ax, df, col='text', by='source', binary_per_text=False, logscale=False, keys=('ליכוד', ('ביבי', 'נתניהו'), ('כחול לבן', 'כחול-לבן'), 'גנץ', 'העבודה', 'גבאי', ('חד"ש', 'תע"ל'), 'עודה', 'יהדות התורה', 'ליצמן', 'איחוד הימין', "סמוטריץ'", 'הימין החדש', 'בנט', 'זהות', 'פייגלין', 'מרצ', 'זנדברג', 'ש"ס', 'דרעי', 'כולנו', 'כחלון', ('בל"ד', 'רע"ם'), 'עבאס', ('ישראל ביתנו', 'ישראל-ביתנו'), 'ליברמן', 'גשר', 'אורלי לוי')): groups = np.unique(df[by]) sep = SEPARATOR['word'] count = {grp: len(keys) * [0] for grp in groups} for grp in groups: for i, txt in enumerate(df[df[by] == grp][col]): for j, key in enumerate(keys): # multi-word keys appears = 0 if isinstance(key, tuple): for k in key: if ' ' in k: appears = txt.count(k) count[grp][j] += bool( appears) if binary_per_text else appears if binary_per_text: break else: k = key if ' ' in k: appears = txt.count(k) count[grp][j] += bool( appears) if binary_per_text else appears if binary_per_text and appears: continue # one-word keys for w in re.split(sep, txt): w = re.sub('\.|,|\(|\)|;|:|\t', '', w).strip() if w.endswith(key): count[grp][j] += 1 if binary_per_text: break keys = tuple(k[0] + ' /\n' + k[1] if isinstance(k, tuple) else k for k in keys) keys = tuple(bidi.get_display(k) for k in keys) colors = utils.DEF_COLORS bottom = np.array([0 for _ in keys]) ylab = ('Texts with the expression' if binary_per_text else 'Total appearances') +\ '\n(as end of a word)' for i, group in enumerate(groups): utils.barplot(ax, keys, count[group], bottom=bottom, plot_bottom=False, ylab=ylab, title='Frequency of appearance', vertical_xlabs=True, colors=colors[i % len(colors)], label=bidi.get_display(group)) bottom += count[group] if logscale: ax.set_yscale('log') ax.legend() utils.draw()