Esempio n. 1
0
def perceptron_diagnosis(model, col_names=None, title=None, fig=None,
                         max_features=50):
    # input validation
    if len(model.coef_)<=2:
        raise NotImplementedError('Binary classification diagnosis is ' +
                                  'currently not supported.')
    if fig is None:
        fig = plt.subplots(1,1)
    plt.figure(fig[0].number)
    if col_names is None:
        col_names = list(range(len(model.coef_[0])))
    col_names = ['intercept'] + [bidi.get_display(nm) for nm in col_names]
    # get std of coefficients
    coef_std = [np.std(model.intercept_)] + \
               [np.std([cfs[i] for cfs in model.coef_])
                for i in range(len(model.coef_[0]))]
    if max_features:
        ids = np.array(coef_std).argsort()[-max_features:][::-1]
        col_names = [col_names[i] for i in ids]
        coef_std = [coef_std[i] for i in ids]
    # plot
    pre_title = '' if title is None else title+'\n'
    utils.barplot(fig[1], col_names, coef_std, vertical_xlabs=True,
                  title=pre_title + 'Perceptron Diagnosis ' +
                        f'({model.n_iter_:d} iterations)',
                  xlab='Feature', colors=('black',),
                  ylab='STD(coef) over classes\n' + '(not STD(x*coef)!)')
    utils.draw()
Esempio n. 2
0
def bar_per_source(ax, df, fun, ylab, title,
                   colors='black', bcolors=utils.DEF_COLORS):
    sources = np.unique(df.source)
    utils.barplot(
        ax, sources,
        [fun(df[np.logical_and(df.source==src,df.blocked)]) for src in sources],
        bottom=
        [fun(df[np.logical_and(df.source==src,np.logical_not(df.blocked))])
         for src in sources],
        ylab=ylab, title=title, colors=colors, bcolors=bcolors
    )
Esempio n. 3
0
def date_hist(ax, df, old_thresh=np.datetime64(datetime(2019,3,1))):
    dts = [str(dt) if str(dt)=='NaT'
           else str(dt)[:4] if dt<old_thresh else str(dt)[:10]
           for dt in df.date]
    dts_vals = sorted(list(set(dts)))
    sources = np.unique(df.source)
    date_count = {src: [np.sum(sc==src and dt==dt_val for sc,dt in zip(df.source,dts))
                        for dt_val in dts_vals]
                  for src in sources}
    bottom = np.array([0 for _ in dts_vals])
    for i,src in enumerate(sources):
        utils.barplot(ax, dts_vals, date_count[src], bottom=bottom, title='Dates',
                      ylab='Articles', vertical_xlabs=True, label=src,
                      colors=('b','r','g')[i], plot_bottom=False)
        bottom += date_count[src]
    ax.legend(loc='upper left')
Esempio n. 4
0
def naive_bayes_diagnosis(model, col_names=None, title=None, fig=None,
                         max_features=50):
    # input validation
    if fig is None:
        fig = plt.subplots(1,1)
    plt.figure(fig[0].number)
    if col_names is None:
        col_names = list(range(len(model.feature_importances)))
    col_names = [bidi.get_display(nm) for nm in col_names]
    # get std of coefficients
    log_probs_std = [np.std([lp[i] for lp in model.feature_log_prob_])
                     for i in range(len(model.feature_log_prob_[0]))]
    if max_features:
        ids = np.array(log_probs_std).argsort()[-max_features:][::-1]
        col_names = [col_names[i] for i in ids]
        log_probs_std = [log_probs_std[i] for i in ids]
    # plot
    pre_title = '' if title is None else title+'\n'
    utils.barplot(fig[1], col_names, log_probs_std, vertical_xlabs=True,
                  title=pre_title+f'Naive Bayes Diagnosis',
                  xlab='Feature', colors=('black',),
                  ylab='STD(log probability)\nover classes')
    utils.draw()
Esempio n. 5
0
def random_forest_diagnosis(model, col_names=None, title=None, fig=None,
                         max_features=50):
    # input validation
    if fig is None:
        fig = plt.subplots(1,1)
    plt.figure(fig[0].number)
    if col_names is None:
        col_names = list(range(len(model.feature_importances_)))
    col_names = [bidi.get_display(nm) for nm in col_names]
    # get importance
    importance = model.feature_importances_
    if max_features:
        ids = np.array(importance).argsort()[-max_features:][::-1]
        col_names = [col_names[i] for i in ids]
        importance = [importance[i] for i in ids]
    # plot
    pre_title = '' if title is None else title+'\n'
    utils.barplot(fig[1], col_names, importance, vertical_xlabs=True,
                  title=pre_title + 'Random Forest Diagnosis ' +
                        f'({len(model.estimators_):d} trees)',
                  xlab='Feature', colors=('black',),
                  ylab='Gini importance')
    utils.draw()
Esempio n. 6
0
def validity_tests(df):
    sources = np.unique(df['source'])
    blocked_contents = (1-check_haaretz_blocked_text(df[df['source'] == 'haaretz'])\
                       / np.sum(df['source']=='haaretz')) * 100
    df = df[np.logical_not(df['blocked'])]
    n = {src: np.sum(df['source'] == src) for src in sources}
    # get anomalies
    bad_types = {src: verify_valid(df[df['source']==src],
                                      {'date':datetime,'blocked':np.bool_})
                 for src in sources}
    bad_lengths = {src: check_lengths(df[df['source']==src]) for src in sources}
    bad_tokens = {src: verify_hebrew_words(df[df['source']==src]) for src in sources}
    # plot anomalies
    f, axs = plt.subplots(3, len(sources))
    for i, src in enumerate(sources):
        tit = ('DATA SANITY TESTS\n' if i==int(len(sources)/2) else '\n') +\
              f'[{src:s}] Invalid field types' +\
              (f'\n(out of {blocked_contents:.0f}% unblocked articles)'
               if src=='haaretz' else '\n')
        utils.barplot(axs[0, i], bad_types[src].keys(),
                      100 * np.array(tuple(bad_types[src].values())) / n[src],
                      vertical_xlabs=True, title=tit,
                      ylab='Having invalid type [%]', ylim=(0, 100))
    sp = inspect.getfullargspec(check_lengths)
    limits = list(itertools.chain.from_iterable(sp[3][0].values()))
    for i, src in enumerate(sources):
        utils.barplot(axs[1, i],
                      [a+f'\n({b:.0f} chars)' for a,b in
                       zip(bad_lengths[src].keys(),limits)],
                      100 * np.array(tuple(bad_lengths[src].values())) / n[src],
                      vertical_xlabs=True,
                      title=f'[{src:s}] Suspicious string-field lengths',
                      ylab='Having invalid length [%]', ylim=(0, 100))
    utils.barplot(axs[2,0], sources, [100*(1-bad_tokens[src][0]) for src in sources],
                  xlab='Source', ylab='Words without numbers\nor Hebrew letters [%]')
    utils.barplot(axs[2,1], sources, [100*(1-bad_tokens[src][1]) for src in sources],
                  xlab='Source', ylab='Words of length <=1 [%]')
    for i in range(2,len(sources)):
        utils.clean_figure(axs[2,i])
    # draw
    utils.draw()
Esempio n. 7
0
def count_parties(
    ax,
    df,
    col='text',
    by='source',
    binary_per_text=False,
    logscale=False,
    keys=('ליכוד', ('ביבי', 'נתניהו'), ('כחול לבן', 'כחול-לבן'), 'גנץ',
          'העבודה', 'גבאי', ('חד"ש', 'תע"ל'), 'עודה', 'יהדות התורה', 'ליצמן',
          'איחוד הימין', "סמוטריץ'", 'הימין החדש', 'בנט', 'זהות', 'פייגלין',
          'מרצ', 'זנדברג', 'ש"ס', 'דרעי', 'כולנו', 'כחלון', ('בל"ד', 'רע"ם'),
          'עבאס', ('ישראל ביתנו',
                   'ישראל-ביתנו'), 'ליברמן', 'גשר', 'אורלי לוי')):
    groups = np.unique(df[by])
    sep = SEPARATOR['word']

    count = {grp: len(keys) * [0] for grp in groups}
    for grp in groups:
        for i, txt in enumerate(df[df[by] == grp][col]):
            for j, key in enumerate(keys):
                # multi-word keys
                appears = 0
                if isinstance(key, tuple):
                    for k in key:
                        if ' ' in k:
                            appears = txt.count(k)
                            count[grp][j] += bool(
                                appears) if binary_per_text else appears
                            if binary_per_text: break
                else:
                    k = key
                    if ' ' in k:
                        appears = txt.count(k)
                        count[grp][j] += bool(
                            appears) if binary_per_text else appears
                if binary_per_text and appears:
                    continue
                # one-word keys
                for w in re.split(sep, txt):
                    w = re.sub('\.|,|\(|\)|;|:|\t', '', w).strip()
                    if w.endswith(key):
                        count[grp][j] += 1
                        if binary_per_text: break

    keys = tuple(k[0] + ' /\n' + k[1] if isinstance(k, tuple) else k
                 for k in keys)
    keys = tuple(bidi.get_display(k) for k in keys)
    colors = utils.DEF_COLORS
    bottom = np.array([0 for _ in keys])

    ylab = ('Texts with the expression' if binary_per_text else 'Total appearances') +\
           '\n(as end of a word)'
    for i, group in enumerate(groups):
        utils.barplot(ax,
                      keys,
                      count[group],
                      bottom=bottom,
                      plot_bottom=False,
                      ylab=ylab,
                      title='Frequency of appearance',
                      vertical_xlabs=True,
                      colors=colors[i % len(colors)],
                      label=bidi.get_display(group))
        bottom += count[group]
    if logscale:
        ax.set_yscale('log')
    ax.legend()
    utils.draw()