Example #1
0
    def find_subgraphs(self, df, relations, max_size=40000, verbose=False):
        if verbose:
            t1 = ut.out('finding subgraphs...')

        if verbose:
            t1 = ut.out('building networkx graph...')
        g = self.build_networkx_graph(df, relations)
        ccs = list(nx.connected_components(g))
        if verbose:
            ut.time(t1)

        if verbose:
            t1 = ut.out('processing connected components...')
        subgraphs = self._process_components(ccs, g)
        if verbose:
            ut.time(t1)

        # t1 = ut.out('filtering redundant subgraphs...')
        # subgraphs = self._filter_redundant_subgraphs(subgraphs, df)
        # ut.time(t1)

        # t1 = ut.out('removing single edge hubs...')
        # subgraphs = self._remove_single_edge_hubs(subgraphs, g)
        # ut.time(t1)

        # t1 = ut.out('compiling single node subgraphs...')
        # subgraphs += self._single_node_subgraphs(subgraphs, df, max_size)
        # ut.time(t1)

        if verbose:
            self._print_subgraphs_size(subgraphs)
        return g, subgraphs
Example #2
0
    def consolidate(self, subgraphs, max_size=40000, div=2):
        """Combine subgraphs into larger sets to reduce total number of
        subgraphs to do inference over."""
        t1 = ut.out('consolidating subgraphs...')

        sgs = []
        new_ids, new_hubs = set(), set()
        new_rels, new_edges = set(), 0

        for ids, hubs, rels, edges in subgraphs:
            size = int(len(new_ids) / div) + int(len(ids) / div)
            size += new_edges + edges

            if size < max_size:  # keep adding to new
                new_ids.update(ids)
                new_rels.update(rels)
                new_hubs.update(hubs)
                new_edges += edges
            elif new_edges == 0 and size > max_size:  # subgraph too big
                new_ids.update(ids)
                new_hubs.update(hubs)
                new_rels.update(rels)
                new_edges += edges
            else:  # new is full
                sgs.append((new_ids, new_hubs, new_rels, new_edges))
                new_ids, new_hubs = ids, hubs
                new_rels, new_edges = rels, edges

        if len(new_ids) > 0:
            sgs.append((new_ids, new_hubs, new_rels, new_edges))

        ut.time(t1)
        self._print_subgraphs_size(sgs)

        return sgs
Example #3
0
    def _evaluate(genome,
                  fitfunc,
                  size,
                  test_size=None,
                  adjuster=None,
                  **kwargs):
        build_time, net = util.time(
            lambda: util.build_network(genome, **kwargs), as_microseconds=True)

        evaluator = fitfunc.get_evaluator()

        pred_time, predictions = util.time(lambda: Evaluator.predict_np(
            net, Evaluator._inputs, size[0], size[1]),
                                           as_microseconds=True)
        pred_avg_time = pred_time / size[0]
        fit_time, fitness = util.time(lambda: evaluator(
            Evaluator._targets, predictions, size[0], **kwargs),
                                      as_microseconds=True)
        if isinstance(fitness, tuple):
            fitness = fitness[0]
            extra = fitness[1]
        else:
            extra = {}

        predictions_test = Evaluator.predict_np(net, Evaluator._test_inputs, test_size[0], test_size[1]) \
            if test_size is not None else None
        fitness_test = evaluator(Evaluator._test_targets, predictions_test, test_size[0]) \
            if test_size is not None else None

        evaluation = Evaluator.create_genome_evaluation(
            genome,
            fitness,
            net=net,
            fitness_test=fitness_test,
            build_time=build_time,
            pred_time=pred_time,
            pred_avg_time=pred_avg_time,
            fit_time=fit_time,
            extra=extra,
            **kwargs)
        if adjuster is not None:
            evaluation.fitness_adj = adjuster.get_adjusted_fitness(evaluation)

        return evaluation
Example #4
0
def sepolicy_common_info():
    sestatus()
    data = {
        "date": util.time(),
        "platform": util.platform_info(),
        "python": util.python_info(),
        "policy": sepolicy.get_installed_policy(),
        "status": sestatus(),
    }
    return data
Example #5
0
    def handle(self, *args, **kwargs):
        while True:
            safe_print("[monitor] sleeping")
            time.sleep(random.uniform(config['sleep_interval'] * 2 / 3, config['sleep_interval'] * 4 / 3))

            if util.time() - self.last_cleanup >= 60000:
                self.last_cleanup = util.time()
                Check.objects.raw("UPDATE checks SET `lock` = '' WHERE TIMESTAMPDIFF(SECOND, last_locked, NOW()) >= 60")

                with recent_failures_lock:
                    for check_tuple in set(recent_failures):
                        if util.time() - check_tuple[1] >= 60000:
                            recent_failures.remove(check_tuple)

            uid = util.uid(16)
            free_threads = pool.free()

            update_query = "UPDATE checks SET `lock` = %s, last_locked = NOW() WHERE `lock` = '' AND (TIMESTAMPDIFF(SECOND, last_checked, NOW()) >= check_interval OR confirmations > 0)"
            params = [uid]

            with recent_failures_lock:
                if recent_failures:
                    update_query += " AND id NOT IN (" + ', '.join(['%s' for check_tuple in recent_failures]) + ")" 
                    params.extend([check_tuple[0] for check_tuple in recent_failures])

            update_query += " ORDER BY RAND() LIMIT %d" % (free_threads)
            safe_print("[monitor] fetching up to %d...", (free_threads))
            database.query(update_query, params)

            result = database.query("SELECT id, name, type, data, status, confirmations FROM checks WHERE `lock` = %s", (uid,))
            safe_print("[monitor] fetched %d checks", (result.rowcount))

            for row in result.fetchall():
                check_id = row['id']
                check_name = row['name']
                check_type = row['type']
                check_data = row['data']
                status = row['status']
                confirmations = row['confirmations']

                pool.queue(check_id, check_name, check_type, check_data, status, confirmations, uid)
Example #6
0
def _significance(df, pred, samples=20):
    ref_auprs, pred_auprs = [], []
    ref_aurocs, pred_aurocs = [], []
    lc, rc = 'label', 'ref_pred'

    t1 = ut.out('computing aupr and auroc significance levels...')

    for i in range(samples):
        s_df = df.sample(frac=0.5, replace=True)
        ref_auprs.append(average_precision_score(s_df[lc], s_df[rc]))
        ref_aurocs.append(roc_auc_score(s_df[lc], s_df[rc]))
        pred_auprs.append(average_precision_score(s_df[lc], s_df[pred]))
        pred_aurocs.append(roc_auc_score(s_df[lc], s_df[pred]))

    auprs = np.subtract(ref_auprs, pred_auprs)
    aurocs = np.subtract(ref_aurocs, pred_aurocs)
    zeros = np.zeros(len(auprs))
    t1, aupr_pval = ttest_rel(auprs, zeros)
    t2, auroc_pval = ttest_rel(aurocs, zeros)
    ut.time(t1)

    return aupr_pval, auroc_pval
Example #7
0
def _approximations(df, relations=[]):
    t1 = ut.out('approximating relational with mean, max, median...')
    df = df.copy()

    con_obj = Connections()

    g, sgs = con_obj.find_subgraphs(df, relations, verbose=False)
    approx_dict = {}

    sg_list = []
    for i, sg in enumerate(sgs):
        if sg[3] > 0:  # num edges > 0
            sg_list.extend([(x, i) for x in sg[0]])  # give sg_id

    if len(sg_list) == 0:
        return approx_dict

    sg_df = pd.DataFrame(sg_list, columns=['com_id', 'sg_id'])
    df = df.merge(sg_df, how='left')
    df['sg_id'] = df['sg_id'].fillna(-1).apply(int)

    sg_mean = df.groupby('sg_id')['ind_pred'].mean().reset_index()\
        .rename(columns={'ind_pred': 'sg_mean_pred'})
    sg_median = df.groupby('sg_id')['ind_pred'].median().reset_index()\
        .rename(columns={'ind_pred': 'sg_median_pred'})
    sg_max = df.groupby('sg_id')['ind_pred'].max().reset_index()\
        .rename(columns={'ind_pred': 'sg_max_pred'})
    df = df.merge(sg_mean).merge(sg_median).merge(sg_max)

    filler = lambda x, c: x['ind_pred'] if x['sg_id'] == -1 else x[c]
    for col in ['sg_mean_pred', 'sg_median_pred', 'sg_max_pred']:
        cols = ['ind_pred', col, 'sg_id']
        df[col] = df[cols].apply(filler, axis=1, args=(col,))

    ut.time(t1)

    return df
Example #8
0
	def worker(self):
		thread_name = threading.currentThread().getName()

		while True:
			check_id, check_name, check_type, check_data, status, max_confirmations, confirmations, lock_uid = self.q.get()

			safe_print("[%s] processing check %d: calling checks.%s", (thread_name, check_id, check_type))
			check_result = checks.run_check(check_type, util.decode(check_data), check_id)

			safe_print("[%s] check %d result: %s", (thread_name, check_id, str(check_result)))

			if not type(check_result) is dict or 'status' not in check_result:
				util.die("[%s] bad check handler [%s]: returned non-dict or missing status" % (thread_name, check_type))
			elif 'message' not in check_result:
				if check_result['status'] == 'fail':
					check_result['message'] = "Check offline: %s" % (check_name)
				else:
					check_result['message'] = "Check online: %s" % (check_name)

			if check_result['status'] == 'fail':
				safe_print("[%s] ... got failure!", (thread_name))

				if status == 'online':
					with recent_failures_lock:
						recent_failures.add((check_id, util.time()))

					if confirmations + 1 >= max_confirmations:
						# target has failed
						self.handle_change(thread_name, check_id, check_name, lock_uid, 'offline', check_result)
					else:
						# increase confirmations
						database.query("UPDATE checks SET confirmations = confirmations + 1, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s", (check_id, lock_uid))
				else:
					database.query("UPDATE checks SET confirmations = 0, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s", (check_id, lock_uid))
			elif check_result['status'] == 'success':
				safe_print("[%s] ... got success", (thread_name))

				if status == 'offline':
					if confirmations + 1 >= max_confirmations:
						# target has come back online
						self.handle_change(thread_name, check_id, check_name, lock_uid, 'online', check_result)
					else:
						# increase confirmations
						database.query("UPDATE checks SET confirmations = confirmations + 1, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s", (check_id, lock_uid))
				else:
					database.query("UPDATE checks SET confirmations = 0, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s", (check_id, lock_uid))
			else:
				util.die("Check handler [%s] returned invalid status code [%s]!") % (check_type, check_result['status'])
Example #9
0
    def worker(self):
        thread_name = threading.currentThread().getName()

        while True:
            check_id, check_name, check_type, check_data, status, max_confirmations, confirmations, lock_uid = self.q.get(
            )

            safe_print("[%s] processing check %d: calling checks.%s",
                       (thread_name, check_id, check_type))
            check_result = checks.run_check(check_type,
                                            util.decode(check_data), check_id)

            safe_print("[%s] check %d result: %s",
                       (thread_name, check_id, str(check_result)))

            if not type(check_result) is dict or 'status' not in check_result:
                util.die(
                    "[%s] bad check handler [%s]: returned non-dict or missing status"
                    % (thread_name, check_type))
            elif 'message' not in check_result:
                if check_result['status'] == 'fail':
                    check_result['message'] = "Check offline: %s" % (
                        check_name)
                else:
                    check_result['message'] = "Check online: %s" % (check_name)

            if check_result['status'] == 'fail':
                safe_print("[%s] ... got failure!", (thread_name))

                if status == 'online':
                    with recent_failures_lock:
                        recent_failures.add((check_id, util.time()))

                    if confirmations + 1 >= max_confirmations:
                        # target has failed
                        self.handle_change(thread_name, check_id, check_name,
                                           lock_uid, 'offline', check_result)
                    else:
                        # increase confirmations
                        database.query(
                            "UPDATE checks SET confirmations = confirmations + 1, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s",
                            (check_id, lock_uid))
                else:
                    database.query(
                        "UPDATE checks SET confirmations = 0, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s",
                        (check_id, lock_uid))
            elif check_result['status'] == 'success':
                safe_print("[%s] ... got success", (thread_name))

                if status == 'offline':
                    if confirmations + 1 >= max_confirmations:
                        # target has come back online
                        self.handle_change(thread_name, check_id, check_name,
                                           lock_uid, 'online', check_result)
                    else:
                        # increase confirmations
                        database.query(
                            "UPDATE checks SET confirmations = confirmations + 1, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s",
                            (check_id, lock_uid))
                else:
                    database.query(
                        "UPDATE checks SET confirmations = 0, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s",
                        (check_id, lock_uid))
            else:
                util.die(
                    "Check handler [%s] returned invalid status code [%s]!"
                ) % (check_type, check_result['status'])
Example #10
0
						# increase confirmations
						database.query("UPDATE checks SET confirmations = confirmations + 1, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s", (check_id, lock_uid))
				else:
					database.query("UPDATE checks SET confirmations = 0, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s", (check_id, lock_uid))
			else:
				util.die("Check handler [%s] returned invalid status code [%s]!") % (check_type, check_result['status'])

pool = MonitorPool()
pool.start()
last_cleanup = 0

while True:
	safe_print("[monitor] sleeping")
	time.sleep(random.uniform(config['sleep_interval'] * 2 / 3, config['sleep_interval'] * 4 / 3))

	if util.time() - last_cleanup >= 60000:
		last_cleanup = util.time()
		database.query("UPDATE checks SET `lock` = '' WHERE TIMESTAMPDIFF(SECOND, last_locked, NOW()) >= 60")

		with recent_failures_lock:
			for check_tuple in set(recent_failures):
				if util.time() - check_tuple[1] >= 60000:
					recent_failures.remove(check_tuple)

	uid = util.uid(16)
	free_threads = pool.free()

	update_query = "UPDATE checks SET `lock` = %s, last_locked = NOW() WHERE `lock` = '' AND (TIMESTAMPDIFF(SECOND, last_checked, NOW()) >= check_interval OR confirmations > 0)"
	params = [uid]

	with recent_failures_lock:
Example #11
0
def single_relational(in_dir='',
                      out_dir='',
                      gids=['text_gid'],
                      pts=100000,
                      start=0,
                      dom=''):
    gen = Generator()

    ut.out('gids: %s' % str(gids), 0)

    t1 = ut.out('reading data...')
    df = pd.read_csv(in_dir + 'comments.csv',
                     skiprows=range(1, start),
                     nrows=pts)
    pts = len(df)
    ut.time(t1)

    colors = {'twitter': 'blue', 'youtube': 'red', 'soundcloud': 'orange'}

    # basic statistics
    p_spam = df.label.sum() / len(df)
    p_ham = 1 - p_spam
    ut.out('spam pct: %.2f' % p_spam)

    for gid in gids:
        t1 = ut.out('generating %s...' % gid)
        df = gen.gen_group_id(df, gid)
        ut.time(t1)

    for gid in gids:
        t1 = ut.out('grouping by %s...' % gid)
        g1 = df.groupby(gid)
        ut.time(t1)

        t1 = ut.out('computing stats per group...')
        size = g1.size().reset_index().rename(columns={0: 'size'})
        sum_label = g1['label'].sum().reset_index()\
            .rename(columns={'label': 'sum_label'})
        mean_label = g1['label'].mean().reset_index()\
            .rename(columns={'label': 'mean_label'})
        gf = size.merge(sum_label).merge(mean_label)

        single_cnt = gf[gf[gid] == -1]['size'].values[0]

        same_label = lambda x: 1 if x['mean_label'] in [1.0, 0.0] else 0
        gf['same_label'] = gf.apply(same_label, axis=1)
        gfs = gf[(gf['same_label'] == 1) | (gf[gid] == -1)]
        gfo = gf[(gf['same_label'] == 0) & (gf[gid] != -1)]
        ut.time(t1)

        t1 = ut.out('computing stats per size...')
        sf = compute_stats_per_size(gf,
                                    p_spam,
                                    p_ham,
                                    label='both',
                                    total_pts=pts,
                                    single_cnt=single_cnt)
        sfs = compute_stats_per_size(gfs,
                                     p_spam,
                                     p_ham,
                                     label='spam',
                                     single_cnt=single_cnt)
        sfo = compute_stats_per_size(gfo, p_spam, p_ham, label='ham')

        # compute single node row
        # sfs_df = sfs_df[sfs_df['size'] != single_cnt]
        extract = ['size', 'mean_label']
        vs = gfs[gfs[gid] == -1][extract].values[0]
        row = [(1, single_cnt, single_cnt, vs[1], p_spam)]
        one_line = pd.DataFrame(row, columns=list(sfs))
        sfs = pd.concat([one_line, sfs])
        sfs = sfs.rename(columns={'mean_label': 'mean_lbl_sme_lbl'})
        sfo = sfo.rename(columns={'mean_label': 'mean_lbl_not_sme_lbl'})

        # compute single node row
        v = gf[gf[gid] == -1][['size', 'mean_label']].values[0]
        row = [(1, v[0], v[0], v[1], v[0] / pts, 1, 1)]
        cols = list(sf)
        one_line = pd.DataFrame(row, columns=cols)
        sf = pd.concat([one_line, sf])

        # keep top X% of affected nodes
        pct = 100
        total = sf.cnt.sum()
        for i in range(1, len(sf)):
            if sf[:i].cnt.sum() / total >= pct / float(100):
                sf = sf[:i]
                break
        ut.time(t1)

        t1 = ut.out('plotting...')
        cols = [
            'cnt_rto', 'same_label_rto', 'mean_lbl_sme_lbl',
            'mean_lbl_not_sme_lbl'
        ]
        ncols = len(cols)

        subtitle_list = ['(a)', '(b)', '(c)', '(d)']
        xlabel_list = [
            '% all messages', '% groups containing same label',
            'mean label over same label groups',
            'mean label over mixed label groups'
        ]
        subtitles = dict(list(zip(cols, subtitle_list)))
        xlabels = dict(list(zip(cols, xlabel_list)))
        fontsize = 24

        # nrows = 2
        # ncols = int(ncols / nrows)
        # ncols += 1 if ncols % nrows != 0 else 0

        # fig, axs = plt.subplots(nrows, ncols, figsize=(15, 15))
        fig, axs = plt.subplots(1, 4, figsize=(27, 7))
        # fig, axs = plt.subplots(1, 4)
        axs = axs.flatten()
        for i, col in enumerate(cols):
            if col == 'mean_lbl_sme_lbl':
                dummy_df = sfs
            elif col == 'mean_lbl_not_sme_lbl':
                dummy_df = sfo
            else:
                dummy_df = sf

            if len(dummy_df) > 0:
                gf = do_log_scale_binning(dummy_df)
                gf.plot.barh('size',
                             col,
                             ax=axs[i],
                             title=subtitles[col],
                             legend=False,
                             fontsize=fontsize)
                if col == 'same_label_rto':
                    gf.plot.barh('size',
                                 'e_sme_lbl_rto',
                                 ax=axs[i],
                                 title=subtitles[col],
                                 legend=False,
                                 fontsize=fontsize,
                                 alpha=0.5,
                                 color='red',
                                 hatch='/')
                elif col in ['mean_lbl_sme_lbl', 'mean_lbl_not_sme_lbl']:
                    axs[i].axvline(p_spam, color='k', linestyle='--')

                axs[i].set_ylabel('group size', fontsize=fontsize)
                axs[i].set_xlabel(xlabels[col], fontsize=fontsize)
                axs[i].set_title(subtitles[col], fontsize=fontsize - 2)

                xt = axs[i].get_xticks()
                tl = len(str(xt[1]))
                if (tl >= 4) or (tl == 3 and len(xt) >= 9):
                    axs[i].set_xticks(axs[i].get_xticks()[::2])

        rel = gid.replace('_gid', '')
        # t = (dom, pts, p_spam * 100, rel)
        title = '%s: spam: %.2f%%, relation: %s' % (dom, p_spam * 100, rel)
        # title = '%s: %d data points, spam: %.2f%%, relation: %s' % t
        fig.tight_layout()
        fig.suptitle(title, y=1.08, fontsize=fontsize)
        fig.savefig(out_dir + 'sg_%s.pdf' % str(gid),
                    format='pdf',
                    bbox_inches='tight')
        plt.close('all')
        ut.time(t1)

        sf.to_csv(out_dir + 'sg_%s.csv' % str(gid), index=None)

    spam_rto = df.label.sum() / len(df)
    ut.out('spam ratio: %.2f' % spam_rto)

    if len(gids) > 1:
        rel_nodes = 0
        g = df.groupby(gids).size().reset_index().rename(columns={0: 'size'})
        for gid in gids:
            g = g[g[gid] != -1]
            rel_nodes += len(df[df[gid] != -1])

        spam_rto = df.label.sum() / len(df)
        overlap_rto = g.size.sum() / rel_nodes
        ut.out('overlap ratio: %.2f' % overlap_rto)
    ut.out()
Example #12
0
                util.die(
                    "Check handler [%s] returned invalid status code [%s]!"
                ) % (check_type, check_result['status'])


pool = MonitorPool()
pool.start()
last_cleanup = 0

while True:
    safe_print("[monitor] sleeping")
    time.sleep(
        random.uniform(config['sleep_interval'] * 2 / 3,
                       config['sleep_interval'] * 4 / 3))

    if util.time() - last_cleanup >= 60000:
        last_cleanup = util.time()
        database.query(
            "UPDATE checks SET `lock` = '' WHERE TIMESTAMPDIFF(SECOND, last_locked, NOW()) >= 60"
        )

        with recent_failures_lock:
            for check_tuple in set(recent_failures):
                if util.time() - check_tuple[1] >= 60000:
                    recent_failures.remove(check_tuple)

    uid = util.uid(16)
    free_threads = pool.free()

    update_query = "UPDATE checks SET `lock` = %s, last_locked = NOW() WHERE `lock` = '' AND (TIMESTAMPDIFF(SECOND, last_checked, NOW()) >= check_interval OR confirmations > 0)"
    params = [uid]
Example #13
0
#!/usr/bin/env python3

import argparse
import importlib
import logging
import util

if __name__ == "__main__":
    """
    Executes a particular function many times, collecting timings and showing
    metrics when finished.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("name")
    parser.add_argument("runs", type=int)
    parser.add_argument("--func")
    parser.add_argument("--verbose", action="store_true")
    known, unknown = parser.parse_known_args()

    level = logging.DEBUG if known.verbose else logging.WARNING
    logging.basicConfig(level=level)
    log = logging.getLogger("time")
    log.debug(f"Known args are {known} and unknown args are {unknown}")

    mod = importlib.import_module(f"impl.{known.name}")

    res = util.time(mod, known.name, known.runs, func=known.func, *unknown)
    for r in res:
        print(r)
Example #14
0
def compute_big_aupr(start_fold=0, ref_start_fold=-1, num_folds=5,
                     domain='twitter', models=['ind'], in_dir='', gids=[]):
    ind_data_dir = 'independent/data/' + domain + '/'

    lines = {'ind': 'b-', 'mrf': 'g--', 'psl': 'm-.', 'mean': 'r:',
             'median': 'c:', 'max': 'y:'}
    inds, mrfs, psls, approxs, refs = [], [], [], [], []
    preds = []

    gen_obj = Generator()
    relations = _relations_for_gids(gids)

    for model in models:
        preds.append(model + '_pred')
    if 'approx' in models:
        models.remove('approx')
        models.extend(['mean', 'median', 'max'])
        preds.extend(['mean_pred', 'median_pred', 'max_pred'])
    preds = list(zip(models, preds))

    t1 = ut.out('reading true labels...', 0)
    full_df = pd.read_csv(ind_data_dir + 'comments.csv')
    lbl_df = full_df[['com_id', 'label']]
    ut.time(t1)

    s = '%s: reading model preds from fold %d to %d:'
    ut.out(s % (domain, start_fold, start_fold + num_folds - 1), 1)

    newline = 1 if 'approx' in models else 0

    d = {}
    for i, fold in enumerate(range(start_fold, start_fold + num_folds)):
        ut.out('\nreading preds for fold %d...' % i, newline)
        f_dict = {}

        if ref_start_fold > -1:
            ndx = ref_start_fold + i
            fname = in_dir + 'test_' + str(ndx) + '_preds.csv'
            assert os.path.exists(fname)
            refs.append(pd.read_csv(fname))

        if 'ind' in models:
            fname = in_dir + 'test_' + str(fold) + '_preds.csv'
            assert os.path.exists(fname)
            ind_df = pd.read_csv(fname)
            inds.append(ind_df)
            ind_lbl_df = full_df.merge(ind_df, on='com_id')
            t1 = ut.out('generating group ids...')
            for gid in gids:
                ind_lbl_df = gen_obj.gen_group_id(ind_lbl_df, gid)
            ut.time(t1)
            m_dict = _metrics(ind_lbl_df)
            a_dict = _analyze(ind_lbl_df, relations=relations, col='ind_pred')
            s_dict = _spread(ind_lbl_df, col='ind_pred', relations=relations)
            f_dict.update(a_dict)
            f_dict.update(s_dict)
            f_dict.update(m_dict)

            if 'mean' in models:
                temp_df = full_df.merge(ind_df)

                t1 = ut.out('generating group ids...')
                for gid in gids:
                    temp_df = gen_obj.gen_group_id(temp_df, gid)
                ut.time(t1)

                approx_df = _approximations(temp_df, relations)
                approxs.append(approx_df)

        if 'mrf' in models:
            fname = in_dir + 'mrf_preds_' + str(fold) + '.csv'
            assert os.path.exists(fname)
            mrf_df = pd.read_csv(fname)
            mrfs.append(mrf_df)
            mrf_lbl_df = lbl_df.merge(mrf_df)
            m_dict = _metrics(mrf_lbl_df, col='mrf_pred', model='mrf')
            f_dict.update(m_dict)

        if 'psl' in models:
            fname = in_dir + 'psl_preds_' + str(fold) + '.csv'
            assert os.path.exists(fname)
            psl_df = pd.read_csv(fname)
            psls.append(psl_df)
            psl_lbl_df = lbl_df.merge(psl_df)
            m_dict = _metrics(psl_lbl_df, col='psl_pred', model='psl')
            f_dict.update(m_dict)

        d[i] = f_dict
        print(d)

    dicts = [d[i] for i in range(len(d))]
    stats_df = pd.DataFrame(dicts)
    stats_df = stats_df.reset_index()\
                       .rename(columns={'index': 'test_set'})
    stats_df.to_csv('tw_full_0stk.csv', index=None)

    t1 = ut.out('concatenating test set predictions...')
    df = full_df[['com_id', 'label']]

    if 'ind' in models:
        ind_df = pd.concat(inds)
        df = df.merge(ind_df)

        if 'mean' in models:
            approx_df = pd.concat(approxs)
            assert set(ind_df['com_id']) == set(approx_df['com_id'])
            df = df.merge(approx_df)

    if ref_start_fold > -1:
        ref_df = pd.concat(refs)
        ref_df = full_df[['com_id', 'label']].merge(ref_df)
        ref_df = ref_df[['com_id', 'ind_pred']]
        ref_df = ref_df.rename(columns={'ind_pred': 'ref_pred'})
        assert set(ind_df['com_id']) == set(ref_df['com_id'])
        df = df.merge(ref_df)

    if 'mrf' in models:
        mrf_df = pd.concat(mrfs)
        assert set(ind_df['com_id']) == set(mrf_df['com_id'])
        df = df.merge(mrf_df)

    if 'psl' in models:
        psl_df = pd.concat(psls)
        assert set(ind_df['com_id']) == set(psl_df['com_id'])
        df = df.merge(psl_df)
    ut.time(t1)

    t1 = ut.out('applying noise to predictions...')
    noise = 0.000025
    perturb = lambda x: max(0.0, min(1.0, x + ran.uniform(-noise, noise)))

    if 'ind' in models:
        df['ind_pred'] = df['ind_pred'].apply(perturb)

        if 'mean' in models:
            df['mean_pred'] = df['mean_pred'].apply(perturb)
            df['median_pred'] = df['median_pred'].apply(perturb)
            df['max_pred'] = df['max_pred'].apply(perturb)

    if 'mrf' in models:
        df['mrf_pred'] = df['mrf_pred'].apply(perturb)

    if 'psl' in models:
        df['psl_pred'] = df['psl_pred'].apply(perturb)
    ut.time(t1)

    # compute reference aupr and auroc
    ref_label, ref_pred = df['label'], df['ref_pred']
    ref_aupr = average_precision_score(ref_label, ref_pred)
    ref_auroc = roc_auc_score(ref_label, ref_pred)
    ref_p, ref_r, ref_t = precision_recall_curve(ref_label, ref_pred)
    ref_fpr, ref_tpr, ref_t2 = roc_curve(ref_label, ref_pred)
    ut.out('%s aupr: %.4f, auroc: %.4f' % ('reference', ref_aupr, ref_auroc))

    ut.plot_pr_curve('ref', ref_p, ref_r, ref_aupr, domain=domain,
                     line='k-', show_legend=True)
    ut.plot_roc_curve('ref', ref_tpr, ref_fpr, ref_auroc, domain=domain,
                      line='k-', show_legend=True)

    auroc_pval, aupr_pval = 0, 0
    # compute combined test set curves
    for i, (model, pred) in enumerate(preds):
        aupr = average_precision_score(df['label'], df[pred])
        auroc = roc_auc_score(df['label'], df[pred])
        p, r, _ = precision_recall_curve(df['label'], df[pred])
        fpr, tpr, _ = roc_curve(df['label'], df[pred])
        # aupr_pval, auroc_pval = _significance(df, pred)
        t = (model, aupr, aupr_pval, auroc, auroc_pval)
        ut.out('%s aupr: %.4f (%.4f), auroc: %.4f (%.4f)' % t)

        save = True if i == len(preds) - 1 else False
        ut.plot_pr_curve(model, p, r, aupr, domain=domain,
                         line=lines[model], show_legend=True)
        ut.plot_roc_curve(model, tpr, fpr, auroc, save=save, domain=domain,
                          line=lines[model], show_legend=True)
    ut.out()
Example #15
0
def cosine_similarities(df, sim_thresh=0.8, in_col='text',
                        out_col='text_id', approx_datapoints=120000,
                        max_feats=None, k=5, max_id=0, out_dir='',
                        fname='sim.csv'):
    ut.makedirs(out_dir)

    group_id = max_id
    all_ids = defaultdict(set)
    dfs = _split_data(df, approx_datapoints=approx_datapoints, in_col=in_col)

    for n, chunk_df in enumerate(dfs):
        t1 = time.time()

        ut.out('\ncreating tf-idf matrix for chunk %d...' % (n + 1))
        groups = defaultdict(set)
        g_df = chunk_df.groupby(in_col).size().reset_index()
        strings = list(g_df[in_col])
        m = _tf_idf(strings, analyzer=_ngrams, max_feats=max_feats)

        v, total = len(m.data), m.shape[0] * m.shape[1]
        ut.out('sparsity: (%d/%d) %.2f%%' % (v, total, 100 * (v / total)))

        ut.out('computing cosine similarities...')
        cos_sim = cosine_similarity(m, dense_output=False)

        ut.out('filtering out simiarities below threshold...')
        scm = cos_sim >= sim_thresh

        ut.out('putting matches into groups...')
        for ndx in range(len(strings)):
            data = cos_sim[ndx].data
            indices = list(cos_sim[ndx].indices)
            sims = [(x, data[indices.index(x)]) for x in scm[ndx].indices]
            sims = sorted(sims, key=lambda x: x[1], reverse=True)
            sim_ids = [sim_ndx for sim_ndx, sim_val in sims[:k]]
            groups[group_id].update(set(sim_ids))
            group_id += 1

        ut.out('merging identical groups...')
        groups = _merge_identical_groups(groups)

        ut.out('assigning ids to items...')
        ids = _assign_ids_to_items(groups, strings)

        ut.out('aggregating identical keys...')
        all_ids = _aggregate_identical_keys(all_ids, ids)

        ut.out('chunk time: %.4fm' % ((time.time() - t1) / 60.0))

    t1 = time.time()
    ut.out('\nprune single items...')
    all_ids = _prune_single_items(all_ids, df, in_col)
    ut.time(t1)

    t1 = time.time()
    ut.out('prune redundant ids...')
    all_ids = _prune_redundant_ids(all_ids)
    ut.time(t1)

    t1 = time.time()
    ut.out('putting ids into a dataframe...')
    sim_df = _ids_to_dataframe(all_ids, df, in_col=in_col, out_col=out_col)
    ut.out('writing to csv...', 0)
    sim_df.to_csv(out_dir + fname, index=None)
    ut.time(t1)
    ut.out()
Example #16
0
def _analyze(df, col, samples=100, relations=[]):
    gids = [r[2] for r in relations]

    if len(relations) == 0:
        return {}

    t1 = ut.out('computing messages missed most often...')

    p, r, ts = precision_recall_curve(df['label'], df[col])
    aupr = average_precision_score(df['label'], df[col])
    mp = 1.0 - aupr

    corrects = []
    step = int(len(ts) / 100) if len(ts) > 100 else 1
    for i in range(0, len(ts), step):
        t = ts[i]
        df['pred'] = np.where(df[col] > t, 1, 0)
        correct = df['pred'] == df['label']
        corrects.append(correct.apply(int))

    total_corrects = [sum(x) for x in zip(*corrects)]
    df['correct'] = total_corrects

    # extract bottom x% data
    df = df.sort_values('correct', ascending=False)
    ndx = len(df) - int(len(df) * mp)
    qf1, qf2 = df[ndx:], df[:ndx]
    # dfs = df[df['label'] == 1]
    qf1s = qf1[qf1['label'] == 1]  # low performers
    qf1o = qf1[qf1['label'] == 0]  # low performers
    qf2s = qf2[qf2['label'] == 1]  # high performers
    qf2o = qf2[qf2['label'] == 0]  # high performers
    ut.time(t1)

    # ut.out('spam in bot %.2f%%: %d' % (mp * 100, len(qf1s)))
    # ut.out('ham in bot %.2f%%: %d' % (mp * 100, len(qf1o)))

    t1 = ut.out('computing messages with a relation...')
    r1s, r1sf = _msgs_with_rel(qf1s, gids, mp, 'bot', 'spam')
    r1o, r1of = _msgs_with_rel(qf1o, gids, mp, 'bot', 'ham')
    r2s, r2sf = _msgs_with_rel(qf2s, gids, mp, 'top', 'spam')
    r2o, r2of = _msgs_with_rel(qf2o, gids, mp, 'top', 'ham')
    ut.time(t1)

    # ut.out()

    t1 = ut.out('computing messages with an outside relation...')
    rr1sof = _rm_in_sect(df, qf1s, qf2, gids, mp, r1s, 'bot', 'spam')
    rr1oof = _rm_in_sect(df, qf1o, qf2, gids, mp, r1o, 'bot', 'ham')
    rr2sof = _rm_in_sect(df, qf2s, qf1, gids, mp, r2s, 'top', 'spam')
    rr2oof = _rm_in_sect(df, qf2o, qf1, gids, mp, r2o, 'top', 'ham')
    # rr1sif = self._rm_in_sect(df, qf1s, qf1, gids, mp, r1s, 'bot', 'spam',
    #                           'inside')
    # rr1oif = self._rm_in_sect(df, qf1o, qf1, gids, mp, r1o, 'bot', 'ham',
    #                           'inside')

    sd = {}
    sd['bot_spam_rels'] = round(r1sf, 4)
    sd['bot_ham_rels'] = round(r1of, 4)
    sd['top_spam_rels'] = round(r2sf, 4)
    sd['top_ham_rels'] = round(r2of, 4)
    sd['bot_spam_rels_out'] = round(rr1sof, 4)
    sd['bot_ham_rels_out'] = round(rr1oof, 4)
    sd['top_spam_rels_out'] = round(rr2sof, 4)
    sd['top_ham_rels_out'] = round(rr2oof, 4)
    # sd['bot_spam_rels_in'] = rr1sif
    # sd['bot_ham_rels_in'] = rr1oif

    ut.time(t1)
    return sd
Example #17
0
def _spread(df, col='ind_pred', relations=[]):
    """This'll give some post-hoc test-set analysis, when running this,
    keep track of the test sets that improved using relational modeling,
    then average those test set statistics together to compare to the test
    sets that did not improve."""
    t1 = ut.out('computing subgraph statistics...')
    con_obj = Connections()

    gids = [r[2] for r in relations]
    g, sgs = con_obj.find_subgraphs(df, relations, verbose=False)
    spread_dict = {}

    sg_list = []
    for i, sg in enumerate(sgs):
        if sg[3] > 0:  # num edges > 0
            sg_list.extend([(x, i) for x in sg[0]])  # give sg_id

    if len(sg_list) == 0:
        return spread_dict

    sg_df = pd.DataFrame(sg_list, columns=['com_id', 'sg_id'])
    df = df.merge(sg_df, how='left')
    df['sg_id'] = df['sg_id'].fillna(-1).apply(int)

    p, r, ts = precision_recall_curve(df['label'], df[col])
    aupr = average_precision_score(df['label'], df[col])
    mp = 1.0 - aupr

    corrects = []
    step = int(len(ts) / 100) if len(ts) > 100 else 1
    for i in range(0, len(ts), step):
        t = ts[i]
        df['pred'] = np.where(df[col] > t, 1, 0)
        correct = df['pred'] == df['label']
        corrects.append(correct.apply(int))

    total_corrects = [sum(x) for x in zip(*corrects)]
    df['correct'] = total_corrects

    # extract bottom x% data
    df = df.sort_values('correct', ascending=False)
    ndx = len(df) - int(len(df) * mp)
    qfs = df[df['label'] == 1]
    qfo = df[df['label'] == 0]
    qf1, qf2 = df[ndx:], df[:ndx]
    qf1s = qf1[qf1['label'] == 1]  # low performers
    qf1o = qf1[qf1['label'] == 0]  # low performers
    qf2s = qf2[qf2['label'] == 1]  # high performers
    qf2o = qf2[qf2['label'] == 0]  # high performers

    spread_dict['spam_mean'] = round(qfs['ind_pred'].mean(), 4)
    spread_dict['spam_median'] = round(qfs['ind_pred'].median(), 4)
    spread_dict['ham_mean'] = round(qfo['ind_pred'].mean(), 4)
    spread_dict['ham_median'] = round(qfo['ind_pred'].median(), 4)

    for nm, temp_df in [('bot_spam', qf1s), ('bot_ham', qf1o),
                        ('top_spam', qf2s), ('top_ham', qf2o)]:
        wf = temp_df[(temp_df[gids] != -1).any(axis=1)]
        sg_mean = wf.groupby('sg_id')['ind_pred'].mean().reset_index()\
            .rename(columns={'ind_pred': 'sg_mean'})
        sg_std = wf.groupby('sg_id')['ind_pred'].std().reset_index()\
            .rename(columns={'ind_pred': 'sg_std'})
        sg_median = wf.groupby('sg_id')['ind_pred'].median().reset_index()\
            .rename(columns={'ind_pred': 'sg_median'})
        sg_min = wf.groupby('sg_id')['ind_pred'].min().reset_index()\
            .rename(columns={'ind_pred': 'sg_min'})
        sg_max = wf.groupby('sg_id')['ind_pred'].max().reset_index()\
            .rename(columns={'ind_pred': 'sg_max'})
        wf = wf.merge(sg_mean).merge(sg_std).merge(sg_median)\
            .merge(sg_min).merge(sg_max)
        wf['sg_spread'] = wf['sg_max'] - wf['sg_min']

        spread_dict[nm + '_sg_mean'] = round(np.mean(wf['sg_mean']), 4)
        spread_dict[nm + '_sg_std'] = round(np.mean(wf['sg_std']), 4)
        spread_dict[nm + '_sg_median'] = round(np.mean(wf['sg_median']), 4)
        spread_dict[nm + '_sg_min'] = round(np.mean(wf['sg_min']), 4)
        spread_dict[nm + '_sg_max'] = round(np.mean(wf['sg_max']), 4)
        spread_dict[nm + '_sg_spread'] = round(np.mean(wf['sg_spread']), 4)

    ut.time(t1)
    return spread_dict