def find_subgraphs(self, df, relations, max_size=40000, verbose=False): if verbose: t1 = ut.out('finding subgraphs...') if verbose: t1 = ut.out('building networkx graph...') g = self.build_networkx_graph(df, relations) ccs = list(nx.connected_components(g)) if verbose: ut.time(t1) if verbose: t1 = ut.out('processing connected components...') subgraphs = self._process_components(ccs, g) if verbose: ut.time(t1) # t1 = ut.out('filtering redundant subgraphs...') # subgraphs = self._filter_redundant_subgraphs(subgraphs, df) # ut.time(t1) # t1 = ut.out('removing single edge hubs...') # subgraphs = self._remove_single_edge_hubs(subgraphs, g) # ut.time(t1) # t1 = ut.out('compiling single node subgraphs...') # subgraphs += self._single_node_subgraphs(subgraphs, df, max_size) # ut.time(t1) if verbose: self._print_subgraphs_size(subgraphs) return g, subgraphs
def consolidate(self, subgraphs, max_size=40000, div=2): """Combine subgraphs into larger sets to reduce total number of subgraphs to do inference over.""" t1 = ut.out('consolidating subgraphs...') sgs = [] new_ids, new_hubs = set(), set() new_rels, new_edges = set(), 0 for ids, hubs, rels, edges in subgraphs: size = int(len(new_ids) / div) + int(len(ids) / div) size += new_edges + edges if size < max_size: # keep adding to new new_ids.update(ids) new_rels.update(rels) new_hubs.update(hubs) new_edges += edges elif new_edges == 0 and size > max_size: # subgraph too big new_ids.update(ids) new_hubs.update(hubs) new_rels.update(rels) new_edges += edges else: # new is full sgs.append((new_ids, new_hubs, new_rels, new_edges)) new_ids, new_hubs = ids, hubs new_rels, new_edges = rels, edges if len(new_ids) > 0: sgs.append((new_ids, new_hubs, new_rels, new_edges)) ut.time(t1) self._print_subgraphs_size(sgs) return sgs
def _evaluate(genome, fitfunc, size, test_size=None, adjuster=None, **kwargs): build_time, net = util.time( lambda: util.build_network(genome, **kwargs), as_microseconds=True) evaluator = fitfunc.get_evaluator() pred_time, predictions = util.time(lambda: Evaluator.predict_np( net, Evaluator._inputs, size[0], size[1]), as_microseconds=True) pred_avg_time = pred_time / size[0] fit_time, fitness = util.time(lambda: evaluator( Evaluator._targets, predictions, size[0], **kwargs), as_microseconds=True) if isinstance(fitness, tuple): fitness = fitness[0] extra = fitness[1] else: extra = {} predictions_test = Evaluator.predict_np(net, Evaluator._test_inputs, test_size[0], test_size[1]) \ if test_size is not None else None fitness_test = evaluator(Evaluator._test_targets, predictions_test, test_size[0]) \ if test_size is not None else None evaluation = Evaluator.create_genome_evaluation( genome, fitness, net=net, fitness_test=fitness_test, build_time=build_time, pred_time=pred_time, pred_avg_time=pred_avg_time, fit_time=fit_time, extra=extra, **kwargs) if adjuster is not None: evaluation.fitness_adj = adjuster.get_adjusted_fitness(evaluation) return evaluation
def sepolicy_common_info(): sestatus() data = { "date": util.time(), "platform": util.platform_info(), "python": util.python_info(), "policy": sepolicy.get_installed_policy(), "status": sestatus(), } return data
def handle(self, *args, **kwargs): while True: safe_print("[monitor] sleeping") time.sleep(random.uniform(config['sleep_interval'] * 2 / 3, config['sleep_interval'] * 4 / 3)) if util.time() - self.last_cleanup >= 60000: self.last_cleanup = util.time() Check.objects.raw("UPDATE checks SET `lock` = '' WHERE TIMESTAMPDIFF(SECOND, last_locked, NOW()) >= 60") with recent_failures_lock: for check_tuple in set(recent_failures): if util.time() - check_tuple[1] >= 60000: recent_failures.remove(check_tuple) uid = util.uid(16) free_threads = pool.free() update_query = "UPDATE checks SET `lock` = %s, last_locked = NOW() WHERE `lock` = '' AND (TIMESTAMPDIFF(SECOND, last_checked, NOW()) >= check_interval OR confirmations > 0)" params = [uid] with recent_failures_lock: if recent_failures: update_query += " AND id NOT IN (" + ', '.join(['%s' for check_tuple in recent_failures]) + ")" params.extend([check_tuple[0] for check_tuple in recent_failures]) update_query += " ORDER BY RAND() LIMIT %d" % (free_threads) safe_print("[monitor] fetching up to %d...", (free_threads)) database.query(update_query, params) result = database.query("SELECT id, name, type, data, status, confirmations FROM checks WHERE `lock` = %s", (uid,)) safe_print("[monitor] fetched %d checks", (result.rowcount)) for row in result.fetchall(): check_id = row['id'] check_name = row['name'] check_type = row['type'] check_data = row['data'] status = row['status'] confirmations = row['confirmations'] pool.queue(check_id, check_name, check_type, check_data, status, confirmations, uid)
def _significance(df, pred, samples=20): ref_auprs, pred_auprs = [], [] ref_aurocs, pred_aurocs = [], [] lc, rc = 'label', 'ref_pred' t1 = ut.out('computing aupr and auroc significance levels...') for i in range(samples): s_df = df.sample(frac=0.5, replace=True) ref_auprs.append(average_precision_score(s_df[lc], s_df[rc])) ref_aurocs.append(roc_auc_score(s_df[lc], s_df[rc])) pred_auprs.append(average_precision_score(s_df[lc], s_df[pred])) pred_aurocs.append(roc_auc_score(s_df[lc], s_df[pred])) auprs = np.subtract(ref_auprs, pred_auprs) aurocs = np.subtract(ref_aurocs, pred_aurocs) zeros = np.zeros(len(auprs)) t1, aupr_pval = ttest_rel(auprs, zeros) t2, auroc_pval = ttest_rel(aurocs, zeros) ut.time(t1) return aupr_pval, auroc_pval
def _approximations(df, relations=[]): t1 = ut.out('approximating relational with mean, max, median...') df = df.copy() con_obj = Connections() g, sgs = con_obj.find_subgraphs(df, relations, verbose=False) approx_dict = {} sg_list = [] for i, sg in enumerate(sgs): if sg[3] > 0: # num edges > 0 sg_list.extend([(x, i) for x in sg[0]]) # give sg_id if len(sg_list) == 0: return approx_dict sg_df = pd.DataFrame(sg_list, columns=['com_id', 'sg_id']) df = df.merge(sg_df, how='left') df['sg_id'] = df['sg_id'].fillna(-1).apply(int) sg_mean = df.groupby('sg_id')['ind_pred'].mean().reset_index()\ .rename(columns={'ind_pred': 'sg_mean_pred'}) sg_median = df.groupby('sg_id')['ind_pred'].median().reset_index()\ .rename(columns={'ind_pred': 'sg_median_pred'}) sg_max = df.groupby('sg_id')['ind_pred'].max().reset_index()\ .rename(columns={'ind_pred': 'sg_max_pred'}) df = df.merge(sg_mean).merge(sg_median).merge(sg_max) filler = lambda x, c: x['ind_pred'] if x['sg_id'] == -1 else x[c] for col in ['sg_mean_pred', 'sg_median_pred', 'sg_max_pred']: cols = ['ind_pred', col, 'sg_id'] df[col] = df[cols].apply(filler, axis=1, args=(col,)) ut.time(t1) return df
def worker(self): thread_name = threading.currentThread().getName() while True: check_id, check_name, check_type, check_data, status, max_confirmations, confirmations, lock_uid = self.q.get() safe_print("[%s] processing check %d: calling checks.%s", (thread_name, check_id, check_type)) check_result = checks.run_check(check_type, util.decode(check_data), check_id) safe_print("[%s] check %d result: %s", (thread_name, check_id, str(check_result))) if not type(check_result) is dict or 'status' not in check_result: util.die("[%s] bad check handler [%s]: returned non-dict or missing status" % (thread_name, check_type)) elif 'message' not in check_result: if check_result['status'] == 'fail': check_result['message'] = "Check offline: %s" % (check_name) else: check_result['message'] = "Check online: %s" % (check_name) if check_result['status'] == 'fail': safe_print("[%s] ... got failure!", (thread_name)) if status == 'online': with recent_failures_lock: recent_failures.add((check_id, util.time())) if confirmations + 1 >= max_confirmations: # target has failed self.handle_change(thread_name, check_id, check_name, lock_uid, 'offline', check_result) else: # increase confirmations database.query("UPDATE checks SET confirmations = confirmations + 1, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s", (check_id, lock_uid)) else: database.query("UPDATE checks SET confirmations = 0, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s", (check_id, lock_uid)) elif check_result['status'] == 'success': safe_print("[%s] ... got success", (thread_name)) if status == 'offline': if confirmations + 1 >= max_confirmations: # target has come back online self.handle_change(thread_name, check_id, check_name, lock_uid, 'online', check_result) else: # increase confirmations database.query("UPDATE checks SET confirmations = confirmations + 1, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s", (check_id, lock_uid)) else: database.query("UPDATE checks SET confirmations = 0, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s", (check_id, lock_uid)) else: util.die("Check handler [%s] returned invalid status code [%s]!") % (check_type, check_result['status'])
def worker(self): thread_name = threading.currentThread().getName() while True: check_id, check_name, check_type, check_data, status, max_confirmations, confirmations, lock_uid = self.q.get( ) safe_print("[%s] processing check %d: calling checks.%s", (thread_name, check_id, check_type)) check_result = checks.run_check(check_type, util.decode(check_data), check_id) safe_print("[%s] check %d result: %s", (thread_name, check_id, str(check_result))) if not type(check_result) is dict or 'status' not in check_result: util.die( "[%s] bad check handler [%s]: returned non-dict or missing status" % (thread_name, check_type)) elif 'message' not in check_result: if check_result['status'] == 'fail': check_result['message'] = "Check offline: %s" % ( check_name) else: check_result['message'] = "Check online: %s" % (check_name) if check_result['status'] == 'fail': safe_print("[%s] ... got failure!", (thread_name)) if status == 'online': with recent_failures_lock: recent_failures.add((check_id, util.time())) if confirmations + 1 >= max_confirmations: # target has failed self.handle_change(thread_name, check_id, check_name, lock_uid, 'offline', check_result) else: # increase confirmations database.query( "UPDATE checks SET confirmations = confirmations + 1, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s", (check_id, lock_uid)) else: database.query( "UPDATE checks SET confirmations = 0, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s", (check_id, lock_uid)) elif check_result['status'] == 'success': safe_print("[%s] ... got success", (thread_name)) if status == 'offline': if confirmations + 1 >= max_confirmations: # target has come back online self.handle_change(thread_name, check_id, check_name, lock_uid, 'online', check_result) else: # increase confirmations database.query( "UPDATE checks SET confirmations = confirmations + 1, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s", (check_id, lock_uid)) else: database.query( "UPDATE checks SET confirmations = 0, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s", (check_id, lock_uid)) else: util.die( "Check handler [%s] returned invalid status code [%s]!" ) % (check_type, check_result['status'])
# increase confirmations database.query("UPDATE checks SET confirmations = confirmations + 1, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s", (check_id, lock_uid)) else: database.query("UPDATE checks SET confirmations = 0, `lock` = '', last_checked = NOW() WHERE id = %s AND `lock` = %s", (check_id, lock_uid)) else: util.die("Check handler [%s] returned invalid status code [%s]!") % (check_type, check_result['status']) pool = MonitorPool() pool.start() last_cleanup = 0 while True: safe_print("[monitor] sleeping") time.sleep(random.uniform(config['sleep_interval'] * 2 / 3, config['sleep_interval'] * 4 / 3)) if util.time() - last_cleanup >= 60000: last_cleanup = util.time() database.query("UPDATE checks SET `lock` = '' WHERE TIMESTAMPDIFF(SECOND, last_locked, NOW()) >= 60") with recent_failures_lock: for check_tuple in set(recent_failures): if util.time() - check_tuple[1] >= 60000: recent_failures.remove(check_tuple) uid = util.uid(16) free_threads = pool.free() update_query = "UPDATE checks SET `lock` = %s, last_locked = NOW() WHERE `lock` = '' AND (TIMESTAMPDIFF(SECOND, last_checked, NOW()) >= check_interval OR confirmations > 0)" params = [uid] with recent_failures_lock:
def single_relational(in_dir='', out_dir='', gids=['text_gid'], pts=100000, start=0, dom=''): gen = Generator() ut.out('gids: %s' % str(gids), 0) t1 = ut.out('reading data...') df = pd.read_csv(in_dir + 'comments.csv', skiprows=range(1, start), nrows=pts) pts = len(df) ut.time(t1) colors = {'twitter': 'blue', 'youtube': 'red', 'soundcloud': 'orange'} # basic statistics p_spam = df.label.sum() / len(df) p_ham = 1 - p_spam ut.out('spam pct: %.2f' % p_spam) for gid in gids: t1 = ut.out('generating %s...' % gid) df = gen.gen_group_id(df, gid) ut.time(t1) for gid in gids: t1 = ut.out('grouping by %s...' % gid) g1 = df.groupby(gid) ut.time(t1) t1 = ut.out('computing stats per group...') size = g1.size().reset_index().rename(columns={0: 'size'}) sum_label = g1['label'].sum().reset_index()\ .rename(columns={'label': 'sum_label'}) mean_label = g1['label'].mean().reset_index()\ .rename(columns={'label': 'mean_label'}) gf = size.merge(sum_label).merge(mean_label) single_cnt = gf[gf[gid] == -1]['size'].values[0] same_label = lambda x: 1 if x['mean_label'] in [1.0, 0.0] else 0 gf['same_label'] = gf.apply(same_label, axis=1) gfs = gf[(gf['same_label'] == 1) | (gf[gid] == -1)] gfo = gf[(gf['same_label'] == 0) & (gf[gid] != -1)] ut.time(t1) t1 = ut.out('computing stats per size...') sf = compute_stats_per_size(gf, p_spam, p_ham, label='both', total_pts=pts, single_cnt=single_cnt) sfs = compute_stats_per_size(gfs, p_spam, p_ham, label='spam', single_cnt=single_cnt) sfo = compute_stats_per_size(gfo, p_spam, p_ham, label='ham') # compute single node row # sfs_df = sfs_df[sfs_df['size'] != single_cnt] extract = ['size', 'mean_label'] vs = gfs[gfs[gid] == -1][extract].values[0] row = [(1, single_cnt, single_cnt, vs[1], p_spam)] one_line = pd.DataFrame(row, columns=list(sfs)) sfs = pd.concat([one_line, sfs]) sfs = sfs.rename(columns={'mean_label': 'mean_lbl_sme_lbl'}) sfo = sfo.rename(columns={'mean_label': 'mean_lbl_not_sme_lbl'}) # compute single node row v = gf[gf[gid] == -1][['size', 'mean_label']].values[0] row = [(1, v[0], v[0], v[1], v[0] / pts, 1, 1)] cols = list(sf) one_line = pd.DataFrame(row, columns=cols) sf = pd.concat([one_line, sf]) # keep top X% of affected nodes pct = 100 total = sf.cnt.sum() for i in range(1, len(sf)): if sf[:i].cnt.sum() / total >= pct / float(100): sf = sf[:i] break ut.time(t1) t1 = ut.out('plotting...') cols = [ 'cnt_rto', 'same_label_rto', 'mean_lbl_sme_lbl', 'mean_lbl_not_sme_lbl' ] ncols = len(cols) subtitle_list = ['(a)', '(b)', '(c)', '(d)'] xlabel_list = [ '% all messages', '% groups containing same label', 'mean label over same label groups', 'mean label over mixed label groups' ] subtitles = dict(list(zip(cols, subtitle_list))) xlabels = dict(list(zip(cols, xlabel_list))) fontsize = 24 # nrows = 2 # ncols = int(ncols / nrows) # ncols += 1 if ncols % nrows != 0 else 0 # fig, axs = plt.subplots(nrows, ncols, figsize=(15, 15)) fig, axs = plt.subplots(1, 4, figsize=(27, 7)) # fig, axs = plt.subplots(1, 4) axs = axs.flatten() for i, col in enumerate(cols): if col == 'mean_lbl_sme_lbl': dummy_df = sfs elif col == 'mean_lbl_not_sme_lbl': dummy_df = sfo else: dummy_df = sf if len(dummy_df) > 0: gf = do_log_scale_binning(dummy_df) gf.plot.barh('size', col, ax=axs[i], title=subtitles[col], legend=False, fontsize=fontsize) if col == 'same_label_rto': gf.plot.barh('size', 'e_sme_lbl_rto', ax=axs[i], title=subtitles[col], legend=False, fontsize=fontsize, alpha=0.5, color='red', hatch='/') elif col in ['mean_lbl_sme_lbl', 'mean_lbl_not_sme_lbl']: axs[i].axvline(p_spam, color='k', linestyle='--') axs[i].set_ylabel('group size', fontsize=fontsize) axs[i].set_xlabel(xlabels[col], fontsize=fontsize) axs[i].set_title(subtitles[col], fontsize=fontsize - 2) xt = axs[i].get_xticks() tl = len(str(xt[1])) if (tl >= 4) or (tl == 3 and len(xt) >= 9): axs[i].set_xticks(axs[i].get_xticks()[::2]) rel = gid.replace('_gid', '') # t = (dom, pts, p_spam * 100, rel) title = '%s: spam: %.2f%%, relation: %s' % (dom, p_spam * 100, rel) # title = '%s: %d data points, spam: %.2f%%, relation: %s' % t fig.tight_layout() fig.suptitle(title, y=1.08, fontsize=fontsize) fig.savefig(out_dir + 'sg_%s.pdf' % str(gid), format='pdf', bbox_inches='tight') plt.close('all') ut.time(t1) sf.to_csv(out_dir + 'sg_%s.csv' % str(gid), index=None) spam_rto = df.label.sum() / len(df) ut.out('spam ratio: %.2f' % spam_rto) if len(gids) > 1: rel_nodes = 0 g = df.groupby(gids).size().reset_index().rename(columns={0: 'size'}) for gid in gids: g = g[g[gid] != -1] rel_nodes += len(df[df[gid] != -1]) spam_rto = df.label.sum() / len(df) overlap_rto = g.size.sum() / rel_nodes ut.out('overlap ratio: %.2f' % overlap_rto) ut.out()
util.die( "Check handler [%s] returned invalid status code [%s]!" ) % (check_type, check_result['status']) pool = MonitorPool() pool.start() last_cleanup = 0 while True: safe_print("[monitor] sleeping") time.sleep( random.uniform(config['sleep_interval'] * 2 / 3, config['sleep_interval'] * 4 / 3)) if util.time() - last_cleanup >= 60000: last_cleanup = util.time() database.query( "UPDATE checks SET `lock` = '' WHERE TIMESTAMPDIFF(SECOND, last_locked, NOW()) >= 60" ) with recent_failures_lock: for check_tuple in set(recent_failures): if util.time() - check_tuple[1] >= 60000: recent_failures.remove(check_tuple) uid = util.uid(16) free_threads = pool.free() update_query = "UPDATE checks SET `lock` = %s, last_locked = NOW() WHERE `lock` = '' AND (TIMESTAMPDIFF(SECOND, last_checked, NOW()) >= check_interval OR confirmations > 0)" params = [uid]
#!/usr/bin/env python3 import argparse import importlib import logging import util if __name__ == "__main__": """ Executes a particular function many times, collecting timings and showing metrics when finished. """ parser = argparse.ArgumentParser() parser.add_argument("name") parser.add_argument("runs", type=int) parser.add_argument("--func") parser.add_argument("--verbose", action="store_true") known, unknown = parser.parse_known_args() level = logging.DEBUG if known.verbose else logging.WARNING logging.basicConfig(level=level) log = logging.getLogger("time") log.debug(f"Known args are {known} and unknown args are {unknown}") mod = importlib.import_module(f"impl.{known.name}") res = util.time(mod, known.name, known.runs, func=known.func, *unknown) for r in res: print(r)
def compute_big_aupr(start_fold=0, ref_start_fold=-1, num_folds=5, domain='twitter', models=['ind'], in_dir='', gids=[]): ind_data_dir = 'independent/data/' + domain + '/' lines = {'ind': 'b-', 'mrf': 'g--', 'psl': 'm-.', 'mean': 'r:', 'median': 'c:', 'max': 'y:'} inds, mrfs, psls, approxs, refs = [], [], [], [], [] preds = [] gen_obj = Generator() relations = _relations_for_gids(gids) for model in models: preds.append(model + '_pred') if 'approx' in models: models.remove('approx') models.extend(['mean', 'median', 'max']) preds.extend(['mean_pred', 'median_pred', 'max_pred']) preds = list(zip(models, preds)) t1 = ut.out('reading true labels...', 0) full_df = pd.read_csv(ind_data_dir + 'comments.csv') lbl_df = full_df[['com_id', 'label']] ut.time(t1) s = '%s: reading model preds from fold %d to %d:' ut.out(s % (domain, start_fold, start_fold + num_folds - 1), 1) newline = 1 if 'approx' in models else 0 d = {} for i, fold in enumerate(range(start_fold, start_fold + num_folds)): ut.out('\nreading preds for fold %d...' % i, newline) f_dict = {} if ref_start_fold > -1: ndx = ref_start_fold + i fname = in_dir + 'test_' + str(ndx) + '_preds.csv' assert os.path.exists(fname) refs.append(pd.read_csv(fname)) if 'ind' in models: fname = in_dir + 'test_' + str(fold) + '_preds.csv' assert os.path.exists(fname) ind_df = pd.read_csv(fname) inds.append(ind_df) ind_lbl_df = full_df.merge(ind_df, on='com_id') t1 = ut.out('generating group ids...') for gid in gids: ind_lbl_df = gen_obj.gen_group_id(ind_lbl_df, gid) ut.time(t1) m_dict = _metrics(ind_lbl_df) a_dict = _analyze(ind_lbl_df, relations=relations, col='ind_pred') s_dict = _spread(ind_lbl_df, col='ind_pred', relations=relations) f_dict.update(a_dict) f_dict.update(s_dict) f_dict.update(m_dict) if 'mean' in models: temp_df = full_df.merge(ind_df) t1 = ut.out('generating group ids...') for gid in gids: temp_df = gen_obj.gen_group_id(temp_df, gid) ut.time(t1) approx_df = _approximations(temp_df, relations) approxs.append(approx_df) if 'mrf' in models: fname = in_dir + 'mrf_preds_' + str(fold) + '.csv' assert os.path.exists(fname) mrf_df = pd.read_csv(fname) mrfs.append(mrf_df) mrf_lbl_df = lbl_df.merge(mrf_df) m_dict = _metrics(mrf_lbl_df, col='mrf_pred', model='mrf') f_dict.update(m_dict) if 'psl' in models: fname = in_dir + 'psl_preds_' + str(fold) + '.csv' assert os.path.exists(fname) psl_df = pd.read_csv(fname) psls.append(psl_df) psl_lbl_df = lbl_df.merge(psl_df) m_dict = _metrics(psl_lbl_df, col='psl_pred', model='psl') f_dict.update(m_dict) d[i] = f_dict print(d) dicts = [d[i] for i in range(len(d))] stats_df = pd.DataFrame(dicts) stats_df = stats_df.reset_index()\ .rename(columns={'index': 'test_set'}) stats_df.to_csv('tw_full_0stk.csv', index=None) t1 = ut.out('concatenating test set predictions...') df = full_df[['com_id', 'label']] if 'ind' in models: ind_df = pd.concat(inds) df = df.merge(ind_df) if 'mean' in models: approx_df = pd.concat(approxs) assert set(ind_df['com_id']) == set(approx_df['com_id']) df = df.merge(approx_df) if ref_start_fold > -1: ref_df = pd.concat(refs) ref_df = full_df[['com_id', 'label']].merge(ref_df) ref_df = ref_df[['com_id', 'ind_pred']] ref_df = ref_df.rename(columns={'ind_pred': 'ref_pred'}) assert set(ind_df['com_id']) == set(ref_df['com_id']) df = df.merge(ref_df) if 'mrf' in models: mrf_df = pd.concat(mrfs) assert set(ind_df['com_id']) == set(mrf_df['com_id']) df = df.merge(mrf_df) if 'psl' in models: psl_df = pd.concat(psls) assert set(ind_df['com_id']) == set(psl_df['com_id']) df = df.merge(psl_df) ut.time(t1) t1 = ut.out('applying noise to predictions...') noise = 0.000025 perturb = lambda x: max(0.0, min(1.0, x + ran.uniform(-noise, noise))) if 'ind' in models: df['ind_pred'] = df['ind_pred'].apply(perturb) if 'mean' in models: df['mean_pred'] = df['mean_pred'].apply(perturb) df['median_pred'] = df['median_pred'].apply(perturb) df['max_pred'] = df['max_pred'].apply(perturb) if 'mrf' in models: df['mrf_pred'] = df['mrf_pred'].apply(perturb) if 'psl' in models: df['psl_pred'] = df['psl_pred'].apply(perturb) ut.time(t1) # compute reference aupr and auroc ref_label, ref_pred = df['label'], df['ref_pred'] ref_aupr = average_precision_score(ref_label, ref_pred) ref_auroc = roc_auc_score(ref_label, ref_pred) ref_p, ref_r, ref_t = precision_recall_curve(ref_label, ref_pred) ref_fpr, ref_tpr, ref_t2 = roc_curve(ref_label, ref_pred) ut.out('%s aupr: %.4f, auroc: %.4f' % ('reference', ref_aupr, ref_auroc)) ut.plot_pr_curve('ref', ref_p, ref_r, ref_aupr, domain=domain, line='k-', show_legend=True) ut.plot_roc_curve('ref', ref_tpr, ref_fpr, ref_auroc, domain=domain, line='k-', show_legend=True) auroc_pval, aupr_pval = 0, 0 # compute combined test set curves for i, (model, pred) in enumerate(preds): aupr = average_precision_score(df['label'], df[pred]) auroc = roc_auc_score(df['label'], df[pred]) p, r, _ = precision_recall_curve(df['label'], df[pred]) fpr, tpr, _ = roc_curve(df['label'], df[pred]) # aupr_pval, auroc_pval = _significance(df, pred) t = (model, aupr, aupr_pval, auroc, auroc_pval) ut.out('%s aupr: %.4f (%.4f), auroc: %.4f (%.4f)' % t) save = True if i == len(preds) - 1 else False ut.plot_pr_curve(model, p, r, aupr, domain=domain, line=lines[model], show_legend=True) ut.plot_roc_curve(model, tpr, fpr, auroc, save=save, domain=domain, line=lines[model], show_legend=True) ut.out()
def cosine_similarities(df, sim_thresh=0.8, in_col='text', out_col='text_id', approx_datapoints=120000, max_feats=None, k=5, max_id=0, out_dir='', fname='sim.csv'): ut.makedirs(out_dir) group_id = max_id all_ids = defaultdict(set) dfs = _split_data(df, approx_datapoints=approx_datapoints, in_col=in_col) for n, chunk_df in enumerate(dfs): t1 = time.time() ut.out('\ncreating tf-idf matrix for chunk %d...' % (n + 1)) groups = defaultdict(set) g_df = chunk_df.groupby(in_col).size().reset_index() strings = list(g_df[in_col]) m = _tf_idf(strings, analyzer=_ngrams, max_feats=max_feats) v, total = len(m.data), m.shape[0] * m.shape[1] ut.out('sparsity: (%d/%d) %.2f%%' % (v, total, 100 * (v / total))) ut.out('computing cosine similarities...') cos_sim = cosine_similarity(m, dense_output=False) ut.out('filtering out simiarities below threshold...') scm = cos_sim >= sim_thresh ut.out('putting matches into groups...') for ndx in range(len(strings)): data = cos_sim[ndx].data indices = list(cos_sim[ndx].indices) sims = [(x, data[indices.index(x)]) for x in scm[ndx].indices] sims = sorted(sims, key=lambda x: x[1], reverse=True) sim_ids = [sim_ndx for sim_ndx, sim_val in sims[:k]] groups[group_id].update(set(sim_ids)) group_id += 1 ut.out('merging identical groups...') groups = _merge_identical_groups(groups) ut.out('assigning ids to items...') ids = _assign_ids_to_items(groups, strings) ut.out('aggregating identical keys...') all_ids = _aggregate_identical_keys(all_ids, ids) ut.out('chunk time: %.4fm' % ((time.time() - t1) / 60.0)) t1 = time.time() ut.out('\nprune single items...') all_ids = _prune_single_items(all_ids, df, in_col) ut.time(t1) t1 = time.time() ut.out('prune redundant ids...') all_ids = _prune_redundant_ids(all_ids) ut.time(t1) t1 = time.time() ut.out('putting ids into a dataframe...') sim_df = _ids_to_dataframe(all_ids, df, in_col=in_col, out_col=out_col) ut.out('writing to csv...', 0) sim_df.to_csv(out_dir + fname, index=None) ut.time(t1) ut.out()
def _analyze(df, col, samples=100, relations=[]): gids = [r[2] for r in relations] if len(relations) == 0: return {} t1 = ut.out('computing messages missed most often...') p, r, ts = precision_recall_curve(df['label'], df[col]) aupr = average_precision_score(df['label'], df[col]) mp = 1.0 - aupr corrects = [] step = int(len(ts) / 100) if len(ts) > 100 else 1 for i in range(0, len(ts), step): t = ts[i] df['pred'] = np.where(df[col] > t, 1, 0) correct = df['pred'] == df['label'] corrects.append(correct.apply(int)) total_corrects = [sum(x) for x in zip(*corrects)] df['correct'] = total_corrects # extract bottom x% data df = df.sort_values('correct', ascending=False) ndx = len(df) - int(len(df) * mp) qf1, qf2 = df[ndx:], df[:ndx] # dfs = df[df['label'] == 1] qf1s = qf1[qf1['label'] == 1] # low performers qf1o = qf1[qf1['label'] == 0] # low performers qf2s = qf2[qf2['label'] == 1] # high performers qf2o = qf2[qf2['label'] == 0] # high performers ut.time(t1) # ut.out('spam in bot %.2f%%: %d' % (mp * 100, len(qf1s))) # ut.out('ham in bot %.2f%%: %d' % (mp * 100, len(qf1o))) t1 = ut.out('computing messages with a relation...') r1s, r1sf = _msgs_with_rel(qf1s, gids, mp, 'bot', 'spam') r1o, r1of = _msgs_with_rel(qf1o, gids, mp, 'bot', 'ham') r2s, r2sf = _msgs_with_rel(qf2s, gids, mp, 'top', 'spam') r2o, r2of = _msgs_with_rel(qf2o, gids, mp, 'top', 'ham') ut.time(t1) # ut.out() t1 = ut.out('computing messages with an outside relation...') rr1sof = _rm_in_sect(df, qf1s, qf2, gids, mp, r1s, 'bot', 'spam') rr1oof = _rm_in_sect(df, qf1o, qf2, gids, mp, r1o, 'bot', 'ham') rr2sof = _rm_in_sect(df, qf2s, qf1, gids, mp, r2s, 'top', 'spam') rr2oof = _rm_in_sect(df, qf2o, qf1, gids, mp, r2o, 'top', 'ham') # rr1sif = self._rm_in_sect(df, qf1s, qf1, gids, mp, r1s, 'bot', 'spam', # 'inside') # rr1oif = self._rm_in_sect(df, qf1o, qf1, gids, mp, r1o, 'bot', 'ham', # 'inside') sd = {} sd['bot_spam_rels'] = round(r1sf, 4) sd['bot_ham_rels'] = round(r1of, 4) sd['top_spam_rels'] = round(r2sf, 4) sd['top_ham_rels'] = round(r2of, 4) sd['bot_spam_rels_out'] = round(rr1sof, 4) sd['bot_ham_rels_out'] = round(rr1oof, 4) sd['top_spam_rels_out'] = round(rr2sof, 4) sd['top_ham_rels_out'] = round(rr2oof, 4) # sd['bot_spam_rels_in'] = rr1sif # sd['bot_ham_rels_in'] = rr1oif ut.time(t1) return sd
def _spread(df, col='ind_pred', relations=[]): """This'll give some post-hoc test-set analysis, when running this, keep track of the test sets that improved using relational modeling, then average those test set statistics together to compare to the test sets that did not improve.""" t1 = ut.out('computing subgraph statistics...') con_obj = Connections() gids = [r[2] for r in relations] g, sgs = con_obj.find_subgraphs(df, relations, verbose=False) spread_dict = {} sg_list = [] for i, sg in enumerate(sgs): if sg[3] > 0: # num edges > 0 sg_list.extend([(x, i) for x in sg[0]]) # give sg_id if len(sg_list) == 0: return spread_dict sg_df = pd.DataFrame(sg_list, columns=['com_id', 'sg_id']) df = df.merge(sg_df, how='left') df['sg_id'] = df['sg_id'].fillna(-1).apply(int) p, r, ts = precision_recall_curve(df['label'], df[col]) aupr = average_precision_score(df['label'], df[col]) mp = 1.0 - aupr corrects = [] step = int(len(ts) / 100) if len(ts) > 100 else 1 for i in range(0, len(ts), step): t = ts[i] df['pred'] = np.where(df[col] > t, 1, 0) correct = df['pred'] == df['label'] corrects.append(correct.apply(int)) total_corrects = [sum(x) for x in zip(*corrects)] df['correct'] = total_corrects # extract bottom x% data df = df.sort_values('correct', ascending=False) ndx = len(df) - int(len(df) * mp) qfs = df[df['label'] == 1] qfo = df[df['label'] == 0] qf1, qf2 = df[ndx:], df[:ndx] qf1s = qf1[qf1['label'] == 1] # low performers qf1o = qf1[qf1['label'] == 0] # low performers qf2s = qf2[qf2['label'] == 1] # high performers qf2o = qf2[qf2['label'] == 0] # high performers spread_dict['spam_mean'] = round(qfs['ind_pred'].mean(), 4) spread_dict['spam_median'] = round(qfs['ind_pred'].median(), 4) spread_dict['ham_mean'] = round(qfo['ind_pred'].mean(), 4) spread_dict['ham_median'] = round(qfo['ind_pred'].median(), 4) for nm, temp_df in [('bot_spam', qf1s), ('bot_ham', qf1o), ('top_spam', qf2s), ('top_ham', qf2o)]: wf = temp_df[(temp_df[gids] != -1).any(axis=1)] sg_mean = wf.groupby('sg_id')['ind_pred'].mean().reset_index()\ .rename(columns={'ind_pred': 'sg_mean'}) sg_std = wf.groupby('sg_id')['ind_pred'].std().reset_index()\ .rename(columns={'ind_pred': 'sg_std'}) sg_median = wf.groupby('sg_id')['ind_pred'].median().reset_index()\ .rename(columns={'ind_pred': 'sg_median'}) sg_min = wf.groupby('sg_id')['ind_pred'].min().reset_index()\ .rename(columns={'ind_pred': 'sg_min'}) sg_max = wf.groupby('sg_id')['ind_pred'].max().reset_index()\ .rename(columns={'ind_pred': 'sg_max'}) wf = wf.merge(sg_mean).merge(sg_std).merge(sg_median)\ .merge(sg_min).merge(sg_max) wf['sg_spread'] = wf['sg_max'] - wf['sg_min'] spread_dict[nm + '_sg_mean'] = round(np.mean(wf['sg_mean']), 4) spread_dict[nm + '_sg_std'] = round(np.mean(wf['sg_std']), 4) spread_dict[nm + '_sg_median'] = round(np.mean(wf['sg_median']), 4) spread_dict[nm + '_sg_min'] = round(np.mean(wf['sg_min']), 4) spread_dict[nm + '_sg_max'] = round(np.mean(wf['sg_max']), 4) spread_dict[nm + '_sg_spread'] = round(np.mean(wf['sg_spread']), 4) ut.time(t1) return spread_dict