def t_typecast(sep=','): import predicate index_field = 'mrn' f = '/phi/proj/poc7002/bulk_training/data-exp/cdr/lab/cerner/cerner_microbio_112.0_tset_bt_100.csv' df0 = load_df(_file=f, from_csv=True, sep=sep, verbose=False) idx = set(df0[index_field].values) if all([predicate.isNumber2(e) for e in idx]): print('> normal.') else: print('> non-numerical values found.') df = load_df(_file=f, from_csv=True, sep=sep, verbose=False) # cannot convert type because some entries are nan # df = df.dropna(thresh=1) # df = df[pd.notnull(df['mrn'])] df = convert_dtype(df, col='mrn', typ='int64', _debug=1) s0 = set(df[index_field].values) s = set([e for e in df[index_field].values if predicate.isNumber(e)]) print('> filter nan rows after convert_dtype: %d >=? %d' % (len(s0), len(s))) idx2 = set(s) div(message='> size of idx: %d' % len(idx2)) if all([isinstance(e, int) for e in idx2]): print('> normal.') print('> examples: %s' % list(idx2)[:5]) else: print('> non int found.') print('> examples: %s' % list(idx2)[:5]) return
def log(model, i): mmm = [] for loader in a_loader, b_loader, c_loader: y, y_bar = infer(loader, model) tp = utils.tp(y, y_bar) / len(y) fp = utils.fp(y, y_bar) / len(y) fn = utils.fn(y, y_bar) / len(y) tn = utils.tn(y, y_bar) / len(y) a = tp + tn p = utils.div(tp, tp + fp) r = utils.div(tp, p1) m = metric(p1, fn, fp) mmm.append([tp, fp, fn, tn, a, p, r, m]) tagg = ['tp', 'fp', 'fn', 'tn', 'a', 'p', 'r', args.metric] placeholder = '0' * (len(str(args.ni)) - len(str(i))) xx = ['/'.join(['%0.2f' % m for m in mm]) for mm in zip(*mmm)] x = ' | '.join('%s %s' % (tag, mm) for tag, mm in zip(tagg, xx)) print('[iteration %s%d]%s' % ((placeholder, i, x))) if args.tb: for writer, mm in zip([a_writer, b_writer, c_writer], mmm): for tag, m in zip(tagg, mm): writer.add_scalar(tag, m, i)
def __add__(self, rhs): """Add two points. Following the description from https://en.wikipedia.org/wiki/Elliptic_curve#The_group_law""" p = ECpoint.p a = ECpoint.a ## Addition of identity if rhs.isInfty(): return ECpoint(self.x, self.y) elif self.isInfty(): return ECpoint(rhs.x, rhs.y) ## Following notation from wikipedia xp, yp = self.x, self.y xq, yq = rhs.x, rhs.y if xp != xq: s = div((yp - yq), (xp - xq), p) xr = s**2 - xp - xq yr = s * (xp - xr) - yp return ECpoint(xr % p, yr % p) # else if xp == xq, there are two cases elif yp == -yq: return ECpoint.infty() else: s = div((3 * xp**2 + a), (yp << 1), p) xr = s**2 - (xp << 1) yr = s * (xp - xr) - yp return ECpoint(xr % p, yr % p)
def display(adict, title=None, msg_per_entry=None): if title: div(message='icd9utils: %s' % title, symbol='*') for k, v in adict.items(): if msg_per_entry: print('[%s] %s (%s)' % (k, v, msg_per_entry)) else: print('[%s] %s' % (k, v)) return
def t_hierarchy2(): import utils # get annotated codes gfiles = [ 'gold_candidates_neg_random_gh.csv', 'gold_candidates_pos_random_gh.csv' ] acodes = set() sdict = {} for i, f in enumerate(gfiles): fp = os.path.join('data-gold', f) df = pd.read_csv(fp, sep='|', header=0, index_col=False, error_bad_lines=True, dtype={'icd9': str}) codes = df['icd9'].values sdict[i] = codes acodes.update(codes) n_annotated = len(acodes) print('info> n_annotated: %d' % n_annotated) # 54 codes = [ '112.3', '047.9', '038.10', '038.11', '112.5', '031.0', '038.19', '031.9', '031.8', '090.9', '041.09', '135', '041.9', '041.6', '090.1', '138', '033.9', '049.9', '031.2', '003.0', '001.1', '017.00', '011.93', '041.00', '079.0', '079.6', '123.1', '079.4', '112.4', '009.0', '112.2', '070.51', '034.0', '007.1', '061', '070.32', '070.30', '054.79', '054.2', '054.3', '054.10', '046.3', '052.7', '038.42', '038.40', '088.81', '053.19', '010.10', '133.0', '110.0', '110.3', '137.0', '040.82', '008.45', '098.0', '075', '057.9', '112.89', '041.7', '112.84', '027.0', '097.1', '078.5', '136.9', '078.0', '009.1', '070.70', '131.01', '070.71', '099.9', '041.89', '127.4', '041.85', '097.9', '005.9', '054.13', '053.9', '054.11', '047.8', '009.3', '083.2', '054.19', '481', '117.3', '091.3', '117.5', '130.7', '038.8', '117.9', '036.0', '094.9', '130.0', '136.3', '008.69', '053.79', '087.9', '041.10', '041.11', '008.61', '111.9' ] assert len(set(acodes) - set(codes)) == 0 print('info> size: %d' % len(codes)) cur, freespots = evalRoot(codes, scope=None, verbose=True) print('> n_roots:%d, current roots:\n%s\n' % (len(cur), cur.keys())) utils.div() display(cur) n = 100 # setting too high may take time for UpSetR to finish candidates = utils.sample_hashtable(cur, n_sample=n) print('> sample existing %d=?=%d candidates:\n%s\n' % (n, len(candidates), list(candidates))) return
def train(model, loss_func, dictionary, epoch, train_data, dev_data, identity_mat, stop_counter): global best_dev_loss, best_acc model.train() total_loss = 0 for texts, labels, masks, bsz in utils.getBatch(data=train_data, dictionary=dictionary, maxlen=MAX_LEN, batch_size=BATCH_SIZE): init_state = model.init_hidden(bsz) fc, outh, pred, attention = model.forward(sents=texts, mask=masks, init_hc=init_state) loss = loss_func(pred.view(texts.size(0), -1), labels) if USE_ATTENTION: attentionT = torch.transpose(attention, 1, 2).contiguous() extra_loss = Frobenius( torch.bmm(attention, attentionT) - identity_mat[:attention.size(0)]) loss += PENALIZATION_COEFF * extra_loss optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), CLIP) optimizer.step() total_loss += loss.data res, dev_loss, acc = evaluate(model, loss_func, dictionary, dev_data) print(res) utils.saveLog(LOG_PATH, res) total_res = 'epoch: %d, dev loss: %f, acc: %f' % (epoch + 1, dev_loss, acc) print(total_res) utils.saveLog(LOG_PATH, total_res) utils.div('-') if not best_dev_loss or dev_loss < best_dev_loss: with open(MODEL_PATH % (dev_loss, acc), 'wb') as f: torch.save(model, f) best_dev_loss = dev_loss stop_counter = 0 else: for param_group in optimizer.param_groups: param_group['lr'] = param_group['lr'] * 0.2 if EARLY_STOP != 0: stop_counter += 1 return stop_counter
def t_select(): """ Memo ---- 1. df.iloc[i] returns the ith row of df. i does not refer to the index value, i is a 0-based index. In contrast, the attribute index is returning index values. """ def show(df, prompt=''): if prompt: print('> %s\n%s\n' % (prompt, df)) else: print('\n%s\n' % df) return div(message='Use case #1: select rows using row indices') df = pd.DataFrame({'BoolCol': [True, False, False, True, True]}, index=[10, 20, 30, 40, 50]) idx = df[df['BoolCol'] == True].index.tolist() # select the rows print df.loc[idx] div(message='Use case #2: select rows where columns match certain values.') df = pd.DataFrame({ 'A': 'foo bar foo bar foo bar foo foo'.split(), 'B': 'one one two three two two one three'.split(), 'C': np.arange(8), 'D': np.arange(8) * 2 }) show(df) show(df.loc[df['A'] == 'foo']) print('\nVS\n') criteria = {'A': 'foo'} show(match(df, criteria)) print('\n> for multiple values?') show(df.loc[df['B'].isin(['one', 'three'])]) print('> another way') df = df.set_index(['B']) show(df.loc['one']) print('> multiple values ...') show(df.loc[df.index.isin(['one', 'two'])]) return
def t_dtype(sep=','): import predicate index_field = 'mrn' f = '/phi/proj/poc7002/bulk_training/data-exp/cdr/lab/cerner/cerner_urine_009.2_tset_bt_77.csv' df0 = load_df(_file=f, from_csv=True, sep=sep, verbose=False) idx = set(df0[index_field].values) print("> Should expect to see integer types for idx: %s" % list(idx)[:5]) a = [['a', '1.2', '4.2'], ['b', '70', '0.03'], ['x', '5', '0']] df = pd.DataFrame(a, columns=['one', 'two', 'three']) print("> df:\n%s\n" % df) print("> dtypes:\n%s" % df.dtypes) div() # df[['two', 'three']] = df[['two', 'three']].astype(float) df[['two']] = df[['two']].astype(float) print("> df:\n%s\n" % df) print("> dtypes:\n%s" % df.dtypes) return
def log_r_X_z(self): X_m_tau = minus(self.Xz, self.tau) X_m_tau_vec = T.reshape(X_m_tau, [self.B * self.R, 1]) X_m_tau_vec.name = 'X_m_tau_vec' if self.Tau_isDiagonal: log_rX_z = -0.5 * self.R * self.B * log2pi - 0.5 * self.R * self.logDetTau \ - 0.5 * trace(dot(X_m_tau_vec.T, div(X_m_tau_vec,self.Tau))) else: log_rX_z = -0.5 * self.R * self.B * log2pi - 0.5 * self.R * self.logDetTau \ - 0.5 * trace(dot(X_m_tau_vec.T, dot(self.iTau, X_m_tau_vec))) log_rX_z.name = 'log_rX_z' return log_rX_z
def make_1d_E_B_plots(bfrange, y_databdl, colors, mu_pos=None, enrange=None, figsize=DEFAULT_FIGURE_SIZE, linewidth=DEFAULT_LW, ax=None, plotrange=None, legend=True): if not isinstance(bfrange, list) or not isinstance( y_databdl, list) or not isinstance(colors, list): raise TypeError(f'either x_databdl or y_databdl or colors is not list') if not len(y_databdl) == len(colors): raise ValueError(f'y_databdl, colors are not of the same length') if not any(isinstance(el, list) for el in y_databdl): raise TypeError(f'y_databdl is not nested list') if not isinstance(figsize, tuple): raise TypeError(f'figsize should be a tuple like (10,10)') if ax is None: ax = make_canvas(figsize=figsize) for n_band, (y_data, color) in enumerate(zip(y_databdl, colors)): for y in y_data: line, = ax.plot(bfrange, div(y, e0), linewidth=linewidth, color=color) line.set_label(f'Band{n_band}') if legend: ax.legend(loc=DEFAULT_LEGEND_LOC, bbox_to_anchor=DEFAULT_LEGEND_POS) if mu_pos and len(mu_pos) == len(bfrange): ax.plot(bfrange, div(mu_pos, e0), linewidth=linewidth, color='k') if enrange is not None: ax.set_ylim(min(enrange) / e0, max(enrange) / e0) ax.set_xlabel(DEFAULT_XLABEL) ax.set_ylabel(DEFAULT_EBPLOT_YLABEL) return ax
def test_query(): codes = getInfectiousParasiticCodes() print "how many? %d" % len(codes) print "max? %s" % Code.max(codes) print "min? %s" % Code.min(codes) except_ = [481, '005', '039.1', 'V42.0', '010.2', '010.01'] # 3 not valid print "min max of except_: %s << %s" % (Code.min(except_), Code.max(except_)) codes = getInfectiousParasiticCodes(diff=except_, verbose=True) print "how many? %d" % len(codes) print "max? %s" % Code.max(codes) print "min? %s" % Code.min(codes) n = 100 print('> randomly select %d infectious diseases' % n) codes = getInfectiousParasiticCodes(n_samples=n) print "how many? %d | they are: %s" % (len(codes), codes) print "max? %s" % Code.max(codes) print "min? %s" % Code.min(codes) print "type: %s" % type(codes) div(message='> mapping from codes to names ...') for i, code in enumerate(codes): print "[%d] %s -> %s" % (i, code, getName(code)) print "-" * 60 regex = 'meningitis' # 'mening.*' print "Getting %s-related codes ..." % regex codes = getCode(regex) div(message='Found %d codes with %s as a keyword.' % (len(codes), regex)) for code in codes: print " + %s -> %s" % (code, getName(code)) print "-" * 60 codes = getInfectiousParasiticCodes(filter_=isTuberculosis) codes2 = getCode('tubercu') print "> size %d =?= %d" % (len(codes), len(codes2))
def t_preproc(**kargs): ### input # code_str = '24900 25000 25001 7902 79021 79022 79029 7915 7916 V4585 V5391 V6546' # code_str += ' ' + """24901 24910 24911 24920 24921 24930 24931 24940 24941 24950 24951 24960 24961 24970 24971 24980 24981 24990 24991 25002 # 25003 25010 25011 25012 25013 25020 25021 25022 25023 25030 25031 25032 25033 25040 25041 25042 25043 25050 25051 25052 # 25053 25060 25061 25062 25063 25070 25071 25072 25073 25080 25081 25082 25083 25090 25091 25092 25093""" # code_str += ' ' + "64800 64801 64802 64803 64804 64880 64881 64882 64883 64884" code_str = '585 5851 5852 5853 5854 5855 5856 5859 7925 V420 V451 V4511 V4512 V560 V561 V562 V5631 V5632 V568' # [params] base_only = False codes = preproc_code(code_str, base_only=base_only) print('> n_codes: %d\n> codes:\n%s\n' % (len(codes), codes)) codes_minus_ve = preproc_code(code_str, base_only=base_only, no_ve_code=True) print('> n_codes: %d\n> codes:\n%s\n' % (len(codes_minus_ve), codes_minus_ve)) # [status] ok # print('\nNow, do base only\n') # codes = preproc_code(code_str, base_only=False) # print('> n_codes: %d\n> codes:\n%s\n' % (len(codes), codes)) # print('> codeset:\n%s\n' % set(codes)) # [log] set(['791', '790', 'V65', 'V45', 'V53', '648', '250', '249']) div(message='Now, testing lookup ...') n_limit = 100 for j, c in enumerate(codes[:20]): description = lookup2(c) print('+ code: %s => %s' % (c, description)) if j >= n_limit: break return
def test_manipulate(**kargs): """ Log --- * ./data-exp/cdr/lab/cerner/cerner_blood_481_tset_mixed.csv * 'data-lab/cerner/cerner_blood_481_tset_mixed.csv' * 'data-meds/cerner_antibiotic_481_tset_mixed.csv' Data ---- * ./data-exp/cdr/lab/cerner/cerner_microbio_tset_mixed_infections_bt.csv """ import os from utils import div from learner import Group, Feature from pprint import pprint file_ = kargs.get( 'file_', os.path.join( ProjDir, 'data-exp/cdr/lab/cerner/cerner_microbio_tset_mixed_infections_bt.csv' )) print('test> path: %s' % file_) df = load_df(_file=file_, from_csv=True, sep=',') # profiling params = profile(df) div() pprint(params) div() # df.columns is an index object df = Group.canonicalize(df) # [w1][1] fg = Feature(df.columns) # print "> total feature set: %s" % fg.total() print("> number of features: %d =?= %d, type: %s" % (len(fg.total()), len(fg.active()), type(fg.total()))) print("> number of columns:%s type: %s, examples: %s" % (len(df.columns), type(df.columns), df.columns)) div() # check support and indexing columns = Series([f for f in df.columns[:10]]) print("> ncols: %d, type: %s, ex: %s" % (len(columns), type(columns), columns)) idx = [1, 3, 5] support = [False] * len(columns) for i in idx: support[i] = True print("> idx: %s -> features:\n %s" % (idx, columns[idx])) print("> support: %s -> features:\n %s" % (support, columns[support])) return
def t_filter(): from numpy.random import randn div('(1) demo dropping rows with nan ...', border=1) df = DataFrame(randn(10, 3), columns=list('ABC'), index=pd.date_range('20130101', periods=10)) df.ix[6, 'A'] = np.nan df.ix[6, 'B'] = np.nan df.ix[2, 'A'] = np.nan df.ix[4, 'B'] = np.nan print("> show row 0-5:\n%s\n" % df.iloc[0:6]) df2 = df.iloc[0:6].dropna() print("> after dropping rows with nan:\n%s\n" % df2) div('(2) filtering out data without NaN ...', border=1) df = pd.DataFrame({ 'movie': ['thg', 'thg', 'mol', 'mol', 'lob', 'lob'], 'rating': [3., 4., 5., np.nan, np.nan, np.nan], 'name': ['John', np.nan, 'N/A', 'Graham', np.nan, np.nan] }) print("> df:\n%s\n" % df) nbs = df['name'].str.extract('^(N/A|NA|na|n/a)') # standardize nan data nms = df[(df['name'] != nbs)] print('> nms:\n%s\n' % nms) thresh = 2 nms = nms.dropna(thresh=thresh) print('> after dropping rows with at least 2 %d nan:\n%s\n' % (thresh, nms)) div() nms01 = nms[nms.name.notnull()] print('> dropped rows with na in name col:\n%s\n' % nms01) # nms02 = nms[np.isfinite(nms['name'])] # => error: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'' # print('> dropped rows with na in name col:\n%s\n' % nms02) nms03 = nms[pd.notnull(nms['name'])] print('> dropped rows with na in name col:\n%s\n' % nms03) return
def plot_from_csv(path, ax=None, cmap=None, legend=True): if not isinstance(path, str): sys.stderr.write(f'the path {path} is not a string\n') if not os.path.isfile(path): sys.stderr.write(f'the file {path} does not exist\n') if not path.endswith('.csv'): sys.stderr.write(f'the file {path} is not a csv file\n') try: df = pd.read_csv(path) except: sys.stderr.write(f'Failed to read the csv file\n') if cmap is None: cmap = DEFAULT_CMAP colors = make_n_colors(len(df.Band.unique()), cmap, DEFAULT_CMAP_VMIN, DEFAULT_CMAP_VMAX) if ax is None: ax = make_canvas() if 'N' in df.columns: x, y, N, iBand = [], [], df.iloc[0].N, df.iloc[0].Band else: x, y, iBand = [], [], df.iloc[0].Band # define plot parameter and type if 'System([band density])' in df.columns: ind, plottype = 'den', 'scatter' elif 'den' in df.columns: ind, plottype = 'den', 'plot' elif 'E' in df.columns: ind, plottype = 'E', 'plot' elif 'dos_at_mu' in df.columns: ind, plottype = 'dos_at_mu', 'plot' else: sys.stderr.write('This file is not readable by toybands\n') # plot for i in range(len(df)): if 'N' in df.columns: if df.iloc[i].N != N or df.iloc[i].Band != iBand: N = df.iloc[i].N if plottype == 'plot': if ind == 'E': line, = ax.plot(x, div(y, e0), color=colors[int(iBand)], linewidth=DEFAULT_LW) else: line, = ax.plot(x, y, color=colors[int(iBand)], linewidth=DEFAULT_LW) else: ax.scatter(x, y, color=colors[int(iBand)]) if df.iloc[i].Band != iBand: if plottype == 'plot': line.set_label(f'Band{int(iBand)}') iBand = df.iloc[i].Band x, y = [], [] x.append(df.iloc[i].B) y.append(df.iloc[i][ind]) else: if df.iloc[i].Band != iBand: line, = ax.plot(x, y, color=colors[int(iBand)], linewidth=DEFAULT_LW) line.set_label(f'Band{int(iBand)}') iBand = df.iloc[i].Band x, y = [], [] x.append(df.iloc[i].B) y.append(df.iloc[i][ind]) else: x.append(df.iloc[i].B) y.append(df.iloc[i][ind]) # plot the stored last curve if 'System([band density])' in df.columns: ax.scatter(x, y, color=colors[int(iBand)]) else: line, = ax.plot(x, y, color=colors[int(iBand)], linewidth=DEFAULT_LW) line.set_label(f'Band{int(iBand)}') # label and legend ax.set_xlabel(DEFAULT_XLABEL) if ind == 'den': ax.set_ylabel(DEFAULT_NBPLOT_YLABEL) elif ind == 'E': ax.set_ylabel(DEFAULT_EBPLOT_YLABEL) elif ind == 'dos_at_mu': ax.set_ylabel(DEFAULT_DOSBPLOT_YLABEL) if legend and not 'System([band density])' in df.columns: ax.legend(loc=DEFAULT_LEGEND_LOC, bbox_to_anchor=DEFAULT_LEGEND_POS) super_save(filename=path.split('/')[-1].split('.')[-2] + '-replot')
def leaderboard(request): context = dict() rnds_played_set = UserStat.objects.order_by('-rounds_played') total_score_set = UserStat.objects.order_by('-total_score') word_score_set = WordStat.objects.order_by('avg_score') # Only words played 10+ times are allowed on the leaderboard word_score_list = list( filter(lambda x: x.rounds_played >= 10, word_score_set)) average_score_list = list(UserStat.objects.all()) # Only players who have played 10+ rounds are allowed on the leaderboard # Furthermore players who have not played in a month are not ranked for avg score. average_score_list = filter(lambda x: x.rounds_played >= 10, average_score_list) average_score_list = filter( lambda x: abs((date.today() - x.last_login).days) <= 30, average_score_list) average_score_list.sort( key=lambda x: utils.div(x.total_score, x.rounds_played), reverse=True) # If the user is authenticated, gather data on them in addition to just top players if request.user.is_authenticated(): user_stat = utils.get_or_create_user_stat(request.user) context['rounds_played'] = user_stat.rounds_played context['total_score'] = round(user_stat.total_score, 2) context['avg_score'] = round( utils.div(user_stat.total_score, user_stat.rounds_played), 2) rounds_played_rank = utils.rank(rnds_played_set, user_stat, lambda x: x.rounds_played, 0, len(rnds_played_set) - 1) total_score_rank = utils.rank(total_score_set, user_stat, lambda x: x.total_score, 0, len(total_score_set) - 1) avg_score_rank = utils.rank( average_score_list, user_stat, lambda x: utils.div(x.total_score, x.rounds_played), 0, len(average_score_list) - 1) context['total_score_rank'] = str(total_score_rank + 1) if total_score_rank >= 0 else '-' context['avg_score_rank'] = str(avg_score_rank + 1) if avg_score_rank >= 0 else '-' context['rounds_played_rank'] = str( rounds_played_rank + 1) if rounds_played_rank >= 0 else '-' context['num_players'] = len(total_score_set) # Adding top x scores per category to context for i in range(0, 5): if i < len(word_score_list): stat = word_score_list[i] context['word_rank' + str(i + 1)] = stat.word context['word_score' + str(i + 1)] = round(stat.avg_score, 2) context['sem_rel' + str(i + 1)] = ("Synonyms of " if stat.sem_rel == 'synonyms' else "Antonyms of " if stat.sem_rel == 'antonyms' else rel_a_map[stat.sem_rel] + utils.get_det(stat.word, determiners)) for i in range(0, 5): if i < len(rnds_played_set): stat = rnds_played_set[i] context['rnd_rank' + str(i + 1)] = stat.user.username context['rnd_score' + str(i + 1)] = stat.rounds_played for i in range(0, 5): if i < len(total_score_set): stat = total_score_set[i] context['ttl_rank' + str(i + 1)] = stat.user.username context['ttl_score' + str(i + 1)] = round(stat.total_score, 2) for i in range(0, 10): if i < len(average_score_list): stat = average_score_list[i] context['avg_rank' + str(i + 1)] = stat.user.username val = 0 if (stat.rounds_played != 0): val = stat.total_score / stat.rounds_played context['avg_score' + str(i + 1)] = round(val, 2) return render(request, 'leaderboard.html', context)
df[col] = df[col].astype(typ) return df except KeyError, e: msg = "convert_dtype> column %s does not exist" % col if not no_op: raise KeyError, msg except Exception, e: try: try: n0 = df.shape[0] msg = "convert_dtype> 'df[%s]' may contain NaN values. Try removing rows with NaNs\n" % col df = df[pd.notnull(df[col])] msg += "convert_dtype> size of df %d -> %d\n" % (n0, df.shape[0]) df[col] = df[col].astype(typ) if _debug: div(message=msg, symbol='*', adaptive=False) return df except: msg = "convert_dtype> Could not cast 'df[%s]' to type: %s\n" % ( col, typ) # if _debug > 0: msg += " + %s\n" % e msg += " + value:\n%s" % df.head(3) except Exception, e: msg = "convert_dtype> %s\n" % e msg += " + Invalid dataframe:\n%s" % str(df.head()) if _debug: print msg if no_op: # do nothing return df
def subset3(path, n_samples=None, balanced=True, ratio=0.8, min_ratio=0.05, sep=',', shuffle=True, verbose=False, save_=True, verify_=False): """ Take a (balanced) subset of a training set data. """ from utils import div import sys from pprint import pprint if sep != Params.sep_tset: div(message="Warning: a training set data are usually '%s'-separated." % Params.sep_tset, symbol='~') root, fname = os.path.dirname(path), os.path.basename(path) df = load_df(_file=path, from_csv=True, sep=sep) params = profile(df) nrow, ncol = params['nrow'], params['ncol'] assert (nrow, ncol) == df.shape and nrow >= 2 # determine n_samples n_total = nrow if n_samples is None: n_samples = math.floor(n_total * ratio) min_samples = math.floor(n_total * min_ratio) if min_ratio is not None else 0 if n_samples < min_samples: if verbose: print('subset3> n_samples:%d may be too small; adjusting to %d' % (n_samples, min_samples)) n_samples = min_samples # how many instances should a label have? labels = params['labels'] dfl = [] dimtb = {} min_nrow, max_nrow = (np.inf, -np.inf) n_avg = int( n_samples / (len(labels) + 0.0)) # each label should have this many instances if obtainable use_min_nrow = False for label in labels: n = params[str(label)] dimtb[str(label)] = n # each label has n instances if balanced and n < n_avg: # if any of the num. of instances does not meet the average, then 'optimal' balanced set is not possible use_min_nrow = True if verbose: msg = 'Warning: %s-labeled data are not sufficient to reach a balanced data set given n_samples=%d' % ( label, n_samples) div(message=msg, symbol='%') if n <= min_nrow: min_nrow = n if n >= max_nrow: max_nrow = n if verbose: msg = "subset3> n_avg: %d, min_nrow: %d, max_nrow: %d" % ( n_avg, min_nrow, max_nrow) div(message=msg, symbol='~') for label in labels: try: subdf = df[df[Params.target_field] == label] except: print('subset3> label %s is not in %s?' % (label, [c for c in df.columns])) sys.exit(1) if verbose: print( 'subset3> prior to slicing, subset of df with label=%s has dim: %s' % (label, str(subdf.shape))) if shuffle: subdf = subdf.reindex(np.random.permutation(subdf.index)) if use_min_nrow: subdf = subdf[:min_nrow] else: subdf = subdf[:n_avg] dfl.append(subdf) df = pd.concat(dfl, ignore_index=True) if shuffle: df = df.reindex(np.random.permutation(df.index)) if verbose: print('subset3> slicing and combining completed, dim of new tset: %s' % str(df.shape)) if verify_: params = profile(df) print("subset3> profile of new training set:") div() pprint(params) div() if balanced: nref = params[str(labels[0])] for label in labels[1:]: assert nref == params[str( label)], "imbalanced training data. see profile above." if save_: # rename file fname_prefix, ext = os.path.splitext(fname) fname = (fname_prefix + '-%s' % n_samples) + ext path = os.path.join(root, fname) if verbose: print('subset3> saving new tset of dim %s to %s' % (str(df.shape), path)) save_df(df, _file=path, is_data=True, to_csv=True, sep=Params.sep_tset) return df
def t_hierarchy(): import utils, configure X = [481.01, '112', '130.09'] Y = [481.0, '112.0', '130.9'] for i, x in enumerate(X): if isA(x, Y[i]): print('> %s is a %s' % (x, Y[i])) # root analysis # targets = ['047.8','112.2','038.10','038.11','112.5','047.9','038.19','090.9','135','041.9','041.6', # '090.1','138','041.3','001.1','017.00','011.93','112.4','003.0','094.9','008.45', # '054.2','070.71','052.7','088.81','041.7','027.0','131.01','041.89','041.85','049.9', # '046.3','009.2','009.3','009.0','009.1','038.2','117.3','038.0','091.3','117.5', # '038.8','117.9','054.10','041.19','136.3','041.10','041.11','031.2','031.0','031.9', # '031.8','112.3','033.9','041.02','041.01','041.00','079.0','079.6','041.09','079.4', # '054.13','070.51','007.1','070.32','070.30','038.3','038.49','038.43','038.42','038.40', # '054.79','053.19','110.0','110.3','137.0','075','057.9','112.89','112.84','097.9', # '097.1','078.5','078.0','070.70','054.3','099.9','127.4','005.9','136.9','053.9', # '054.11','083.2','054.19','481','130.7','036.0','130.0','008.69','053.79','087.9', # '008.61','111.9'] otra = '112.1,112.0,112.9,072.9,096,056.9,041.8,098.86,041.4,041.5,041.2,041.0,011.12,091.0,026.9,001.9,091.9,123.1,003.1,074.0,003.9,074.8,077.99,098.0,008.6,098.2,054.0,054.6,008.8,099.40,099.41,052.9,129,088.82,057.0,039.9,008.43,010.10,131.9,039.1,133.0,079.53,040.82,099.50,099.53,099.55,099.54,039.8,090.2,035,092.9,010.01,010.00,041.1,094.0,131.00,079.51,079.83,041.86,131.09,079.88,079.89,049.8,048,042,038.1,038.9,094.89,136.1,136.8,031.1,079.98,066.3,139.8,033.0,070.54,041.04,041.03,074.3,079.2,079.1,070.22,054.40,054.43,007.4,045.90,007.2,070.59,061,078.19,077.8,070.31,078.10,078.11,004.9,046.1,038.44,038.41,058.10,053.12,053.11,084.0,084.6,110.1,070.41,110.2,110.5,110.4,110.9,110.8,054.8,134.0,054.9,010.90,057.8,078.89,078.88,040.0,055.9,112.81,078.8,097.0,078.2,078.1,111.0,002.0,127.2,099.1,099.0,099.3,054.12,053.21,070.3,053.0,034.0,034.1,130.9,111.8,036.2,132.9,088.8,008.62,132.2,132.1,132.0,088.0' otra = otra.split(',') # get annotated codes gfiles = [ 'gold_candidates_neg_random_gh.csv', 'gold_candidates_pos_random_gh.csv' ] acodes = set() sdict = {} for i, f in enumerate(gfiles): fp = os.path.join('data-gold', f) df = pd.read_csv(fp, sep='|', header=0, index_col=False, error_bad_lines=True, dtype={'icd9': str}) codes = df['icd9'].values sdict[i] = codes acodes.update(codes) n_annotated = len(acodes) print('info> n_annotated: %d' % n_annotated) # 54 overlap = set(sdict[0]).intersection(sdict[1]) print('info> overlap? size: %d, %s' % (len(overlap), overlap)) total_set = configure.Params.code_set print('info> size of total targets: %d' % len(total_set)) targets = list(set(total_set) - set(acodes)) n_remaining = len(targets) n_targets = 100 n_to_draw = n_targets - n_annotated print('info> number of remaining: %d but only need %d' % (n_remaining, n_to_draw)) # otra = configure.Params.otra cur, freespots = evalRoot(targets, scope=None, verbose=True) print('> n_roots:%d, current roots:\n%s\n' % (len(cur), cur.keys())) utils.div() display(cur) n = n_to_draw # setting too high may take time for UpSetR to finish candidates = utils.sample_hashtable(cur, n_sample=n) print('> sample existing %d=?=%d candidates:\n%s\n' % (n, len(candidates), list(candidates))) print('-' * 100) acodes.update(candidates) wanted = list(acodes) print('info> %d candidates:\n%s\n' % (len(wanted), wanted)) print('-' * 100) newcodes = assignRoot(otra, freespots) print('> suggested pick:\n%s\n' % newcodes) n = 10 candidates = utils.sample_hashtable(newcodes, n_sample=n) print('> sample %d=?=%d candidates:\n%s\n' % (n, len(candidates), list(candidates))) return
def random_forest(N, M, F, table, attr_indexes, attr_domains, class_index, strat_index): random.shuffle(table) test, remainder = test_remainder_stratified(table, strat_index) boot_samples = [] attr_subsets = [] trees = [] accuracies = [] trees = [] #setup boot straps for _ in range(N): attr_subsets.append(utils.rand_attributes(attr_indexes, F)) boot = utils.bootstrap(remainder) valid = [] #build validator set for item in remainder: if item not in boot: valid.append(item) boot_samples.append([boot, valid]) #build trees for i in range(N): #returns predictions, tree pred, tree = train_test_tree(boot_samples[i][0], boot_samples[i][1], attr_subsets[i], attr_domains, class_index) correct = 0 for j in range(len(boot_samples[i][1])): if boot_samples[i][1][j][class_index] == pred[j]: correct += 1 trees.append([tree, utils.div(correct, len(boot_samples[i][1]))]) trees.sort(key=lambda x: x[1]) mtrees = trees[len(trees) - M:] #predict and determine accuracy print(" grouping test set") minutes, groups = utils.groupBy(test, 1) print(" running classifier") accuracies = [] overall_correct = 0 total_instance = len(test) for count in range(len(minutes)): correct = 0 for item in groups[count]: votes = [] for tree in mtrees: votes.append(classify_tdidt(tree[0], item)) vote = utils.majority_vote(votes) if item[class_index] == vote: correct += 1 overall_correct += 1 accuracies.append([ minutes[count], correct / len(groups[count]), correct, len(groups[count]) ]) print("Sorting accuracies") accuracies.sort(key=lambda x: x[0]) count = 0 for item in accuracies: print('Minute: ', item[0]) print(' Accuracy: ', item[1]) print(' Correct: ', item[2]) print(' Instances: ', item[3]) print() count += 1 print("Overll Accurracy: ", overall_correct / total_instance) print("Instances: ", total_instance) print("Correct: ", overall_correct) return accuracies
def single_match(raw, mid): m = Match(id=mid) match_incr() if raw[0][0]['officl'] == "1" and raw[0][0]['cas'] == "1": # m.mode = 'cs' m.mode = 2 elif raw[0][0]['officl'] == "1" and raw[0][0]['cas'] == "0": # m.mode = "rnk" m.mode = 1 else: # m.mode = "acc" m.mode = 3 m.version = raw[3][0]['version'] m.map_used = raw[3][0]['map'] m.length = raw[3][0]['time_played'] # '2014-07-27 01:31:18' unaware_date = datetime.strptime(raw[3][0]['mdt'], '%Y-%m-%d %H:%M:%S') m.date = utc.localize(unaware_date) pitems = {} for p in raw[1]: items = [] for item in range(1, 7): if p['slot_' + str(item)]: items.append(int(p['slot_' + str(item)])) pitems[p['account_id']] = items for p in raw[2]: if p['account_id'] not in pitems: pitems[p['account_id']] = [] m.players.append(PlayerMatch( player_id=int(p['account_id']), nickname=p['nickname'], clan_id=int(p['clan_id']), hero_id=int(p['hero_id']), position=int(p['position']), items=pitems[p['account_id']], team=int(p['team']), level=int(p['level']), win=bool(int(p['wins'])), concedes=int(p['concedes']), concedevotes=int(p['concedevotes']), buybacks=int(p['buybacks']), discos=int(p['discos']), kicked=int(p['kicked']), mmr_change=float(p['amm_team_rating']), herodmg=int(p['herodmg']), kills=int(p['herokills']), assists=int(p['heroassists']), deaths=int(p['deaths']), kdr=div(p['herokills'], p['deaths']), goldlost2death=int(p['goldlost2death']), secs_dead=int(p['secs_dead']), cs=int(p['teamcreepkills']) + int(p['neutralcreepkills']), bdmg=p['bdmg'], denies=p['denies'], exp_denied=p['exp_denied'], gpm=divmin(p['gold'], m.length), xpm=divmin(p['exp'], m.length), apm=divmin(p['actions'], m.length), consumables=int(p['consumables']), wards=int(p['wards']) )) return m
def update_player(nickname, p=None): raw = get_json('/player_statistics/all/nickname/' + nickname) if raw is None or int(raw['account_id']) == 0: return None if p is None: p = Player(id=int(raw['account_id'])) not_exists = True else: not_exists = False p.nickname = raw['nickname'].lower() var = [ 'rnk_games_played', 'rnk_wins', 'rnk_losses', 'rnk_concedes', 'rnk_concedevotes', 'rnk_buybacks', 'rnk_discos', 'rnk_kicked', 'rnk_herokills', 'rnk_herodmg', 'rnk_heroexp', 'rnk_herokillsgold', 'rnk_heroassists', 'rnk_deaths', 'rnk_goldlost2death', 'rnk_secs_dead', 'rnk_teamcreepkills', 'rnk_teamcreepdmg', 'rnk_teamcreepexp', 'rnk_teamcreepgold', 'rnk_neutralcreepkills', 'rnk_neutralcreepdmg', 'rnk_teamcreepexp', 'rnk_neutralcreepgold', 'rnk_bdmg', 'rnk_razed', 'rnk_bgold', 'rnk_denies', 'rnk_exp_denied', 'rnk_gold', 'rnk_gold_spent', 'rnk_exp', 'rnk_actions', 'rnk_secs', 'rnk_consumables', 'rnk_wards', 'rnk_level', 'rnk_level_exp', 'rnk_time_earning_exp', 'rnk_bloodlust', 'rnk_doublekill', 'rnk_triplekill', 'rnk_quadkill', 'rnk_annihilation', 'rnk_ks3', 'rnk_ks4', 'rnk_ks5', 'rnk_ks6', 'rnk_ks7', 'rnk_ks8', 'rnk_ks9', 'rnk_ks10', 'rnk_ks15', 'rnk_smackdown', 'rnk_humiliation', 'rnk_nemesis', 'rnk_retribution', 'cs_games_played', 'cs_wins', 'cs_losses', 'cs_concedes', 'cs_concedevotes', 'cs_buybacks', 'cs_discos', 'cs_kicked', 'cs_herokills', 'cs_herodmg', 'cs_heroexp', 'cs_herokillsgold', 'cs_heroassists', 'cs_deaths', 'cs_goldlost2death', 'cs_secs_dead', 'cs_teamcreepkills', 'cs_teamcreepdmg', 'cs_teamcreepexp', 'cs_teamcreepgold', 'cs_neutralcreepkills', 'cs_neutralcreepdmg', 'cs_teamcreepexp', 'cs_neutralcreepgold', 'cs_bdmg', 'cs_bdmgexp', 'cs_razed', 'cs_bgold', 'cs_denies', 'cs_exp_denied', 'cs_gold', 'cs_gold_spent', 'cs_exp', 'cs_actions', 'cs_secs', 'cs_consumables', 'cs_wards', 'cs_level', 'cs_level_exp', 'cs_time_earning_exp', 'cs_bloodlust', 'cs_doublekill', 'cs_triplekill', 'cs_quadkill', 'cs_annihilation', 'cs_ks3', 'cs_ks4', 'cs_ks5', 'cs_ks6', 'cs_ks7', 'cs_ks8', 'cs_ks9', 'cs_ks10', 'cs_ks15', 'cs_smackdown', 'cs_humiliation', 'cs_nemesis', 'cs_retribution', 'acc_games_played', 'acc_wins', 'acc_losses', 'acc_concedes', 'acc_concedevotes', 'acc_buybacks', 'acc_discos', 'acc_kicked', 'acc_herokills', 'acc_herodmg', 'acc_heroexp', 'acc_herokillsgold', 'acc_heroassists', 'acc_deaths', 'acc_goldlost2death', 'acc_secs_dead', 'acc_teamcreepkills', 'acc_teamcreepdmg', 'acc_teamcreepexp', 'acc_teamcreepgold', 'acc_neutralcreepkills', 'acc_neutralcreepdmg', 'acc_teamcreepexp', 'acc_neutralcreepgold', 'acc_bdmg', 'acc_bdmgexp', 'acc_razed', 'acc_bgold', 'acc_denies', 'acc_exp_denied', 'acc_gold', 'acc_gold_spent', 'acc_exp', 'acc_actions', 'acc_secs', 'acc_consumables', 'acc_wards', 'acc_time_earning_exp', 'acc_bloodlust', 'acc_doublekill', 'acc_triplekill', 'acc_quadkill', 'acc_annihilation', 'acc_ks3', 'acc_ks4', 'acc_ks5', 'acc_ks6', 'acc_ks7', 'acc_ks8', 'acc_ks9', 'acc_ks10', 'acc_ks15', 'acc_smackdown', 'acc_humiliation', 'acc_nemesis', 'acc_retribution' ] for v in var: setattr(p, v, int(raw[v])) p.updated = datetime.utcnow() p.rnk_mmr = float(raw['rnk_amm_team_rating']) p.rnk_avg_kills = div(p.rnk_herokills, p.rnk_games_played) p.rnk_avg_deaths = div(p.rnk_deaths, p.rnk_games_played) p.rnk_avg_assists = div(p.rnk_heroassists, p.rnk_games_played) p.rnk_avg_creeps = div((p.rnk_neutralcreepkills + p.rnk_teamcreepkills), p.rnk_games_played) p.rnk_avg_denies = div(p.rnk_denies, p.rnk_games_played) rnk_minutes = div(p.rnk_secs, 60) p.rnk_avg_xpm = div(p.rnk_exp, rnk_minutes) p.rnk_avg_apm = div(p.rnk_actions, rnk_minutes) p.rnk_avg_gpm = div(p.rnk_gold, rnk_minutes) p.rnk_avg_consumables = div(p.rnk_consumables, p.rnk_games_played) p.rnk_avg_time = div(rnk_minutes, p.rnk_games_played) p.rnk_winpercent = div(p.rnk_wins, p.rnk_games_played) p.rnk_kdr = div(p.rnk_herokills, p.rnk_deaths) p.rnk_avg_wards = div(p.rnk_wards, p.rnk_games_played) p.rnk_kadr = div((p.rnk_herokills + p.rnk_heroassists), p.rnk_deaths) try: p.rnk_tsr = ((p.rnk_herokills / p.rnk_deaths / 1.15) * 0.65) + ((p.rnk_heroassists / p.rnk_deaths / 1.55) * 1.20) + (((p.rnk_wins / (p.rnk_wins + p.rnk_losses)) / 0.55) * 0.9) + (((p.rnk_gold / p.rnk_secs * 60) / 230) * 0.35) + ((((p.rnk_exp / p.rnk_time_earning_exp * 60) / 380)) * 0.40) + ( (((((p.rnk_denies / p.rnk_games_played) / 12)) * 0.70) + ((((p.rnk_teamcreepkills / p.rnk_games_played) / 93)) * 0.50) + ((p.rnk_wards / p.rnk_games_played) / 1.45 * 0.30)) * (37.5 / (p.rnk_secs / p.rnk_games_played / 60))) except: p.rnk_tsr = 0 p.cs_mmr = float(raw['cs_amm_team_rating']) p.cs_avg_kills = div(p.cs_herokills, p.cs_games_played) p.cs_avg_deaths = div(p.cs_deaths, p.cs_games_played) p.cs_avg_assists = div(p.cs_heroassists, p.cs_games_played) p.cs_avg_creeps = div((p.cs_neutralcreepkills + p.cs_teamcreepkills), p.cs_games_played) p.cs_avg_denies = div(p.cs_denies, p.cs_games_played) cs_minutes = div(p.cs_secs, 60) p.cs_avg_xpm = div(p.cs_exp, cs_minutes) p.cs_avg_apm = div(p.cs_actions, cs_minutes) p.cs_avg_gpm = div(p.cs_gold, cs_minutes) p.cs_avg_consumables = div(p.cs_consumables, p.cs_games_played) p.cs_avg_time = div(cs_minutes, p.cs_games_played) p.cs_winpercent = div(p.cs_wins, p.cs_games_played) p.cs_kdr = div(p.cs_herokills, p.cs_deaths) p.cs_avg_wards = div(p.cs_wards, p.cs_games_played) p.cs_kadr = div((p.cs_herokills + p.cs_heroassists), p.cs_deaths) try: p.cs_tsr = ((p.cs_herokills / p.cs_deaths / 1.15) * 0.65) + ((p.cs_heroassists / p.cs_deaths / 1.55) * 1.20) + (((p.cs_wins / (p.cs_wins + p.cs_losses)) / 0.55) * 0.9) + (((p.cs_gold / p.cs_secs * 60) / 230) * 0.35) + ((((p.cs_exp / p.cs_time_earning_exp * 60) / 380)) * 0.40) + ( (((((p.cs_denies / p.cs_games_played) / 12)) * 0.70) + ((((p.cs_teamcreepkills / p.cs_games_played) / 93)) * 0.50) + ((p.cs_wards / p.cs_games_played) / 1.45 * 0.30)) * (37.5 / (p.cs_secs / p.cs_games_played / 60))) except: p.cs_tsr = 0 p.acc_mmr = float(raw['acc_pub_skill']) p.acc_avg_kills = div(p.acc_herokills, p.acc_games_played) p.acc_avg_deaths = div(p.acc_deaths, p.acc_games_played) p.acc_avg_assists = div(p.acc_heroassists, p.acc_games_played) p.acc_avg_creeps = div((p.acc_neutralcreepkills + p.acc_teamcreepkills), p.acc_games_played) p.acc_avg_denies = div(p.acc_denies, p.acc_games_played) acc_minutes = div(p.acc_secs, 60) p.acc_avg_xpm = div(p.acc_exp, acc_minutes) p.acc_avg_apm = div(p.acc_actions, acc_minutes) p.acc_avg_gpm = div(p.acc_gold, acc_minutes) p.acc_avg_consumables = div(p.acc_consumables, p.acc_games_played) p.acc_avg_time = div(acc_minutes, p.acc_games_played) p.acc_winpercent = div(p.acc_wins, p.acc_games_played) p.acc_kdr = div(p.acc_herokills, p.acc_deaths) p.acc_avg_wards = div(p.acc_wards, p.acc_games_played) p.acc_kadr = div((p.acc_herokills + p.acc_heroassists), p.acc_deaths) try: p.acc_tsr = ((p.acc_herokills / p.acc_deaths / 1.15) * 0.65) + ((p.acc_heroassists / p.acc_deaths / 1.55) * 1.20) + (((p.acc_wins / (p.acc_wins + p.acc_losses)) / 0.55) * 0.9) + (((p.acc_gold / p.acc_secs * 60) / 230) * 0.35) + ((((p.acc_exp / p.acc_time_earning_exp * 60) / 380)) * 0.40) + ( (((((p.acc_denies / p.acc_games_played) / 12)) * 0.70) + ((((p.acc_teamcreepkills / p.acc_games_played) / 93)) * 0.50) + ((p.acc_wards / p.acc_games_played) / 1.45 * 0.30)) * (37.5 / (p.acc_secs / p.acc_games_played / 60))) except: p.acc_tsr = 0 if not_exists: db.session.add(p) db.session.commit() avatar.delay(p.id) player_incr() db.session.commit() return p