def fit(self, train_x, train_y, evals, top): m, n = train_x.shape for i in range(m): uid, iid, rating = train_x[i][0], train_x[i][1], train_y[i] self.__user_item.setdefault(uid, {}) self.__user_item[uid][iid] = rating self.__item_user.setdefault(iid, {}) self.__item_user[iid][uid] = rating self.items_pool.append(iid) for u in self.__user_item: self.p.setdefault(u, np_random(self.__f) / np.sqrt(self.__f)) for i in self.__item_user: self.q.setdefault(i, np_random(self.__f) / np.sqrt(self.__f)) for step in range(self.__steps): for u in self.__user_item: dict_items = self.__user_item[u] for i, r in dict_items.items(): e = r - np.dot(self.p[u], self.q[i]) tmp = self.q[i] self.q[i] += self.__lr * (e * self.p[u] - self.__lambda * tmp) self.p[u] += self.__lr * (e * tmp - self.__lambda * self.p[u]) self.__lr *= 0.9 print("第%d次迭代完成!" % (step + 1)) self.__loss() if evals and top: self.evals(evals[0], evals[1], top)
def main(): import contextlib import time @contextlib.contextmanager def measure(title=""): if title: title = " " + title print("start%s" % title) started = time.time() yield needed = time.time() - started print("running%s needed %.2f seconds" % (title, needed)) import emzed with measure("load pm"): pm = emzed.io.loadPeakMap("141208_pos001.mzXML") import copy # create modified copy pm2 = copy.deepcopy(pm) pm2.spectra = pm2.spectra[1:] pms = [pm, pm2] n = 10000 integers = list(reversed(range(n))) for k in range(0, n, 10): integers[k] = None from numpy.random import randint, random as np_random import random tuples = [tuple(randint(0, 1000, size=10)) for _ in range(100)] with measure("create table"): t = emzed.utils.toTable("integers", integers, type_=int) t.addColumn("mzmin", t.apply(lambda: 100 + 900 * np_random() + np_random(), ()), type_=float) t.addColumn("mzmax", t.apply(lambda mzmin: mzmin + 0.1 * np_random(), (t.mzmin,)), type_=float) t.addColumn("rtmin", t.apply(lambda: 50 + 1000 * np_random(), ()), type_=float) t.addColumn("rtmax", t.apply(lambda rtmin: rtmin + 10 + 60 * np_random(), (t.rtmin,)), type_=float) t.addColumn("peakmap", t.apply(lambda: random.choice(pms), ()), type_=object) for i in range(30): t.addColumn("floats_%d" % i, t.integers + 1.1, type_=float) t.addColumn("strings_%d" % i, t.integers.apply(str) * (i % 3), type_=str) t.addColumn("tuples_%d" % i, t.apply(lambda: random.choice(tuples), ()), type_=object) t.addColumn("peakmaps_%d" % i, pms[i % 2], type_=object) with measure("write hdf5 table with %d rows and %d cols" % t.shape): to_hdf5(t, "test.hdf5")
def main(): with measure("load pm"): # pm = emzed.io.loadPeakMap("141208_pos001.mzXML") pm = emzed.io.loadPeakMap("Danu.mzML") pm2 = copy.deepcopy(pm) # create modified copy pm2.spectra = pm2.spectra[1:] rtmin, rtmax = pm.rtRange() pms = [pm, pm2] n = 10000 integers = list(reversed(range(n))) for k in range(0, n, 10): integers[k] = None flags = [i % 2 == 0 for i in range(n)] tuples = [tuple(randint(0, 1000, size=10)) for _ in range(100)] tsub = emzed.utils.toTable("a", (1, 2, 3), type_=int) with measure("create table"): t = emzed.utils.toTable("integers", integers, type_=int) t.addColumn("check", flags, type_=CheckState) t.addColumn("sub_table", tsub, type_=Table, format_="%r") t.addColumn("mzmin", t.apply(lambda: 100 + 900 * np_random() + np_random(), ()), type_=float) t.addColumn("mzmax", t.apply(lambda mzmin: mzmin + 0.1 * np_random(), (t.mzmin, )), type_=float) t.addColumn("rtmin", t.apply(lambda: rtmin + (rtmax - rtmin) * np_random(), ()), type_=float) t.addColumn("rtmax", t.apply(lambda rtmin: rtmin + 30 + 300 * np_random(), (t.rtmin, )), type_=float) t.addColumn("peakmap", t.apply(lambda: random.choice(pms), ()), type_=object) for i in range(10): print(i) t.addColumn("floats_%d" % i, t.integers + 1.1, type_=float) t.addColumn("strings_%d" % i, t.integers.apply(str) * (i % 3), type_=str) t.addColumn("tuples_%d" % i, t.apply(lambda: random.choice(tuples), ()), type_=object) t.addColumn("peakmaps_%d" % i, pms[i % 2], type_=object) target_ids = [None] * n for i in range(0, n, 100): target_ids[i] = "target_%d" % i target_ids[i + 1] = "target_%d" % i target_ids[i + 2] = "target_%d" % i target_ids[i + 3] = "target_%d" % i t.addColumn("target_id", target_ids, type_=str) n, m = t.shape #for fac in (1, 10): for fac in (1, ): n0 = n * fac with measure("write hdf5 table with %d rows and %d cols" % (n0, m)): path = "test_%d.hdf5" % (n * fac) with atomic_hdf5_writer(path) as add: for i in range(fac): print(i, "out of", fac) add(t)
def bayesian_pattern_based(self, y, r_matrix, init_rules): # |A| : min((rule_space)/2,(rule_space+beta_l-alpha_l)/2) self.Asize = [[ min( self.pattern_space[l] / 2, 0.5 * (self.pattern_space[l] + self.beta_l[l] - self.alpha_l[l])) for l in range(self.maxlen + 1) ]] # support threshold self.C = [1] self.maps = defaultdict(list) T0 = 1000 rules_curr = init_rules pt_curr = -1000000000 # now only consider 1 chain # it should have been maps[chain] self.maps[0].append([ -1, [pt_curr / 3, pt_curr / 3, pt_curr / 3], rules_curr, [self.rules[i] for i in rules_curr], [] ]) alpha = np.inf for ith_iter in range(self.max_iter): rules_new = self.propose(rules_curr, y, r_matrix) cfmatrix, prob = self.compute_prob(r_matrix, y, rules_new) T = T0**(1 - ith_iter / self.max_iter) pt_new = sum(prob) # logger.debug("pt_new: %f, pt_curr: %f, T: %f, float(pt_new - pt_curr): %f" % # (pt_new, pt_curr, T, float(pt_new - pt_curr))) if ith_iter > 0: # The original Wang et al. code did not have this check # and was resulting in RuntimeWarning because we were # passing np.exp() a very large number (-pt_curr = 1000000000 in 0-th iter). # We do not expect the algorithm performance to change with this check # and we can avoid the RuntimeWarning alpha = np.exp(float(pt_new - pt_curr) / T) if pt_new > sum(self.maps[0][-1][1]): if False: logger.debug( '\n** chain = {}, max at iter = {} ** \n accuracy = {}, TP = {},FP = {}, TN = {}, FN = {}\n ' 'old is {}, pt_new is {}, prior_ChsRules={}, likelihood_1 = {}, likelihood_2 = {}\n ' .format(self.chains, ith_iter, (cfmatrix[0] + cfmatrix[2] + 0.0) / len(y), cfmatrix[0], cfmatrix[1], cfmatrix[2], cfmatrix[3], sum(self.maps[0][-1][1]) + 0.1, sum(prob), prob[0], prob[1], prob[2])) # logger.debug("rules_new: %s" % str(rules_new)) # logger.debug("const_denominator: %s" % str(self.const_denominator)) self.Asize.append([ np.floor( min(self.Asize[-1][l], (-pt_new + self.Lup + self.P0) / max(1., self.const_denominator[l]))) for l in range(self.maxlen + 1) ]) self.const_denominator = [ np.log( np.true_divide( max(1., self.pattern_space[l] + self.beta_l[l] - 1), max(1., self.Asize[-1][l] + self.alpha_l[l] - 1))) for l in range(self.maxlen + 1) ] self.maps[0].append([ ith_iter, prob, rules_new, [self.rules[i] for i in rules_new], cfmatrix ]) new_supp = np.ceil( np.log( max([ np.true_divide( self.pattern_space[l] - self.Asize[-1][l] + self.beta_l[l], max(1., self.Asize[-1][l] - 1 + self.alpha_l[l])) for l in range(1, self.maxlen + 1, 1) ]))) self.C.append(new_supp) self.predicted_rules = rules_new if np_random() <= alpha: rules_curr, pt_curr = rules_new[:], pt_new return self.maps[0]
def propose(self, rules_curr, y, r_matrix): """ Propose a modification to the current set of rules :param rules_curr: np.array indexes of rules currently in play :param y: np.array :param r_matrix: np.ndarray satisfaction matrix for all the rules in play :return: np.array proposed set of rules """ # ex is an instance selected at random ex = None yhat = self.check_satisfies_at_least_one_rule(r_matrix, rules_curr) incorr = np.where(y != yhat)[0] rules_curr_len = len(rules_curr) move = ['clean'] if len(incorr) > 0: ex = sample(list(incorr), 1)[0] t = np_random() if y[ex] == 1 or rules_curr_len == 1: if t < 1.0 / 2 or rules_curr_len == 1: move = ['add'] else: move = ['cut', 'add'] else: if t < 1.0 / 2: move = ['cut'] else: move = ['cut', 'add'] # logger.debug("move: %s" % str(move)) # 'cut' a rule if move[0] == 'cut': try: if np_random() < self.propose_threshold: candidate = [] for rule in rules_curr: if r_matrix[ex, rule]: candidate.append(rule) if len(candidate) == 0: candidate = rules_curr cut_rule = sample(candidate, 1)[0] else: p = [] all_sum = np.zeros(r_matrix.shape[0], dtype=int) for rule in rules_curr: all_sum = all_sum + r_matrix[:, rule] for ith_rule, rule in enumerate(rules_curr): yhat = (all_sum - r_matrix[:, rule]) > 0 TP, FP, TN, FN = get_confusion(yhat, y) p.append(TP.astype(float) / (TP + FP + 1)) p = [x - min(p) for x in p] p = np.exp(p) p = np.insert(p, 0, 0) p = np.array(list(accumulate(p))) if p[-1] == 0: index = sample(list(range(len(rules_curr))), 1)[0] else: p = p / p[-1] index = find_lt(p, np_random()) cut_rule = rules_curr[index] rules_curr.remove(cut_rule) move.remove('cut') except: move.remove('cut') # 'add' a rule if len(move) > 0 and move[0] == 'add': if y[ex] == 1: select = np.where( (self.supp > self.C[-1]) & ~r_matrix[ex] > 0)[0] else: select = np.where( (self.supp > self.C[-1]) & r_matrix[ex] > 0)[0] if len(select) > 0: if np_random() < self.propose_threshold: add_rule = sample(select.tolist(), 1)[0] else: Yhat_neg_index = np.where( ~self.check_satisfies_at_least_one_rule( r_matrix, rules_curr))[0] # In case Yhat_neg_index is [] if Yhat_neg_index.shape[0] == 0: return rules_curr mat = r_matrix[ Yhat_neg_index.reshape(-1, 1), select].transpose() & y[Yhat_neg_index].astype(int) TP = np.sum(mat, axis=1) FP = np.array( np.sum(r_matrix[Yhat_neg_index.reshape(-1, 1), select], axis=0) - TP) p = (TP.astype(float) / (TP + FP + 1)) add_rule = select[sample(list(np.where(p == max(p))[0]), 1)[0]] try: if add_rule not in rules_curr: rules_curr.append(add_rule) except: pass return rules_curr
def decide_to_connect(self): return np_random() < self.prob