def rij(self, x_df, ntop): x, labs_list, m, n = self.prepare(x_df) if ntop == None: ntop = int(st.mCn(n, 2)) rij_ar = st.rij(x) rij_list = [] for i in range(int(st.mCn(n, 2))): lab1 = labs_list[int(rij_ar[i, 0])] lab2 = labs_list[int(rij_ar[i, 1])] val = rij_ar[i, 2] val2 = rij_ar[i, 3] rij_list.append([lab1, lab2, val, val2]) rij_df = pd.DataFrame(rij_list, columns=["i", "j", "rij", "rij2"]) print(rij_df[:ntop])
def gij(self, x_df, ntop): x, labs_list, m, n = self.prepare(x_df) self.mc_check(x) if ntop == None: ntop = int(st.mCn(n, 2)) gij_ar = st.gij(x) gij_list = [] for i in range(int(st.mCn(n, 2))): lab1 = labs_list[int(gij_ar[i, 0])] lab2 = labs_list[int(gij_ar[i, 1])] val = gij_ar[i, 2] val2 = gij_ar[i, 3] gij_list.append([lab1, lab2, val, val2]) gij_df = pd.DataFrame(gij_list, columns=["i", "j", "gij", "gij2"]) print(gij_df[:ntop])
def __init__(self, x_df, y_se, n_len, n_one, score, n_ktop, n_gen, nmax_pop, n_tou, p_gap, p_crs, p_mut, n_mut): self.labs = x_df.columns.tolist() self.x = x_df.values.copy() self.y = y_se.values.copy() self.n_len = n_len # the length of gene self.n_one = n_one # the numper of "1" in the gene #self.func = func # score calculator self.max_flag = False # maximum search problem or not #self.sw_size = sw_size # scale window size (for fitness calculation) self.score = score self.n_ktop = n_ktop # keep top (in the global ranking) size # --- ga parameters --- self.n_gen = n_gen # the number of generations self.nmax_pop = nmax_pop # the maximum number of popurations in each generations self.n_tou = n_tou # tournament size for selection (large: strong selection) self.p_gap = p_gap # generation gap: (1-p_gap)*nmax_pop is the number of eliets self.p_crs = p_crs # crossover ratio self.p_mut = p_mut # mutation ratio self.n_mut = n_mut # strength of the mutation (the number of swapping) if score == "e2": self.r_or_q = "R2" else: self.r_or_q = "Q2" ncomb = int(st.mCn(self.n_len,self.n_one)) print(f"Genetic Algorithm for {self.n_one} descriptors: {self.n_len}C{self.n_one} = {ncomb}")
def hypervolume_comb(df,defined_list,candidate_list,k,ntop): if ntop == None: m = len(candidate_list) ntop = int(st.mCn(m,k)) def_df = make_sub_df(df,defined_list) can_df = make_sub_df(df,candidate_list) use_list,unuse_list = make_comb_list(candidate_list,k) ncomb = len(use_list) vol_list = [] for i in range(ncomb): use_df = make_sub_df(df,use_list[i]) base_df = pd.concat([def_df,use_df],axis=1) base = base_df.values.copy() vol = hypervolume(base) vol_list.append([use_list[i],unuse_list[i],vol]) vol_df = pd.DataFrame(vol_list,columns=["Used","Unused","Volume"]) sorted_vol_df = vol_df.sort_values(by="Volume",ascending=False).reset_index() print(sorted_vol_df[:ntop],"\n")
def exhaustive_search(x_df,y_se,k,ntop,const): labs_list = x_df.columns.tolist() n = len(labs_list) x = x_df.values.copy() y = y_se.values.copy() syy = np.sum((y-np.mean(y))**2) if const: k = k-1 n = n-1 print("Caution!:") print(" Constant term is always included in the search.") print(" The number of descriptors contained in each combination is k.") print(" (constant term is also regarded as a descriptor.)") print(" The total number of combinations should be searched is now (n-1)C(k-1).") print(" (constant term should be in the first column of span.)") print() ncomb = int(st.mCn(n,k)) print(f"Exhaustive search for {k} descriptors: {n}C{k} = {ncomb}") di = 0.05*ncomb # display interval cnt = 1 e_list = [] t0 = time.time() if const: for i,comb in enumerate(itertools.combinations(range(1,n+1),k), start=1): # for with const comb = list(comb) comb.insert(0,0) q,r = np.linalg.qr(x[:,comb],mode="reduced") qty = np.dot(q.T,y) e = y - np.dot(q,qty) q_adm = q*q lev = np.sum(q_adm,axis=1) # leverage (diagonal part of hat matrix) eq = e / (1. - lev) e2 = np.dot(e,e) eq2 = np.dot(eq,eq) e_list.append([e2,eq2,comb]) if i >= di*cnt: print(f"{int(i/ncomb*100): >4} % ({i}/{ncomb})") cnt += 1 else: for i,comb in enumerate(itertools.combinations(range(n),k), start=1): q,r = np.linalg.qr(x[:,comb],mode="reduced") qty = np.dot(q.T,y) e = y - np.dot(q,qty) q_adm = q*q lev = np.sum(q_adm,axis=1) # leverage (diagonal part of hat matrix) eq = e / (1. - lev) e2 = np.dot(e,e) eq2 = np.dot(eq,eq) e_list.append([e2,eq2,comb]) if i >= di*cnt: print(f"{int(i/ncomb*100): >4} % ({i}/{ncomb})") cnt += 1 print() print(f"Exhaustive search is finished.{int(i/ncomb*100): >4} % ({i}/{ncomb})") print() t1 = time.time() print(f"Elapsed time for score calculatons") print(f"time: {t1-t0} [s]") print(f"time/{n}C{k} : {(t1-t0)/ncomb} [s/ncomb]") print() df = pd.DataFrame(e_list,columns=["e2","eq2","comb"]) # === e2 sort === t2 = time.time() e_sort_df = df.sort_values(by="e2",ascending=True).reset_index()[:ntop] t3 = time.time() print(f"Elapsed time for sorting (by e2)") print(f"time : {t3-t2} [s]") print(f"time/{n}C{k} : {(t3-t2)/ncomb} [s/ncomb]") print() e_des_lis = [] r2_lis = [] q2_lis = [] print(f"Displayed only top {ntop} (sorted by e2)") for i,(et,qt,cmb) in enumerate(zip(e_sort_df["e2"],e_sort_df["eq2"],e_sort_df["comb"])): r2_lis.append(1.-et/syy) q2_lis.append(1.-qt/syy) e_des_lis.append([labs_list[j] for j in cmb]) e_sort_df["R2"] = r2_lis e_sort_df["Q2"] = q2_lis e_sort_df["Descriptors"] = e_des_lis print(e_sort_df[["e2","R2","eq2","Q2","Descriptors"]]) pt.ex_plot(x,y,e_sort_df["comb"],"R2") print() # === q2 sort === t4 = time.time() q_sort_df = df.sort_values(by="eq2",ascending=True).reset_index()[:ntop] t5 = time.time() print(f"Elapsed time for sorting (by eq2)") print(f"time : {t5-t4} [s]") print(f"time/{n}C{k} : {(t5-t4)/ncomb} [s/ncomb]") print() q_des_lis = [] r2_lis = [] q2_lis = [] print(f"Displayed only top {ntop} (sorted by eq2)") for i,(et,qt,cmb) in enumerate(zip(q_sort_df["e2"],q_sort_df["eq2"],q_sort_df["comb"])): r2_lis.append(1.-et/syy) q2_lis.append(1.-qt/syy) q_des_lis.append([labs_list[j] for j in cmb]) e_sort_df["R2"] = r2_lis e_sort_df["Q2"] = q2_lis e_sort_df["Descriptors"] = q_des_lis print(e_sort_df[["e2","R2","eq2","Q2","Descriptors"]]) pt.ex_plot(x,y,q_sort_df["comb"],"Q2") return e_des_lis,q_des_lis