Ejemplo n.º 1
0
 def rij(self, x_df, ntop):
     x, labs_list, m, n = self.prepare(x_df)
     if ntop == None:
         ntop = int(st.mCn(n, 2))
     rij_ar = st.rij(x)
     rij_list = []
     for i in range(int(st.mCn(n, 2))):
         lab1 = labs_list[int(rij_ar[i, 0])]
         lab2 = labs_list[int(rij_ar[i, 1])]
         val = rij_ar[i, 2]
         val2 = rij_ar[i, 3]
         rij_list.append([lab1, lab2, val, val2])
     rij_df = pd.DataFrame(rij_list, columns=["i", "j", "rij", "rij2"])
     print(rij_df[:ntop])
Ejemplo n.º 2
0
 def gij(self, x_df, ntop):
     x, labs_list, m, n = self.prepare(x_df)
     self.mc_check(x)
     if ntop == None:
         ntop = int(st.mCn(n, 2))
     gij_ar = st.gij(x)
     gij_list = []
     for i in range(int(st.mCn(n, 2))):
         lab1 = labs_list[int(gij_ar[i, 0])]
         lab2 = labs_list[int(gij_ar[i, 1])]
         val = gij_ar[i, 2]
         val2 = gij_ar[i, 3]
         gij_list.append([lab1, lab2, val, val2])
     gij_df = pd.DataFrame(gij_list, columns=["i", "j", "gij", "gij2"])
     print(gij_df[:ntop])
Ejemplo n.º 3
0
    def __init__(self, x_df, y_se, n_len, n_one, score, n_ktop, 
                 n_gen, nmax_pop, n_tou, p_gap, p_crs, p_mut, n_mut):

        self.labs       = x_df.columns.tolist()
        self.x          = x_df.values.copy()
        self.y          = y_se.values.copy()

        self.n_len      = n_len        # the length of gene
        self.n_one      = n_one        # the numper of "1" in the gene
        #self.func       = func         # score calculator
        self.max_flag   = False        # maximum search problem or not
        #self.sw_size    = sw_size      # scale window size (for fitness calculation)
        self.score      = score
        self.n_ktop     = n_ktop       # keep top (in the global ranking) size 
        
        # --- ga parameters ---
        self.n_gen      = n_gen        # the number of generations
        self.nmax_pop   = nmax_pop     # the maximum number of popurations in each generations
        self.n_tou      = n_tou        # tournament size for selection (large: strong selection)
        self.p_gap      = p_gap        # generation gap: (1-p_gap)*nmax_pop is the number of eliets
        self.p_crs      = p_crs        # crossover ratio
        self.p_mut      = p_mut        # mutation ratio
        self.n_mut      = n_mut        # strength of the mutation (the number of swapping)
        
        if score == "e2":
            self.r_or_q = "R2"
        else:
            self.r_or_q = "Q2"

        ncomb = int(st.mCn(self.n_len,self.n_one))
        print(f"Genetic Algorithm for {self.n_one} descriptors:  {self.n_len}C{self.n_one} = {ncomb}")
Ejemplo n.º 4
0
def hypervolume_comb(df,defined_list,candidate_list,k,ntop):
    if ntop == None:
        m = len(candidate_list)
        ntop = int(st.mCn(m,k))
    def_df = make_sub_df(df,defined_list)
    can_df = make_sub_df(df,candidate_list)
    use_list,unuse_list = make_comb_list(candidate_list,k)
    ncomb = len(use_list)
    vol_list = []
    for i in range(ncomb):
        use_df = make_sub_df(df,use_list[i])
        base_df = pd.concat([def_df,use_df],axis=1)
        base = base_df.values.copy()
        vol = hypervolume(base)
        vol_list.append([use_list[i],unuse_list[i],vol])
    vol_df = pd.DataFrame(vol_list,columns=["Used","Unused","Volume"])
    sorted_vol_df = vol_df.sort_values(by="Volume",ascending=False).reset_index()
    print(sorted_vol_df[:ntop],"\n")
Ejemplo n.º 5
0
def exhaustive_search(x_df,y_se,k,ntop,const):
    labs_list = x_df.columns.tolist()
    n = len(labs_list)
    x = x_df.values.copy()
    y = y_se.values.copy()
    syy = np.sum((y-np.mean(y))**2)
    if const: 
        k = k-1
        n = n-1
        print("Caution!:")
        print("  Constant term is always included in the search.")
        print("  The number of descriptors contained in each combination is k.")
        print("  (constant term is also regarded as a descriptor.)")
        print("  The total number of combinations should be searched is now (n-1)C(k-1).")
        print("  (constant term should be in the first column of span.)")
        print()
    ncomb = int(st.mCn(n,k))
    print(f"Exhaustive search for {k} descriptors:  {n}C{k} = {ncomb}")

    di = 0.05*ncomb # display interval
    cnt = 1
    e_list = []
    
    t0 = time.time()
    
    if const:
        for i,comb in enumerate(itertools.combinations(range(1,n+1),k), start=1): # for with const
            comb = list(comb)
            comb.insert(0,0)  
            q,r = np.linalg.qr(x[:,comb],mode="reduced")
            qty = np.dot(q.T,y)
            e = y - np.dot(q,qty)
            q_adm = q*q
            lev = np.sum(q_adm,axis=1) # leverage (diagonal part of hat matrix)
            eq = e / (1. - lev) 

            e2 = np.dot(e,e)
            eq2 = np.dot(eq,eq)
            e_list.append([e2,eq2,comb])
            
            if i >= di*cnt:
                print(f"{int(i/ncomb*100): >4} %  ({i}/{ncomb})")
                cnt += 1
        
    else:
        for i,comb in enumerate(itertools.combinations(range(n),k), start=1):
            q,r = np.linalg.qr(x[:,comb],mode="reduced")
            qty = np.dot(q.T,y)
            e = y - np.dot(q,qty)
            q_adm = q*q
            lev = np.sum(q_adm,axis=1) # leverage (diagonal part of hat matrix)
            eq = e / (1. - lev) 

            e2 = np.dot(e,e)
            eq2 = np.dot(eq,eq)
            e_list.append([e2,eq2,comb])

            if i >= di*cnt:
                print(f"{int(i/ncomb*100): >4} %  ({i}/{ncomb})")
                cnt += 1

    print()
    print(f"Exhaustive search is finished.{int(i/ncomb*100): >4} %  ({i}/{ncomb})")
    print()
    t1 = time.time()
    print(f"Elapsed time for score calculatons")
    print(f"time: {t1-t0} [s]")
    print(f"time/{n}C{k} : {(t1-t0)/ncomb} [s/ncomb]")              
    print()

    df = pd.DataFrame(e_list,columns=["e2","eq2","comb"])
   

 
    # === e2 sort ===
    t2 = time.time()
    e_sort_df = df.sort_values(by="e2",ascending=True).reset_index()[:ntop]
    t3 = time.time()

    print(f"Elapsed time for sorting (by e2)")
    print(f"time : {t3-t2} [s]")
    print(f"time/{n}C{k} : {(t3-t2)/ncomb} [s/ncomb]")              
    print()
    
    e_des_lis = []
    r2_lis = []
    q2_lis = []
    print(f"Displayed only top {ntop} (sorted by e2)")
    for i,(et,qt,cmb) in enumerate(zip(e_sort_df["e2"],e_sort_df["eq2"],e_sort_df["comb"])):
        r2_lis.append(1.-et/syy)
        q2_lis.append(1.-qt/syy)
        e_des_lis.append([labs_list[j] for j in cmb])
    e_sort_df["R2"] = r2_lis
    e_sort_df["Q2"] = q2_lis
    e_sort_df["Descriptors"] = e_des_lis
    print(e_sort_df[["e2","R2","eq2","Q2","Descriptors"]])    
    
    pt.ex_plot(x,y,e_sort_df["comb"],"R2")
    print()
 

    # === q2 sort ===
    t4 = time.time()
    q_sort_df = df.sort_values(by="eq2",ascending=True).reset_index()[:ntop]
    t5 = time.time()
    
    print(f"Elapsed time for sorting (by eq2)")
    print(f"time : {t5-t4} [s]")
    print(f"time/{n}C{k} : {(t5-t4)/ncomb} [s/ncomb]")              
    print()

    q_des_lis = []
    r2_lis = []
    q2_lis = []
    print(f"Displayed only top {ntop} (sorted by eq2)")
    for i,(et,qt,cmb) in enumerate(zip(q_sort_df["e2"],q_sort_df["eq2"],q_sort_df["comb"])):
        r2_lis.append(1.-et/syy)
        q2_lis.append(1.-qt/syy)
        q_des_lis.append([labs_list[j] for j in cmb])
    e_sort_df["R2"] = r2_lis
    e_sort_df["Q2"] = q2_lis
    e_sort_df["Descriptors"] = q_des_lis
    print(e_sort_df[["e2","R2","eq2","Q2","Descriptors"]])    

    pt.ex_plot(x,y,q_sort_df["comb"],"Q2")
    
    return e_des_lis,q_des_lis