def __init__(self, csv_name = 'sheet/ws496.csv', y_id = 'exp', n_iter = 100, graph = True, offline = False): """ If offline is true, there is no need to write data again. We can use saved data from now and on. Now, I make fname_data_prefix becomes the csv file name except the posfix of '.csv', which will be convenient to use later on. Hence, there is no need to define it. The prefix value will be determined automatically. """ fname_data_prefix = csv_name[:-4] if offline: self.fname_data_prefix = fname_data_prefix else: pdr = pd.read_csv( csv_name) xM_r = jpd.pd_get_xM( pdr, smiles_id = 'R-SMILES') xM_o = jpd.pd_get_xM( pdr, smiles_id = 'SMILES') xM = np.concatenate( [xM_r, xM_o], axis = 1) print "Shape of combined xM is", xM.shape yV = jpd.pd_get_yV( pdr, y_id = y_id) if graph: hist( yV) show() self.kFold( xM, yV, fname_data_prefix = fname_data_prefix) self.n_iter = n_iter
def read_data_sets_csv( fname, validation_rate = 0, test_rate = 0, disp = False): class DataSets(object): pass data_sets = DataSets() pdr = pd.read_csv( fname) xM = jpd.pd_get_xM( pdr) yV = jpd.pd_get_yV( pdr, y_id = 'exp') X, Y = list(map( np.array, [xM, yV])) assert X.shape[0] == Y.shape[0] if test_rate > 0: X, Y, X_test, Y_test = XY_split( X, Y, test_rate) data_sets.test = DataSet_CSV( X_test, Y_test, disp = disp) if validation_rate > 0: X, Y, X_val, Y_val = XY_split( X, Y, validation_rate) data_sets.validation = DataSet_CSV( X_val, Y_val, disp = disp) # If test_rate and validation_rate are both zero, # all data is allocated to train dataset. data_sets.train = DataSet_CSV( X, Y, disp = disp) return data_sets
def get_xMyV( fname, y_id): pdr = pd.read_csv( fname) xM_fp = jpd.pd_get_xM( pdr) xM = xM_fp yV = jpd.pd_get_yV( pdr, y_id = y_id) return xM, yV
def read_data_sets_mol( fname, validation_rate = 0, test_rate = 0, disp = False): class DataSets(object): pass data_sets = DataSets() pdr = pd.read_csv( fname) xM_fp = jpd.pd_get_xM( pdr) xM_key = jpd.pd_get_xM_MACCSkeys( pdr) xM_molw = jpd.pd_get_xM_molw( pdr) xM_lasa = jpd.pd_get_xM_lasa( pdr) xM = np.concatenate( [xM_fp, xM_key, xM_molw, xM_lasa], axis = 1) yV = jpd.pd_get_yV( pdr, y_id = 'exp').A1 yV = [1 if y > 0 else 0 for y in yV] # classification is performed X, Y = list(map( np.array, [xM, yV])) assert X.shape[0] == Y.shape[0] if test_rate > 0: X, Y, X_test, Y_test = XY_split( X, Y, test_rate) data_sets.test = DataSet_CSV( X_test, Y_test, disp = disp) if validation_rate > 0: X, Y, X_val, Y_val = XY_split( X, Y, validation_rate) data_sets.validation = DataSet_CSV( X_val, Y_val, disp = disp) # If test_rate and validation_rate are both zero, # all data is allocated to train dataset. data_sets.train = DataSet_CSV( X, Y, disp = disp) data_sets.IMAGE_PIXELS = xM.shape[1] return data_sets
def grid_BIKE2(pdr, alphas_log, y_id = 'Solubility_log_mol_l'): print "BIKE with (A+B)+W" xM1 = jpd.pd_get_xM( pdr, radius=6, nBits=4096) xM2 = jpd.pd_get_xM_MACCSkeys( pdr) yV = jpd.pd_get_yV( pdr, y_id = y_id) #A1 = jpyx.calc_tm_sim_M( xM1) #A2 = jpyx.calc_tm_sim_M( xM2) #A = np.concatenate( ( A1, A2), axis = 1) xM = np.concatenate( ( xM1, xM2), axis = 1) A = jpyx.calc_tm_sim_M( xM1) print A.shape molw_l = jchem.rdkit_molwt( pdr.SMILES.tolist()) print np.shape( molw_l) A_molw = jchem.add_new_descriptor( A, molw_l) print A_molw.shape gs = jgrid.gs_Ridge( A_molw, yV, alphas_log=alphas_log) jutil.show_gs_alpha( gs.grid_scores_) jgrid.cv( 'Ridge', A_molw, yV, alpha = gs.best_params_['alpha']) return gs
def read_data_sets_csv(fname, validation_rate=0, test_rate=0, disp=False): class DataSets(object): pass data_sets = DataSets() pdr = pd.read_csv(fname) xM = jpd.pd_get_xM(pdr) yV = jpd.pd_get_yV(pdr, y_id='exp') X, Y = list(map(np.array, [xM, yV])) assert X.shape[0] == Y.shape[0] if test_rate > 0: X, Y, X_test, Y_test = XY_split(X, Y, test_rate) data_sets.test = DataSet_CSV(X_test, Y_test, disp=disp) if validation_rate > 0: X, Y, X_val, Y_val = XY_split(X, Y, validation_rate) data_sets.validation = DataSet_CSV(X_val, Y_val, disp=disp) # If test_rate and validation_rate are both zero, # all data is allocated to train dataset. data_sets.train = DataSet_CSV(X, Y, disp=disp) return data_sets
def get_xMyV(fname, y_id): pdr = pd.read_csv(fname) xM_fp = jpd.pd_get_xM(pdr) xM = xM_fp yV = jpd.pd_get_yV(pdr, y_id=y_id) return xM, yV
def read_data_sets_mol(fname, validation_rate=0, test_rate=0, disp=False): class DataSets(object): pass data_sets = DataSets() pdr = pd.read_csv(fname) xM_fp = jpd.pd_get_xM(pdr) xM_key = jpd.pd_get_xM_MACCSkeys(pdr) xM_molw = jpd.pd_get_xM_molw(pdr) xM_lasa = jpd.pd_get_xM_lasa(pdr) xM = np.concatenate([xM_fp, xM_key, xM_molw, xM_lasa], axis=1) yV = jpd.pd_get_yV(pdr, y_id='exp').A1 yV = [1 if y > 0 else 0 for y in yV] # classification is performed X, Y = list(map(np.array, [xM, yV])) assert X.shape[0] == Y.shape[0] if test_rate > 0: X, Y, X_test, Y_test = XY_split(X, Y, test_rate) data_sets.test = DataSet_CSV(X_test, Y_test, disp=disp) if validation_rate > 0: X, Y, X_val, Y_val = XY_split(X, Y, validation_rate) data_sets.validation = DataSet_CSV(X_val, Y_val, disp=disp) # If test_rate and validation_rate are both zero, # all data is allocated to train dataset. data_sets.train = DataSet_CSV(X, Y, disp=disp) data_sets.IMAGE_PIXELS = xM.shape[1] return data_sets
def grid_MLR_A(pdr, alphas_log, y_id = 'Solubility_log_mol_l'): print "MLR with A" xM1 = jpd.pd_get_xM( pdr, radius=6, nBits=4096) xM_molw = xM1 yV = jpd.pd_get_yV( pdr, y_id = y_id) gs = jgrid.gs_Ridge( xM_molw, yV, alphas_log=alphas_log) jutil.show_gs_alpha( gs.grid_scores_) jgrid.cv( 'Ridge', xM_molw, yV, alpha = gs.best_params_['alpha']) return gs
def __init__(self, csv_name='sheet/ws496.csv', y_id='exp', n_iter=100, graph=True, offline=False): """ If offline is true, there is no need to write data again. We can use saved data from now and on. Now, I make fname_data_prefix becomes the csv file name except the posfix of '.csv', which will be convenient to use later on. Hence, there is no need to define it. The prefix value will be determined automatically. """ fname_data_prefix = csv_name[:-4] if offline: self.fname_data_prefix = fname_data_prefix else: pdr = pd.read_csv(csv_name) xM_r = jpd.pd_get_xM(pdr, smiles_id='R-SMILES') xM_o = jpd.pd_get_xM(pdr, smiles_id='SMILES') xM = np.concatenate([xM_r, xM_o], axis=1) print "Shape of combined xM is", xM.shape yV = jpd.pd_get_yV(pdr, y_id=y_id) if graph: hist(yV) show() self.kFold(xM, yV, fname_data_prefix=fname_data_prefix) self.n_iter = n_iter
def __init__(self, csv_name = 'sheet/ws496.csv', y_id = 'exp', n_iter = 100, graph = True, offline = False): fname_data_prefix = csv_name[:-4] if offline: self.fname_data_prefix = fname_data_prefix else: pdr = pd.read_csv( csv_name) xM = jpd.pd_get_xM( pdr) yV = jpd.pd_get_yV( pdr, y_id = y_id) if graph: hist( yV) show() self.kFold( xM, yV, fname_data_prefix = fname_data_prefix) self.n_iter = n_iter
def __init__(self, csv_name='sheet/ws496.csv', y_id='exp', n_iter=100, graph=True, offline=False): fname_data_prefix = csv_name[:-4] if offline: self.fname_data_prefix = fname_data_prefix else: pdr = pd.read_csv(csv_name) xM = jpd.pd_get_xM(pdr) yV = jpd.pd_get_yV(pdr, y_id=y_id) if graph: hist(yV) show() self.kFold(xM, yV, fname_data_prefix=fname_data_prefix) self.n_iter = n_iter