Esempio n. 1
0
def read_data_sets_csv( fname, validation_rate = 0, test_rate = 0, disp = False):
	class DataSets(object):
		pass
	data_sets = DataSets()

	pdr = pd.read_csv( fname)
	xM = jpd.pd_get_xM( pdr)
	yV = jpd.pd_get_yV( pdr, y_id = 'exp')

	X, Y = list(map( np.array, [xM, yV]))
	assert X.shape[0] == Y.shape[0]

	if test_rate > 0:
		X, Y, X_test, Y_test = XY_split( X, Y, test_rate)
		data_sets.test = DataSet_CSV( X_test, Y_test, disp = disp)

	if validation_rate > 0:
		X, Y, X_val, Y_val = XY_split( X, Y, validation_rate)
		data_sets.validation = DataSet_CSV( X_val, Y_val, disp = disp)

	# If test_rate and validation_rate are both zero, 
	# all data is allocated to train dataset. 
	data_sets.train = DataSet_CSV( X, Y, disp = disp)

	return data_sets
Esempio n. 2
0
def read_data_sets_mol_molw( fname, validation_rate = 0, test_rate = 0, disp = False):
	class DataSets(object):
		pass
	data_sets = DataSets()

	pdr = pd.read_csv( fname)
	#xM_fp = jpd.pd_get_xM( pdr)
	#xM_key = jpd.pd_get_xM_MACCSkeys( pdr)
	xM_molw = jpd.pd_get_xM_molw( pdr)
	#xM_lasa = jpd.pd_get_xM_lasa( pdr)
	#xM = np.concatenate( [xM_fp, xM_key, xM_molw, xM_lasa], axis = 1)

	"Normalize xM so as to be a set of unit norm random values"
	xM = np.divide( xM_molw, np.std( xM_molw, axis = 0))
	yV = jpd.pd_get_yV( pdr, y_id = 'exp').A1

	X, Y = list(map( np.array, [xM, yV]))
	assert X.shape[0] == Y.shape[0]

	if test_rate > 0:
		X, Y, X_test, Y_test = XY_split( X, Y, test_rate)
		data_sets.test = DataSet_CSV( X_test, Y_test, disp = disp)

	if validation_rate > 0:
		X, Y, X_val, Y_val = XY_split( X, Y, validation_rate)
		data_sets.validation = DataSet_CSV( X_val, Y_val, disp = disp)

	# If test_rate and validation_rate are both zero, 
	# all data is allocated to train dataset. 
	data_sets.train = DataSet_CSV( X, Y, disp = disp)

	# The length of descriptors are fed back.
	data_sets.IMAGE_PIXELS = xM.shape[1]

	return data_sets
Esempio n. 3
0
	def get_xMyV( fname, y_id):
		pdr = pd.read_csv( fname)
		xM_fp = jpd.pd_get_xM( pdr)
		
		xM = xM_fp
		yV = jpd.pd_get_yV( pdr, y_id = y_id)
		return xM, yV
Esempio n. 4
0
	def __init__(self, csv_name = 'sheet/ws496.csv', y_id = 'exp', n_iter = 100, graph = True, offline = False):
		"""
		If offline is true, there is no need to write data again. 
		We can use saved data from now and on.	
		Now, I make fname_data_prefix becomes the csv file name except the posfix of '.csv',
		which will be convenient to use later on. Hence, there is no need to define it. 
		The prefix value will be determined automatically. 
		"""

		fname_data_prefix = csv_name[:-4]

		if offline:
			self.fname_data_prefix = fname_data_prefix

		else:
			pdr = pd.read_csv( csv_name)

			xM_r = jpd.pd_get_xM( pdr, smiles_id = 'R-SMILES')
			xM_o = jpd.pd_get_xM( pdr, smiles_id = 'SMILES')
			xM = np.concatenate( [xM_r, xM_o], axis = 1)
			print "Shape of combined xM is", xM.shape

			yV = jpd.pd_get_yV( pdr, y_id = y_id)

			if graph:
				hist( yV)
				show()

			self.kFold( xM, yV, fname_data_prefix = fname_data_prefix)

			self.n_iter = n_iter
Esempio n. 5
0
def read_data_sets_mol( fname, validation_rate = 0, test_rate = 0, disp = False):
	class DataSets(object):
		pass
	data_sets = DataSets()

	pdr = pd.read_csv( fname)
	xM_fp = jpd.pd_get_xM( pdr)
	xM_key = jpd.pd_get_xM_MACCSkeys( pdr)
	xM_molw = jpd.pd_get_xM_molw( pdr)
	xM_lasa = jpd.pd_get_xM_lasa( pdr)
	xM = np.concatenate( [xM_fp, xM_key, xM_molw, xM_lasa], axis = 1)

	yV = jpd.pd_get_yV( pdr, y_id = 'exp').A1
	yV = [1 if y > 0 else 0 for y in yV] # classification is performed

	X, Y = list(map( np.array, [xM, yV]))
	assert X.shape[0] == Y.shape[0]

	if test_rate > 0:
		X, Y, X_test, Y_test = XY_split( X, Y, test_rate)
		data_sets.test = DataSet_CSV( X_test, Y_test, disp = disp)

	if validation_rate > 0:
		X, Y, X_val, Y_val = XY_split( X, Y, validation_rate)
		data_sets.validation = DataSet_CSV( X_val, Y_val, disp = disp)

	# If test_rate and validation_rate are both zero, 
	# all data is allocated to train dataset. 
	data_sets.train = DataSet_CSV( X, Y, disp = disp)

	data_sets.IMAGE_PIXELS = xM.shape[1]
	return data_sets
Esempio n. 6
0
def grid_BIKE2(pdr, alphas_log, y_id = 'Solubility_log_mol_l'):
	print "BIKE with (A+B)+W"

	xM1 = jpd.pd_get_xM( pdr, radius=6, nBits=4096)
	xM2 = jpd.pd_get_xM_MACCSkeys( pdr)

	yV = jpd.pd_get_yV( pdr, y_id = y_id)

	#A1 = jpyx.calc_tm_sim_M( xM1)
	#A2 = jpyx.calc_tm_sim_M( xM2)
	#A = np.concatenate( ( A1, A2), axis = 1)
	xM = np.concatenate( ( xM1, xM2), axis = 1)
	A = jpyx.calc_tm_sim_M( xM1)
	print A.shape

	molw_l = jchem.rdkit_molwt( pdr.SMILES.tolist())
	print np.shape( molw_l)
	A_molw = jchem.add_new_descriptor( A, molw_l)
	print A_molw.shape

	gs = jgrid.gs_Ridge( A_molw, yV, alphas_log=alphas_log)
	jutil.show_gs_alpha( gs.grid_scores_)
	
	jgrid.cv( 'Ridge', A_molw, yV, alpha = gs.best_params_['alpha'])
	
	return gs
Esempio n. 7
0
def read_data_sets_csv(fname, validation_rate=0, test_rate=0, disp=False):
    class DataSets(object):
        pass

    data_sets = DataSets()

    pdr = pd.read_csv(fname)
    xM = jpd.pd_get_xM(pdr)
    yV = jpd.pd_get_yV(pdr, y_id='exp')

    X, Y = list(map(np.array, [xM, yV]))
    assert X.shape[0] == Y.shape[0]

    if test_rate > 0:
        X, Y, X_test, Y_test = XY_split(X, Y, test_rate)
        data_sets.test = DataSet_CSV(X_test, Y_test, disp=disp)

    if validation_rate > 0:
        X, Y, X_val, Y_val = XY_split(X, Y, validation_rate)
        data_sets.validation = DataSet_CSV(X_val, Y_val, disp=disp)

    # If test_rate and validation_rate are both zero,
    # all data is allocated to train dataset.
    data_sets.train = DataSet_CSV(X, Y, disp=disp)

    return data_sets
Esempio n. 8
0
    def get_xMyV(fname, y_id):
        pdr = pd.read_csv(fname)
        xM_fp = jpd.pd_get_xM(pdr)

        xM = xM_fp
        yV = jpd.pd_get_yV(pdr, y_id=y_id)
        return xM, yV
Esempio n. 9
0
def read_data_sets_mol(fname, validation_rate=0, test_rate=0, disp=False):
    class DataSets(object):
        pass

    data_sets = DataSets()

    pdr = pd.read_csv(fname)
    xM_fp = jpd.pd_get_xM(pdr)
    xM_key = jpd.pd_get_xM_MACCSkeys(pdr)
    xM_molw = jpd.pd_get_xM_molw(pdr)
    xM_lasa = jpd.pd_get_xM_lasa(pdr)
    xM = np.concatenate([xM_fp, xM_key, xM_molw, xM_lasa], axis=1)

    yV = jpd.pd_get_yV(pdr, y_id='exp').A1
    yV = [1 if y > 0 else 0 for y in yV]  # classification is performed

    X, Y = list(map(np.array, [xM, yV]))
    assert X.shape[0] == Y.shape[0]

    if test_rate > 0:
        X, Y, X_test, Y_test = XY_split(X, Y, test_rate)
        data_sets.test = DataSet_CSV(X_test, Y_test, disp=disp)

    if validation_rate > 0:
        X, Y, X_val, Y_val = XY_split(X, Y, validation_rate)
        data_sets.validation = DataSet_CSV(X_val, Y_val, disp=disp)

    # If test_rate and validation_rate are both zero,
    # all data is allocated to train dataset.
    data_sets.train = DataSet_CSV(X, Y, disp=disp)

    data_sets.IMAGE_PIXELS = xM.shape[1]
    return data_sets
Esempio n. 10
0
    def set_xy(self):
        """
        if self is changed self will be a return value
        for feedback all outputs using only a variable
        """
        pdr = self.pdr

        self.s_l = self.pdr.SMILES.tolist()
        self.xM_logP = np.mat(self.pdr.logP.values).T
        self.yV = jpd.pd_get_yV( self.pdr, y_id="exp")

        return self
Esempio n. 11
0
def grid_MLR_B(pdr, alphas_log, y_id = 'Solubility_log_mol_l'):
	print "MLR with B"

	xM2 = jpd.pd_get_xM_MACCSkeys( pdr)

	xM_molw = xM2
	yV = jpd.pd_get_yV( pdr, y_id = y_id)

	gs = jgrid.gs_Ridge( xM_molw, yV, alphas_log=alphas_log)
	jutil.show_gs_alpha( gs.grid_scores_)
	
	jgrid.cv( 'Ridge', xM_molw, yV, alpha = gs.best_params_['alpha'])
	
	return gs
Esempio n. 12
0
def grid_MLR_A(pdr, alphas_log, y_id = 'Solubility_log_mol_l'):
	print "MLR with A"

	xM1 = jpd.pd_get_xM( pdr, radius=6, nBits=4096)

	xM_molw = xM1
	yV = jpd.pd_get_yV( pdr, y_id = y_id)

	gs = jgrid.gs_Ridge( xM_molw, yV, alphas_log=alphas_log)
	jutil.show_gs_alpha( gs.grid_scores_)
	
	jgrid.cv( 'Ridge', xM_molw, yV, alpha = gs.best_params_['alpha'])
	
	return gs
Esempio n. 13
0
	def __init__(self, csv_name = 'sheet/ws496.csv', y_id = 'exp', n_iter = 100, graph = True, offline = False):

		fname_data_prefix = csv_name[:-4]

		if offline:
			self.fname_data_prefix = fname_data_prefix
		else:
			pdr = pd.read_csv( csv_name)

			xM = jpd.pd_get_xM( pdr)
			yV = jpd.pd_get_yV( pdr, y_id = y_id)

			if graph:
				hist( yV)
				show()

			self.kFold( xM, yV, fname_data_prefix = fname_data_prefix)

			self.n_iter = n_iter
Esempio n. 14
0
    def __init__(self,
                 csv_name='sheet/ws496.csv',
                 y_id='exp',
                 n_iter=100,
                 graph=True,
                 offline=False):
        """
		If offline is true, there is no need to write data again. 
		We can use saved data from now and on.	
		Now, I make fname_data_prefix becomes the csv file name except the posfix of '.csv',
		which will be convenient to use later on. Hence, there is no need to define it. 
		The prefix value will be determined automatically. 
		"""

        fname_data_prefix = csv_name[:-4]

        if offline:
            self.fname_data_prefix = fname_data_prefix

        else:
            pdr = pd.read_csv(csv_name)

            xM_r = jpd.pd_get_xM(pdr, smiles_id='R-SMILES')
            xM_o = jpd.pd_get_xM(pdr, smiles_id='SMILES')
            xM = np.concatenate([xM_r, xM_o], axis=1)
            print "Shape of combined xM is", xM.shape

            yV = jpd.pd_get_yV(pdr, y_id=y_id)

            if graph:
                hist(yV)
                show()

            self.kFold(xM, yV, fname_data_prefix=fname_data_prefix)

            self.n_iter = n_iter
Esempio n. 15
0
    def __init__(self,
                 csv_name='sheet/ws496.csv',
                 y_id='exp',
                 n_iter=100,
                 graph=True,
                 offline=False):

        fname_data_prefix = csv_name[:-4]

        if offline:
            self.fname_data_prefix = fname_data_prefix
        else:
            pdr = pd.read_csv(csv_name)

            xM = jpd.pd_get_xM(pdr)
            yV = jpd.pd_get_yV(pdr, y_id=y_id)

            if graph:
                hist(yV)
                show()

            self.kFold(xM, yV, fname_data_prefix=fname_data_prefix)

            self.n_iter = n_iter
Esempio n. 16
0
def read_data_sets_mol_molw(fname, validation_rate=0, test_rate=0, disp=False):
    class DataSets(object):
        pass

    data_sets = DataSets()

    pdr = pd.read_csv(fname)
    #xM_fp = jpd.pd_get_xM( pdr)
    #xM_key = jpd.pd_get_xM_MACCSkeys( pdr)
    xM_molw = jpd.pd_get_xM_molw(pdr)
    #xM_lasa = jpd.pd_get_xM_lasa( pdr)
    #xM = np.concatenate( [xM_fp, xM_key, xM_molw, xM_lasa], axis = 1)

    "Normalize xM so as to be a set of unit norm random values"
    xM = np.divide(xM_molw, np.std(xM_molw, axis=0))
    yV = jpd.pd_get_yV(pdr, y_id='exp').A1

    X, Y = list(map(np.array, [xM, yV]))
    assert X.shape[0] == Y.shape[0]

    if test_rate > 0:
        X, Y, X_test, Y_test = XY_split(X, Y, test_rate)
        data_sets.test = DataSet_CSV(X_test, Y_test, disp=disp)

    if validation_rate > 0:
        X, Y, X_val, Y_val = XY_split(X, Y, validation_rate)
        data_sets.validation = DataSet_CSV(X_val, Y_val, disp=disp)

    # If test_rate and validation_rate are both zero,
    # all data is allocated to train dataset.
    data_sets.train = DataSet_CSV(X, Y, disp=disp)

    # The length of descriptors are fed back.
    data_sets.IMAGE_PIXELS = xM.shape[1]

    return data_sets