ml_dict = dict(alpha=1.0, binarize=None, fit_prior=True) if options.ml: ml_dict = ml_func.readMLFile(ml_dict, read_dict, path+options.ml) # initialize machine-learning method ml = BernoulliNB(alpha=ml_dict['alpha'], binarize=ml_dict['binarize'], fit_prior=ml_dict['fit_prior']) # loop over targets for target in conf.set_data: print target # read in training actives and calculate fps actives = cPickle.load(open(inpath_cmp+'ChEMBL_II/Target_no_'+str(target)+'.pkl', 'r')) for k in actives.keys(): for i,m in enumerate(actives[k]): fp_dict = scor.getFP(fp_build, m[1]) actives[k][i] = [str(target)+'_'+str(k)+'_A_'+str(i+1), fp_dict] # read in test actives and calculate fps div_actives = [] for line in gzip.open(inpath_cmp+'ChEMBL/cmp_list_ChEMBL_'+str(target)+'_actives.dat.gz', 'r'): if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() fp_dict = scor.getFP(fp_build, line[2]) # store: [internal ID, dict with fps] div_actives.append([line[1], fp_dict]) num_test_actives = conf.num_div_act - 1 # convert fps to numpy arrays np_fps_div_act = ml_func.getNumpy(div_actives)
# loop over data-set sources for dataset in conf.set_data.keys(): print dataset # loop over targets for target in conf.set_data[dataset]['ids']: print target # read in actives and calculate fps actives = [] for line in gzip.open( inpath_cmp + dataset + '/cmp_list_' + dataset + '_' + str(target) + '_actives.dat.gz', 'r'): if line[0] != '#': # structure of line: [external ID, internal ID, SMILES]] line = line.rstrip().split() fp_dict = scor.getFP(fp_build, line[2]) # store: [internal ID, dict with fps] actives.append([line[1], fp_dict]) num_actives = len(actives) num_test_actives = num_actives - num_query_mols # convert fps to numpy arrays np_fps_act = ml_func.getNumpy(actives) # read in decoys and calculate fps if dataset == 'ChEMBL': if firstchembl: decoys = [] for line in gzip.open( inpath_cmp + dataset + '/cmp_list_' + dataset + '_zinc_decoys.dat.gz', 'r'): if line[0] != '#':