# Load the perceptual descriptors data. perceptual_headers, perceptual_obs_data = loading.load_perceptual_data('training') loading.format_leaderboard_perceptual_data() # Show the perceptual metadata types and perceptual descriptor names. print(perceptual_headers) # Show the metadata and perceptual descriptor values for the first compound. print(perceptual_obs_data[1]) num_descriptors = len(perceptual_headers[6:]) assert num_descriptors == dream.NUM_DESCRIPTORS num_subjects = dream.NUM_SUBJECTS print('There are %d different perceptual descriptors and %d different subjects' % (num_descriptors,num_subjects)) # Load the molecular descriptors data. molecular_headers, molecular_data = loading.load_molecular_data() print("First ten molecular descriptor types are %s" % molecular_headers[:10]) print("First ten descriptor values for the first compound are %s" % molecular_data[0][:10]) total_size = len(set([int(row[0]) for row in molecular_data])) print("We have molecular descriptors for %d unique molecules" % total_size) # Load the molecular descriptors data. molecular_headers, molecular_data = loading.load_molecular_data() print("First ten molecular descriptor types are %s" % molecular_headers[:10]) print("First ten descriptor values for the first compound are %s" % molecular_data[0][:10]) total_size = len(set([int(row[0]) for row in molecular_data])) print("We have molecular descriptors for %d unique molecules" % total_size) training_size = len(set([int(row[0]) for row in perceptual_obs_data])) print("We have perceptual data for %d unique molecules" % training_size) remaining_size = total_size - training_size
loading.format_leaderboard_perceptual_data() # Show the perceptual metadata types and perceptual descriptor names. print(perceptual_headers) # Show the metadata and perceptual descriptor values for the first compound. print(perceptual_obs_data[1]) num_descriptors = len(perceptual_headers[6:]) assert num_descriptors == dream.NUM_DESCRIPTORS num_subjects = dream.NUM_SUBJECTS print( 'There are %d different perceptual descriptors and %d different subjects' % (num_descriptors, num_subjects)) # Load the molecular descriptors data. molecular_headers, molecular_data = loading.load_molecular_data() print("First ten molecular descriptor types are %s" % molecular_headers[:10]) print("First ten descriptor values for the first compound are %s" % molecular_data[0][:10]) total_size = len(set([int(row[0]) for row in molecular_data])) print("We have molecular descriptors for %d unique molecules" % total_size) # Load the molecular descriptors data. molecular_headers, molecular_data = loading.load_molecular_data() print("First ten molecular descriptor types are %s" % molecular_headers[:10]) print("First ten descriptor values for the first compound are %s" % molecular_data[0][:10]) total_size = len(set([int(row[0]) for row in molecular_data])) print("We have molecular descriptors for %d unique molecules" % total_size) training_size = len(set([int(row[0]) for row in perceptual_obs_data]))
def get_molecular_data(sources,CIDs): import pandas DATA = '../../data/' if 1 or ('dragon' in sources): molecular_headers, molecular_data = loading.load_molecular_data() if 'episuite' in sources: episuite = pandas.read_table('%s/DREAM_episuite_descriptors.txt' % DATA) episuite.iloc[:,49] = 1*(episuite.iloc[:,49]=='YES ') episuite = episuite.iloc[:,2:].as_matrix() print("Episuite has %d features for %d molecules." % (episuite.shape[1],episuite.shape[0])) ''' if 'verbal' in sources: verbal = pandas.read_table('%s/name_features.txt' % DATA, sep='\t', header=None) verbal = verbal.as_matrix()[:,1:] print("Verbal has %d features for %d molecules." % (verbal.shape[1],verbal.shape[0])) ''' if 'morgan' in sources: morgan = pandas.read_csv('%s/morgan_sim.csv' % DATA) morgan = morgan.as_matrix()[:,1:] print("Morgan has %d features for %d molecules." % (morgan.shape[1],morgan.shape[0])) if 'nspdk' in sources: # Start to load the NSPDK features. with open('%s/derived/nspdk_r3_d4_unaug.svm' % DATA) as f: nspdk_dict = {} i = 0 while True: x = f.readline() if(len(x)): key_vals = x.split(' ')[1:] for key_val in key_vals: key,val = key_val.split(':') if key in nspdk_dict: nspdk_dict[key][CIDs[i]] = val else: nspdk_dict[key] = {CIDs[i]:val} i+=1 if i == len(CIDs): break else: break nspdk_dict = {key:value for key,value in nspdk_dict.items() if len(value)>1} # Get the NSPDK features into the right format. nspdk = np.zeros((len(CIDs),len(nspdk_dict))) for j,(feature,facts) in enumerate(nspdk_dict.items()): for CID,value in facts.items(): i = CIDs.index(CID) nspdk[i,j] = value print("NSPDK has %d features for %d molecules." % (nspdk.shape[1],nspdk.shape[0])) if 'gramian' in sources: # These require a large file that is not on GitHub, but can be obtained separately. nspdk_gramian = pandas.read_table('%s/derived/nspdk_r3_d4_unaug_gramian.mtx' % DATA, delimiter=' ', header=None) nspdk_gramian = nspdk_gramian.as_matrix()[:len(CIDs),:] print("NSPDK Gramian has %d features for %d molecules." % \ (nspdk_gramian.shape[1],nspdk_gramian.shape[0])) # Add all these new features to the molecular data dict. mdx = [] for i,line in enumerate(molecular_data): CID = int(line[0]) if CID in CIDs: index = CIDs.index(CID) if 'episuite' in sources: line += list(episuite[index]) if 'morgan' in sources: line += list(morgan[index]) if 'nspdk' in sources: line += list(nspdk[index]) if 'gramian' in sources: line += list(nspdk_gramian[index]) mdx.append(line) print("There are now %d total features." % len(mdx[0])) return molecular_data