Python load_molecular_data Examples, opc_python.utils.loading.load_molecular_data Python Examples

Example #1

0

Show file

File: prepare.py Project: bence-szalai/olfaction-prediction

# Load the perceptual descriptors data.  
perceptual_headers, perceptual_obs_data = loading.load_perceptual_data('training')
loading.format_leaderboard_perceptual_data()
# Show the perceptual metadata types and perceptual descriptor names.
print(perceptual_headers)

# Show the metadata and perceptual descriptor values for the first compound.
print(perceptual_obs_data[1])

num_descriptors = len(perceptual_headers[6:])
assert num_descriptors == dream.NUM_DESCRIPTORS
num_subjects = dream.NUM_SUBJECTS
print('There are %d different perceptual descriptors and %d different subjects' % (num_descriptors,num_subjects))

# Load the molecular descriptors data.  
molecular_headers, molecular_data = loading.load_molecular_data()
print("First ten molecular descriptor types are %s" % molecular_headers[:10])
print("First ten descriptor values for the first compound are %s" % molecular_data[0][:10])
total_size = len(set([int(row[0]) for row in molecular_data]))
print("We have molecular descriptors for %d unique molecules" % total_size)

# Load the molecular descriptors data.  
molecular_headers, molecular_data = loading.load_molecular_data()
print("First ten molecular descriptor types are %s" % molecular_headers[:10])
print("First ten descriptor values for the first compound are %s" % molecular_data[0][:10])
total_size = len(set([int(row[0]) for row in molecular_data]))
print("We have molecular descriptors for %d unique molecules" % total_size)

training_size = len(set([int(row[0]) for row in perceptual_obs_data]))
print("We have perceptual data for %d unique molecules" % training_size)
remaining_size = total_size - training_size

Example #2

0

Show file

loading.format_leaderboard_perceptual_data()
# Show the perceptual metadata types and perceptual descriptor names.
print(perceptual_headers)

# Show the metadata and perceptual descriptor values for the first compound.
print(perceptual_obs_data[1])

num_descriptors = len(perceptual_headers[6:])
assert num_descriptors == dream.NUM_DESCRIPTORS
num_subjects = dream.NUM_SUBJECTS
print(
    'There are %d different perceptual descriptors and %d different subjects' %
    (num_descriptors, num_subjects))

# Load the molecular descriptors data.
molecular_headers, molecular_data = loading.load_molecular_data()
print("First ten molecular descriptor types are %s" % molecular_headers[:10])
print("First ten descriptor values for the first compound are %s" %
      molecular_data[0][:10])
total_size = len(set([int(row[0]) for row in molecular_data]))
print("We have molecular descriptors for %d unique molecules" % total_size)

# Load the molecular descriptors data.
molecular_headers, molecular_data = loading.load_molecular_data()
print("First ten molecular descriptor types are %s" % molecular_headers[:10])
print("First ten descriptor values for the first compound are %s" %
      molecular_data[0][:10])
total_size = len(set([int(row[0]) for row in molecular_data]))
print("We have molecular descriptors for %d unique molecules" % total_size)

training_size = len(set([int(row[0]) for row in perceptual_obs_data]))

Example #3

0

Show file

File: dream.py Project: bence-szalai/olfaction-prediction

def get_molecular_data(sources,CIDs):
    import pandas
    DATA = '../../data/'
    if 1 or ('dragon' in sources):
        molecular_headers, molecular_data = loading.load_molecular_data()
    if 'episuite' in sources:
        episuite = pandas.read_table('%s/DREAM_episuite_descriptors.txt' % DATA)
        episuite.iloc[:,49] = 1*(episuite.iloc[:,49]=='YES ')
        episuite = episuite.iloc[:,2:].as_matrix()
        print("Episuite has %d features for %d molecules." % (episuite.shape[1],episuite.shape[0]))
    '''
    if 'verbal' in sources:
        verbal = pandas.read_table('%s/name_features.txt' % DATA, sep='\t', header=None)
        verbal = verbal.as_matrix()[:,1:]
        print("Verbal has %d features for %d molecules." % (verbal.shape[1],verbal.shape[0]))
    '''
    if 'morgan' in sources:
        morgan = pandas.read_csv('%s/morgan_sim.csv' % DATA)
        morgan = morgan.as_matrix()[:,1:]
        print("Morgan has %d features for %d molecules." % (morgan.shape[1],morgan.shape[0]))
    if 'nspdk' in sources:
        # Start to load the NSPDK features.  
        with open('%s/derived/nspdk_r3_d4_unaug.svm' % DATA) as f:
            nspdk_dict = {}
            i = 0
            while True:
                x = f.readline()
                if(len(x)):
                    key_vals = x.split(' ')[1:]
                    for key_val in key_vals:
                        key,val = key_val.split(':')
                        if key in nspdk_dict:
                            nspdk_dict[key][CIDs[i]] = val
                        else:
                            nspdk_dict[key] = {CIDs[i]:val}
                    i+=1
                    if i == len(CIDs):
                        break
                else:
                    break
        nspdk_dict = {key:value for key,value in nspdk_dict.items() if len(value)>1}
        # Get the NSPDK features into the right format.  
        nspdk = np.zeros((len(CIDs),len(nspdk_dict)))
        for j,(feature,facts) in enumerate(nspdk_dict.items()):
            for CID,value in facts.items():
                i = CIDs.index(CID)
                nspdk[i,j] = value
        print("NSPDK has %d features for %d molecules." % (nspdk.shape[1],nspdk.shape[0]))
    if 'gramian' in sources:
        # These require a large file that is not on GitHub, but can be obtained separately.  
        nspdk_gramian = pandas.read_table('%s/derived/nspdk_r3_d4_unaug_gramian.mtx' % DATA, delimiter=' ', header=None)
        nspdk_gramian = nspdk_gramian.as_matrix()[:len(CIDs),:]
        print("NSPDK Gramian has %d features for %d molecules." % \
              (nspdk_gramian.shape[1],nspdk_gramian.shape[0]))

    # Add all these new features to the molecular data dict.  
    mdx = []
    for i,line in enumerate(molecular_data):
        CID = int(line[0])
        if CID in CIDs:
            index = CIDs.index(CID)
            if 'episuite' in sources:
                line += list(episuite[index])
            if 'morgan' in sources:
                line += list(morgan[index])
            if 'nspdk' in sources:
                line += list(nspdk[index])
            if 'gramian' in sources:
                line += list(nspdk_gramian[index])
            mdx.append(line)
    print("There are now %d total features." % len(mdx[0]))
    return molecular_data