Ejemplo n.º 1
0
    def get_mols(self, files, type='', label_part=-1, fallback=False):
        self.mols = []
        self.files = files

        if len(files) > 2000:
            self.big_data = True

        if label_part == -1:
            label_part = get_unique_part(files)
            if label_part == -1:
                fallback = True

        for f, file in enumerate(files):
            if fallback:
                id = str(f)
            else:
                id = file.split('/')[-1].split('.')[0].split('_')[label_part]

            try:
                int(id)
                id = 'nmrmol' + str(id)
            except:
                pass

            if self.big_data:
                self.mols.append([file, id, type])
            else:
                mol = nmrmol(molid=id)

                if type == '':
                    ftype = get_type(file)
                else:
                    ftype = type
                mol.read_nmr(file, ftype)
                self.mols.append(mol)
Ejemplo n.º 2
0
def predict(args):

	from autoenrich.molecule.dataset import dataset
	from autoenrich.file_creation.structure_formats.nmredata import nmrmol_to_nmredata

	for files_set in args['test_sets']:
		parts = files_set.split('/')
		path = ''
		for part in parts[:-1]:
			path = path + part + '/'

		files = glob.glob(files_set)
		#if len(files) == 0:
		#	print ('No file(s) found matching ', args['training_set'])
		#	sys.exit(0)
		dset = dataset()

		label_part = get_unique_part(files)
		dset.get_mols(files, type='nmredata', label_part=label_part)
		if len(dset.mols) == 0:
			print('No molecules loaded. . .')
			sys.exit(0)

		for m, model_file in enumerate(args['models']):

			print('Predicting from model: ', model_file)

			model = pickle.load(open(model_file, 'rb'))

			print(model.args["targetflag"])
			dset.get_features_frommols(model.args, params=model.params, training=False)
			assert len(dset.x) > 0, print('No features made. . . ')

			if args['store_datasets']:
				pickle.dump(dset, open('OPT_testing_set.pkl', 'wb'))

			y_test, y_pred = model.predict(dset.x[0])

			v_preds = []
			for i in range(args['var']):
				var_model_file = model_file.split('.pkl')[0] + '_' + str(i+1) + '.pkl'

				try:
					var_model = pickle.load(open(var_model_file, 'rb'))
				except Exception as e:
					print(e)
					continue

				assert model.args['featureflag'] == var_model.args['featureflag']
				assert model.args['targetflag'] == var_model.args['targetflag']
				assert model.args['max_size'] == var_model.args['max_size']
				assert model.params == var_model.params, print(model.params, var_model.params)

				print('\tPredicting from ', var_model_file)
				tmp_preds = var_model.predict(dset.x)
				v_preds.append(tmp_preds)

			if args['var'] > 0:
				var = np.var(np.asarray(v_preds), axis=0)
			else:
				var = np.zeros(len(y_pred), dtype=np.float64)

			if m == 0:
				dset.assign_from_ml(y_pred, var, zero=True)
			else:
				dset.assign_from_ml(y_pred, var, zero=False)

		for mol in dset.mols:
			outname = args['output_dir'] + 'IMP_' + mol.molid + '.nmredata.sdf'
			nmrmol_to_nmredata(mol, outname)

	print('Done')
Ejemplo n.º 3
0
def compare_datasets(args):

    att_mols = []
    sets = []
    for set_list in args['comp_sets']:
        print('Getting molecules from ', set_list)
        set = dataset()
        label_part = get_unique_part(glob.glob(set_list))
        set.get_mols(glob.glob(set_list), label_part=label_part)
        print(len(set.mols), ' molecules found from ',
              len(glob.glob(set_list)), ' files')

        sets.append(set)

    assert len(sets) > 1, print('Only one set found. . .')
    assert len(sets[0].mols) == len(
        sets[1].mols), print('Different numbers of molecules in sets')

    found = []
    for m1, mol1 in enumerate(sets[0].mols):
        if m1 in found:
            continue
        for m2, mol2 in enumerate(sets[1].mols):

            if args['match_criteria'] == 'id':
                if mol1.molid == mol2.molid:
                    att_mols.append([mol1, mol2])
                else:
                    continue

            if not mol_isequal(mol1, mol2):
                continue

            if [mol1, mol2] in att_mols:
                continue

            if len(sets) > 2:
                for m3, mol3 in enumerate(sets[2].mols):
                    if not mol_isequal(mol1, mol3):
                        continue

                    if [mol1, mol2, mol3] in att_mols:
                        continue

                    att_mols.append([mol1, mol2, mol3])

            else:
                found.append(m1)
                att_mols.append([mol1, mol2])

    print(len(att_mols), ' molecules matched, out of ', len(sets[0].mols))

    for targetflag in args['comp_targets']:
        print(targetflag)
        target = flag_to_target(targetflag)

        for set in sets:
            set.get_features_frommols({
                'featureflag': 'dummy',
                'targetflag': targetflag,
                'max_size': 0
            })

        values = []
        refs = []
        typerefs = []

        assert len(sets[0].r) == len(sets[1].r)
        if len(sets) > 2:
            assert len(sets[2].r) == len(sets[1].r)

        for group in att_mols:
            for i in range(len(sets[0].r)):
                ref1 = sets[0].r[i]
                if ref1[0] != group[0].molid:
                    continue
                val1 = sets[0].y[i]
                typeref1 = [group[0].types[row] for row in ref1[1:]]

                for j in range(len(sets[0].r)):

                    ref2 = sets[1].r[j]

                    if ref2[0] != group[1].molid:
                        continue
                    val2 = sets[1].y[j]
                    typeref2 = [group[1].types[row] for row in ref2[1:]]

                    bad = False
                    for xx in range(1, len(ref1)):
                        if ref1[xx] != ref2[xx]:
                            bad = True
                    if typeref1 != typeref2:
                        #print(typeref1, typeref2)
                        bad = True
                    if bad:
                        continue

                    if len(sets) > 2:
                        for k in range(len(sets[0].r)):
                            ref3 = sets[2].r[k]
                            if ref3[0] != group[2].molid:
                                continue
                            val3 = sets[2].y[k]
                            typeref3 = [
                                group[2].types[row] for row in ref3[1:]
                            ]

                            bad = False
                            for xx in range(1, len(ref1)):
                                if ref1[xx] != ref3[xx]:
                                    bad = True
                            if bad:
                                continue

                            refs.append([ref1, ref2, ref3])
                            values.append([val1, val2, val3])
                            typerefs.append([typeref1, typeref2, typeref3])
                    else:
                        refs.append([ref1, ref2])
                        values.append([val1, val2])
                        typerefs.append([typeref1, typeref2])

        if 'output_path' in args:
            assert len(args['output_path']) != 0
            outname = args['output_path'] + '/Comparison_' + str(
                targetflag) + '.csv'
        else:
            outname = 'Comparison_' + str(targetflag) + '.csv'

        print_mol_csv(outname, refs, typerefs, values, args['comp_labels'])

        x = [row[0] for row in values]
        y = [row[1] for row in values]

        MAE = np.mean(np.absolute(np.asarray(x) - np.asarray(y)))
        MAEstring = '{0:<6.3f}'.format(MAE)
        print('MAE between ', args['comp_labels'][0], args['comp_labels'][1],
              ' = ', MAEstring, '   no. of envs. ', len(x))