Ejemplo n.º 1
0
def existing_training_model(args, seed):
    '''
    Input data are used to prepopulate the model, based on prior runs
    :param datain Input data files are stored as models (if .model) or
    features (if .features) they are pickeled model and feature files
    '''
    verbose_print(args.verbose, "Using existing model")
    trained_model = Training()
    if args.datain.endswith('.cluster'):
        with open(args.datain, 'rb') as fid:
            cluster = pickle.load(fid)
            trained_model.cluster = cluster
            trained_model.model = cluster.model
    elif args.datain.endswith('.model'):
        with open(args.datain, 'rb') as fid:
            model = pickle.load(fid)
            trained_model.model = model
        if args.cluster:
            cluster = cl.Clustering(model.training_data.compound,
                                    seed=args.random)
            cluster.cluster_training(model)
            trained_model.cluster = cluster
    elif args.datain.endswith('.features'):
        with open(args.datain, 'rb') as fid:
            train = pickle.load(fid)
            train = bt.Process(train, seed=seed)
            model = tm.Train(train)
            model.train_model()
    if args.cv:
        verbose_print(args.verbose, "Running cross-validational analysis")
        cv.Analysis(model, seed)
    return trained_model
Ejemplo n.º 2
0
 def testTrainModel(self):
     print("Testing the training of model")
     np.random.seed(_random)
     train = bt.Process(self.test_data, split_value=_split_value)
     X = train.train
     imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
     imp.fit(X)
     t = imp.transform(X)
     train.train = t
     model = tm.Train(train)
     model.train_model()
     self.assertEqual(512, len(model.clf.estimators_))
     self.assertEqual(16, len(model.features))
     self.assertAlmostEqual(0.333, model.clf.oob_score_, 2)
     self.assertAlmostEqual(0.760, model.clf.oob_decision_function_[0][0],
                            2)
     '''preprocess_model hasn't been tested'''
 def testCrossValidate(self):
     np.random.seed(seed=_random)
     train = bt.Process(self.test_data, split_value=_split_value)
     X = train.train
     imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
     imp.fit(X)
     t = imp.transform(X)
     train.train = t
     model = tm.Train(train)
     model.train_model()
     print("Test cross_validate")
     cross = cv.Analysis(model, seed=_random, verbose=True)
     cross.cross_validate(n_iter=_n_iter)
     self.assertEqual(0.95, cross.acc)
     self.assertEqual(0.95, cross.prec)
     self.assertEqual(1.0, cross.recall)
     self.assertEqual(0.95, cross.roc)
     cross.feature_importances()
     self.assertAlmostEqual(0.0586, cross.feats[0][0], 3)
 def testBuildTraining(self):
     train = bt.Process(self.test_data, split_value=_split_value)
     distance = _distance
     a = train.train[0][0] * (1. / distance[0][1])
     b = train.train[2][0] * (1. / distance[2][1])
     denominator = (1. / distance[0][1]) + (1. / distance[2][1])
     value = (a + b) / denominator
     print("Test imputing")
     train.impute_values(distance, verbose=_verbose, k=2)
     self.assertAlmostEquals(value, train.train[1][0])
     print("Testing Boruta")
     compound = {
         '19502': {
             'predictor': '57.6',
             'experimentalhash': {
                 u'Rotatable Bond Count': 1.0,
                 u'XLogP3-AA': 3.8,
                 u'Heavy Atom Count': 8.0,
                 u'Undefined Atom Stereocenter Count': 3.0,
                 u'Molecular Weight': 112.21264,
                 u'Complexity': 66.4,
                 u'Exact Mass': 112.125201,
                 u'Monoisotopic Mass': 112.125201
             }
         },
         '7459': {
             'predictor': '67.3',
             'experimentalhash': {
                 u'Density': 0.8039,
                 u'Boiling Point': 170.9,
                 u'Rotatable Bond Count': 1.0,
                 u'Melting Point': -89.84,
                 u'Undefined Atom Stereocenter Count': 0.0,
                 u'Molecular Weight': 140.2658,
                 u'XLogP3-AA': 4.5,
                 u'Complexity': 86.2,
                 u'Heavy Atom Count': 10.0,
                 u'Exact Mass': 140.156501,
                 u'Monoisotopic Mass': 140.156501
             }
         },
         '8004': {
             'predictor': '87.9',
             'experimentalhash': {
                 u'Density': 0.6405,
                 u'Vapor Density': 2.42,
                 u'Boiling Point': 29.9,
                 u'Rotatable Bond Count': 2.0,
                 u'Melting Point': -165.2,
                 u'Flash Point': -18.0,
                 u'Undefined Atom Stereocenter Count': 0.0,
                 u'Auto-Ignition': 527.0,
                 u'Molecular Weight': 70.1329,
                 u'XLogP3-AA': 2.4,
                 u'Complexity': 21.2,
                 u'Vapor Pressure': 635.0,
                 u'Heavy Atom Count': 5.0,
                 u'Exact Mass': 70.07825,
                 u'Monoisotopic Mass': 70.07825
             }
         },
         '11597': {
             'predictor': '76.4',
             'experimentalhash': {
                 u'Density': 0.6731,
                 u'Vapor Density': 3.0,
                 u'Boiling Point': 63.4,
                 u'Rotatable Bond Count': 3.0,
                 u'XLogP3': 3.4,
                 u'Melting Point': -139.7,
                 u'Flash Point': 20.0,
                 u'Undefined Atom Stereocenter Count': 0.0,
                 u'Auto-Ignition': 487.0,
                 u'Molecular Weight': 84.15948,
                 u'LogP': 3.39,
                 u'Complexity': 29.0,
                 u'Vapor Pressure': 183.7,
                 u'Heavy Atom Count': 6.0,
                 u'Exact Mass': 84.0939,
                 u'Monoisotopic Mass': 84.0939
             }
         },
         '107252': {
             'predictor': '29.9',
             'experimentalhash': {
                 u'XLogP3-AA': 4.8,
                 u'Exact Mass': 140.156501,
                 u'Monoisotopic Mass': 140.156501,
                 u'Undefined Atom Stereocenter Count': 2.0,
                 u'Complexity': 86.0,
                 u'Rotatable Bond Count': 2.0,
                 u'Molecular Weight': 140.2658,
                 u'Heavy Atom Count': 10.0
             }
         },
         '11610': {
             'predictor': '54.5',
             'experimentalhash': {
                 u'Density': 0.697,
                 u'Vapor Density': 0.7,
                 u'Boiling Point': 93.6,
                 u'Rotatable Bond Count': 4.0,
                 u'XLogP3': 4.0,
                 u'Melting Point': -119.7,
                 u'Flash Point': 32.0,
                 u'Undefined Atom Stereocenter Count': 0.0,
                 u'Auto-Ignition': 500.0,
                 u'Molecular Weight': 98.18606,
                 u'LogP': 3.99,
                 u'Complexity': 37.3,
                 u'Vapor Pressure': 59.3,
                 u'Heavy Atom Count': 7.0,
                 u'Exact Mass': 98.10955,
                 u'Monoisotopic Mass': 98.10955
             }
         },
         '35411': {
             'predictor': '68.7',
             'experimentalhash': {
                 u'XLogP3-AA': 4.4,
                 u'Exact Mass': 126.140851,
                 u'Monoisotopic Mass': 126.140851,
                 u'Undefined Atom Stereocenter Count': 0.0,
                 u'Complexity': 78.0,
                 u'Rotatable Bond Count': 1.0,
                 u'Molecular Weight': 126.23922,
                 u'Heavy Atom Count': 9.0
             }
         },
         '136729': {
             'predictor': '57.6',
             'experimentalhash': {
                 u'Rotatable Bond Count': 1.0,
                 u'XLogP3-AA': 3.8,
                 u'Heavy Atom Count': 8.0,
                 u'Undefined Atom Stereocenter Count': 2.0,
                 u'Molecular Weight': 112.21264,
                 u'Complexity': 66.4,
                 u'Exact Mass': 112.125201,
                 u'Monoisotopic Mass': 112.125201
             }
         },
         '7844': {
             'predictor': '98.8',
             'experimentalhash': {
                 u'Density': 0.577,
                 u'Vapor Density': 1.93,
                 u'Boiling Point': -6.47,
                 u'Rotatable Bond Count': 1.0,
                 u'XLogP3': 2.4,
                 u'Melting Point': -185.3,
                 u'Flash Point': False,
                 u'Undefined Atom Stereocenter Count': 0.0,
                 u'Auto-Ignition': 725.0,
                 u'Molecular Weight': 56.10632,
                 u'LogP': 2.4,
                 u'Complexity': 14.0,
                 u'Vapor Pressure': 2253.0,
                 u'Heavy Atom Count': 4.0,
                 u'Exact Mass': 56.0626,
                 u'Monoisotopic Mass': 56.0626
             }
         },
         '8125': {
             'predictor': '28.7',
             'experimentalhash': {
                 u'Density': 0.7149,
                 u'Vapor Density': 3.87,
                 u'Boiling Point': 121.2,
                 u'Rotatable Bond Count': 5.0,
                 u'XLogP3': 4.6,
                 u'Melting Point': -101.7,
                 u'Flash Point': 70.0,
                 u'Undefined Atom Stereocenter Count': 0.0,
                 u'Auto-Ignition': 446.0,
                 u'Molecular Weight': 112.21264,
                 u'LogP': 4.57,
                 u'Complexity': 46.0,
                 u'Vapor Pressure': 17.4,
                 u'Heavy Atom Count': 8.0,
                 u'Exact Mass': 112.125201,
                 u'Monoisotopic Mass': 112.125201
             }
         },
         '11549': {
             'predictor': '87.3',
             'experimentalhash': {
                 u'XLogP3-AA': 3.9,
                 u'Exact Mass': 112.125201,
                 u'Monoisotopic Mass': 112.125201,
                 u'Undefined Atom Stereocenter Count': 0.0,
                 u'Complexity': 64.1,
                 u'Rotatable Bond Count': 0.0,
                 u'Molecular Weight': 112.21264,
                 u'Heavy Atom Count': 8.0
             }
         }
     }
     test_data = Object()
     test_data.compound = deepcopy(compound)
     train = bt.Process(test_data, split_value=_split_value)
     train.feature_selection(seed=12345, verbose=False)
     support = [
         False, False, True, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False
     ]
     self.assertTrue(list_check(support, train.feature_support.tolist()))
     self.assertEquals('Complexity', train.feature_names[0])
     '''
Ejemplo n.º 5
0
 def setUp(self):
     """Create an instance of the read_training Read class"""
     print("Initializing test")
     input_data = {
         '7844': {
             'CC1CCC(C)CC1': '0',
             '[#1]-C=C-[#1]': '0',
             'C-C-C-C-C-C': '1',
             'C-C-C-C-C-C(C)-C': '1',
             '>= 8 C': '1',
             'C-C(C)-C-C-C': '1',
             'C(~C)(~C)(~C)': '1',
             '>= 16 H': '1',
             'C-C-C(C)-C-C': '1',
             'CC1CC(C)CC1': '1',
             'CC1C(C)CCCC1': '0',
             'C-C-C=C': '0',
             'C-C(C)-C-C': '1',
             'C(~C)(~C)(~C)(~H)': '1',
             'C-C-C-C-C': '1',
             'C(~C)(~C)(~C)(~C)': '0',
             'C-C(C)-C(C)-C': '0',
             'C=C-C-C-C': '0',
             'C=C': '0',
             'CC1C(C)CCC1': '0',
             'C(-C)(-H)(=C)': '0',
             'C-C(C)(C)-C-C': '0',
             '[#1]-C-C=C-[#1]': '0',
             '>= 1 saturated or aromatic carbon-only ring size 5': '1',
             '>= 1 saturated or aromatic carbon-only ring size 6': '0',
             'C=C-C-C': '0',
             'C-C-C-C-C-C-C-C': '0',
             'C-C-C-C-C-C-C': '1',
             'C(-H)(=C)': '0',
             '>= 1 any ring size 5': '1',
             '>= 1 any ring size 6': '0',
             'C(-C)(=C)': '0'
         },
         '19502': {
             'CC1CCC(C)CC1': '1',
             '[#1]-C=C-[#1]': '0',
             'C-C-C-C-C-C': '1',
             'C-C-C-C-C-C(C)-C': '1',
             '>= 8 C': '1',
             'C-C(C)-C-C-C': '1',
             'C(~C)(~C)(~C)': '1',
             '>= 16 H': '1',
             'C-C-C(C)-C-C': '1',
             'CC1CC(C)CC1': '0',
             'CC1C(C)CCCC1': '0',
             'C-C-C=C': '0',
             'C-C(C)-C-C': '1',
             'C(~C)(~C)(~C)(~H)': '1',
             'C-C-C-C-C': '1',
             'C(~C)(~C)(~C)(~C)': '0',
             'C-C(C)-C(C)-C': '1',
             'C=C-C-C-C': '0',
             'C=C': '0',
             'CC1C(C)CCC1': '0',
             'C(-C)(-H)(=C)': '0',
             'C-C(C)(C)-C-C': '0',
             '[#1]-C-C=C-[#1]': '0',
             '>= 1 saturated or aromatic carbon-only ring size 5': '0',
             '>= 1 saturated or aromatic carbon-only ring size 6': '1',
             'C=C-C-C': '0',
             'C-C-C-C-C-C-C-C': '1',
             'C-C-C-C-C-C-C': '1',
             'C(-H)(=C)': '0',
             '>= 1 any ring size 5': '0',
             '>= 1 any ring size 6': '1',
             'C(-C)(=C)': '0'
         },
         '11610': {
             'CC1CCC(C)CC1': '0',
             '[#1]-C=C-[#1]': '0',
             'C-C-C-C-C-C': '1',
             'C-C-C-C-C-C(C)-C': '0',
             '>= 8 C': '1',
             'C-C(C)-C-C-C': '1',
             'C(~C)(~C)(~C)': '1',
             '>= 16 H': '1',
             'C-C-C(C)-C-C': '1',
             'CC1CC(C)CC1': '0',
             'CC1C(C)CCCC1': '0',
             'C-C-C=C': '0',
             'C-C(C)-C-C': '1',
             'C(~C)(~C)(~C)(~H)': '1',
             'C-C-C-C-C': '1',
             'C(~C)(~C)(~C)(~C)': '0',
             'C-C(C)-C(C)-C': '1',
             'C=C-C-C-C': '0',
             'C=C': '0',
             'CC1C(C)CCC1': '1',
             'C(-C)(-H)(=C)': '0',
             'C-C(C)(C)-C-C': '0',
             '[#1]-C-C=C-[#1]': '0',
             '>= 1 saturated or aromatic carbon-only ring size 5': '1',
             '>= 1 saturated or aromatic carbon-only ring size 6': '0',
             'C=C-C-C': '0',
             'C-C-C-C-C-C-C-C': '1',
             'C-C-C-C-C-C-C': '1',
             'C(-H)(=C)': '0',
             '>= 1 any ring size 5': '1',
             '>= 1 any ring size 6': '0',
             'C(-C)(=C)': '0'
         }
     }
     input_data = OrderedDict(sorted(input_data.items(),
                                     key=lambda t: t[0]))
     np.random.seed(seed=_random)
     self.fingerprint_vector = list()
     self.key_list = list()
     for (key, value) in dictitems(input_data):
         self.fingerprint_vector.append(value)
         self.key_list.append(key)
     self.distance = ds.Distance(self.fingerprint_vector,
                                 self.key_list).distance
     compound = {
         '7844': {
             'predictor': '98.8',
             'experimentalhash': {
                 u'Density': 0.577,
                 u'Vapor Density': 1.93,
                 u'Boiling Point': -6.47,
                 u'Rotatable Bond Count': 1.0,
                 u'XLogP3': 2.4,
                 u'Melting Point': -185.3,
                 u'Flash Point': False,
                 u'Undefined Atom Stereocenter Count': 0.0,
                 u'Auto-Ignition': 725.0,
                 u'Molecular Weight': 56.10632,
                 u'LogP': 2.4,
                 u'Complexity': 14.0,
                 u'Vapor Pressure': 2253.0,
                 u'Heavy Atom Count': 4.0,
                 u'Exact Mass': 56.0626,
                 u'Monoisotopic Mass': 56.0626
             }
         },
         '19502': {
             'predictor': '57.6',
             'experimentalhash': {
                 u'Rotatable Bond Count': 1.0,
                 u'Heavy Atom Count': 8.0,
                 u'Undefined Atom Stereocenter Count': 3.0,
                 u'Molecular Weight': 112.21264,
                 u'Complexity': 66.4,
                 u'Exact Mass': 112.125201,
                 u'Monoisotopic Mass': 112.125201
             }
         },
         '11610': {
             'predictor': '54.5',
             'experimentalhash': {
                 u'Density': 0.697,
                 u'Vapor Density': 0.7,
                 u'Boiling Point': 93.6,
                 u'Rotatable Bond Count': 4.0,
                 u'XLogP3': 4.0,
                 u'Melting Point': -119.7,
                 u'Flash Point': 32.0,
                 u'Undefined Atom Stereocenter Count': 0.0,
                 u'Auto-Ignition': 500.0,
                 u'Molecular Weight': 98.18606,
                 u'LogP': 3.99,
                 u'Complexity': 37.3,
                 u'Vapor Pressure': 59.3,
                 u'Heavy Atom Count': 7.0,
                 u'Exact Mass': 98.10955,
                 u'Monoisotopic Mass': 98.10955
             }
         }
     }
     self.test_data = Object()
     self.test_data.compound = deepcopy(compound)
     train = bt.Process(self.test_data, split_value=_split_value)
     X = train.train
     imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
     imp.fit(X)
     t = imp.transform(X)
     train.train = t
     self.model = tm.Train(train)
     self.model.train_model()
Ejemplo n.º 6
0
def train_model(args, seed, proxy, pred):
    trained_model = Training()
    if args.train:
        '''
        The program is essentially run in one of two mutually exclusive modes
        (training or test)
        :param train if True, being parsing and training model file
        '''
        verbose_print(args.verbose, "Training model")
        if args.datain:
            warnings.warning("WARNING: The pickle datatype is inherently\
                             insecure. A quick question: do you trust the\
                             source of your model? Pickle files can contain\
                             corrupt code and executable commands.\
                             They can take over your computer and install\
                             malicious code on your computer or server. Use\
                             caution! Your best bet is to train your own\
                             models and run those! Use --datain at your own\
                             risk")
            continue_program = raw_input("Press [Y/y] if you want to continue")
            if continue_program in ['Y', 'y']:
                trained_model = existing_training_model(args, seed)
            else:
                exit()
        else:
            distance = False
            training_data = False
            verbose_print(args.verbose, "Reading training set")
            (user, experimental, chemofeatures, fingerprint) = check_features(args)
            if (args.distance is True) or (args.cluster is True) or (args.impute is True):
                '''These functions all require a distance matrix, which is best
                collected using the fingerprint data'''
                fingerprint = True
            training = rt.Read(args.input, pred, user=user, id_name=_id,
                               weights=args.weight)
            '''This block of code generally works on feature collection and
            parsing, including the removal of fully redundant features. The
            difference between remove_static=True and False is whether or not
            to get rid of fully redundant features. Since the distance matrix
            is the same, regardless, it is run using original data'''
            training_data = add_pubchem_features(training, args, user=user,
                                                 proxy=proxy,
                                                 fingerprint=fingerprint,
                                                 experimental=experimental,
                                                 chemofeatures=chemofeatures,
                                                 id_name=_id, chunks=_chunks)
            if (args.cluster is True) or (args.distance is True) or (args.impute is True):
                verbose_print(args.verbose, "Creating distance matrix")
                '''Collect distance matrix using the original dataset'''
                distance = collect_distance_matrix(training_data)
            '''Extract features from the user and PubChem data'''
            verbose_print(args.verbose, "Extracting features")
            training_data = extract_features(training_data, args, user=user,
                                             fingerprint=fingerprint,
                                             experimental=experimental,
                                             chemofeatures=chemofeatures,
                                             remove_static=True)
            '''Discretize the y-values for the the classification process.
            If no split value is provided then the default for the program
            is to break the value at the median
            '''
            if training_data.compound:
                train = bt.Process(training_data, split_value=args.split_value,
                                   verbose=args.verbose)
                if args.impute is True:
                    train.impute_values(distance=distance,
                                        verbose=args.verbose)
                if args.selection is True:
                    train.feature_selection(verbose=args.verbose,
                                            seed=args.random)
                '''If dataout parameter is set, it prints to pickle a file
                containing the features that were extracted. In later runs
                this can be specified as the data input using the datain
                parameter
                '''
                if args.dataout:
                    features_file = args.dataout + ".features"
                    with open(features_file, 'wb') as fid:
                        pickle.dump(train, fid)
                '''This is where the model is actually trained in the tm module'''
                model = tm.Train(train)
                model.train_model()
                trained_model.model = model
                '''If dataout parameter is set, it prints to pickle a file
                containing the RF model. In later runs this can be specified
                as the data input using the datain parameter
                '''
                if args.dataout:
                    model_file = args.dataout + ".model"
                    with open(model_file, 'wb') as fid:
                        pickle.dump(model, fid)
                if args.cv:
                    report_model_validation(model, args)
                if args.cluster:
                    cluster = cl.Clustering(training_data.compound, seed=args.random)
                    cluster.cluster_training(model)
                    trained_model.cluster = cluster
                    if args.dataout:
                        cluster_file = args.dataout + ".cluster"
                        with open(cluster_file, 'wb') as fid:
                            pickle.dump(cluster, fid)
    else:
        trained_model = False
    return trained_model