Exemple #1
0
def build_2ffnn(inp, h1, h2, out):
    n = FeedForwardNetwork()
    inLayer = LinearLayer(inp)
    hiddenLayer1 = TanhLayer(h1)
    hiddenLayer2 = TanhLayer(h2)
    outLayer = LinearLayer(out)
    #outLayer = SoftmaxLayer(out)
    n.addInputModule(inLayer)
    n.addModule(hiddenLayer1)
    n.addModule(hiddenLayer2)
    n.addOutputModule(outLayer)
    in_to_hidden1 = FullConnection(inLayer, hiddenLayer1)
    hidden1_to_hidden2 = FullConnection(hiddenLayer1, hiddenLayer2)
    hidden2_to_out = FullConnection(hiddenLayer2, outLayer)
    n.addConnection(in_to_hidden1)
    n.addConnection(hidden1_to_hidden2)
    n.addConnection(hidden2_to_out)
    n.sortModules()
    n.randomize()
    return n
Exemple #2
0
class MP_Pybrain(Regression):
    """
    Fully connected multilayer perceptron using pybrain library.
    """
    def __init__(self, train_data, hyper,  n_targets=None, label_targets=None):
        """
    ------------

    train_data: pandas DataFrame
                Contains columns for features and for target variables. The names of the target variables ends
                with the suffix "_tau"
    hyper:      dictionary
                It contains the hyperparameters necessary to run all the functionalities of the model.
                 They are the following:
                "structure" is a list of integers determining the number of neurons in each hidden layer
                "epochs" an integer specifying the maximum number of epochs to run during every training session
                "learning_rate" a float giving the learning rate of the gradient descend
                "momentum" a float giving the value of the momentum for the algorithm
                "batch" a bool. If True the method performs full batch learning, i.e. updates of the weights is done
                using all the instances of the training set. Else, normal online method is performed
                Other parameters regarding cross validation are explained in the base class

        """
        Regression.__init__(self, train_data, hyper, n_targets=n_targets, label_targets=label_targets)

        self.N = FeedForwardNetwork()
        self.structure = [self.n_feature] + hyper['structure'] + [self.n_target]

        self._build_net(self.structure)
        self.res_params = [self.N.params[i] for i in range(len(self.N.params))]

        self.train_fraction = hyper['train_fraction']
        self.seed = hyper['seed']
        self.epochs = hyper['epochs']
        self.learning_rate = hyper['learning_rate']
        self.momentum = hyper['momentum']
        self.batch = bool(hyper['batch'])

    def learn(self, train_data = None, seed = None):
        """
    Performs single run training, and it is designed to be called after network instantiation.

    ----------

    train_data: pandas Dataframe
            It needs to contain datetime objects on index, and both features and target variables.
            The target variables need to end with the suffix "_tau". If None the self.train_set
            variable passed at the moment of instantiation will be used.

    Returns: tuple(MP_Pybrain object,float)
            It returns the model with the lowest training error, and the value of the training error.

        """
        if train_data is not None:
            self.train_set = train_data
            self.randomize()
        ds_train, ds_valid = self._build_dataset(self.train_set)
        trainer = BackpropTrainer(self.N, ds_train, learningrate=self.learning_rate,
                                  momentum=self.momentum,batchlearning=self.batch)
        trainer.train()
        e_train = [self._error(ds_train)]
        e_valid = [self._error(ds_valid)]
        final_model = copy(self)
        fin_error_train = e_train[0]
        fin_error_valid = e_valid[0]
        for i in range(1,self.epochs):
            if i%10 == 0:
                print "epoch: ", i
            trainer.train()
            e_train.append(self._error(ds_train))
            e_valid.append(self._error(ds_valid))
            if e_train[-1] < fin_error_train:
                final_model = deepcopy(self)
                fin_error_train = e_train[-1]
                fin_error_valid = e_valid[-1]
        return final_model, fin_error_train, fin_error_valid

    def xvalidate(self, train_data = None, folds = None):
        """
    Performs n-folds cross-validation on the a data set. The method is designed to reset the network
    to an initial configuration (decided at the moment of instantiation) every time a new training is
    started. The purpose is to make model comparison and returning an average error given a specific
    data set and collection of hyper-parameters. At the moment training and validation sets are chosen
    based on the input sequence of data, i.e. there is no random shuffling of the instances of the data set.

    ----------

    train_data: pandas Dataframe
            It needs to contain datetime objects on index, and both features and target variables.
            The target variables need to end with the suffix "_tau". If None the self.train_set
            variable passed at the moment of instantiation will be used.

    folds: integer
            The number of training/validation partition used in the method. If None it needs to be
            passed in the constructor when instantiating the object for the first time. If not passed
            ever, the method cannot work and an exception needs to be thrown.
    Returns: list, float, float
            A list of all the models trained for each fold, the mean train error and the cross-validation error,
            i.e. the average of NRMSE for all the training/validation partitions created.

        """
        if train_data is not None:
            self.train_set = train_data
        if folds is not None:
            self.cv_folds = folds
        train, validation = self._build_folds(random=False)
        models = []
        train_error = []
        cv_error = []
        for i in range(self.cv_folds):
            print "Cross-validation Fold: ", i+1
            self.randomize()
            model, error, _ = self.learn(train_data=train[i])
            models.append(deepcopy(model))
            train_error.append(error)
            predicted, actual = self.test(validation[i])
            e = 0
            for k in predicted.keys():
                e += errors.RMSE(np.array(actual[k]),np.array(predicted[k]))
            cv_error.append(e)
        return models, np.mean(train_error), np.mean(cv_error)

    def test(self, data):
        """
    Tests the trained model on data. The usage is two fold: 1) Internal usage to calculate errors on validation
    sets. 2) For external usage when a test set is provided. Both the validation and test set need to contain target
    columns. For prediction, where target variables are unknown, please refer to the function self.predict below.
    ----------

    data:       pandas Dataframe
                A pandas dataframe. A deepcopy of it will be made and only the feature columns will be considered.
                Due to the functionality of the pyBrain library we require (at the moment) that the order of the
                colums is the same as the one of the training set used for training.

    Returns:    pandas Dataframe
                A Dataframe with columns containing the predictions of the different target variables and same index as
                the input DataFrame

        """
        data_x = data[self.features]
        data_y = data[self.targets]
        predicted = np.array([])
        for i in range(len(data_x)):
            predicted = np.append(predicted, self.N.activate(data_x.values[i]))
        return pd.DataFrame(predicted, index=data.index, columns=self.targets), data_y

    def predict(self, data):
        """
    It returns target variables given a set of features, using the model trained and saved.
    ---------

    data: pandas Dataframe
         It must contain all the feature columns used for training of the model

    Returns: pandas Dataframe
         It contains the prediction on the target variables. The name of the variables is the same as the one
         provided at the moment of instantiation of object.

        """
        data_x = data[self.features]
        predicted = np.array([])
        for i in range(len(data_x)):
            predicted = np.append(predicted, self.N.activate(data_x.values[i]))
        return pd.DataFrame(predicted, index=data_x.index, columns=self.targets)

    def randomize(self):
        self.N.randomize()
        pass


    ### Private functions ###
    def _error(self, ds):
        """
    Calculates the RMSE over an input dataset, given the current state of the network.

    ds: Supervised dataset pybrain style

    Returns: float
        The total error between prediction and actual values.

        """
        predicted = np.array([list(self.N.activate(x)) for x in ds['input']]).transpose()
        actual = np.array([list(x) for x in ds['target']]).transpose()
        total_error = [errors.RMSE(np.array(actual[i]),np.array(predicted[i])) for i in range(len(actual))]
        return sum(total_error)

    def _build_net(self,s):
        layers = [LinearLayer(s[0])]
        self.N.addInputModule(layers[0])
        for i in range(1,len(s)-1):
            layers.append(SigmoidLayer(s[i]))
            self.N.addModule(layers[i])
        layers.append(SigmoidLayer(s[-1]))
        self.N.addOutputModule(layers[-1])
        self._build_connections(layers)

    def _build_connections(self, l):
        for i,j in zip(l,l[1:]):
            a = FullConnection(i,j)
            self.N.addConnection(a)
        self.N.sortModules()

    def _build_dataset(self, data):
        """
    Given a input training Dataframe with features and targets it returns the formatted training and validation
    datasets for pybrain usage, and randomly shuffled according to the self.seed given at instantiation.

    ----------

    data: pandas Dataframe
        It must contains both features and target columns

    Returns: (pybrain dataset, pybrain dataset)
        The first is the training dataset and the second is the validation dataset

        """
        np.random.seed(self.seed)
        permutation = np.random.permutation(np.arange(len(data)))
        sep = int(self.train_fraction * len(data))
        x = data[self.features]
        y = data[self.targets]
        ds_train = SupervisedDataSet(self.n_feature, self.n_target)
        ds_valid = SupervisedDataSet(self.n_feature, self.n_target)
        for i in permutation[:sep]:
            ds_train.addSample(x.values[i], y.values[i])
        for i in permutation[sep:]:
            ds_valid.addSample(x.values[i], y.values[i])
        return ds_train, ds_valid
class ANN:
    def __init__(self):
        self.name = "ANN"

    def getParams(self):
        return self.in_to_hidden.params, self.hidden_to_out.params

    def create_network(self, nFeatures, hidden1Size=20, nClasses=1):
        # create network object
        self.ffn = FeedForwardNetwork()

        # create layer objects
        inLayer = LinearLayer(nFeatures, name="input")
        hiddenLayer = SigmoidLayer(hidden1Size, name="hidden1")
        #hiddenLayer2 = SigmoidLayer(hidden2Size, name="hidden2")
        outLayer = LinearLayer(nClasses, name="output")

        # add layers to feed forward network
        self.ffn.addInputModule(inLayer)
        self.ffn.addModule(hiddenLayer)
        #self.ffn.addModule(hiddenLayer2)
        self.ffn.addOutputModule(outLayer)

        # add bias unit to layers
        self.ffn.addModule(BiasUnit(name='bias'))

        # establish connections between layers
        self.in_to_hidden = FullConnection(inLayer, hiddenLayer)
        #hidden_to_hidden = FullConnection(hiddenLayer, hiddenLayer2)
        self.hidden_to_out = FullConnection(hiddenLayer, outLayer)

        # print "into hidden: {}".format(len(in_to_hidden.params))
        # print "into out: {}".format(len(hidden_to_out.params))

        # add connections to network
        self.ffn.addConnection(self.in_to_hidden)
        #self.ffn.addConnection(hidden_to_hidden)
        self.ffn.addConnection(self.hidden_to_out)

        # necessary, sort layers into correct/certain order
        self.ffn.sortModules()

        # dataset object
        self.train_ds = SupervisedDataSet(nFeatures, nClasses)
        self.validate_ds = SupervisedDataSet(nFeatures, nClasses)

    # train network
    def train(self, TrainX, TrainY, ValidateX, ValidateY):
        # clear old dataset
        self.train_ds.clear()
        self.validate_ds.clear()

        # add data to dataset object (ds)
        for i in range(TrainX.shape[0]):
            self.train_ds.addSample(TrainX[i], TrainY[i])

        for i in range(ValidateX.shape[0]):
            self.validate_ds.addSample(ValidateX[i], ValidateY[i])

        # randomiz weights
        self.ffn.randomize()

        # Backprop trainer object
        self.trainer = BackpropTrainer(self.ffn,
                                       learningrate=.0775,
                                       momentum=.1)
        try:
            with Timer() as t:
                self.train_errors, self.val_errors \
                    = self.trainer.trainUntilConvergence(trainingData=self.train_ds, \
                                                         validationData=self.validate_ds, \
                                                         maxEpochs=500, \
                                                         continueEpochs=10)

            #return self.train_errors, self.val_errors
        except:
            print "Error occured while training model in ANN."

        #finally:
        #    print("ANN.py - Time to trainUntilConvergence: {:.03f} sec.".format(t.interval))

        return 'ANN'

    # predict depenent variable for dataset
    def predict(self, data):
        # if only make prediction for one sample
        if (len(data.shape) == 1):
            return self.ffn.activate(data)
        else:
            outputs = np.zeros(data.shape[0])
            for i in range(data.shape[0]):
                outputs[i] = self.ffn.activate(data[i])
            return outputs
def ANN_blind_analysis(a_network, a_gene, a_dataset, boot_val):

	"Creates and trains a network that is created to reflect the structure of the hypothesized network"

	regulatory_network = FeedForwardNetwork()

	# retrieving needed parameters from the input network

	upper_case_data_node_list = get_sub_list_from_network(a_network, a_gene, "gene,TF", 1)

	# to lower case for everything
	data_node_list = [x.lower() for x in upper_case_data_node_list]
	a_gene = a_gene.lower()

	# If the target gene is also a TF, remove it from the list as it will be added
	if a_gene in data_node_list: data_node_list.remove(a_gene)

	print 'what is in data_node_list:'
	print data_node_list

	if len(data_node_list) == 0:
		print "No connections to " + a_gene + " found."
		return [a_gene, '0', '0']


	# Check for missing entries in the dataset (DS)
	# For the main gene

	#print a_gene
	#print a_dataset[0].keys()
	
	# Check for missing entries in the dataset (DS)
    # For the main gene
	if a_gene not in a_dataset[0].keys():
		#print 'herp'
		return [a_gene, '0', '0']
	
	# For the linked genes

	for each_gene in data_node_list:
		if each_gene not in a_dataset[0].keys():
			data_node_list.remove(each_gene)

	if len(data_node_list) == 0:
		print "No connections to " + a_gene + " found."
		return [a_gene, '0', '0']

	print len(data_node_list)
	print data_node_list

	# Need to add +1 node to the input layer that represents the "other" control variables

	# describing network modules to be used
	inLayer = LinearLayer(len(data_node_list), name="Input_layer")
	
	hiddenLayer = SigmoidLayer(len(data_node_list) + 1, name="Hidden_sigmoid_layer_1")
	
	outLayer = LinearLayer(1, name="Output_layer")


	# Adding layers to network
	regulatory_network.addInputModule(inLayer)

	regulatory_network.addModule(hiddenLayer)

	regulatory_network.addOutputModule(outLayer)

	# Adding connections between layers

	in_to_hidden = FullConnection(inLayer, hiddenLayer)

	hidden_to_out = FullConnection(hiddenLayer, outLayer)


	regulatory_network.addConnection(in_to_hidden)

	regulatory_network.addConnection(hidden_to_out)


	get_nn_details(regulatory_network)

	# Other stuff added

	regulatory_network.sortModules()

	# Formatting the dataset 

	input_dimention = len(data_node_list)
	print "in_dimention = ", input_dimention

	DS = SupervisedDataSet( input_dimention, 1 )

	# Adding data, there may be a problem with order here where tfs are not always the same... seems ok though


	# This may not be the best way, but is needed due to the next for statement
	data_node_list.append(a_gene)
	print 'node list contains: '
	print data_node_list

	# This is where the ordered dict needs to be used to link the input name to the input node.

	for experiment in a_dataset:
		tf_list = []
		gene_list = []
		tf_labels = []
		first_round = True
		for TF in data_node_list:
			if TF != a_gene:
				#print TF, "<---"
				tf_list.append(experiment[TF])
				if first_round == True:
					tf_labels.append(TF)
			else:
				#print TF, "<---gene"
				gene_list.append(experiment[TF])
		first_round = False
		# View the input data sets
		print tf_labels
		print tf_list
		print gene_list


		if (check_missing_experiments(tf_list) == True) and (check_missing_experiments(gene_list) == True):
			float_tf_list = [float(i) for i in tf_list]
			float_gene_list = [float(i) for i in gene_list]
			DS.appendLinked( float_tf_list, float_gene_list )

	print "......"
	print 'Network before training'
	print regulatory_network

	pesos_conexiones(regulatory_network)
	print regulatory_network.outputerror

	#print DS

	# Training
	trainer = RPropMinusTrainer_Evolved(regulatory_network, verbose=False)

	trainer.setData(DS)

	result_list = []
	best_run_error = 1000

	boot_count = 0
	while boot_count < boot_val:
		print '\n'
		print 'Bootstrap round ' + str(boot_count + 1)
		trainer.trainEpochs(500)
		this = get_nn_details(regulatory_network)
		# Corrected error
		
		print trainer.total_error
		current_run_error = trainer.total_error
		

		
		print 'Bootstrap round ' + str(boot_count + 1) + ' error: ' + str(current_run_error)

		if abs(current_run_error) < abs(best_run_error):
			best_run_error = current_run_error
			trained_net_filename = a_gene + '_trained_net.xml'
			NetworkWriter.writeToFile(regulatory_network, trained_net_filename)

			export_to_gml(regulatory_network, tf_labels, a_gene)

		#result_list.append(this)
		regulatory_network.reset()
		regulatory_network.randomize()
		trainer = RPropMinusTrainer_Evolved(regulatory_network, verbose=False)
		trainer.setData(DS)
		boot_count += 1

	#print "TF Labels"
	#print tf_labels
	#print regulatory_network.params
	#print inLayer
	#print "Pesos Conexiones"
	#pesos_conexiones(regulatory_network)

	#print dir(regulatory_network)
	#print dir(trainer)
	#print 'look here'
	#print regulatory_network.outputerror
	#print '<><><><><>'
	#print dir(regulatory_network['SigmoidLayer-7'])
	#print '\n'
	#print vars(regulatory_network['SigmoidLayer-7'])
	#print '\n'
	#print regulatory_network['SigmoidLayer-7'].forward
	#print regulatory_network['SigmoidLayer-7'].bufferlist

	result_list.append(a_gene)

	result_list.append(best_run_error)

	result_list.append(len(tf_list))

	return result_list
class BlondieBrain:
    def __init__(self, datadir, insize=None, outsize=None, paramfile=None):
        self.datadir = datadir
        if insize == None:
            g = runner.Game()
            ip = self._game2input(g)
            self.insize = len(ip)
        else:
            self.insize = insize
        if outsize == None:
            self.outsize = 1
        else:
            self.outsize = outsize
        if paramfile:
            f = os.path.join(self.datadir, paramfile)
            self.nn = NetworkReader.readFrom(f)
            try:
                self.name = re.search("(.*)-bestof-(.*)", paramfile).group(1)
            except AttributeError:
                self.name = "blondie-%s" % (datetime.datetime.now())
        else:
            self.nn = FeedForwardNetwork()
            tmpname = "blondie-%s" % (datetime.datetime.now())
            self.name = re.sub("[.: ]", "-", tmpname)

            inLayer = LinearLayer(self.insize)
            hiddenLayer1 = SigmoidLayer(self.insize)
            hiddenLayer2 = SigmoidLayer(self.insize)
            outLayer = LinearLayer(self.outsize)

            self.nn.addInputModule(inLayer)
            self.nn.addModule(hiddenLayer1)
            self.nn.addModule(hiddenLayer2)
            self.nn.addOutputModule(outLayer)

            in_to_hidden1 = FullConnection(inLayer, hiddenLayer1)
            hidden1_to_hidden2 = FullConnection(hiddenLayer1, hiddenLayer2)
            hidden2_to_out = FullConnection(hiddenLayer2, outLayer)

            self.nn.addConnection(in_to_hidden1)
            self.nn.addConnection(hidden1_to_hidden2)
            self.nn.addConnection(hidden2_to_out)

            self.nn.sortModules()

    def nextmove(self, game):
        inputdata = self._game2input(game)
        if self.outsize == 1:
            op = int(self.nn.activate(inputdata))
        else:
            r = self.nn.activate(inputdata)
            op = r.argmax()
        return op

    def save(self, suffix=""):
        f = os.path.join(self.datadir, self.name + suffix + ".xml")
        NetworkWriter.writeToFile(self.nn, f)

    def mutate(self):
        self.nn.mutate()

    def randomize(self):
        self.nn.randomize()

    def copy(self):
        x = copy.deepcopy(self)
        x.nn = x.nn.copy()
        return x

    @classmethod
    def _game2input(cls, game):
        mysymbol = len(game.moves) % 2
        cells = [cls._trsymb(x, mysymbol) for x in itertools.chain.from_iterable(game.grid_columns)]
        cols = [cls._trsum(c, mysymbol) for c in game.grid_columns]
        rows = [cls._trsum(r, mysymbol) for r in game.grid_rows]
        diags = [cls._trsum(r, mysymbol) for r in game.diags]
        l = itertools.chain.from_iterable([cells, cols, rows, diags])
        return list(l)

    @classmethod
    def _trsymb(cls, piece, mysymbol):
        # Transform symbol
        if piece == None:
            return 0
        elif piece == mysymbol:
            return 1
        else:
            return -1

    @classmethod
    def _trsum(cls, l, mysymbol):
        # Transform symbol and sum list
        s = 0
        for ll in l:
            s += cls._trsymb(ll, mysymbol)
        return s