def createModel(pathToTrainingData): # Create a list of sequences and a 0/1-list where position i is 1 if sequence i is a binder and 0 if it isn't sequencesList = [] isBinderList = [] with open(pathToTrainingData, "r") as i: for e in i: lineSplit = e.split() if lineSplit[0] == "Peptide": # skip the first line pass else: sequencesList.append(lineSplit[0]) isBinderList.append(1 if lineSplit[2] is "1" else 0) # Create a 3D array of all the data necessary for the ANN # Its shape is 726*10*6: There are 726 sequences in the training set, # each sequence consists of 9 residues plus a boolean telling us if it's a binder # and 6 properties are considered for each residue: # Its one-letter-code, weight, iep, hydrophobicity, polarity, and its area data = np.array([[(aminoAcidDict.get(residue).one_letter_code, aminoAcidDict.get(residue).weight, aminoAcidDict.get(residue).iep, aminoAcidDict.get(residue).hydrophobicity, aminoAcidDict.get(residue).polarity, aminoAcidDict.get(residue).area) for residue in sequence] for sequence in sequencesList]) # Generate actual ANN input: one-letter-code is cut away, now there are only five numbers for each residue annInput = data[:, :, 1:6].reshape(726, 45) # Setup model model = Sequential() # Input layer with 9*5=45 input nodes - each property at every position model.add(Dense(45, kernel_initializer='uniform', activation='softplus', input_shape=(45,))) # Hidden layer with 17 nodes model.add(Dense(17, kernel_initializer='uniform', activation='softplus')) # Output layer with 1 node model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam') model.fit(annInput, isBinderList, epochs=100, verbose=0, batch_size=100) return model
sequencesList = [] isBinderList = [] with open(args.input, "r") as i: for e in i: lineSplit = e.split() if lineSplit[0] == "Peptide": # skip the first line pass else: sequencesList.append(lineSplit[0]) isBinderList.append(1 if lineSplit[2] is "1" else 0) # Create a 3D array of all the data necessary for the ANN # Its shape is 726*10*6, since there are 726 sequences in the training set, each sequence consists of 9 residues plus # a boolean telling us if it's a binder and # 6 properties are considered for each residue: # Its one-letter-code, weight, iep, hydrophobicity, polarity, and its area data = np.array([[(aminoAcidDict.get(residue).one_letter_code, aminoAcidDict.get(residue).weight, aminoAcidDict.get(residue).iep, aminoAcidDict.get(residue).hydrophobicity, aminoAcidDict.get(residue).polarity, aminoAcidDict.get(residue).area) for residue in sequence] for sequence in sequencesList]) # Generate actual ANN input: one-letter-code is cut away, now there are only five numbers for each residue annInput = data[:, :, 1:6].reshape(726, 45) # Split data up into training and test sets x_train, x_test, y_train, y_test = train_test_split(annInput, isBinderList, test_size=0.8, random_state=0) # Function for creating a model - needed for the keras classifier def create_model(neurons=1, activation='relu'):