def main_train(pos_sequences=None, neg_sequences=None, prefix=None, model_file=None, weights_file=None): # encode fastas print("loading sequence data...") X_pos = encode_fasta_sequences(pos_sequences) y_pos = np.array([[True]]*len(X_pos)) X_neg = encode_fasta_sequences(neg_sequences) y_neg = np.array([[False]]*len(X_neg)) X = np.concatenate((X_pos, X_neg)) y = np.concatenate((y_pos, y_neg)) X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2) if model_file is not None and weights_file is not None: # load model print("loading model...") model = SequenceDNN.load(model_file, weights_file) else: # initialize model print("initializing model...") model = SequenceDNN(seq_length=X_train.shape[-1]) # train print("starting model training...") model.train(X_train, y_train, validation_data=(X_valid, y_valid)) valid_result = model.test(X_valid, y_valid) print("final validation metrics:") print(valid_result) # save print("saving model files..") model.save("%s.model.json" % (prefix), "%s.weights.hd5" % (prefix)) print("Done!")
def main_train(pos_sequences=None, neg_sequences=None, prefix=None, arch_file=None, weights_file=None, **kwargs): kwargs = {key: value for key, value in kwargs.items() if value is not None} # encode fastas print("loading sequence data...") X_pos = encode_fasta_sequences(pos_sequences) y_pos = np.array([[True]] * len(X_pos)) X_neg = encode_fasta_sequences(neg_sequences) y_neg = np.array([[False]] * len(X_neg)) X = np.concatenate((X_pos, X_neg)) y = np.concatenate((y_pos, y_neg)) X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2) if arch_file is not None: # load model print("loading model...") model = SequenceDNN.load(arch_file, weights_file) else: # initialize model print("initializing model...") model = SequenceDNN(seq_length=X_train.shape[-1], **kwargs) # train print("starting model training...") model.train(X_train, y_train, validation_data=(X_valid, y_valid)) valid_result = model.test(X_valid, y_valid) print("final validation metrics:") print(valid_result) # save print("saving model files..") model.save(prefix) print("Done!")
def main_train(pos_sequences=None, neg_sequences=None, prefix=None, arch_file=None, weights_file=None, **kwargs): kwargs = {key: value for key, value in kwargs.items() if value is not None} # encode fastas print("loading sequence data...") X_pos = encode_fasta_sequences(pos_sequences) y_pos = np.array([[True]]*len(X_pos)) X_neg = encode_fasta_sequences(neg_sequences) y_neg = np.array([[False]]*len(X_neg)) X = np.concatenate((X_pos, X_neg)) y = np.concatenate((y_pos, y_neg)) X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2) if arch_file is not None: # load model print("loading model...") model = SequenceDNN.load(arch_file, weights_file) else: # initialize model print("initializing model...") model = SequenceDNN(seq_length=X_train.shape[-1], **kwargs) # train print("starting model training...") model.train(X_train, y_train, validation_data=(X_valid, y_valid)) valid_result = model.test(X_valid, y_valid) print("final validation metrics:") print(valid_result) # save print("saving model files..") model.save(prefix) print("Done!")
def main_train(pos_sequences=None, neg_sequences=None, pos_validation_sequences=None, neg_validation_sequences=None, prefix=None, arch_file=None, weights_file=None, **kwargs): kwargs = {key: value for key, value in kwargs.items() if value is not None} # encode fastas print("loading sequence data...") X_pos = encode_fasta_sequences(pos_sequences) y_pos = np.array([[True]] * len(X_pos)) X_neg = encode_fasta_sequences(neg_sequences) y_neg = np.array([[False]] * len(X_neg)) X = np.concatenate((X_pos, X_neg)) y = np.concatenate((y_pos, y_neg)) #if a validation set is provided by the user, encode that as well if (pos_validation_sequences != None or neg_validation_sequences != None): #both positive and negative validation sequences must be provided. assert neg_validation_sequences != None assert pos_validation_sequences != None X_valid_pos = encode_fasta_sequences(pos_validation_sequences) X_valid_neg = encode_fasta_sequences(neg_validation_sequences) y_valid_pos = np.array([[True]]) * len(X_valid_pos) y_valid_neg = np.array([[False]]) * len(X_valid_neg) X_valid = np.concatenate((X_valid_pos, X_valid_neg)) y_valid = np.concatenate((y_valid_pos, y_valid_neg)) else: X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2) if arch_file is not None: # load model print("loading model...") model = SequenceDNN.load(model_hdf5_file, arch_file, weights_file) else: # initialize model print("initializing model...") model = SequenceDNN(seq_length=X_train.shape[-1], **kwargs) # train print("starting model training...") model.train(X_train, y_train, validation_data=(X_valid, y_valid)) valid_result = model.test(X_valid, y_valid) print("final validation metrics:") print(valid_result) # save print("saving model files..") model.save(prefix) print("Done!")
def main_test(pos_sequences=None, neg_sequences=None, arch_file=None, weights_file=None): # encode fastas print("loading sequence data...") X_test_pos = encode_fasta_sequences(pos_sequences) y_test_pos = np.array([[True]] * len(X_test_pos)) X_test_neg = encode_fasta_sequences(neg_sequences) y_test_neg = np.array([[False]] * len(X_test_neg)) X_test = np.concatenate((X_test_pos, X_test_neg)) y_test = np.concatenate((y_test_pos, y_test_neg)) # load model print("loading model...") model = SequenceDNN.load(arch_file, weights_file) # test print("testing model...") test_result = model.test(X_test, y_test) print(test_result)
def main_test(pos_sequences=None, neg_sequences=None, model_file=None, weights_file=None): # encode fastas print("loading sequence data...") X_test_pos = encode_fasta_sequences(pos_sequences) y_test_pos = np.array([[True]]*len(X_test_pos)) X_test_neg = encode_fasta_sequences(neg_sequences) y_test_neg = np.array([[False]]*len(X_test_neg)) X_test = np.concatenate((X_test_pos, X_test_neg)) y_test = np.concatenate((y_test_pos, y_test_neg)) # load model print("loading model...") model = SequenceDNN.load(model_file, weights_file) # test print("testing model...") test_result = model.test(X_test, y_test) print(test_result)
def main(args): ''' args - parsed arguments that include pos_sequences, neg_sequences, arch_file, and weights_file ''' # encode fasta print('Loading sequence data...') pos_seq = encode_fasta_sequences(args.pos_sequences) print('{} positive test sequences'.format(len(pos_seq))) neg_seq = encode_fasta_sequences(args.neg_sequences) print('{} negative test sequences\n'.format(len(neg_seq))) # load model prefix = args.arch_file.replace('.arch.json', '') print('Loading {} model...'.format(prefix)) model = SequenceDNN.load(args.arch_file, args.weights_file) # predict binding probability on test sequences print('Getting predictions...') pos_predictions = model.predict(pos_seq) for index, pred in enumerate(pos_predictions): print('positive_test_{}\tP(bound)={}'.format(index, pred[0])) print('') neg_predictions = model.predict(neg_seq) for index, pred in enumerate(neg_predictions): print('negative_test_{}\tP(bound)={}'.format(index, pred[0])) print('') # visualize trained model and motifs print('Plotting deeplift scores on positive sequences...') model.plot_deeplift(pos_seq, '{}_deeplift_positive'.format(prefix)) print('Plotting true motifs...') motif_names = ['IRF_known1', 'NFKB_known1'] for index, motif in enumerate(motif_names): fig = plot_motif(motif, figsize=(10, 4), ylab=motif) fig.savefig('motif{}.png'.format(index + 1), bbox_inches='tight') print('Plotting architecture...') model.plot_architecture('{}_architecture.png'.format(prefix)) print('Plotting convolutional filters...') plot_sequence_filters(model, prefix)
def main_predict(sequences=None, arch_file=None, weights_file=None, output_file=None): # encode fasta print("loading sequence data...") X = encode_fasta_sequences(sequences) # load model print("loading model...") model = SequenceDNN.load(arch_file, weights_file) # predict print("getting predictions...") predictions = model.predict(X) # save predictions print("saving predictions to output file...") np.savetxt(output_file, predictions) print("Done!")
def main_predict(sequences=None, model_file=None, weights_file=None, output_file=None): # encode fasta print("loading sequence data...") X = encode_fasta_sequences(sequences) # load model print("loading model...") model = SequenceDNN.load(model_file, weights_file) # predict print("getting predictions...") predictions = model.predict(X) # save predictions print("saving predictions to output file...") np.savetxt(output_file, predictions) print("Done!")
def main_interpret(sequences=None, arch_file=None, weights_file=None, pos_threshold=None, peak_width=10, prefix=None): # encode fasta print("loading sequence data...") X = encode_fasta_sequences(sequences) # load model print("loading model...") model = SequenceDNN.load(arch_file, weights_file) # predict print("getting predictions...") predictions = model.predict(X) # deeplift print("getting deeplift scores...") deeplift_scores = model.deeplift(X) # get important sequences and write to file print("extracting important sequences and writing to file...") for task_index, task_scores in enumerate(deeplift_scores): peak_positions = [] peak_sequences = [] for sequence_index, sequence_scores in enumerate(task_scores): if predictions[sequence_index, task_index] > pos_threshold: #print(sequence_scores.shape) basewise_sequence_scores = sequence_scores.max(axis=(0,1)) peak_position = basewise_sequence_scores.argmax() peak_positions.append(peak_position) peak_sequences.append(X[sequence_index : sequence_index + 1, :, :, peak_position - peak_width : peak_position + peak_width]) else: peak_positions.append(-1) peak_sequences.append(np.zeros((1, 1, 4, 2 * peak_width))) peak_sequences = np.concatenate(peak_sequences) peak_sequence_strings = get_sequence_strings(peak_sequences) # write important sequences to file ofname = "%s.task_%i.important_sequences.txt" % (prefix, task_index) with open(ofname, "w") as wf: for i, peak_position in enumerate(peak_positions): wf.write("> sequence_%i\n" % (i)) wf.write("%i: %s\n" %(peak_position, peak_sequence_strings[i])) print("Done!")
def main_interpret(sequences=None, arch_file=None, weights_file=None, pos_threshold=None, peak_width=10, prefix=None): # encode fasta print("loading sequence data...") X = encode_fasta_sequences(sequences) # load model print("loading model...") model = SequenceDNN.load(arch_file, weights_file) # predict print("getting predictions...") predictions = model.predict(X) # deeplift print("getting deeplift scores...") deeplift_scores = model.deeplift(X) # get important sequences and write to file print("extracting important sequences and writing to file...") for task_index, task_scores in enumerate(deeplift_scores): peak_positions = [] peak_sequences = [] for sequence_index, sequence_scores in enumerate(task_scores): if predictions[sequence_index, task_index] > pos_threshold: #print(sequence_scores.shape) basewise_sequence_scores = sequence_scores.max(axis=(0, 1)) peak_position = basewise_sequence_scores.argmax() peak_positions.append(peak_position) peak_sequences.append( X[sequence_index:sequence_index + 1, :, :, peak_position - peak_width:peak_position + peak_width]) else: peak_positions.append(-1) peak_sequences.append(np.zeros((1, 1, 4, 2 * peak_width))) peak_sequences = np.concatenate(peak_sequences) peak_sequence_strings = get_sequence_strings(peak_sequences) # write important sequences to file ofname = "%s.task_%i.important_sequences.txt" % (prefix, task_index) with open(ofname, "w") as wf: for i, peak_position in enumerate(peak_positions): wf.write("> sequence_%i\n" % (i)) wf.write("%i: %s\n" % (peak_position, peak_sequence_strings[i])) print("Done!")