parser.add_argument( '--out_dir', type=str, default="", help="Directory to save the models state dict (No default)") parser.add_argument('--k', type=int, default="1", help="k for k-mers") parser.add_argument('--l2_penalty', type=float, default=0.0005) parser.add_argument('--save_every', type=int, default="20", help="Num of iterations required to save the model") opt = parser.parse_args() print(opt) human_sequences, _ = dp.load_FASTA(opt.main_data_dir + 'human_sequences.fasta') human_train_idx, human_valid_idx, human_test_idx, human_train_labels, human_valid_labels, human_test_labels, human_GO_terms = dp.load_test_sets( opt.main_data_dir + 'human_annotations_temporal_holdout.mat') # Create train, validation, and test sets from the full list of human proteins human_train_sequences = [human_sequences[i] for i in human_train_idx] human_valid_sequences = [human_sequences[i] for i in human_valid_idx] human_test_sequences = [human_sequences[i] for i in human_test_idx] # Truncate longest sequences human_train_sequences[6640] = human_train_sequences[6640][:5000] human_train_sequences[6613] = human_train_sequences[6613][:5000] # Create lengths for sequence representation averaging in FastText human_train_lengths = dp.sequence_lengths_with_kmers(human_train_sequences, opt.k) human_valid_lengths = dp.sequence_lengths_with_kmers(human_valid_sequences, opt.k)
import torch import torch.nn as nn from torch.autograd import Variable import torch.nn.functional as F # In[2]: # ## Load Data and Create Train/Dev/Test Sets # ### Human sequences # In[2]: human_sequences, _ = dp.load_FASTA('../capstone_data/human_sequences.fasta') human_train_idx, human_valid_idx, human_test_idx, human_train_labels, human_valid_labels, human_test_labels, human_GO_terms = dp.load_test_sets( '../capstone_data/human_annotations_temporal_holdout.mat') # Create train, validation, and test sets from the full list of human proteins human_train_sequences = [human_sequences[i] for i in human_train_idx] human_valid_sequences = [human_sequences[i] for i in human_valid_idx] human_test_sequences = [human_sequences[i] for i in human_test_idx] # Convert corresponding labels for train, validation, and test sets # from the full list of human proteins. human_train_labels = torch.from_numpy(human_train_labels).type( torch.LongTensor) human_valid_labels = torch.from_numpy(human_valid_labels).type( torch.LongTensor) human_test_labels = torch.from_numpy(human_test_labels).type(torch.LongTensor) # Create lengths for sequence representation averaging in FastText
import torch import torch.nn as nn from torch.autograd import Variable import torch.nn.functional as F output_file = 'lstm_gpu_output/human/baseline' #Hyper-parameters emb_dim = 150 hidden_dim = 750 L2_penalty = 0.0005 # ## Load Data and Create Train/Dev/Test Sets human_sequences, _ = dp.load_FASTA('data/human_sequences.fasta') human_train_idx, human_valid_idx, human_test_idx, human_train_labels, human_valid_labels, human_test_labels, human_GO_terms = dp.load_test_sets( 'data/human_annotations_temporal_holdout.mat') # Create train, validation, and test sets from the full list of human proteins human_train_sequences = [human_sequences[i] for i in human_train_idx] human_valid_sequences = [human_sequences[i] for i in human_valid_idx] human_test_sequences = [human_sequences[i] for i in human_test_idx] # Convert corresponding labels for train, validation, and test sets # from the full list of human proteins. human_train_labels = torch.from_numpy(human_train_labels).type( torch.LongTensor) human_valid_labels = torch.from_numpy(human_valid_labels).type( torch.LongTensor) human_test_labels = torch.from_numpy(human_test_labels).type(torch.LongTensor) # Create lengths for sequence representation averaging in FastText
from torch.autograd import Variable import torch.nn.functional as F output_file = 'lstm_gpu_output/yeast/baseline/' # Set Hyper-parameters: emb_dim = 250 # dimension for n-gram embedding hidden_dim = 750 L2_penalty = 0.0005 # ## Load Data and Create Train/Dev/Test Sets # Load yeast sequences and training data yeast_sequences, yeast_protein_names = dp.load_FASTA( 'data/yeast_sequences.fasta') yeast_train_idx, yeast_valid_idx, yeast_test_idx, yeast_train_labels, yeast_valid_labels, yeast_test_labels, yeast_GO_terms = dp.load_test_sets( 'data/yeast_MF_temporal_holdout.mat') # Create train, validation, and test sets from the full list of yeast proteins yeast_train_sequences = [yeast_sequences[i] for i in yeast_train_idx] yeast_valid_sequences = [yeast_sequences[i] for i in yeast_valid_idx] yeast_test_sequences = [yeast_sequences[i] for i in yeast_test_idx] # Convert corresponding labels for train, validation, and test sets from the full list of yeast proteins. yeast_train_labels = torch.from_numpy(yeast_train_labels).type( torch.LongTensor) yeast_valid_labels = torch.from_numpy(yeast_valid_labels).type( torch.LongTensor) yeast_test_labels = torch.from_numpy(yeast_test_labels).type(torch.LongTensor) # Create lengths for sequence representation averaging in FastText yeast_train_lengths = dp.sequence_lengths(yeast_train_sequences)