# create a new label encoder le = LabelEncoder() # encode train set labels y_train = le.fit_transform(y_train) # EX1 # encode test set labels y_test = le.fit_transform(y_test) # EX1 # compute number of classes made by the encoder n_classes = le.classes_.size #print("EX1 answer printing") #print("First 10 unencoded labels from the training set are: ") #print(le.inverse_transform(y_train[:10])) #print("First 10 encoded labels from the training set are: ") #print(y_train[:10]) # Define our PyTorch-based Dataset train_set = SentenceDataset(X_train, y_train, word2idx) # ------------ # # EX2 # # ------------ # # EX2 # print first 10 tokenized training examples #print("EX2 printing") #for i in range(10): # print(train_set.data[i]) # ------------ # # EX3 # # ------------ # # EX3 # print("EX3 printing") # for i in range(5):
# Load the raw data if DATASET == "Semeval2017A": _, _, X_test, y_test = load_Semeval2017A() else: raise ValueError("Invalid dataset") # Convert data labels from strings to integers # create a new label encoder le = LabelEncoder() # encode test set labels y_test = le.fit_transform(y_test) # EX1 # compute number of classes made by the encoder n_classes = le.classes_.size # Define our PyTorch-based Dataset test_set = SentenceDataset(X_test, y_test, word2idx) tweetToken = TweetTokenizer() text = [tweetToken.tokenize(example) for example in X_test] # Define our PyTorch-based DataLoader # Batch size is 1. test_loader = DataLoader(test_set) # Load user model. model = torch.load(sys.argv[1]).to(DEVICE) # Define criterion for evaluation. loss_function = torch.nn.CrossEntropyLoss() model.eval() # IMPORTANT: in evaluation mode, we don't want to keep the gradients # so we do everything under torch.no_grad() data = []