def load_pickle_data(self): data = pickle.load(open(self.data_path, 'rb')) X_train_path = self.fp_prefix + '_train.pkl' X_test_path = self.fp_prefix + '_test.pkl' X_dev_path = self.fp_prefix + '_valid.pkl' if 'emotion_dic' in data and self.label == 'emotion': self.emotion_dic = data['emotion_dic'] self.get_max_seq_len(data['train']['text'] + data['test']['text'] + data['valid']['text']) self.speaker_num = data['speaker_num'] if not os.path.exists(X_train_path): print("Creating new train data!") X_train, y_train = self.pad_dialogue(data['train']) if self.dialogue_context: X_train, y_train = self.extract_context(X_train, y_train) pickle.dump([*X_train, y_train], open(X_train_path, 'wb')) else: print(" - Found cached train data") train_data = pickle.load(open(X_train_path, 'rb')) X_train = train_data[:-1] y_train = train_data[-1] if not os.path.exists(X_test_path): print("Creating new test data!") X_test, y_test = self.pad_dialogue(data['test']) if self.dialogue_context: X_test, y_test = self.extract_context(X_test, y_test) pickle.dump([*X_test, y_test], open(X_test_path, 'wb')) else: print(" - Found cached test data") test_data = pickle.load(open(X_test_path, 'rb')) X_test = test_data[:-1] y_test = test_data[-1] if not os.path.exists(X_dev_path): print("Creating new dev data!") X_dev, y_dev = self.pad_dialogue(data['valid']) if self.dialogue_context: X_dev, y_dev = self.extract_context(X_dev, y_dev) pickle.dump([*X_dev, y_dev], open(X_dev_path, 'wb')) else: print(" - Found cached dev data") dev_data = pickle.load(open(X_dev_path, 'rb')) X_dev = dev_data[:-1] y_dev = dev_data[-1] X_train = [torch.tensor(x, dtype=torch.float32) for x in X_train] X_test = [torch.tensor(x, dtype=torch.float32) for x in X_test] X_dev = [torch.tensor(x, dtype=torch.float32) for x in X_dev] self.embedding_enabled = False self.sentiment_dic = None # Remove spurious values (-inf) for x in X_train: clean_tensor(x) for x in X_test: clean_tensor(x) for x in X_dev: clean_tensor(x) y_train = torch.tensor(y_train, dtype=torch.float32) y_test = torch.tensor(y_test, dtype=torch.float32) y_dev = torch.tensor(y_dev, dtype=torch.float32) if y_train.dim() == 3: y_train = y_train.squeeze(dim=-1) y_test = y_test.squeeze(dim=-1) y_dev = y_dev.squeeze(dim=-1) return X_train, X_test, X_dev, y_train, y_test, y_dev
def load_pickle_data(self): data = pickle.load(open(self.data_path, 'rb')) X_train_path = self.fp_prefix + '_train.pkl' X_test_path = self.fp_prefix + '_test.pkl' X_dev_path = self.fp_prefix + '_valid.pkl' self.get_max_seq_len(data['train']['text'] + data['test']['text'] + data['valid']['text']) # Load embedding embedding_path = self.fp_prefix + '_embedding.pkl' if not os.path.exists(embedding_path): print("Creating new embeddings!") self.dictionary = Dictionary(start_feature_id=0) self.dictionary.add('UNK') textual_features = data['train']['text'] + data['test'][ 'text'] + data['valid']['text'] for tokens in textual_features: for token in tokens: self.dictionary.add(str(token.lower())) self.embedding = Embedding(self.dictionary, self.max_seq_len) self.embedding.get_embedding(dataset_name=self.dataset_name, fname=self.wordvec_path) pickle.dump(self.embedding, open(embedding_path, 'wb')) else: print(" - Found cached embeddings") self.embedding = pickle.load(open(embedding_path, 'rb')) if not os.path.exists(X_train_path): print("Creating new train data!") X_train = [[ self.embedding.text_to_sequence(seq) for seq in data['train']['text'] ], data['train']['vision'], data['train']['audio']] y_train = data['train']['labels'] pickle.dump([*X_train, y_train], open(X_train_path, 'wb')) else: print(" - Found cached train data") train_data = pickle.load(open(X_train_path, 'rb')) X_train = train_data[:-1] y_train = train_data[-1] if not os.path.exists(X_test_path): print("Creating new test data!") X_test = [[ self.embedding.text_to_sequence(seq) for seq in data['test']['text'] ], data['test']['vision'], data['test']['audio']] y_test = data['test']['labels'] pickle.dump([*X_test, y_test], open(X_test_path, 'wb')) else: print(" - Found cached test data") test_data = pickle.load(open(X_test_path, 'rb')) X_test = test_data[:-1] y_test = test_data[-1] if not os.path.exists(X_dev_path): print("Creating new valid data!") X_dev = [[ self.embedding.text_to_sequence(seq) for seq in data['valid']['text'] ], data['valid']['vision'], data['valid']['audio']] y_dev = data['valid']['labels'] pickle.dump([*X_dev, y_dev], open(X_dev_path, 'wb')) else: print(" - Found cached valid data") dev_data = pickle.load(open(X_dev_path, 'rb')) X_dev = dev_data[:-1] y_dev = dev_data[-1] # Convert data to tensor format X_train = [ torch.tensor(x, dtype=torch.int64) if i == 0 else torch.tensor(x, dtype=torch.float32) for i, x in enumerate(X_train) ] X_test = [ torch.tensor(x, dtype=torch.int64) if i == 0 else torch.tensor(x, dtype=torch.float32) for i, x in enumerate(X_test) ] X_dev = [ torch.tensor(x, dtype=torch.int64) if i == 0 else torch.tensor(x, dtype=torch.float32) for i, x in enumerate(X_dev) ] # Remove spurious values (-inf) for x in X_train: clean_tensor(x) for x in X_test: clean_tensor(x) for x in X_dev: clean_tensor(x) y_train = torch.tensor(y_train, dtype=torch.float32) y_test = torch.tensor(y_test, dtype=torch.float32) y_dev = torch.tensor(y_dev, dtype=torch.float32) if y_train.dim() == 3: y_train = y_train.squeeze(dim=-1) y_test = y_test.squeeze(dim=-1) y_dev = y_dev.squeeze(dim=-1) return X_train, X_test, X_dev, y_train, y_test, y_dev