from torchnlp.text_encoders import WhitespaceEncoder, IdentityEncoder from torchnlp import word_to_vector from model import SNLIClassifier from util import get_args, makedirs, collate_fn args = get_args() if args.gpu >= 0: torch.cuda.set_device(args.gpu) # load dataset train, dev, test = snli_dataset(train=True, dev=True, test=True) # Preprocess for row in datasets_iterator(train, dev, test): row['premise'] = row['premise'].lower() row['hypothesis'] = row['hypothesis'].lower() # Make Encoders sentence_corpus = [row['premise'] for row in datasets_iterator(train, dev, test)] sentence_corpus += [row['hypothesis'] for row in datasets_iterator(train, dev, test)] sentence_encoder = WhitespaceEncoder(sentence_corpus) label_corpus = [row['label'] for row in datasets_iterator(train, dev, test)] label_encoder = IdentityEncoder(label_corpus) # Encode for row in datasets_iterator(train, dev, test): row['premise'] = sentence_encoder.encode(row['premise']) row['hypothesis'] = sentence_encoder.encode(row['hypothesis'])
def __init__(self, root: str, normal_class=0, tokenizer='spacy', use_tfidf_weights=False, append_sos=False, append_eos=False, clean_txt=False, max_seq_len_prior=None): super().__init__(root) self.n_classes = 2 # 0: normal, 1: outlier classes = [ 'earn', 'acq', 'crude', 'trade', 'money-fx', 'interest', 'ship' ] # classes_full_list = [ # 'acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', # 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', # 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', # 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', # 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', # 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', # 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', # 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', # 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc' # ] self.normal_classes = [classes[normal_class]] del classes[normal_class] self.outlier_classes = classes # Load the reuters dataset self.train_set, self.test_set = reuters_dataset(directory=root, train=True, test=True, clean_txt=clean_txt) # Pre-process self.train_set.columns.add('index') self.test_set.columns.add('index') self.train_set.columns.add('weight') self.test_set.columns.add('weight') train_idx_normal = [] # for subsetting train_set to normal class for i, row in enumerate(self.train_set): if any(label in self.normal_classes for label in row['label']) and (len(row['label']) == 1): train_idx_normal.append(i) row['label'] = torch.tensor(0) else: row['label'] = torch.tensor(1) row['text'] = row['text'].lower() test_idx = [ ] # for subsetting test_set to selected normal and anomalous classes test_n_idx = [] # subsetting test_set to selected normal classes test_a_idx = [] # subsetting test_set to selected anomalous classes for i, row in enumerate(self.test_set): if any(label in self.normal_classes for label in row['label']) and (len(row['label']) == 1): test_idx.append(i) test_n_idx.append(i) row['label'] = torch.tensor(0) elif any(label in self.outlier_classes for label in row['label']) and (len(row['label']) == 1): test_idx.append(i) test_a_idx.append(i) row['label'] = torch.tensor(1) else: row['label'] = torch.tensor(1) row['text'] = row['text'].lower() # Subset train_set to normal class self.train_set = Subset(self.train_set, train_idx_normal) # Subset test_set to selected normal and anomalous classes self.test_set = Subset(self.test_set, test_idx) # Subset test_set to selected normal classes self.test_n_set = Subset(self.test_set, test_n_idx) # Subset test_set to selected anomalous classes self.test_a_set = Subset(self.test_set, test_a_idx) # Make corpus and set encoder text_corpus = [ row['text'] for row in datasets_iterator(self.train_set, self.test_set) ] if tokenizer == 'spacy': self.encoder = SpacyEncoder(text_corpus, min_occurrences=3, append_eos=append_eos) if tokenizer == 'bert': self.encoder = MyBertTokenizer.from_pretrained('bert-base-uncased', cache_dir=root) # Encode self.max_seq_len = 0 for row in datasets_iterator(self.train_set, self.test_set): if append_sos: sos_id = self.encoder.stoi[DEFAULT_SOS_TOKEN] row['text'] = torch.cat((torch.tensor(sos_id).unsqueeze(0), self.encoder.encode(row['text']))) else: row['text'] = self.encoder.encode(row['text']) if len(row['text']) > self.max_seq_len: self.max_seq_len = len(row['text']) # Compute tf-idf weights if use_tfidf_weights: compute_tfidf_weights(self.train_set, self.test_set, vocab_size=self.encoder.vocab_size) else: for row in datasets_iterator(self.train_set, self.test_set): row['weight'] = torch.empty(0) # Get indices after pre-processing for i, row in enumerate(self.train_set): row['index'] = i for i, row in enumerate(self.test_set): row['index'] = i # # length prior sent_lengths = [len(row['text']) for row in self.train_set] sent_lengths_freq = np.bincount(np.array(sent_lengths)) sent_lengths_freq = np.concatenate( (sent_lengths_freq, np.array((max_seq_len_prior - max(sent_lengths)) * [0])), axis=0) sent_lengths_freq = sent_lengths_freq + 1 self.length_prior = np.log(sent_lengths_freq) - np.log( sent_lengths_freq.sum())
def load_data(data_type, preprocessing=False, fine_grained=False, verbose=False, text_length=5000, encode=True, load_SLE=False): if data_type == 'imdb': train_data, test_data = imdb_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'newsgroups': train_data, test_data = newsgroups_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'reuters': train_data, test_data = reuters_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) elif data_type == 'webkb': train_data, test_data = webkb_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'cade': train_data, test_data = cade_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'dbpedia': train_data, test_data = dbpedia_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'agnews': train_data, test_data = agnews_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'yahoo': train_data, test_data = yahoo_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'sogou': train_data, test_data = sogou_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'yelp': train_data, test_data = yelp_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) elif data_type == 'amazon': train_data, test_data = amazon_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) elif data_type == 'custom': test_data = custom_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) sentence_encoder = pickle.load(open('epochs/sentence_encoder', 'rb')) label_encoder = pickle.load(open('epochs/label_encoder', 'rb')) for row in datasets_iterator(test_data): row['text'] = sentence_encoder.encode(' '.join(row['text'])) row['label'] = label_encoder.encode(row['label']) return sentence_encoder.vocab_size, label_encoder.vocab_size, test_data else: raise ValueError('{} data type not supported.'.format(data_type)) if encode: if load_SLE: sentence_encoder = pickle.load( open('epochs/sentence_encoder', 'rb')) label_encoder = pickle.load(open('epochs/label_encoder', 'rb')) else: sentence_corpus = [ row['text'] for row in datasets_iterator(train_data, ) ] label_corpus = [ row['label'] for row in datasets_iterator(train_data, ) ] sentence_encoder = WhitespaceEncoder( sentence_corpus, reserved_tokens=[PADDING_TOKEN, UNKNOWN_TOKEN]) label_encoder = IdentityEncoder(label_corpus, reserved_tokens=[]) with open('epochs/sentence_encoder', 'wb') as f: pickle.dump(sentence_encoder, f) with open('epochs/label_encoder', 'wb') as f: pickle.dump(label_encoder, f) # Encode for row in datasets_iterator(train_data, test_data): row['text'] = sentence_encoder.encode(row['text']) row['label'] = label_encoder.encode(row['label']) return sentence_encoder.vocab_size, label_encoder.vocab_size, train_data, test_data else: return train_data, test_data
def __init__(self, root: str, normal_class=0, tokenizer='spacy', use_tfidf_weights=False, append_sos=False, append_eos=False, clean_txt=False, max_seq_len_prior=None): super().__init__(root) self.n_classes = 2 # 0: normal, 1: outlier classes = ['pos', 'neg'] if normal_class == -1: self.normal_classes = classes self.outlier_classes = [] else: self.normal_classes = [classes[normal_class]] del classes[normal_class] self.outlier_classes = classes if root not in nltk.data.path: nltk.data.path.append(root) # Load the imdb dataset self.train_set, self.test_set = imdb_dataset(directory=root, train=True, test=True) # Pre-process self.train_set.columns.add('index') self.test_set.columns.add('index') self.train_set.columns.remove('sentiment') self.test_set.columns.remove('sentiment') self.train_set.columns.add('label') self.test_set.columns.add('label') self.train_set.columns.add('weight') self.test_set.columns.add('weight') train_idx_normal = [] # for subsetting train_set to normal class for i, row in enumerate(self.train_set): row['label'] = row.pop('sentiment') if row['label'] in self.normal_classes: train_idx_normal.append(i) row['label'] = torch.tensor(0) else: row['label'] = torch.tensor(1) if clean_txt: row['text'] = clean_text(row['text'].lower()) else: row['text'] = row['text'].lower() test_n_idx = [] # subsetting test_set to selected normal classes test_a_idx = [] # subsetting test_set to selected anomalous classes for i, row in enumerate(self.test_set): row['label'] = row.pop('sentiment') if row['label'] in self.normal_classes: test_n_idx.append(i) else: test_a_idx.append(i) row['label'] = torch.tensor( 0) if row['label'] in self.normal_classes else torch.tensor(1) if clean_txt: row['text'] = clean_text(row['text'].lower()) else: row['text'] = row['text'].lower() # Subset train_set to normal class self.train_set = Subset(self.train_set, train_idx_normal) # Subset test_set to selected normal classes self.test_n_set = Subset(self.test_set, test_n_idx) # Subset test_set to selected anomalous classes self.test_a_set = Subset(self.test_set, test_a_idx) # Make corpus and set encoder text_corpus = [ row['text'] for row in datasets_iterator(self.train_set, self.test_set) ] if tokenizer == 'spacy': self.encoder = SpacyEncoder(text_corpus, min_occurrences=3, append_eos=append_eos) if tokenizer == 'bert': self.encoder = MyBertTokenizer.from_pretrained('bert-base-uncased', cache_dir=root) # Encode self.max_seq_len = 0 for row in datasets_iterator(self.train_set, self.test_set): if append_sos: sos_id = self.encoder.stoi[DEFAULT_SOS_TOKEN] row['text'] = torch.cat((torch.tensor(sos_id).unsqueeze(0), self.encoder.encode(row['text']))) else: row['text'] = self.encoder.encode(row['text']) if len(row['text']) > self.max_seq_len: self.max_seq_len = len(row['text']) # Compute tf-idf weights if use_tfidf_weights: compute_tfidf_weights(self.train_set, self.test_set, vocab_size=self.encoder.vocab_size) else: for row in datasets_iterator(self.train_set, self.test_set): row['weight'] = torch.empty(0) # Get indices after pre-processing for i, row in enumerate(self.train_set): row['index'] = i for i, row in enumerate(self.test_set): row['index'] = i # length prior sent_lengths = [len(row['text']) for row in self.train_set] sent_lengths_freq = np.bincount(np.array(sent_lengths)) sent_lengths_freq = np.concatenate( (sent_lengths_freq, np.array((max_seq_len_prior - max(sent_lengths)) * [0])), axis=0) sent_lengths_freq = sent_lengths_freq + 1 self.length_prior = np.log(sent_lengths_freq) - np.log( sent_lengths_freq.sum())
def __init__(self, root: str, normal_class=0, tokenizer='spacy', use_tfidf_weights=False, append_sos=False, append_eos=False, clean_txt=False): super().__init__(root) self.n_classes = 2 # 0: normal, 1: outlier classes = ['pos', 'neg'] if normal_class == -1: self.normal_classes = classes self.outlier_classes = [] else: self.normal_classes = [classes[normal_class]] del classes[normal_class] self.outlier_classes = classes # Load the imdb dataset self.train_set, self.test_set = imdb_dataset(directory=root, train=True, test=True) # Pre-process self.train_set.columns.add('index') self.test_set.columns.add('index') self.train_set.columns.remove('sentiment') self.test_set.columns.remove('sentiment') self.train_set.columns.add('label') self.test_set.columns.add('label') self.train_set.columns.add('weight') self.test_set.columns.add('weight') train_idx_normal = [] # for subsetting train_set to normal class for i, row in enumerate(self.train_set): row['label'] = row.pop('sentiment') if row['label'] in self.normal_classes: train_idx_normal.append(i) row['label'] = torch.tensor(0) else: row['label'] = torch.tensor(1) if clean_txt: row['text'] = clean_text(row['text'].lower()) else: row['text'] = row['text'].lower() for i, row in enumerate(self.test_set): row['label'] = row.pop('sentiment') row['label'] = torch.tensor(0) if row['label'] in self.normal_classes else torch.tensor(1) if clean_txt: row['text'] = clean_text(row['text'].lower()) else: row['text'] = row['text'].lower() # Subset train_set to normal class self.train_set = Subset(self.train_set, train_idx_normal) # Make corpus and set encoder text_corpus = [row['text'] for row in datasets_iterator(self.train_set, self.test_set)] if tokenizer == 'spacy': self.encoder = SpacyEncoder(text_corpus, min_occurrences=3, append_eos=append_eos) if tokenizer == 'bert': self.encoder = MyBertTokenizer.from_pretrained('bert-base-uncased', cache_dir=root) # Encode for row in datasets_iterator(self.train_set, self.test_set): if append_sos: sos_id = self.encoder.stoi[DEFAULT_SOS_TOKEN] row['text'] = torch.cat((torch.tensor(sos_id).unsqueeze(0), self.encoder.encode(row['text']))) else: row['text'] = self.encoder.encode(row['text']) # Compute tf-idf weights if use_tfidf_weights: compute_tfidf_weights(self.train_set, self.test_set, vocab_size=self.encoder.vocab_size) else: for row in datasets_iterator(self.train_set, self.test_set): row['weight'] = torch.empty(0) # Get indices after pre-processing for i, row in enumerate(self.train_set): row['index'] = i for i, row in enumerate(self.test_set): row['index'] = i
from torchnlp.text_encoders import WhitespaceEncoder, IdentityEncoder from torchnlp import word_to_vector from model import SNLIClassifier from util import get_args, makedirs, collate_fn args = get_args() if args.gpu >= 0: torch.cuda.set_device(args.gpu) # load dataset train, dev, test = snli_dataset(train=True, dev=True, test=True) # Preprocess for row in datasets_iterator(train, dev, test): row['premise'] = row['premise'].lower() row['hypothesis'] = row['hypothesis'].lower() # Make Encoders sentence_corpus = [ row['premise'] for row in datasets_iterator(train, dev, test) ] sentence_corpus += [ row['hypothesis'] for row in datasets_iterator(train, dev, test) ] sentence_encoder = WhitespaceEncoder(sentence_corpus) label_corpus = [row['label'] for row in datasets_iterator(train, dev, test)] label_encoder = IdentityEncoder(label_corpus)
def __init__(self, root: str, normal_class=0, tokenizer='spacy', use_tfidf_weights=False, append_sos=False, append_eos=False, clean_txt=False, max_seq_len_prior=None): super().__init__(root) self.n_classes = 2 # 0: normal, 1: outlier classes = list(range(6)) groups = [[ 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x' ], [ 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey' ], ['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'], ['misc.forsale'], [ 'talk.politics.misc', 'talk.politics.guns', 'talk.politics.mideast' ], [ 'talk.religion.misc', 'alt.atheism', 'soc.religion.christian' ]] short_group_names = ['comp', 'rec', 'sci', 'misc', 'pol', 'rel'] self.subset = short_group_names[normal_class] self.normal_classes = groups[normal_class] self.outlier_classes = [] del classes[normal_class] for i in classes: self.outlier_classes += groups[i] # Load the 20 Newsgroups dataset self.train_set, self.test_set = newsgroups20_dataset( directory=root, train=True, test=True, clean_txt=clean_txt, groups=groups, short_group_names=short_group_names) # Pre-process self.train_set.columns.add('index') self.test_set.columns.add('index') self.train_set.columns.add('weight') self.test_set.columns.add('weight') train_idx_normal = [] # for subsetting train_set to normal class for i, row in enumerate(self.train_set): if row['label'] in self.normal_classes: train_idx_normal.append(i) row['label'] = torch.tensor(0) else: row['label'] = torch.tensor(1) row['text'] = row['text'].lower() test_n_idx = [] # subsetting test_set to selected normal classes test_a_idx = [] # subsetting test_set to selected anomalous classes for i, row in enumerate(self.test_set): if row['label'] in self.normal_classes: test_n_idx.append(i) else: test_a_idx.append(i) row['label'] = torch.tensor( 0) if row['label'] in self.normal_classes else torch.tensor(1) row['text'] = row['text'].lower() # Subset train_set to normal class self.train_set = Subset(self.train_set, train_idx_normal) # Subset test_set to selected normal classes self.test_n_set = Subset(self.test_set, test_n_idx) # Subset test_set to selected anomalous classes self.test_a_set = Subset(self.test_set, test_a_idx) # Make corpus and set encoder text_corpus = [ row['text'] for row in datasets_iterator(self.train_set, self.test_set) ] if tokenizer == 'spacy': self.encoder = SpacyEncoder(text_corpus, min_occurrences=3, append_eos=append_eos) if tokenizer == 'bert': self.encoder = MyBertTokenizer.from_pretrained('bert-base-uncased', cache_dir=root) # Encode self.max_seq_len = 0 for row in datasets_iterator(self.train_set, self.test_set): if append_sos: sos_id = self.encoder.stoi[DEFAULT_SOS_TOKEN] row['text'] = torch.cat((torch.tensor(sos_id).unsqueeze(0), self.encoder.encode(row['text']))) else: row['text'] = self.encoder.encode(row['text']) if len(row['text']) > self.max_seq_len: self.max_seq_len = len(row['text']) # Compute tf-idf weights if use_tfidf_weights: compute_tfidf_weights(self.train_set, self.test_set, vocab_size=self.encoder.vocab_size) else: for row in datasets_iterator(self.train_set, self.test_set): row['weight'] = torch.empty(0) # Get indices after pre-processing for i, row in enumerate(self.train_set): row['index'] = i for i, row in enumerate(self.test_set): row['index'] = i # length prior sent_lengths = [len(row['text']) for row in self.train_set] sent_lengths_freq = np.bincount(np.array(sent_lengths)) sent_lengths_freq = np.concatenate( (sent_lengths_freq, np.array((max_seq_len_prior - max(sent_lengths)) * [0])), axis=0) sent_lengths_freq = sent_lengths_freq + 1 self.length_prior = np.log(sent_lengths_freq) - np.log( sent_lengths_freq.sum())
def load_data(data_type, preprocessing=False, fine_grained=False, verbose=False, text_length=5000, encode=True): if data_type == 'imdb': train_data, test_data = imdb_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'newsgroups': train_data, test_data = newsgroups_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'reuters': train_data, test_data = reuters_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) elif data_type == 'webkb': train_data, test_data = webkb_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'cade': train_data, test_data = cade_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'dbpedia': train_data, test_data = dbpedia_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'agnews': train_data, test_data = agnews_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'yahoo': train_data, test_data = yahoo_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'sogou': train_data, test_data = sogou_dataset(preprocessing=preprocessing, verbose=verbose, text_length=text_length) elif data_type == 'yelp': train_data, test_data = yelp_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) elif data_type == 'amazon': train_data, test_data = amazon_dataset(preprocessing=preprocessing, fine_grained=fine_grained, verbose=verbose, text_length=text_length) else: raise ValueError('{} data type not supported.'.format(data_type)) if encode: sentence_corpus = [ row['text'] for row in datasets_iterator(train_data, ) ] sentence_encoder = WhitespaceEncoder( sentence_corpus, reserved_tokens=[DEFAULT_PADDING_TOKEN, DEFAULT_UNKNOWN_TOKEN]) label_corpus = [ row['label'] for row in datasets_iterator(train_data, ) ] label_encoder = LabelEncoder(label_corpus, reserved_labels=[]) # Encode for row in datasets_iterator(train_data, test_data): row['text'] = sentence_encoder.encode(row['text']) row['label'] = label_encoder.encode(row['label']) return sentence_encoder, label_encoder, train_data, test_data else: return train_data, test_data
def main(): ROOT_DIR = os.path.join(str(Path.home()), '.torchtext') # define parameters and hyperparameters args = { 'data_dir': ROOT_DIR, 'use_cuda': True, 'test_batch_size': 128, 'dev_size': 0.1, 'checkpoint': True, 'early_stopping': False, 'epochs': 5, 'd_embedding': 300, 'word_vectors': 'glove.840B.300d', 'word_vectors_freeze': True, 'vector_cache_dir': os.path.join(ROOT_DIR, 'vector_cache'), 'momentum': .9, 'seed': 42, 'visdom_env': 'main', } args = Args(**args) vis = visdom.Visdom() if not vis.check_connection(): raise RuntimeError( "Visdom server not running. Please run python -m visdom.server") torch.manual_seed(args.seed) device = torch.device('cuda' if args.use_cuda else 'cpu') # Load dataset splits train, test = trec_dataset(train=True, test=True, directory=args.data_dir) # Create encoders (TODO: best way to persist those?) text_corpus = [row['text'] for row in datasets_iterator(train, test)] text_encoder = WhitespaceEncoder(text_corpus) label_corpus = [row['label'] for row in datasets_iterator(train, test)] label_encoder = LabelEncoder(label_corpus) # encode dataset splits for row in datasets_iterator(train, test): row['text'] = text_encoder.encode(row['text']) row['label'] = label_encoder.encode(row['label']) # create sampler for train / dev split used in dataloader train_sampler, dev_sampler = train_test_split_sampler( train, test_size=args.dev_size, random_state=args.seed) def delete_checkpoint(path): checkpoint_files = list(path.glob('checkpoint_model*.pth')) if checkpoint_files: os.remove(checkpoint_files[0]) visdom_logger = VisdomRunSummaryLogger(env=args.visdom_env, clear_batch_summary=True) # TODO: abstract this part run_config = {'run': 0} # train function def train_f(config): run_name = 'run_%d' % run_config['run'] run_config['run'] = run_config['run'] + 1 visdom_logger.new_run(run_name) model_path = Path('/tmp/models/') delete_checkpoint(model_path) train_batch_sampler = FlexibleBucketBatchSampler( train, config.batch_size, sampler=train_sampler, drop_last=True, sort_key=lambda r: len(row['text'])) train_loader = DataLoader(train, batch_sampler=train_batch_sampler, collate_fn=collate_fn, pin_memory=config.use_cuda, num_workers=0) dev_batch_sampler = FlexibleBucketBatchSampler( train, config.test_batch_size, drop_last=True, sampler=dev_sampler, sort_key=lambda r: len(row['text'])) dev_loader = DataLoader(train, batch_sampler=dev_batch_sampler, collate_fn=collate_fn, pin_memory=config.use_cuda, num_workers=0) test_sampler = BucketBatchSampler(test, config.test_batch_size, drop_last=True, sort_key=lambda r: len(row['text'])) test_loader = DataLoader(test, batch_sampler=test_sampler, collate_fn=collate_fn, pin_memory=config.use_cuda, num_workers=0) embedding = nn.Embedding(text_encoder.vocab_size, config.d_embedding) if config.word_vectors_freeze: embedding.weight.requires_grad = False if config.word_vectors: # Load word vectors word_vectors = word_to_vector.aliases[config.word_vectors]( cache=config.vector_cache_dir) for i, token in enumerate(text_encoder.vocab): embedding.weight.data[i] = word_vectors[token] print( 'Found vectors for %d tokens in vocabulary' % len([t for t in text_encoder.vocab if t in word_vectors.stoi])) model = LSTMClassifier(d_in=embedding.embedding_dim, d_out=label_encoder.vocab_size, d_hidden=config.d_hidden, dropout=config.dropout, embedding=embedding) model.to(device) optimizer_params = list( filter(lambda p: p.requires_grad, model.parameters())) optimizer = torch.optim.SGD(optimizer_params, lr=config.lr, momentum=config.momentum) trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) evaluator_train = \ create_supervised_evaluator(model, metrics={ 'accuracy': CategoricalAccuracy(), 'nll': Loss(F.nll_loss)}, device=device) evaluator_dev = \ create_supervised_evaluator(model, metrics={ 'accuracy': CategoricalAccuracy(), 'nll': Loss(F.nll_loss)}, device=device) visdom_logger.attach_trainer(trainer) visdom_logger.attach_evaluator(evaluator_train, trainer, phase='train') visdom_logger.attach_evaluator(evaluator_dev, trainer, phase='dev') lr_scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda epoch_: 1. / (1 + config.lr_decay * (epoch_ - 1))) # scoring function for early stopping and checkpointing def score_function(engine): dev_loss = engine.state.metrics['nll'] return -dev_loss early_stopping = EarlyStopping(patience=15, score_function=score_function, trainer=trainer) def checkpoint_score_function(engine): dev_accuracy = engine.state.metrics['accuracy'] return dev_accuracy checkpoint = ModelCheckpoint('/tmp/models', 'checkpoint', score_function=checkpoint_score_function, n_saved=1, create_dir=True, score_name="dev_accuracy") # lets train! train_model( model=model, trainer=trainer, epochs=config.epochs, evaluator_train=evaluator_train, evaluator_dev=evaluator_dev, train_loader=train_loader, dev_loader=dev_loader, lr_scheduler=lr_scheduler, early_stopping=early_stopping if config.early_stopping else None, checkpoint=checkpoint if config.checkpoint else None) # load checkpointed (best) model and evaluate on test loader model = torch.load(list(model_path.glob('checkpoint_model*.pth'))[0]) test_evaluator = \ create_supervised_evaluator(model, metrics={ 'accuracy': CategoricalAccuracy(), 'nll': Loss(F.nll_loss)}, device=device) test_evaluator.run(test_loader) metrics = test_evaluator.state.metrics print("Test Results: Avg accuracy: {:.2f} Avg loss: {:.2f}".format( metrics['accuracy'], metrics['nll'])) test_evaluator.run(dev_loader) metrics = test_evaluator.state.metrics return metrics['nll'] # hyperparameter tuning! hp_opt = HPOptimizer(args=args, strategy='gp', space=[ Real(0.1, 0.5, name='dropout'), Categorical([50, 100, 150, 200], name='d_hidden'), Real(1e-4, 1, prior='log-uniform', name='lr'), Real(1e-3, 1, prior='log-uniform', name='lr_decay'), Categorical([4, 8, 16, 32, 64, 128], name='batch_size') ]) hp_opt.add_callback(visdom_logger.run_summary) result = hp_opt.minimize(train_f, n_calls=10) print(result)