train='multi_label_train.tsv', validation='multi_label_valid.tsv', test='multi_label_test.tsv', format='tsv', fields=[('Text', TEXT), ('Description', LABEL)]) elif args.dataset == 'multi_no_des': train, val, test = data.TabularDataset.splits( path='../../data/csu/', train='multi_label_no_des_train.tsv', validation='multi_label_no_des_valid.tsv', test='multi_label_no_des_test.tsv', format='tsv', fields=[('Text', TEXT), ('Description', LABEL)]) if args.emb_dim == 100: TEXT.build_vocab(train, vectors="glove.6B.100d") elif args.emb_dim == 200: TEXT.build_vocab(train, vectors="glove.6B.200d") elif args.emb_dim == 300: TEXT.build_vocab(train, vectors="glove.6B.300d") else: TEXT.build_vocab(train, vectors="glove.6B.100d") # do repeat=False train_iter, val_iter, test_iter = data.Iterator.splits( (train, val, test), sort_key=lambda x: len(x.Text ), # no global sort, but within-batch-sort batch_sizes=(32, 256, 256), device=args.gpu, sort_within_batch=True,
train='snomed_multi_label_no_des_train.tsv', validation='snomed_multi_label_no_des_valid.tsv', test='snomed_multi_label_no_des_test.tsv', format='tsv', fields=[('Text', TEXT), ('Description', LABEL)]) elif args.dataset == 'multi_top_snomed_adjusted_no_des': train, val, test = data.TabularDataset.splits( path='../../data/csu/', train='snomed_adjusted_multi_label_no_des_train.tsv', validation='snomed_adjusted_multi_label_no_des_valid.tsv', test='snomed_adjusted_multi_label_no_des_test.tsv', format='tsv', fields=[('Text', TEXT), ('Description', LABEL)]) # actually, this is the first point of improvement: load in clinical embedding instead!!! TEXT.build_vocab(train, vectors="glove.6B.{}d".format(args.emb_dim)) # do repeat=False train_iter, val_iter, test_iter = data.Iterator.splits( (train, val, test), sort_key=lambda x: len(x.Text ), # no global sort, but within-batch-sort batch_sizes=(32, 128, 128), device=args.gpu, sort_within_batch=True, repeat=False) # stop infinite runs # if not labeling sort=False, then you are sorting through valid and test adobe_test_iter = data.Iterator(adobe_test, 128, sort_key=lambda x: len(x.Text),
class Dataset(object): def __init__(self, path='./data/', dataset_prefix='vci_1543_abs_tit_key_apr_1_2019_', test_data_name='', full_meta_data_name='explanations_5panels.csv', label_size=5, fix_length=None, meta_data=None): """ :param meta_data: MetaData class instance. Will be used for vocab building. """ # we will add metalabel here and make iterators self.TEXT = ReversibleField(sequential=True, include_lengths=True, lower=False, fix_length=fix_length) self.LABEL = MultiLabelField(sequential=True, use_vocab=False, label_size=label_size, tensor_type=torch.FloatTensor, fix_length=fix_length) # it's actually this step that will take 5 minutes self.train, self.val, self.test = data.TabularDataset.splits( path=path, train=dataset_prefix + 'train.csv', validation=dataset_prefix + 'valid.csv', test=dataset_prefix + 'test.csv', format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) self.full_meta_data = data.TabularDataset( path=pjoin(path, full_meta_data_name), format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) self.meta_data = meta_data self.is_vocab_bulit = False self.iterators = [] if test_data_name != '': self.external_test = data.TabularDataset( path=path + test_data_name, format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) else: self.external_test = None def get_iterators(self, device, val_batch_size=128): if not self.is_vocab_bulit: raise Exception( "Vocabulary is not built yet..needs to call build_vocab()") if len(self.iterators) > 0: return self.iterators # return stored iterator # only get them after knowing the device (inside trainer or evaluator) train_iter, val_iter, test_iter = data.Iterator.splits( (self.train, self.val, self.test), sort_key=lambda x: len(x.Text ), # no global sort, but within-batch-sort batch_sizes=(32, val_batch_size, val_batch_size), device=device, sort_within_batch=True, repeat=False) return train_iter, val_iter, test_iter def xavier_uniform(self, tensor, fan_in, fan_out, gain=1): # fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) std = gain * math.sqrt(2.0 / (fan_in + fan_out)) a = math.sqrt( 3.0) * std # Calculate uniform bounds from standard deviation with torch.no_grad(): return tensor.uniform_(-a, a) def init_emb(self, vocab, init="glorot", num_special_toks=2, silent=False): # we can try randn or glorot # mode="unk"|"all", all means initialize everything emb_vectors = vocab.vectors sweep_range = len(vocab) running_norm = 0. num_non_zero = 0 total_words = 0 fan_in, fan_out = emb_vectors.size() for i in range(num_special_toks, sweep_range): if len(emb_vectors[i, :].nonzero()) == 0: # std = 0.5 is based on the norm of average GloVE word vectors self.xavier_uniform(emb_vectors[i], fan_in, fan_out) else: num_non_zero += 1 running_norm += torch.norm(emb_vectors[i]) total_words += 1 if not silent: print( "average GloVE norm is {}, number of known words are {}, total number of words are {}" .format( running_norm / num_non_zero, num_non_zero, total_words)) # directly printing into Jupyter Notebook def build_vocab(self, config, silent=False): if config.emb_corpus == 'common_crawl': self.TEXT.build_vocab(self.train, self.full_meta_data, vectors="glove.840B.300d") config.emb_dim = 300 # change the config emb dimension else: # add all datasets self.TEXT.build_vocab(self.train, self.full_meta_data, vectors="glove.6B.{}d".format( config.emb_dim)) self.is_vocab_bulit = True self.vocab = self.TEXT.vocab if config.rand_unk: if not silent: print("initializing random vocabulary") self.init_emb(self.vocab, silent=silent) # synchronize vocab by making them the same object self.meta_data.TEXT_FIELD.vocab = self.TEXT.vocab
class Dataset(object): def __init__(self, path='./data/', weak_train_dataset="", acmg_weak_data_path="", dataset_prefix='vci_1543_abs_tit_key_apr_1_2019_', test_data_name='vci_358_abs_tit_key_may_7_2019_true_test.csv', multi_task_train_dataset="", label_size=5, fix_length=None): self.TEXT = ReversibleField(sequential=True, include_lengths=True, lower=False, fix_length=fix_length) self.LABEL = MultiLabelField(sequential=True, use_vocab=False, label_size=label_size, tensor_type=torch.FloatTensor, fix_length=fix_length) if weak_train_dataset != "": self.weak_train = data.TabularDataset(weak_train_dataset, format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) if acmg_weak_data_path != "": acmg_weak_data = data.TabularDataset(acmg_weak_data_path, format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) # this should be enough! self.weak_train.examples.extend(acmg_weak_data.examples) else: self.weak_train = None if multi_task_train_dataset != "": self.multi_task_train = data.TabularDataset(multi_task_train_dataset, format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) else: self.multi_task_train = None # it's actually this step that will take 5 minutes self.train, self.val, self.test = data.TabularDataset.splits( path=path, train=dataset_prefix + 'train.csv', validation=dataset_prefix + 'valid.csv', test=dataset_prefix + 'test.csv', format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) if test_data_name != '': self.external_test = data.TabularDataset(path=path + test_data_name, format='tsv', fields=[('Text', self.TEXT), ('Description', self.LABEL)]) else: self.external_test = None self.is_vocab_bulit = False self.iterators = [] self.test_iterator = None self.weak_train_iterator = None self.multi_task_train_iterator = None def xavier_uniform(self, tensor, fan_in, fan_out, gain=1): # fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) std = gain * math.sqrt(2.0 / (fan_in + fan_out)) a = math.sqrt(3.0) * std # Calculate uniform bounds from standard deviation with torch.no_grad(): return tensor.uniform_(-a, a) def init_emb(self, vocab, init="glorot", num_special_toks=2, silent=False): # we can try randn or glorot # mode="unk"|"all", all means initialize everything emb_vectors = vocab.vectors sweep_range = len(vocab) running_norm = 0. num_non_zero = 0 total_words = 0 fan_in, fan_out = emb_vectors.size() # 16870, 300 # std = 0.01 # a = 1.73 * 0.01 for i in range(num_special_toks, sweep_range): if len(emb_vectors[i, :].nonzero()) == 0: # std = 0.5 is based on the norm of average GloVE word vectors self.xavier_uniform(emb_vectors[i], fan_in, fan_out) else: num_non_zero += 1 running_norm += torch.norm(emb_vectors[i]) total_words += 1 if not silent: print("average GloVE norm is {}, number of known words are {}, total number of words are {}".format( running_norm / num_non_zero, num_non_zero, total_words)) # directly printing into Jupyter Notebook def build_vocab(self, config, silent=False): datasets = [self.train] if self.weak_train is not None and args.weak_vocab: datasets.append(self.weak_train) if self.multi_task_train is not None: datasets.append(self.multi_task_train) # we always build vocab for multitask if config.emb_corpus == 'common_crawl': # self.TEXT.build_vocab(self.train, vectors="glove.840B.300d") self.TEXT.build_vocab(*datasets, vectors="glove.840B.300d") config.emb_dim = 300 # change the config emb dimension else: self.TEXT.build_vocab(*datasets, vectors="glove.6B.{}d".format(config.emb_dim)) self.is_vocab_bulit = True self.vocab = self.TEXT.vocab if config.rand_unk: if not silent: print("initializing random vocabulary") self.init_emb(self.vocab, silent=silent) def get_iterators(self, device, val_batch_size=128): if not self.is_vocab_bulit: raise Exception("Vocabulary is not built yet..needs to call build_vocab()") if len(self.iterators) > 0: return self.iterators # return stored iterator # only get them after knowing the device (inside trainer or evaluator) train_iter, val_iter, test_iter = data.Iterator.splits( (self.train, self.val, self.test), sort_key=lambda x: len(x.Text), # no global sort, but within-batch-sort batch_sizes=(32, val_batch_size, val_batch_size), device=device, sort_within_batch=True, repeat=False) return train_iter, val_iter, test_iter def get_test_iterator(self, device): if not self.is_vocab_bulit: raise Exception("Vocabulary is not built yet..needs to call build_vocab()") if self.test_iterator is not None: return self.test_iterator external_test_iter = data.Iterator(self.external_test, 128, sort_key=lambda x: len(x.Text), device=device, train=False, repeat=False, sort_within_batch=True) return external_test_iter def get_weak_train_iterator(self, device): if not self.is_vocab_bulit: raise Exception("Vocabulary is not built yet..needs to call build_vocab()") if self.weak_train_iterator is not None: return self.weak_train_iterator weak_train_iterator = data.Iterator(self.weak_train, 128, sort_key=lambda x: len(x.Text), device=device, train=True, repeat=False, sort_within_batch=True) return weak_train_iterator def get_multi_task_train_iterator(self, device): if not self.is_vocab_bulit: raise Exception("Vocabulary is not built yet..needs to call build_vocab()") if self.multi_task_train_iterator is not None: return self.multi_task_train_iterator self.multi_task_train_iterator = data.Iterator(self.multi_task_train, 128, sort_key=lambda x: len(x.Text), device=device, train=True, repeat=False, sort_within_batch=True) return self.multi_task_train_iterator