Ejemplo n.º 1
0
            train='multi_label_train.tsv',
            validation='multi_label_valid.tsv',
            test='multi_label_test.tsv',
            format='tsv',
            fields=[('Text', TEXT), ('Description', LABEL)])
    elif args.dataset == 'multi_no_des':
        train, val, test = data.TabularDataset.splits(
            path='../../data/csu/',
            train='multi_label_no_des_train.tsv',
            validation='multi_label_no_des_valid.tsv',
            test='multi_label_no_des_test.tsv',
            format='tsv',
            fields=[('Text', TEXT), ('Description', LABEL)])

    if args.emb_dim == 100:
        TEXT.build_vocab(train, vectors="glove.6B.100d")
    elif args.emb_dim == 200:
        TEXT.build_vocab(train, vectors="glove.6B.200d")
    elif args.emb_dim == 300:
        TEXT.build_vocab(train, vectors="glove.6B.300d")
    else:
        TEXT.build_vocab(train, vectors="glove.6B.100d")

    # do repeat=False
    train_iter, val_iter, test_iter = data.Iterator.splits(
        (train, val, test),
        sort_key=lambda x: len(x.Text
                               ),  # no global sort, but within-batch-sort
        batch_sizes=(32, 256, 256),
        device=args.gpu,
        sort_within_batch=True,
Ejemplo n.º 2
0
            train='snomed_multi_label_no_des_train.tsv',
            validation='snomed_multi_label_no_des_valid.tsv',
            test='snomed_multi_label_no_des_test.tsv',
            format='tsv',
            fields=[('Text', TEXT), ('Description', LABEL)])
    elif args.dataset == 'multi_top_snomed_adjusted_no_des':
        train, val, test = data.TabularDataset.splits(
            path='../../data/csu/',
            train='snomed_adjusted_multi_label_no_des_train.tsv',
            validation='snomed_adjusted_multi_label_no_des_valid.tsv',
            test='snomed_adjusted_multi_label_no_des_test.tsv',
            format='tsv',
            fields=[('Text', TEXT), ('Description', LABEL)])

    # actually, this is the first point of improvement: load in clinical embedding instead!!!
    TEXT.build_vocab(train, vectors="glove.6B.{}d".format(args.emb_dim))

    # do repeat=False
    train_iter, val_iter, test_iter = data.Iterator.splits(
        (train, val, test),
        sort_key=lambda x: len(x.Text
                               ),  # no global sort, but within-batch-sort
        batch_sizes=(32, 128, 128),
        device=args.gpu,
        sort_within_batch=True,
        repeat=False)  # stop infinite runs
    # if not labeling sort=False, then you are sorting through valid and test

    adobe_test_iter = data.Iterator(adobe_test,
                                    128,
                                    sort_key=lambda x: len(x.Text),
Ejemplo n.º 3
0
class Dataset(object):
    def __init__(self,
                 path='./data/',
                 dataset_prefix='vci_1543_abs_tit_key_apr_1_2019_',
                 test_data_name='',
                 full_meta_data_name='explanations_5panels.csv',
                 label_size=5,
                 fix_length=None,
                 meta_data=None):
        """
        :param meta_data: MetaData class instance. Will be used for vocab building.
        """
        # we will add metalabel here and make iterators
        self.TEXT = ReversibleField(sequential=True,
                                    include_lengths=True,
                                    lower=False,
                                    fix_length=fix_length)
        self.LABEL = MultiLabelField(sequential=True,
                                     use_vocab=False,
                                     label_size=label_size,
                                     tensor_type=torch.FloatTensor,
                                     fix_length=fix_length)

        # it's actually this step that will take 5 minutes
        self.train, self.val, self.test = data.TabularDataset.splits(
            path=path,
            train=dataset_prefix + 'train.csv',
            validation=dataset_prefix + 'valid.csv',
            test=dataset_prefix + 'test.csv',
            format='tsv',
            fields=[('Text', self.TEXT), ('Description', self.LABEL)])

        self.full_meta_data = data.TabularDataset(
            path=pjoin(path, full_meta_data_name),
            format='tsv',
            fields=[('Text', self.TEXT), ('Description', self.LABEL)])

        self.meta_data = meta_data

        self.is_vocab_bulit = False
        self.iterators = []

        if test_data_name != '':
            self.external_test = data.TabularDataset(
                path=path + test_data_name,
                format='tsv',
                fields=[('Text', self.TEXT), ('Description', self.LABEL)])
        else:
            self.external_test = None

    def get_iterators(self, device, val_batch_size=128):
        if not self.is_vocab_bulit:
            raise Exception(
                "Vocabulary is not built yet..needs to call build_vocab()")

        if len(self.iterators) > 0:
            return self.iterators  # return stored iterator

        # only get them after knowing the device (inside trainer or evaluator)
        train_iter, val_iter, test_iter = data.Iterator.splits(
            (self.train, self.val, self.test),
            sort_key=lambda x: len(x.Text
                                   ),  # no global sort, but within-batch-sort
            batch_sizes=(32, val_batch_size, val_batch_size),
            device=device,
            sort_within_batch=True,
            repeat=False)

        return train_iter, val_iter, test_iter

    def xavier_uniform(self, tensor, fan_in, fan_out, gain=1):
        # fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
        std = gain * math.sqrt(2.0 / (fan_in + fan_out))
        a = math.sqrt(
            3.0) * std  # Calculate uniform bounds from standard deviation
        with torch.no_grad():
            return tensor.uniform_(-a, a)

    def init_emb(self, vocab, init="glorot", num_special_toks=2, silent=False):
        # we can try randn or glorot
        # mode="unk"|"all", all means initialize everything
        emb_vectors = vocab.vectors
        sweep_range = len(vocab)
        running_norm = 0.
        num_non_zero = 0
        total_words = 0

        fan_in, fan_out = emb_vectors.size()

        for i in range(num_special_toks, sweep_range):
            if len(emb_vectors[i, :].nonzero()) == 0:
                # std = 0.5 is based on the norm of average GloVE word vectors
                self.xavier_uniform(emb_vectors[i], fan_in, fan_out)
            else:
                num_non_zero += 1
                running_norm += torch.norm(emb_vectors[i])
            total_words += 1
        if not silent:
            print(
                "average GloVE norm is {}, number of known words are {}, total number of words are {}"
                .format(
                    running_norm / num_non_zero, num_non_zero,
                    total_words))  # directly printing into Jupyter Notebook

    def build_vocab(self, config, silent=False):
        if config.emb_corpus == 'common_crawl':
            self.TEXT.build_vocab(self.train,
                                  self.full_meta_data,
                                  vectors="glove.840B.300d")
            config.emb_dim = 300  # change the config emb dimension
        else:
            # add all datasets
            self.TEXT.build_vocab(self.train,
                                  self.full_meta_data,
                                  vectors="glove.6B.{}d".format(
                                      config.emb_dim))
        self.is_vocab_bulit = True
        self.vocab = self.TEXT.vocab
        if config.rand_unk:
            if not silent:
                print("initializing random vocabulary")
            self.init_emb(self.vocab, silent=silent)

        # synchronize vocab by making them the same object
        self.meta_data.TEXT_FIELD.vocab = self.TEXT.vocab
Ejemplo n.º 4
0
class Dataset(object):
    def __init__(self, path='./data/',
                 weak_train_dataset="",
                 acmg_weak_data_path="",
                 dataset_prefix='vci_1543_abs_tit_key_apr_1_2019_',
                 test_data_name='vci_358_abs_tit_key_may_7_2019_true_test.csv',
                 multi_task_train_dataset="",
                 label_size=5, fix_length=None):
        self.TEXT = ReversibleField(sequential=True, include_lengths=True, lower=False, fix_length=fix_length)
        self.LABEL = MultiLabelField(sequential=True, use_vocab=False, label_size=label_size,
                                     tensor_type=torch.FloatTensor, fix_length=fix_length)

        if weak_train_dataset != "":
            self.weak_train = data.TabularDataset(weak_train_dataset, format='tsv',
                                                  fields=[('Text', self.TEXT), ('Description', self.LABEL)])
            if acmg_weak_data_path != "":
                acmg_weak_data = data.TabularDataset(acmg_weak_data_path, format='tsv',
                                                  fields=[('Text', self.TEXT), ('Description', self.LABEL)])
                # this should be enough!
                self.weak_train.examples.extend(acmg_weak_data.examples)
        else:
            self.weak_train = None

        if multi_task_train_dataset != "":
            self.multi_task_train = data.TabularDataset(multi_task_train_dataset, format='tsv',
                                                        fields=[('Text', self.TEXT), ('Description', self.LABEL)])
        else:
            self.multi_task_train = None

        # it's actually this step that will take 5 minutes
        self.train, self.val, self.test = data.TabularDataset.splits(
            path=path, train=dataset_prefix + 'train.csv',
            validation=dataset_prefix + 'valid.csv',
            test=dataset_prefix + 'test.csv', format='tsv',
            fields=[('Text', self.TEXT), ('Description', self.LABEL)])

        if test_data_name != '':
            self.external_test = data.TabularDataset(path=path + test_data_name,
                                                     format='tsv',
                                                     fields=[('Text', self.TEXT), ('Description', self.LABEL)])
        else:
            self.external_test = None

        self.is_vocab_bulit = False
        self.iterators = []
        self.test_iterator = None
        self.weak_train_iterator = None
        self.multi_task_train_iterator = None

    def xavier_uniform(self, tensor, fan_in, fan_out, gain=1):
        # fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
        std = gain * math.sqrt(2.0 / (fan_in + fan_out))
        a = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
        with torch.no_grad():
            return tensor.uniform_(-a, a)

    def init_emb(self, vocab, init="glorot", num_special_toks=2, silent=False):
        # we can try randn or glorot
        # mode="unk"|"all", all means initialize everything
        emb_vectors = vocab.vectors
        sweep_range = len(vocab)
        running_norm = 0.
        num_non_zero = 0
        total_words = 0

        fan_in, fan_out = emb_vectors.size()  # 16870, 300 # std = 0.01 # a = 1.73 * 0.01

        for i in range(num_special_toks, sweep_range):
            if len(emb_vectors[i, :].nonzero()) == 0:
                # std = 0.5 is based on the norm of average GloVE word vectors
                self.xavier_uniform(emb_vectors[i], fan_in, fan_out)
            else:
                num_non_zero += 1
                running_norm += torch.norm(emb_vectors[i])
            total_words += 1
        if not silent:
            print("average GloVE norm is {}, number of known words are {}, total number of words are {}".format(
                running_norm / num_non_zero, num_non_zero, total_words))  # directly printing into Jupyter Notebook

    def build_vocab(self, config, silent=False):
        datasets = [self.train]
        if self.weak_train is not None and args.weak_vocab:
            datasets.append(self.weak_train)

        if self.multi_task_train is not None:
            datasets.append(self.multi_task_train)  # we always build vocab for multitask

        if config.emb_corpus == 'common_crawl':
            # self.TEXT.build_vocab(self.train, vectors="glove.840B.300d")
            self.TEXT.build_vocab(*datasets, vectors="glove.840B.300d")
            config.emb_dim = 300  # change the config emb dimension
        else:
            self.TEXT.build_vocab(*datasets, vectors="glove.6B.{}d".format(config.emb_dim))

        self.is_vocab_bulit = True
        self.vocab = self.TEXT.vocab
        if config.rand_unk:
            if not silent:
                print("initializing random vocabulary")
            self.init_emb(self.vocab, silent=silent)

    def get_iterators(self, device, val_batch_size=128):
        if not self.is_vocab_bulit:
            raise Exception("Vocabulary is not built yet..needs to call build_vocab()")

        if len(self.iterators) > 0:
            return self.iterators  # return stored iterator

        # only get them after knowing the device (inside trainer or evaluator)
        train_iter, val_iter, test_iter = data.Iterator.splits(
            (self.train, self.val, self.test), sort_key=lambda x: len(x.Text),  # no global sort, but within-batch-sort
            batch_sizes=(32, val_batch_size, val_batch_size), device=device,
            sort_within_batch=True, repeat=False)

        return train_iter, val_iter, test_iter

    def get_test_iterator(self, device):
        if not self.is_vocab_bulit:
            raise Exception("Vocabulary is not built yet..needs to call build_vocab()")

        if self.test_iterator is not None:
            return self.test_iterator

        external_test_iter = data.Iterator(self.external_test, 128, sort_key=lambda x: len(x.Text),
                                           device=device, train=False, repeat=False, sort_within_batch=True)
        return external_test_iter

    def get_weak_train_iterator(self, device):
        if not self.is_vocab_bulit:
            raise Exception("Vocabulary is not built yet..needs to call build_vocab()")

        if self.weak_train_iterator is not None:
            return self.weak_train_iterator

        weak_train_iterator = data.Iterator(self.weak_train, 128, sort_key=lambda x: len(x.Text),
                                           device=device, train=True, repeat=False, sort_within_batch=True)

        return weak_train_iterator

    def get_multi_task_train_iterator(self, device):
        if not self.is_vocab_bulit:
            raise Exception("Vocabulary is not built yet..needs to call build_vocab()")

        if self.multi_task_train_iterator is not None:
            return self.multi_task_train_iterator

        self.multi_task_train_iterator = data.Iterator(self.multi_task_train, 128, sort_key=lambda x: len(x.Text),
                                        device=device, train=True, repeat=False, sort_within_batch=True)

        return self.multi_task_train_iterator