Esempio n. 1
0
    def __init__(self, train_validation_split=None, test_split=None,
                 use_defaults=True, subset='sts_small'):
        if train_validation_split is not None or test_split is not None or \
                use_defaults is False:
            raise NotImplementedError('This Dataset does not implement '
                  'train_validation_split, test_split or use_defaults as the '
                  'dataset is big enough and uses dedicated splits from '
                  'the original datasets')
        self.dataset_name = 'Semantic Text Similarity - All'
        self.dataset_description = 'This dataset has been generated by ' \
               'merging MPD, SICK, Quora, StackExchange and and SemEval ' \
               'datasets. \n It has 258537 Training sentence pairs, 133102 ' \
               'Test sentence pairs and 59058 validation sentence pairs.'
        self.test_split = 'large'
        self.dataset = subset
        self.dataset_path = os.path.join(datasets.data_root_directory,
                                         self.dataset)
        self.train_path = os.path.join(self.dataset_path, 'train', 'train.txt')
        self.validation_path = os.path.join(self.dataset_path, 'validation',
                                            'validation.txt')
        self.test_path = os.path.join(self.dataset_path, 'test', 'test.txt')
        self.vocab_path = os.path.join(self.dataset_path, 'vocab.txt')
        self.metadata_path = os.path.abspath(os.path.join(self.dataset_path,
                                               'metadata.txt'))
        self.w2v_path = os.path.join(self.dataset_path, 'w2v.npy')

        self.w2i, self.i2w = datasets.load_vocabulary(self.vocab_path)
        self.w2v = datasets.load_w2v(self.w2v_path)

        self.vocab_size = len(self.w2i)
        self.train = DataSet(self.train_path, (self.w2i, self.i2w))
        self.validation = DataSet(self.validation_path, (self.w2i, self.i2w))
        self.test = DataSet(self.test_path, (self.w2i, self.i2w))
        self.__refresh(load_w2v=False)
Esempio n. 2
0
 def __refresh(self, load_w2v):
     self.w2i, self.i2w = datasets.load_vocabulary(self.vocab_path)
     if load_w2v:
         self.w2v = datasets.preload_w2v(self.w2i)
         datasets.save_w2v(self.w2v_path, self.w2v)
     self.train.set_vocab((self.w2i, self.i2w))
     self.validation.set_vocab((self.w2i, self.i2w))
     self.test.set_vocab((self.w2i, self.i2w))
Esempio n. 3
0
    def __init__(self,
                 train_validation_split=None,
                 test_split=None,
                 use_defaults=True):
        if train_validation_split is not None or test_split is not None or \
                use_defaults is False:
            raise NotImplementedError(
                'This Dataset does not implement '
                'train_validation_split, test_split or use_defaults as the '
                'dataset is big enough and uses dedicated splits from '
                'the original datasets')
        self.dataset_name = 'Twitter Emotions Dataset'
        self.dataset_description = 'Tn a variation on the popular task of ' \
           'sentiment analysis, this dataset contains labels for the emotional' \
           ' content (such as happiness, sadness, and anger) of texts. Hundreds' \
           ' to thousands of examples across 13 labels. A subset of this data ' \
           'is used in an experiment that is uploaded to Microsoft’s Cortana ' \
           'Intelligence Gallery.'
        self.test_split = 'small'
        self.dataset = "twitter_emotion"
        self.dataset_path = os.path.join(datasets.data_root_directory,
                                         self.dataset)
        self.data_path = os.path.join(self.dataset_path, 'emotion_text.txt')
        self.train_paths = {
            i: os.path.join(self.dataset_path, 'train',
                            'fold_{}_train'.format(i))
            for i in range(5)
        }
        self.validation_paths = {
            i: os.path.join(self.dataset_path, 'validation',
                            'fold_{}_val'.format(i))
            for i in range(5)
        }
        self.test_paths = {
            i: os.path.join(self.dataset_path, 'test',
                            'fold_{}_test'.format(i))
            for i in range(5)
        }
        self.vocab_path = os.path.join(self.dataset_path, 'vocab.txt')
        self.metadata_path = os.path.abspath(
            os.path.join(self.dataset_path, 'metadata.txt'))
        self.classes_path = os.path.join(self.dataset_path, 'classes.txt')
        self.w2v_path = os.path.join(self.dataset_path, 'w2v.npy')

        self.w2i, self.i2w = datasets.load_vocabulary(self.vocab_path)
        self.w2v = datasets.load_w2v(self.w2v_path)
        self.c2i, self.i2c = datasets.load_classes(self.classes_path)
        self.n_classes = len(self.c2i)

        self.vocab_size = len(self.w2i)
        self.train = DataSet(self.train_paths, (self.w2i, self.i2w),
                             (self.c2i, self.i2c), self.n_classes)
        self.validation = DataSet(self.validation_paths, (self.w2i, self.i2w),
                                  (self.c2i, self.i2c), self.n_classes)
        self.test = DataSet(self.test_paths, (self.w2i, self.i2w),
                            (self.c2i, self.i2c), self.n_classes)
        self.__refresh(load_w2v=False)
Esempio n. 4
0
    def initialize_vocabulary(self):
        line_processor = lambda line: " ".join(line.split('\t')[:1])

        self.vocab_path, self.w2v_path, self.metadata_path = \
            datasets.new_vocabulary(
                files=[self.train_path], dataset_path=self.dataset_path,
                min_frequency=5, tokenizer='spacy',
                downcase=True, max_vocab_size=None,
                name='new', line_processor=line_processor)

        self.w2i, self.i2w = datasets.load_vocabulary(self.vocab_path)
        self.w2v = datasets.preload_w2v(self.w2i)
        datasets.save_w2v(self.w2v_path, self.w2v)
Esempio n. 5
0
 def __refresh(self, load_w2v):
     # (Again)
     # It doesn't seem to make sense to want to create a new vocabulary for
     # the other two types of data (NER data or POS tags). So I'll only allow
     # for new vocabularies on the text
     self.w2i[0], self.i2w[0] = datasets.load_vocabulary(
         self.vocab_paths[0])
     if load_w2v:
         self.w2v[0] = datasets.preload_w2v(self.w2i[0], lang='de')
         datasets.save_w2v(self.w2v_paths[0], self.w2v[0])
     self.train.set_vocab(self.w2i, self.i2w, 0)
     self.validation.set_vocab(self.w2i, self.i2w, 0)
     self.test.set_vocab(self.w2i, self.i2w, 0)
Esempio n. 6
0
    def initialize_defaults(self, shuffle):
        # For now, we are happy that this works =)
        #self.load_anew(train_validate_split=datasets.train_validate_split,
        #               test_split=datasets.test_split_small, shuffle=shuffle)
        train_data = self.load_data(self.train_path)
        validate_data = self.load_data(self.validate_path)
        test_data = self.load_data(self.test_path)

        self.w2i, self.i2w = datasets.load_vocabulary(self.vocab_path)
        self.w2v = datasets.load_w2v(self.w2v_path)

        self.train = DataSet(train_data, (self.w2i, self.i2w), shuffle)
        self.validation = DataSet(validate_data, (self.w2i, self.i2w), shuffle)
        self.test = DataSet(test_data, (self.w2i, self.i2w), shuffle)
Esempio n. 7
0
    def initialize_vocabulary_ll(self, names, min_frequencies, downcases,
                                 tokenizer):
        for i in range(len(self.vocab_paths)):
            self.vocab_paths[i], self.w2v_paths[i], self.metadata_paths[i] = \
                datasets.new_vocabulary(
                    files=[self.train_path], dataset_path=self.dataset_path,
                    min_frequency=min_frequencies[i], tokenizer=tokenizer[i],
                    downcase=downcases[i], max_vocab_size=None,
                    name=names[i],
                    line_processor=lambda line: line.split('\t')[i], lang='de')

            self.w2i[i], self.i2w[i] = datasets.load_vocabulary(
                self.vocab_paths[i])
            self.w2v[i] = datasets.preload_w2v(self.w2i[i], lang='de')
            datasets.save_w2v(self.w2v_paths[i], self.w2v[i])
    def __init__(self,
                 train_validation_split=None,
                 test_split=None,
                 use_defaults=True,
                 data_balancing=True):
        if train_validation_split is not None or test_split is not None or \
                        use_defaults is False:
            raise NotImplementedError(
                'This Dataset does not implement '
                'train_validation_split, test_split or use_defaults as the '
                'dataset is big enough and uses dedicated splits from '
                'the original datasets')
        self.dataset_name = 'CMU Hotel Reviews'
        self.dataset_description = 'This dataset is from CMU. Here is the ' \
                                   'link to the dataset http://www.cs.cmu.edu/~jiweil/html/' \
                                   'hotel-review.html \nIt has 553494 Training Instances ' \
                                   '263568 Test Instances and 61499 Validation Instances'
        self.test_split = 'large'
        self.dataset = "hotel_reviews"
        self.dataset_path = os.path.join(datasets.data_root_directory,
                                         self.dataset)
        self.data_balancing = data_balancing
        self.train_path = os.path.join(self.dataset_path, 'train', 'train.txt')
        self.train_path_list = glob(
            os.path.join(self.dataset_path, 'train', 'output_file_*.txt'))
        self.validation_path = os.path.join(self.dataset_path, 'validation',
                                            'validation.txt')
        self.test_path = os.path.join(self.dataset_path, 'test', 'test.txt')
        self.vocab_path = os.path.join(self.dataset_path, 'vocab.txt')
        self.metadata_path = os.path.abspath(
            os.path.join(self.dataset_path, 'metadata.txt'))
        self.w2v_path = os.path.join(self.dataset_path, 'w2v.npy')

        self.w2i, self.i2w = datasets.load_vocabulary(self.vocab_path)
        self.w2v = datasets.load_w2v(self.w2v_path)

        self.vocab_size = len(self.w2i)
        if not self.data_balancing:
            self.train = DataSet(self.train_path, (self.w2i, self.i2w))
        else:
            self.train = DataSetBalanced(self.train_path_list,
                                         (self.w2i, self.i2w))

        self.validation = DataSet(self.validation_path, (self.w2i, self.i2w))
        self.test = DataSet(self.test_path, (self.w2i, self.i2w))
        self.__refresh(load_w2v=False)
Esempio n. 9
0
    def __init__(self, train_validation_split=None, test_split=None,
                 use_defaults=True, data_balancing=True):
        if train_validation_split is not None or test_split is not None or \
                use_defaults is False:
            raise NotImplementedError('This Dataset does not implement '
                  'train_validation_split, test_split or use_defaults as the '
                  'dataset is big enough and uses dedicated splits from '
                  'the original datasets')
        self.dataset_name = 'Amazon Reviews Dataset'
        self.dataset_description = 'This dataset has been generated by ' \
                                   'scraping Amazon Reviews'
        self.test_split = 'large'
        self.dataset = "amazon_reviews_de"
        self.dataset_path = os.path.join(datasets.data_root_directory,
                                         self.dataset)
        self.data_balancing = data_balancing
        self.data_path = os.path.join(self.dataset_path, 'reviews.txt')
        self.train_path = os.path.join(self.dataset_path, 'train', 'train.txt')
        self.train_path_list = glob(os.path.join(self.dataset_path, 'train', 'output_file_*.txt'))
        self.validation_path = os.path.join(self.dataset_path, 'validation',
                                            'validation.txt')
        self.test_path = os.path.join(self.dataset_path, 'test', 'test.txt')
        self.vocab_path = os.path.join(self.dataset_path, 'vocab.txt')
        self.metadata_path = os.path.abspath(os.path.join(self.dataset_path,
                                               'metadata.txt'))
        self.w2v_path = os.path.join(self.dataset_path, 'w2v.npy')

        self.w2i, self.i2w = datasets.load_vocabulary(self.vocab_path)
        self.w2v = datasets.load_w2v(self.w2v_path)

        self.vocab_size = len(self.w2i)
        if not self.data_balancing:
            self.train = DataSet(self.train_path, (self.w2i, self.i2w))
        else:
            self.train = DataSetBalanced(self.train_path_list, (self.w2i, self.i2w))
        self.validation = DataSet(self.validation_path, (self.w2i, self.i2w))
        self.test = DataSet(self.test_path, (self.w2i, self.i2w))
        self.__refresh(load_w2v=False)