def __init__(self, train_validation_split=None, test_split=None, use_defaults=True, subset='sts_small'): if train_validation_split is not None or test_split is not None or \ use_defaults is False: raise NotImplementedError('This Dataset does not implement ' 'train_validation_split, test_split or use_defaults as the ' 'dataset is big enough and uses dedicated splits from ' 'the original datasets') self.dataset_name = 'Semantic Text Similarity - All' self.dataset_description = 'This dataset has been generated by ' \ 'merging MPD, SICK, Quora, StackExchange and and SemEval ' \ 'datasets. \n It has 258537 Training sentence pairs, 133102 ' \ 'Test sentence pairs and 59058 validation sentence pairs.' self.test_split = 'large' self.dataset = subset self.dataset_path = os.path.join(datasets.data_root_directory, self.dataset) self.train_path = os.path.join(self.dataset_path, 'train', 'train.txt') self.validation_path = os.path.join(self.dataset_path, 'validation', 'validation.txt') self.test_path = os.path.join(self.dataset_path, 'test', 'test.txt') self.vocab_path = os.path.join(self.dataset_path, 'vocab.txt') self.metadata_path = os.path.abspath(os.path.join(self.dataset_path, 'metadata.txt')) self.w2v_path = os.path.join(self.dataset_path, 'w2v.npy') self.w2i, self.i2w = datasets.load_vocabulary(self.vocab_path) self.w2v = datasets.load_w2v(self.w2v_path) self.vocab_size = len(self.w2i) self.train = DataSet(self.train_path, (self.w2i, self.i2w)) self.validation = DataSet(self.validation_path, (self.w2i, self.i2w)) self.test = DataSet(self.test_path, (self.w2i, self.i2w)) self.__refresh(load_w2v=False)
def __refresh(self, load_w2v): self.w2i, self.i2w = datasets.load_vocabulary(self.vocab_path) if load_w2v: self.w2v = datasets.preload_w2v(self.w2i) datasets.save_w2v(self.w2v_path, self.w2v) self.train.set_vocab((self.w2i, self.i2w)) self.validation.set_vocab((self.w2i, self.i2w)) self.test.set_vocab((self.w2i, self.i2w))
def __init__(self, train_validation_split=None, test_split=None, use_defaults=True): if train_validation_split is not None or test_split is not None or \ use_defaults is False: raise NotImplementedError( 'This Dataset does not implement ' 'train_validation_split, test_split or use_defaults as the ' 'dataset is big enough and uses dedicated splits from ' 'the original datasets') self.dataset_name = 'Twitter Emotions Dataset' self.dataset_description = 'Tn a variation on the popular task of ' \ 'sentiment analysis, this dataset contains labels for the emotional' \ ' content (such as happiness, sadness, and anger) of texts. Hundreds' \ ' to thousands of examples across 13 labels. A subset of this data ' \ 'is used in an experiment that is uploaded to Microsoft’s Cortana ' \ 'Intelligence Gallery.' self.test_split = 'small' self.dataset = "twitter_emotion" self.dataset_path = os.path.join(datasets.data_root_directory, self.dataset) self.data_path = os.path.join(self.dataset_path, 'emotion_text.txt') self.train_paths = { i: os.path.join(self.dataset_path, 'train', 'fold_{}_train'.format(i)) for i in range(5) } self.validation_paths = { i: os.path.join(self.dataset_path, 'validation', 'fold_{}_val'.format(i)) for i in range(5) } self.test_paths = { i: os.path.join(self.dataset_path, 'test', 'fold_{}_test'.format(i)) for i in range(5) } self.vocab_path = os.path.join(self.dataset_path, 'vocab.txt') self.metadata_path = os.path.abspath( os.path.join(self.dataset_path, 'metadata.txt')) self.classes_path = os.path.join(self.dataset_path, 'classes.txt') self.w2v_path = os.path.join(self.dataset_path, 'w2v.npy') self.w2i, self.i2w = datasets.load_vocabulary(self.vocab_path) self.w2v = datasets.load_w2v(self.w2v_path) self.c2i, self.i2c = datasets.load_classes(self.classes_path) self.n_classes = len(self.c2i) self.vocab_size = len(self.w2i) self.train = DataSet(self.train_paths, (self.w2i, self.i2w), (self.c2i, self.i2c), self.n_classes) self.validation = DataSet(self.validation_paths, (self.w2i, self.i2w), (self.c2i, self.i2c), self.n_classes) self.test = DataSet(self.test_paths, (self.w2i, self.i2w), (self.c2i, self.i2c), self.n_classes) self.__refresh(load_w2v=False)
def initialize_vocabulary(self): line_processor = lambda line: " ".join(line.split('\t')[:1]) self.vocab_path, self.w2v_path, self.metadata_path = \ datasets.new_vocabulary( files=[self.train_path], dataset_path=self.dataset_path, min_frequency=5, tokenizer='spacy', downcase=True, max_vocab_size=None, name='new', line_processor=line_processor) self.w2i, self.i2w = datasets.load_vocabulary(self.vocab_path) self.w2v = datasets.preload_w2v(self.w2i) datasets.save_w2v(self.w2v_path, self.w2v)
def __refresh(self, load_w2v): # (Again) # It doesn't seem to make sense to want to create a new vocabulary for # the other two types of data (NER data or POS tags). So I'll only allow # for new vocabularies on the text self.w2i[0], self.i2w[0] = datasets.load_vocabulary( self.vocab_paths[0]) if load_w2v: self.w2v[0] = datasets.preload_w2v(self.w2i[0], lang='de') datasets.save_w2v(self.w2v_paths[0], self.w2v[0]) self.train.set_vocab(self.w2i, self.i2w, 0) self.validation.set_vocab(self.w2i, self.i2w, 0) self.test.set_vocab(self.w2i, self.i2w, 0)
def initialize_defaults(self, shuffle): # For now, we are happy that this works =) #self.load_anew(train_validate_split=datasets.train_validate_split, # test_split=datasets.test_split_small, shuffle=shuffle) train_data = self.load_data(self.train_path) validate_data = self.load_data(self.validate_path) test_data = self.load_data(self.test_path) self.w2i, self.i2w = datasets.load_vocabulary(self.vocab_path) self.w2v = datasets.load_w2v(self.w2v_path) self.train = DataSet(train_data, (self.w2i, self.i2w), shuffle) self.validation = DataSet(validate_data, (self.w2i, self.i2w), shuffle) self.test = DataSet(test_data, (self.w2i, self.i2w), shuffle)
def initialize_vocabulary_ll(self, names, min_frequencies, downcases, tokenizer): for i in range(len(self.vocab_paths)): self.vocab_paths[i], self.w2v_paths[i], self.metadata_paths[i] = \ datasets.new_vocabulary( files=[self.train_path], dataset_path=self.dataset_path, min_frequency=min_frequencies[i], tokenizer=tokenizer[i], downcase=downcases[i], max_vocab_size=None, name=names[i], line_processor=lambda line: line.split('\t')[i], lang='de') self.w2i[i], self.i2w[i] = datasets.load_vocabulary( self.vocab_paths[i]) self.w2v[i] = datasets.preload_w2v(self.w2i[i], lang='de') datasets.save_w2v(self.w2v_paths[i], self.w2v[i])
def __init__(self, train_validation_split=None, test_split=None, use_defaults=True, data_balancing=True): if train_validation_split is not None or test_split is not None or \ use_defaults is False: raise NotImplementedError( 'This Dataset does not implement ' 'train_validation_split, test_split or use_defaults as the ' 'dataset is big enough and uses dedicated splits from ' 'the original datasets') self.dataset_name = 'CMU Hotel Reviews' self.dataset_description = 'This dataset is from CMU. Here is the ' \ 'link to the dataset http://www.cs.cmu.edu/~jiweil/html/' \ 'hotel-review.html \nIt has 553494 Training Instances ' \ '263568 Test Instances and 61499 Validation Instances' self.test_split = 'large' self.dataset = "hotel_reviews" self.dataset_path = os.path.join(datasets.data_root_directory, self.dataset) self.data_balancing = data_balancing self.train_path = os.path.join(self.dataset_path, 'train', 'train.txt') self.train_path_list = glob( os.path.join(self.dataset_path, 'train', 'output_file_*.txt')) self.validation_path = os.path.join(self.dataset_path, 'validation', 'validation.txt') self.test_path = os.path.join(self.dataset_path, 'test', 'test.txt') self.vocab_path = os.path.join(self.dataset_path, 'vocab.txt') self.metadata_path = os.path.abspath( os.path.join(self.dataset_path, 'metadata.txt')) self.w2v_path = os.path.join(self.dataset_path, 'w2v.npy') self.w2i, self.i2w = datasets.load_vocabulary(self.vocab_path) self.w2v = datasets.load_w2v(self.w2v_path) self.vocab_size = len(self.w2i) if not self.data_balancing: self.train = DataSet(self.train_path, (self.w2i, self.i2w)) else: self.train = DataSetBalanced(self.train_path_list, (self.w2i, self.i2w)) self.validation = DataSet(self.validation_path, (self.w2i, self.i2w)) self.test = DataSet(self.test_path, (self.w2i, self.i2w)) self.__refresh(load_w2v=False)
def __init__(self, train_validation_split=None, test_split=None, use_defaults=True, data_balancing=True): if train_validation_split is not None or test_split is not None or \ use_defaults is False: raise NotImplementedError('This Dataset does not implement ' 'train_validation_split, test_split or use_defaults as the ' 'dataset is big enough and uses dedicated splits from ' 'the original datasets') self.dataset_name = 'Amazon Reviews Dataset' self.dataset_description = 'This dataset has been generated by ' \ 'scraping Amazon Reviews' self.test_split = 'large' self.dataset = "amazon_reviews_de" self.dataset_path = os.path.join(datasets.data_root_directory, self.dataset) self.data_balancing = data_balancing self.data_path = os.path.join(self.dataset_path, 'reviews.txt') self.train_path = os.path.join(self.dataset_path, 'train', 'train.txt') self.train_path_list = glob(os.path.join(self.dataset_path, 'train', 'output_file_*.txt')) self.validation_path = os.path.join(self.dataset_path, 'validation', 'validation.txt') self.test_path = os.path.join(self.dataset_path, 'test', 'test.txt') self.vocab_path = os.path.join(self.dataset_path, 'vocab.txt') self.metadata_path = os.path.abspath(os.path.join(self.dataset_path, 'metadata.txt')) self.w2v_path = os.path.join(self.dataset_path, 'w2v.npy') self.w2i, self.i2w = datasets.load_vocabulary(self.vocab_path) self.w2v = datasets.load_w2v(self.w2v_path) self.vocab_size = len(self.w2i) if not self.data_balancing: self.train = DataSet(self.train_path, (self.w2i, self.i2w)) else: self.train = DataSetBalanced(self.train_path_list, (self.w2i, self.i2w)) self.validation = DataSet(self.validation_path, (self.w2i, self.i2w)) self.test = DataSet(self.test_path, (self.w2i, self.i2w)) self.__refresh(load_w2v=False)