def initialize(self, embeddings): embeddings = read_config_file_or_json(embeddings, 'embeddings') embeddings_set = index_by_label(embeddings) self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print_dataset_info(self.dataset) vocab_sources = [self.dataset['train_file'], self.dataset['valid_file']] # TODO: make this optional if 'test_file' in self.dataset: vocab_sources.append(self.dataset['test_file']) vocab1, vocab2 = self.reader.build_vocabs(vocab_sources, min_f=Task._get_min_f(self.config_params), vocab_file=self.dataset.get('vocab_file')) # To keep the config file simple, share a list between source and destination (tgt) features_src = [] features_tgt = None for feature in self.config_params['features']: if feature['name'] == 'tgt': features_tgt = feature else: features_src += [feature] self.src_embeddings, self.feat2src = self._create_embeddings(embeddings_set, vocab1, features_src) # For now, dont allow multiple vocabs of output baseline.save_vocabs(self.get_basedir(), self.feat2src) self.tgt_embeddings, self.feat2tgt = self._create_embeddings(embeddings_set, {'tgt': vocab2}, [features_tgt]) baseline.save_vocabs(self.get_basedir(), self.feat2tgt) self.tgt_embeddings = self.tgt_embeddings['tgt'] self.feat2tgt = self.feat2tgt['tgt']
def initialize(self, embeddings): embeddings = read_config_file_or_json(embeddings, 'embeddings') embeddings_set = index_by_label(embeddings) self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print_dataset_info(self.dataset) vocab1, vocab2 = self.reader.build_vocabs( [self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file']], min_f=Task._get_min_f(self.config_params), vocab_file=self.dataset.get('vocab_file') ) # To keep the config file simple, share a list between source and destination (tgt) features_src = [] features_tgt = None for feature in self.config_params['features']: if feature['name'] == 'tgt': features_tgt = feature else: features_src += [feature] self.src_embeddings, self.feat2src = self._create_embeddings(embeddings_set, vocab1, features_src) # For now, dont allow multiple vocabs of output baseline.save_vocabs(self.get_basedir(), self.feat2src) self.tgt_embeddings, self.feat2tgt = self._create_embeddings(embeddings_set, {'tgt': vocab2}, [features_tgt]) baseline.save_vocabs(self.get_basedir(), self.feat2tgt) self.tgt_embeddings = self.tgt_embeddings['tgt'] self.feat2tgt = self.feat2tgt['tgt']
def initialize(self, embeddings): self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print_dataset_info(self.dataset) embeddings = read_config_file_or_json(embeddings, 'embeddings') embeddings_set = index_by_label(embeddings) vocabs = self.reader.build_vocab( [self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file']], min_f=Task._get_min_f(self.config_params), ) self.embeddings, self.feat2index = self._create_embeddings(embeddings_set, vocabs, self.config_params['features']) baseline.save_vocabs(self.get_basedir(), self.feat2index)
def initialize(self, embeddings): embeddings = read_config_file_or_json(embeddings, 'embeddings') embeddings_set = index_by_label(embeddings) self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print_dataset_info(self.dataset) vocab_sources = [self.dataset['train_file'], self.dataset['valid_file']] # TODO: make this optional if 'test_file' in self.dataset: vocab_sources.append(self.dataset['test_file']) vocabs = self.reader.build_vocab(vocab_sources, min_f=Task._get_min_f(self.config_params), vocab_file=self.dataset.get('vocab_file')) self.embeddings, self.feat2index = self._create_embeddings(embeddings_set, vocabs, self.config_params['features']) baseline.save_vocabs(self.get_basedir(), self.feat2index)