def __init__(self, vocab_path, data_raw_src=None, data_raw_tgt=None, lower=True): """ Initialize a vocabulary class. Either you specify a vocabulary path to load the vocabulary from a file, or you provide training data to create one. :param vocab_path: path to a saved vocabulary :param data_raw_src: training data, source side :param data_raw_tgt: training data, target side """ self.lower = lower self.id2tok = {} self.tok2id = {} if not check_file_exists(vocab_path): assert (data_raw_src is not None) and (data_raw_tgt is not None), \ "You need to process train data ** before ** creating a vocabulary!" self.create_vocabulary(raw_data_src=data_raw_src, raw_data_tgt=data_raw_tgt, vocab_path=vocab_path) else: # Load a saved vocabulary self.load_vocabulary(vocab_path)
def setup(self, vocab_path, data, lower, source): """ Initialize a vocabulary class: - either you specify a vocabulary path to load the vocabulary from a file - or you provide a vocabulary list to init the vocabulary :param vocab_path: path to a saved vocabulary :param vocab_list: list of items """ self.lower = lower self.id2tok = {} self.tok2id = {} save_vocab = False if source == 'file': assert check_file_exists(vocab_path), \ logger.error('Vocabulary file does not exits: %s', vocab_path) vocablist = self.vocablist_from_file(vocab_path) elif source == 'depgraphs': assert data is not None, \ logger.error("Cannot create vocabulary: no data provided!") vocablist = self.vocablist_from_depgraphs(data) save_vocab = True elif source == 'tokens': assert data is not None, \ logger.error("Cannot create vocabulary: no data provided!") vocablist = self.vocablist_from_tokens(data) save_vocab = True else: raise NotImplementedError() assert (type(vocablist) == list) if save_vocab: self.save_to_file(vocablist, vocab_path) for idx, tok in enumerate(vocablist): self.add_tok(idx, tok) self.add_id(idx, tok) self.UNK_ID = self.tok2id[constants.UNK] logger.debug('Vocabulary size: %d' % self.size)
def setup(self, vocab_path, data, lower, source): self.lower = lower self.id2tok = {} self.tok2id = {} save_vocab = False if source == 'file': assert check_file_exists(vocab_path), \ logger.error('Vocabulary file does not exits: %s', vocab_path) logger.info('Loading vocabulary from <-- %s', vocab_path) vocab_data = load_json(vocab_path) self.init_from_vocab_data(vocab_data) elif source == 'lemma_form_feat': assert data is not None, logger.error( "Cannot create vocabulary: no data provided!") logger.info('Inducing vocabulary from forms and lemmas ...') vocablist = self.induce_from_data(data) assert (type(vocablist) == list) for idx, tok in enumerate(vocablist): self.add_tok(idx, tok) self.add_id(idx, tok) save_vocab = True else: raise NotImplementedError() if save_vocab: self.save_vocab(vocab_path) self.UNK_ID = self.tok2id[UNK] self.BOS_ID = self.tok2id[BOS] self.EOS_ID = self.tok2id[EOS] self.PAD_ID = self.tok2id[PAD] self.output_classes_ids = [self.tok2id[t] for t in self.output_classes] logger.debug('Vocabulary size: %d' % self.size) logger.debug('Feature types: %s', self.featkey2idx.keys()) logger.debug('Output classes and ids:') logger.debug(list(zip(self.output_classes, self.output_classes_ids)))
def __init__(self, vocab_path, data_raw=None, lower=True): """ Initialize a vocabulary class. Either you specify a vocabulary path to load the vocabulary from a file, or you provide training data to create one. :param vocab_path: path to a saved vocabulary :param data_raw: training data """ self.lower = lower self.id2tok = {} self.tok2id = {} if not check_file_exists(vocab_path): assert data_raw is not None, "You need to process train data ** before ** creating a vocabulary!" self.create_vocabulary(raw_data=data_raw, vocab_path=vocab_path) else: # Load a saved vocabulary self.load_vocabulary(vocab_path)
def load_vocabulary(self, vocabulary_path): """ Load vocabulary from file. """ if check_file_exists([vocabulary_path]): logger.debug('Loading vocabulary from %s' % vocabulary_path) vocablist = [] with open(vocabulary_path, 'r') as f: for line in f: vocablist.append(line.strip()) for idx, tok in enumerate(vocablist): self.id2tok[idx] = tok self.tok2id[tok] = idx else: raise ValueError('Vocabulary file not found: %s' % vocabulary_path)
def vocablist_from_file(self, vocabulary_path): """ Load vocabulary from a text file. """ assert check_file_exists(vocabulary_path), ('Vocabulary file not found: %s' % vocabulary_path) logger.debug('Loading vocabulary from file <-- %s' % vocabulary_path) vocablist = [] with open(vocabulary_path, 'r') as f: for line in f: vocablist.append(line.strip()) test_key = constants.SYN_START_VOCAB[0].lower() if self.lower else constants.SYN_START_VOCAB[0] if test_key not in vocablist: vocablist_start = copy.deepcopy(constants.SYN_START_VOCAB) vocablist = vocablist_start + vocablist return vocablist
def setup(self, vocab_path, data, lower, source): self.src_vocab = SrcSideVocab() self.tgt_vocab = TgtSideVocab() save_vocab = False if source == 'file': assert check_file_exists(vocab_path), \ logger.error('Vocabulary file does not exits: %s', vocab_path) logger.error('Loading vocabulary from <-- %s', vocab_path) vocab_dicts = load_json(vocab_path) src_dict = vocab_dicts['src'] tgt_dict = vocab_dicts['tgt'] self.src_vocab.setup(src_dict, data=None, lower=lower) self.tgt_vocab.setup(tgt_dict, data=None, lower=lower) elif source == 'lemma_form_feat': assert data is not None, \ logger.error("Cannot create vocabulary: no data provided!") logger.info('Inducing src and tgt vocabularies from train data') # retrieve lemmas, forms and feature dictionaries train_lemmas_l, train_forms_l, train_feat_d = data self.src_vocab.setup(vocab_dict=None, data=(train_lemmas_l, train_feat_d), lower=lower) self.tgt_vocab.setup(vocab_dict=None, data=train_forms_l, lower=lower) save_vocab = True else: raise NotImplementedError() if save_vocab: self.save_vocab(vocab_path) self.PAD_ID = self.src_vocab.PAD_ID