Exemple #1
0
    def __init__(self,
                 vocab_path,
                 data_raw_src=None,
                 data_raw_tgt=None,
                 lower=True):
        """
        Initialize a vocabulary class. Either you specify a vocabulary path to load
        the vocabulary from a file, or you provide training data to create one.
        :param vocab_path: path to a saved vocabulary
        :param data_raw_src: training data, source side
        :param data_raw_tgt: training data, target side
        """
        self.lower = lower
        self.id2tok = {}
        self.tok2id = {}

        if not check_file_exists(vocab_path):
            assert (data_raw_src is not None) and (data_raw_tgt is not None), \
                "You need to process train data ** before ** creating a vocabulary!"
            self.create_vocabulary(raw_data_src=data_raw_src,
                                   raw_data_tgt=data_raw_tgt,
                                   vocab_path=vocab_path)

        else:
            # Load a saved vocabulary
            self.load_vocabulary(vocab_path)
    def setup(self, vocab_path, data, lower, source):
        """
        Initialize a vocabulary class:
            - either you specify a vocabulary path to load the vocabulary from a file
            - or you provide a vocabulary list to init the vocabulary

        :param vocab_path: path to a saved vocabulary
        :param vocab_list: list of items
        """

        self.lower = lower
        self.id2tok = {}
        self.tok2id = {}

        save_vocab = False

        if source == 'file':
            assert check_file_exists(vocab_path), \
                logger.error('Vocabulary file does not exits: %s', vocab_path)

            vocablist = self.vocablist_from_file(vocab_path)

        elif source == 'depgraphs':
            assert data is not None, \
                logger.error("Cannot create vocabulary: no data provided!")

            vocablist = self.vocablist_from_depgraphs(data)
            save_vocab = True

        elif source == 'tokens':
            assert data is not None, \
                logger.error("Cannot create vocabulary: no data provided!")

            vocablist = self.vocablist_from_tokens(data)
            save_vocab = True

        else:
            raise NotImplementedError()

        assert (type(vocablist) == list)
        if save_vocab:
            self.save_to_file(vocablist, vocab_path)

        for idx, tok in enumerate(vocablist):
            self.add_tok(idx, tok)
            self.add_id(idx, tok)

        self.UNK_ID = self.tok2id[constants.UNK]

        logger.debug('Vocabulary size: %d' % self.size)
    def setup(self, vocab_path, data, lower, source):

        self.lower = lower
        self.id2tok = {}
        self.tok2id = {}
        save_vocab = False

        if source == 'file':
            assert check_file_exists(vocab_path), \
                logger.error('Vocabulary file does not exits: %s', vocab_path)

            logger.info('Loading vocabulary from <-- %s', vocab_path)
            vocab_data = load_json(vocab_path)
            self.init_from_vocab_data(vocab_data)

        elif source == 'lemma_form_feat':
            assert data is not None, logger.error(
                "Cannot create vocabulary: no data provided!")

            logger.info('Inducing vocabulary from forms and lemmas ...')
            vocablist = self.induce_from_data(data)
            assert (type(vocablist) == list)

            for idx, tok in enumerate(vocablist):
                self.add_tok(idx, tok)
                self.add_id(idx, tok)

            save_vocab = True

        else:
            raise NotImplementedError()

        if save_vocab:
            self.save_vocab(vocab_path)

        self.UNK_ID = self.tok2id[UNK]
        self.BOS_ID = self.tok2id[BOS]
        self.EOS_ID = self.tok2id[EOS]
        self.PAD_ID = self.tok2id[PAD]
        self.output_classes_ids = [self.tok2id[t] for t in self.output_classes]

        logger.debug('Vocabulary size: %d' % self.size)
        logger.debug('Feature types: %s', self.featkey2idx.keys())
        logger.debug('Output classes and ids:')
        logger.debug(list(zip(self.output_classes, self.output_classes_ids)))
Exemple #4
0
    def __init__(self, vocab_path, data_raw=None, lower=True):
        """
        Initialize a vocabulary class. Either you specify a vocabulary path to load
        the vocabulary from a file, or you provide training data to create one.
        :param vocab_path: path to a saved vocabulary
        :param data_raw: training data
        """
        self.lower = lower
        self.id2tok = {}
        self.tok2id = {}

        if not check_file_exists(vocab_path):
            assert data_raw is not None, "You need to process train data ** before ** creating a vocabulary!"
            self.create_vocabulary(raw_data=data_raw, vocab_path=vocab_path)

        else:
            # Load a saved vocabulary
            self.load_vocabulary(vocab_path)
Exemple #5
0
    def load_vocabulary(self, vocabulary_path):
        """
        Load vocabulary from file.
        """

        if check_file_exists([vocabulary_path]):
            logger.debug('Loading vocabulary from %s' % vocabulary_path)

            vocablist = []

            with open(vocabulary_path, 'r') as f:
                for line in f:
                    vocablist.append(line.strip())

            for idx, tok in enumerate(vocablist):
                self.id2tok[idx] = tok
                self.tok2id[tok] = idx

        else:
            raise ValueError('Vocabulary file not found: %s' % vocabulary_path)
    def vocablist_from_file(self, vocabulary_path):

        """
        Load vocabulary from a text file.
        """

        assert check_file_exists(vocabulary_path), ('Vocabulary file not found: %s' % vocabulary_path)
        logger.debug('Loading vocabulary from file <-- %s' % vocabulary_path)

        vocablist = []

        with open(vocabulary_path, 'r') as f:
            for line in f:
                vocablist.append(line.strip())

        test_key = constants.SYN_START_VOCAB[0].lower() if self.lower else constants.SYN_START_VOCAB[0]
        if test_key not in vocablist:
            vocablist_start = copy.deepcopy(constants.SYN_START_VOCAB)
            vocablist = vocablist_start + vocablist

        return vocablist
Exemple #7
0
    def setup(self, vocab_path, data, lower, source):

        self.src_vocab = SrcSideVocab()
        self.tgt_vocab = TgtSideVocab()
        save_vocab = False

        if source == 'file':
            assert check_file_exists(vocab_path), \
                logger.error('Vocabulary file does not exits: %s', vocab_path)

            logger.error('Loading vocabulary from <-- %s', vocab_path)
            vocab_dicts = load_json(vocab_path)

            src_dict = vocab_dicts['src']
            tgt_dict = vocab_dicts['tgt']

            self.src_vocab.setup(src_dict, data=None, lower=lower)
            self.tgt_vocab.setup(tgt_dict, data=None, lower=lower)

        elif source == 'lemma_form_feat':
            assert data is not None, \
                logger.error("Cannot create vocabulary: no data provided!")

            logger.info('Inducing src and tgt vocabularies from train data')

            # retrieve lemmas, forms and feature dictionaries
            train_lemmas_l, train_forms_l, train_feat_d = data
            self.src_vocab.setup(vocab_dict=None, data=(train_lemmas_l, train_feat_d), lower=lower)
            self.tgt_vocab.setup(vocab_dict=None, data=train_forms_l, lower=lower)
            save_vocab = True

        else:
            raise NotImplementedError()

        if save_vocab:
            self.save_vocab(vocab_path)

        self.PAD_ID = self.src_vocab.PAD_ID