def __init__(self):
        ''' load data '''
        self.domains_train = pickle.load(
            open(OUTPUT_DIR + 'training_domains_%s.list' % DATASET, 'rb'))
        self.domains_train = [
            d for cat_domains in self.domains_train for d in cat_domains
        ]
        self.domains_val = pickle.load(
            open(OUTPUT_DIR + 'validation_domains_%s.list' % DATASET, 'rb'))
        self.domains_val = [
            d for cat_domains in self.domains_val for d in cat_domains
        ]
        self.domains_test = pickle.load(
            open(OUTPUT_DIR + 'test_domains_%s.list' % DATASET, 'rb'))
        self.domains_test = [
            d for cat_domains in self.domains_test for d in cat_domains
        ]

        self.charngram2index = defaultdict(
            int)  # index starts from 1. 0 is for padding
        for domains in (self.domains_train, self.domains_val,
                        self.domains_test):
            for domain in domains:
                for word in domain['segmented_domain']:

                    for ngram in compute_ngrams(word, *char_ngram_sizes):
                        if ngram in self.charngram2index:
                            continue
                        self.charngram2index[ngram] = len(
                            self.charngram2index) + 1
                    '''
                    word = ''.join(['<', word, '>'])
                    for size in char_ngram_sizes:
                        for i in range(max(1, len(word) - size + 1)):  # some segments' lengths are less than char_ngram
                            if word[i : i + size] in self.charngram2index:
                                continue
                            self.charngram2index[word[i : i + size]] = len(self.charngram2index) + 1
                    # the word itself is also added
                    if word not in self.charngram2index:
                        self.charngram2index[word] = len(self.charngram2index) + 1
                    '''
        ''' load params '''
        self.params = json.load(open(OUTPUT_DIR + 'params_%s.json' % DATASET))
        self.params[
            'max_segment_char_len'] += 2  # because '<' and '>'are appended to each word
        # the word itself is also added, thus: sum(...) + 1
        self.max_num_charngrams = len(
            compute_ngrams(
                ''.join(['a'] * self.params['max_segment_char_len']),
                *char_ngram_sizes))
        '''
        self.max_num_charngrams = sum(self.params['max_segment_char_len'] - size + 1 for size in char_ngram_sizes) + 1
        '''
        self.compute_class_weights()
コード例 #2
0
    def next_batch(self, domains, batch_size=batch_size):
        X_batch_embed = []
        X_batch_mask = []
        X_batch_suf = []
        domain_actual_lens = []
        sample_weights = []
        y_batch = []
        shuffle(domains)
        start_index = 0
        while start_index < len(domains):
            for i in range(start_index, min(len(domains), start_index + batch_size)):
                ''' get char n-gram indices '''
                embeds = []  # [[1,2,5,0,0], [35,3,7,8,4], ...]
                for word in domains[i]['segmented_domain']:
                    embeds.append([self.charngram2index[ngram] for ngram in compute_ngrams(word, *char_ngram_sizes)])

                    '''    
                    word = ''.join(['<', word, '>'])
                    for size in char_ngram_sizes:
                        # the word itself is also added
                        embeds.append([self.charngram2index[word]] + [self.charngram2index[word[start : start + size]] for start in range(max(1, len(word) - size + 1))])
                    '''
                domain_actual_lens.append(len(embeds))

                ''' padding '''
                # pad char-ngram level
                embeds = [indices + [0] * (self.max_num_charngrams - len(indices)) for indices in embeds]
                embeds += [[0] * self.max_num_charngrams for _ in range(self.params['max_domain_segments_len'] - len(embeds))]
                # X_batch_embed.append(tf.pad(embeds, paddings=[[0, n_extra_padding],[0,0]], mode="CONSTANT"))
                X_batch_embed.append(embeds)
                ''' mask '''
                X_batch_mask.append((np.array(embeds) != 0).astype(float))

                ''' top-level domain (suffix) '''
                one_hot_suf = np.zeros(self.params['num_suffix'])
                one_hot_suf[domains[i]['suffix_indices']] = 1.0 / len(domains[i]['suffix_indices'])
                X_batch_suf.append(one_hot_suf)

                ''' target category '''
                sample_weights.append(self.class_weights[categories[domains[i]['target']]])
                y_batch.append(domains[i]['target'])

            yield np.array(X_batch_embed), np.array(X_batch_mask), np.array(domain_actual_lens), np.array(X_batch_suf), \
                  np.array(sample_weights), np.array(y_batch)

            # print(sample_weights)

            X_batch_embed.clear()
            X_batch_mask.clear()
            domain_actual_lens.clear()
            X_batch_suf.clear()
            sample_weights.clear()
            y_batch.clear()
            start_index += batch_size
コード例 #3
0
def ft_embed(word):
    if word in en_model.wv.vocab:
        return super(FastTextKeyedVectors, en_model.wv).word_vec(word)

    word_vec = np.zeros(en_model.wv.syn0_ngrams.shape[1], dtype=np.float32)
    ngrams = compute_ngrams(word, 3, 6)
    ngrams = [ng for ng in ngrams if ng in en_model.wv.ngrams]
    ngram_weights = en_model.wv.syn0_ngrams
    for ngram in ngrams:
        word_vec += ngram_weights[en_model.wv.ngrams[ngram]]
    if word_vec.any():
        return word_vec / len(ngrams)
コード例 #4
0
    def __init__(self):
        ''' load data '''
        self.domains_train = pickle.load(open(OUTPUT_DIR + 'training_domains_%s.list' % DATASET, 'rb'))
        self.domains_train = [d for cat_domains in self.domains_train for d in cat_domains]
        self.domains_val = pickle.load(open(OUTPUT_DIR + 'validation_domains_%s.list' % DATASET, 'rb'))
        self.domains_val = [d for cat_domains in self.domains_val for d in cat_domains]
        self.domains_test = pickle.load(open(OUTPUT_DIR + 'test_domains_%s.list' % DATASET, 'rb'))
        self.domains_test = [d for cat_domains in self.domains_test for d in cat_domains]


        self.charngram2index = defaultdict(int)  # index starts from 1. 0 is for padding
        max_domain_ngram = 0
        max_segment_ngram = 0
        for domains in (self.domains_train, self.domains_val, self.domains_test):
            for domain in domains:
                n_ngram_d = 0
                for word in domain['segmented_domain']:
                    n_ngram_s = 0
                    for ngram in compute_ngrams(word, *char_ngram_sizes):
                        n_ngram_d += 1
                        n_ngram_s += 1
                        if ngram in self.charngram2index:
                            continue
                        self.charngram2index[ngram] = len(self.charngram2index) + 1
                    max_segment_ngram = max(max_segment_ngram, n_ngram_s)
                max_domain_ngram = max(max_domain_ngram, n_ngram_d)

        self.inital_ngram_embed = np.random.uniform(low=-1.0, high=1.0, size=(max(self.charngram2index.values()) + 1, embed_dimen)).astype('float32')
        if FT_INITIAL:
            for ngram, index in self.charngram2index.items():
                if ngram in en_model.wv.vocab:
                    self.inital_ngram_embed[index, :] = super(FastTextKeyedVectors, en_model.wv).word_vec(ngram, False)
                elif ngram in en_model.wv.ngrams:
                    self.inital_ngram_embed[index, :] = en_model.wv.syn0_ngrams[en_model.wv.ngrams[ngram]]

        print('self.inital_ngram_embed.shape =', self.inital_ngram_embed.shape)

        ''' load params '''
        self.params = json.load(open(OUTPUT_DIR + 'params_%s.json' % DATASET))
        # the word itself is also added, thus: sum(...) + 1
        if REDUCE_TO_WORD_LEVEL:
            self.max_num_charngrams = max_segment_ngram
        else:
            self.max_num_charngrams = max_domain_ngram
        print('self.max_num_charngrams =', self.max_num_charngrams)

        self.compute_class_weights()
コード例 #5
0
    def init_ngrams(self, update=False):
        if not update:
            self.wv.ngrams = {}
            self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size),
                                       dtype=REAL)
            self.syn0_vocab_lockf = ones(
                (len(self.wv.vocab), self.vector_size), dtype=REAL)

            self.wv.syn0_ngrams = empty((self.bucket, self.vector_size),
                                        dtype=REAL)
            self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size),
                                          dtype=REAL)

            all_ngrams = []
            for w, v in self.wv.vocab.items():
                self.wv.ngrams_word[w] = compute_ngrams(
                    w, self.min_n, self.max_n)
                all_ngrams += self.wv.ngrams_word[w]

            all_ngrams = list(set(all_ngrams))
            self.num_ngram_vectors = len(all_ngrams)
            logger.info("Total number of ngrams is %d", len(all_ngrams))

            self.wv.hash2index = {}
            ngram_indices = []
            new_hash_count = 0
            for i, ngram in enumerate(all_ngrams):
                ngram_hash = ft_hash(ngram)
                if ngram_hash in self.wv.hash2index:
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                else:
                    ngram_indices.append(ngram_hash % self.bucket)
                    self.wv.hash2index[ngram_hash] = new_hash_count
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                    new_hash_count = new_hash_count + 1

            self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices,
                                                           axis=0)
            self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices,
                                                                 axis=0)
            self.reset_ngram_weights()
        else:
            new_ngrams = []
            for w, v in self.wv.vocab.items():
                self.wv.ngrams_word[w] = compute_ngrams(
                    w, self.min_n, self.max_n)
                new_ngrams += [
                    ng for ng in self.wv.ngrams_word[w]
                    if ng not in self.wv.ngrams
                ]

            new_ngrams = list(set(new_ngrams))
            logger.info("Number of new ngrams is %d", len(new_ngrams))
            new_hash_count = 0
            for i, ngram in enumerate(new_ngrams):
                ngram_hash = ft_hash(ngram)
                if ngram_hash not in self.wv.hash2index:
                    self.wv.hash2index[
                        ngram_hash] = new_hash_count + self.old_hash2index_len
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                    new_hash_count = new_hash_count + 1
                else:
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]

            rand_obj = np.random
            rand_obj.seed(self.seed)
            new_vocab_rows = rand_obj.uniform(
                -1.0 / self.vector_size, 1.0 / self.vector_size,
                (len(self.wv.vocab) - self.old_vocab_len, self.vector_size))
            new_vocab_lockf_rows = ones(
                (len(self.wv.vocab) - self.old_vocab_len, self.vector_size),
                dtype=REAL)
            new_ngram_rows = rand_obj.uniform(
                -1.0 / self.vector_size, 1.0 / self.vector_size,
                (len(self.wv.hash2index) - self.old_hash2index_len,
                 self.vector_size))
            new_ngram_lockf_rows = ones(
                (len(self.wv.hash2index) - self.old_hash2index_len,
                 self.vector_size),
                dtype=REAL)

            self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows])
            self.syn0_vocab_lockf = vstack(
                [self.syn0_vocab_lockf, new_vocab_lockf_rows])
            self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows])
            self.syn0_ngrams_lockf = vstack(
                [self.syn0_ngrams_lockf, new_ngram_lockf_rows])
コード例 #6
0
    def init_ngrams(self, update=False):
        """Compute ngrams of all words present in vocabulary and stores vectors for only those ngrams.
        Vectors for other ngrams are initialized with a random uniform distribution in FastText.

        Parameters
        ----------
        update : bool
            If True, the new vocab words and their new ngrams word vectors are initialized
            with random uniform distribution and updated/added to the existing vocab word and ngram vectors.

        """
        if not update:
            self.wv.ngrams = {}
            self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size),
                                       dtype=REAL)
            self.syn0_vocab_lockf = ones(
                (len(self.wv.vocab), self.vector_size), dtype=REAL)

            self.wv.syn0_ngrams = empty((self.bucket, self.vector_size),
                                        dtype=REAL)
            self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size),
                                          dtype=REAL)

            all_ngrams = []
            for w, v in self.wv.vocab.items():
                self.wv.ngrams_word[w] = compute_ngrams(
                    w, self.min_n, self.max_n)
                all_ngrams += self.wv.ngrams_word[w]

            all_ngrams = list(set(all_ngrams))
            self.num_ngram_vectors = len(all_ngrams)
            logger.info("Total number of ngrams is %d", len(all_ngrams))

            self.wv.hash2index = {}
            ngram_indices = []
            new_hash_count = 0
            for i, ngram in enumerate(all_ngrams):
                ngram_hash = ft_hash(ngram) % self.bucket
                if ngram_hash in self.wv.hash2index:
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                else:
                    ngram_indices.append(ngram_hash % self.bucket)
                    self.wv.hash2index[ngram_hash] = new_hash_count
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                    new_hash_count = new_hash_count + 1

            self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices,
                                                           axis=0)
            self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices,
                                                                 axis=0)
            self.reset_ngram_weights()
        else:
            new_ngrams = []
            for w, v in self.wv.vocab.items():
                self.wv.ngrams_word[w] = compute_ngrams(
                    w, self.min_n, self.max_n)
                new_ngrams += [
                    ng for ng in self.wv.ngrams_word[w]
                    if ng not in self.wv.ngrams
                ]

            new_ngrams = list(set(new_ngrams))
            logger.info("Number of new ngrams is %d", len(new_ngrams))
            new_hash_count = 0
            for i, ngram in enumerate(new_ngrams):
                ngram_hash = ft_hash(ngram) % self.bucket
                if ngram_hash not in self.wv.hash2index:
                    self.wv.hash2index[
                        ngram_hash] = new_hash_count + self.old_hash2index_len
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                    new_hash_count = new_hash_count + 1
                else:
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]

            rand_obj = np.random
            rand_obj.seed(self.seed)
            new_vocab_rows = rand_obj.uniform(
                -1.0 / self.vector_size, 1.0 / self.vector_size,
                (len(self.wv.vocab) - self.old_vocab_len,
                 self.vector_size)).astype(REAL)
            new_vocab_lockf_rows = ones(
                (len(self.wv.vocab) - self.old_vocab_len, self.vector_size),
                dtype=REAL)
            new_ngram_rows = rand_obj.uniform(
                -1.0 / self.vector_size, 1.0 / self.vector_size,
                (len(self.wv.hash2index) - self.old_hash2index_len,
                 self.vector_size)).astype(REAL)
            new_ngram_lockf_rows = ones(
                (len(self.wv.hash2index) - self.old_hash2index_len,
                 self.vector_size),
                dtype=REAL)

            self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows])
            self.syn0_vocab_lockf = vstack(
                [self.syn0_vocab_lockf, new_vocab_lockf_rows])
            self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows])
            self.syn0_ngrams_lockf = vstack(
                [self.syn0_ngrams_lockf, new_ngram_lockf_rows])
コード例 #7
0
    def next_batch(self, domains, batch_size=batch_size):
        X_batch_charidx = []
        X_batch_mask = []
        X_batch_wordembed = []
        X_batch_suf = []
        domain_actual_lens = []
        sample_weights = []
        y_batch = []
        shuffle(domains)
        start_index = 0
        while start_index < len(domains):
            for i in range(start_index, min(len(domains), start_index + batch_size)):

                ''' get char n-gram indices '''
                charidx = []  # [[1,2,5,0,0], [35,3,7,8,4], ...] or [1,2,5,35,3,7,8,4, ...]
                if REDUCE_TO_WORD_LEVEL:
                    for word in domains[i]['segmented_domain']:
                        if FT_INITIAL:
                            charidx.append([self.charngram2index[ngram]
                                 for ngram in compute_ngrams(word, *char_ngram_sizes)
                                 if ngram in self.charngram2index])
                        else:
                            charidx.append([self.charngram2index[ngram]
                                for ngram in compute_ngrams(word, *char_ngram_sizes)])
                else:
                    for word in domains[i]['segmented_domain']:
                        if FT_INITIAL:
                            charidx.extend([self.charngram2index[ngram]
                                           for ngram in compute_ngrams(word, *char_ngram_sizes)
                                           if ngram in self.charngram2index])
                        else:
                            charidx.extend([self.charngram2index[ngram]
                                           for ngram in compute_ngrams(word, *char_ngram_sizes)])

                if not charidx or not any(charidx):
                    domains[i]['skipped'] = True
                    continue
                domains[i]['skipped'] = False
                domain_actual_lens.append(len(charidx))


                ''' get and pad word embedding indices '''
                word_embeds = [en_model[w].tolist() for w in domains[i]['segmented_domain'] if w in en_model]
                # if not word_embeds:  # Skip if none of segments of this domain can not be recognized by FastText
                    # continue
                n_extra_padding = self.params['max_domain_segments_len'] - len(word_embeds)
                word_embeds += [[0] * embed_dimen for _ in range(n_extra_padding)]
                # X_batch_embed.append(tf.pad(embeds, paddings=[[0, n_extra_padding],[0,0]], mode="CONSTANT"))
                X_batch_wordembed.append(word_embeds)



                ''' padding '''
                # pad char-ngram level
                if REDUCE_TO_WORD_LEVEL:
                    charidx = [indices + [0] * (self.max_num_charngrams - len(indices)) for indices in charidx]  # pad char-ngram level
                    charidx += [[0] * self.max_num_charngrams for _ in
                               range(self.params['max_domain_segments_len'] - len(charidx))]  # pad segment level
                else:
                    charidx += [0] * (self.max_num_charngrams - len(charidx))

                X_batch_charidx.append(charidx)


                ''' mask '''
                X_batch_mask.append((np.array(charidx) != 0).astype(float))

                ''' top-level domain (suffix) '''
                one_hot_suf = np.zeros(self.params['num_suffix'])
                one_hot_suf[domains[i]['suffix_indices']] = 1.0 / len(domains[i]['suffix_indices'])
                X_batch_suf.append(one_hot_suf)

                ''' target category '''
                sample_weights.append(self.class_weights[categories[domains[i]['target']]])
                y_batch.append(domains[i]['target'])

            yield np.array(X_batch_charidx), np.array(X_batch_mask), np.array(X_batch_wordembed), np.array(domain_actual_lens), np.array(X_batch_suf), \
                  np.array(sample_weights), np.array(y_batch)

            # print(sample_weights)

            X_batch_charidx.clear()
            X_batch_mask.clear()
            X_batch_wordembed.clear()
            domain_actual_lens.clear()
            X_batch_suf.clear()
            sample_weights.clear()
            y_batch.clear()
            start_index += batch_size
コード例 #8
0
ファイル: fasttext.py プロジェクト: jMonteroMunoz/gensim
    def init_ngrams(self, update=False):
        if not update:
            self.wv.ngrams = {}
            self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size), dtype=REAL)
            self.syn0_vocab_lockf = ones((len(self.wv.vocab), self.vector_size), dtype=REAL)

            self.wv.syn0_ngrams = empty((self.bucket, self.vector_size), dtype=REAL)
            self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size), dtype=REAL)

            all_ngrams = []
            for w, v in self.wv.vocab.items():
                self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n)
                all_ngrams += self.wv.ngrams_word[w]

            all_ngrams = list(set(all_ngrams))
            self.num_ngram_vectors = len(all_ngrams)
            logger.info("Total number of ngrams is %d", len(all_ngrams))

            self.wv.hash2index = {}
            ngram_indices = []
            new_hash_count = 0
            for i, ngram in enumerate(all_ngrams):
                ngram_hash = ft_hash(ngram)
                if ngram_hash in self.wv.hash2index:
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                else:
                    ngram_indices.append(ngram_hash % self.bucket)
                    self.wv.hash2index[ngram_hash] = new_hash_count
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                    new_hash_count = new_hash_count + 1

            self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0)
            self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices, axis=0)
            self.reset_ngram_weights()
        else:
            new_ngrams = []
            for w, v in self.wv.vocab.items():
                self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n)
                new_ngrams += [ng for ng in self.wv.ngrams_word[w] if ng not in self.wv.ngrams]

            new_ngrams = list(set(new_ngrams))
            logger.info("Number of new ngrams is %d", len(new_ngrams))
            new_hash_count = 0
            for i, ngram in enumerate(new_ngrams):
                ngram_hash = ft_hash(ngram)
                if ngram_hash not in self.wv.hash2index:
                    self.wv.hash2index[ngram_hash] = new_hash_count + self.old_hash2index_len
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
                    new_hash_count = new_hash_count + 1
                else:
                    self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]

            rand_obj = np.random
            rand_obj.seed(self.seed)
            new_vocab_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.vocab) - self.old_vocab_len, self.vector_size))
            new_vocab_lockf_rows = ones((len(self.wv.vocab) - self.old_vocab_len, self.vector_size), dtype=REAL)
            new_ngram_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size))
            new_ngram_lockf_rows = ones((len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size), dtype=REAL)

            self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows])
            self.syn0_vocab_lockf = vstack([self.syn0_vocab_lockf, new_vocab_lockf_rows])
            self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows])
            self.syn0_ngrams_lockf = vstack([self.syn0_ngrams_lockf, new_ngram_lockf_rows])
コード例 #9
0
    def next_batch(self, domains, batch_size=batch_size):
        X_batch_embed = []
        X_batch_mask = []
        X_batch_suf = []
        domain_actual_lens = []
        sample_weights = []
        y_batch = []
        shuffle(domains)
        start_index = 0
        while start_index < len(domains):
            for i in range(start_index, min(len(domains), start_index + batch_size)):
                ''' get char n-gram indices '''
                embeds = []  # [[1,2,5,0,0], [35,3,7,8,4], ...] or [1,2,5,35,3,7,8,4, ...]
                if REDUCE_TO_WORD_LEVEL:
                    for word in domains[i]['segmented_domain']:
                        if FT_INITIAL:
                            embeds.append([self.charngram2index[ngram]
                                 for ngram in compute_ngrams(word, *char_ngram_sizes)
                                 if ngram in self.charngram2index])
                        else:
                            embeds.append([self.charngram2index[ngram]
                                for ngram in compute_ngrams(word, *char_ngram_sizes)])
                else:
                    for word in domains[i]['segmented_domain']:
                        if FT_INITIAL:
                            embeds.extend([self.charngram2index[ngram]
                                           for ngram in compute_ngrams(word, *char_ngram_sizes)
                                           if ngram in self.charngram2index])
                        else:
                            embeds.extend([self.charngram2index[ngram]
                                           for ngram in compute_ngrams(word, *char_ngram_sizes)])

                if not embeds or not any(embeds):
                    domains[i]['skipped'] = True
                    continue
                domains[i]['skipped'] = False
                domain_actual_lens.append(len(embeds))

                ''' padding '''
                # pad char-ngram level
                if REDUCE_TO_WORD_LEVEL:
                    embeds = [indices + [0] * (self.max_num_charngrams - len(indices)) for indices in embeds]  # pad char-ngram level
                    embeds += [[0] * self.max_num_charngrams for _ in
                               range(self.params['max_domain_segments_len'] - len(embeds))]  # pad segment level
                else:
                    embeds += [0] * (self.max_num_charngrams - len(embeds))

                X_batch_embed.append(embeds)
                ''' mask '''
                X_batch_mask.append((np.array(embeds) != 0).astype(float))

                ''' top-level domain (suffix) '''
                one_hot_suf = np.zeros(self.params['num_suffix'])
                one_hot_suf[domains[i]['suffix_indices']] = 1.0 / len(domains[i]['suffix_indices'])
                X_batch_suf.append(one_hot_suf)

                ''' target category '''
                sample_weights.append(self.class_weights[categories[domains[i]['target']]])
                y_batch.append(domains[i]['target'])

            yield np.array(X_batch_embed), np.array(X_batch_mask), np.array(domain_actual_lens), np.array(X_batch_suf), \
                  np.array(sample_weights), np.array(y_batch)

            # print(sample_weights)

            X_batch_embed.clear()
            X_batch_mask.clear()
            domain_actual_lens.clear()
            X_batch_suf.clear()
            sample_weights.clear()
            y_batch.clear()
            start_index += batch_size