def __init__(self): ''' load data ''' self.domains_train = pickle.load( open(OUTPUT_DIR + 'training_domains_%s.list' % DATASET, 'rb')) self.domains_train = [ d for cat_domains in self.domains_train for d in cat_domains ] self.domains_val = pickle.load( open(OUTPUT_DIR + 'validation_domains_%s.list' % DATASET, 'rb')) self.domains_val = [ d for cat_domains in self.domains_val for d in cat_domains ] self.domains_test = pickle.load( open(OUTPUT_DIR + 'test_domains_%s.list' % DATASET, 'rb')) self.domains_test = [ d for cat_domains in self.domains_test for d in cat_domains ] self.charngram2index = defaultdict( int) # index starts from 1. 0 is for padding for domains in (self.domains_train, self.domains_val, self.domains_test): for domain in domains: for word in domain['segmented_domain']: for ngram in compute_ngrams(word, *char_ngram_sizes): if ngram in self.charngram2index: continue self.charngram2index[ngram] = len( self.charngram2index) + 1 ''' word = ''.join(['<', word, '>']) for size in char_ngram_sizes: for i in range(max(1, len(word) - size + 1)): # some segments' lengths are less than char_ngram if word[i : i + size] in self.charngram2index: continue self.charngram2index[word[i : i + size]] = len(self.charngram2index) + 1 # the word itself is also added if word not in self.charngram2index: self.charngram2index[word] = len(self.charngram2index) + 1 ''' ''' load params ''' self.params = json.load(open(OUTPUT_DIR + 'params_%s.json' % DATASET)) self.params[ 'max_segment_char_len'] += 2 # because '<' and '>'are appended to each word # the word itself is also added, thus: sum(...) + 1 self.max_num_charngrams = len( compute_ngrams( ''.join(['a'] * self.params['max_segment_char_len']), *char_ngram_sizes)) ''' self.max_num_charngrams = sum(self.params['max_segment_char_len'] - size + 1 for size in char_ngram_sizes) + 1 ''' self.compute_class_weights()
def next_batch(self, domains, batch_size=batch_size): X_batch_embed = [] X_batch_mask = [] X_batch_suf = [] domain_actual_lens = [] sample_weights = [] y_batch = [] shuffle(domains) start_index = 0 while start_index < len(domains): for i in range(start_index, min(len(domains), start_index + batch_size)): ''' get char n-gram indices ''' embeds = [] # [[1,2,5,0,0], [35,3,7,8,4], ...] for word in domains[i]['segmented_domain']: embeds.append([self.charngram2index[ngram] for ngram in compute_ngrams(word, *char_ngram_sizes)]) ''' word = ''.join(['<', word, '>']) for size in char_ngram_sizes: # the word itself is also added embeds.append([self.charngram2index[word]] + [self.charngram2index[word[start : start + size]] for start in range(max(1, len(word) - size + 1))]) ''' domain_actual_lens.append(len(embeds)) ''' padding ''' # pad char-ngram level embeds = [indices + [0] * (self.max_num_charngrams - len(indices)) for indices in embeds] embeds += [[0] * self.max_num_charngrams for _ in range(self.params['max_domain_segments_len'] - len(embeds))] # X_batch_embed.append(tf.pad(embeds, paddings=[[0, n_extra_padding],[0,0]], mode="CONSTANT")) X_batch_embed.append(embeds) ''' mask ''' X_batch_mask.append((np.array(embeds) != 0).astype(float)) ''' top-level domain (suffix) ''' one_hot_suf = np.zeros(self.params['num_suffix']) one_hot_suf[domains[i]['suffix_indices']] = 1.0 / len(domains[i]['suffix_indices']) X_batch_suf.append(one_hot_suf) ''' target category ''' sample_weights.append(self.class_weights[categories[domains[i]['target']]]) y_batch.append(domains[i]['target']) yield np.array(X_batch_embed), np.array(X_batch_mask), np.array(domain_actual_lens), np.array(X_batch_suf), \ np.array(sample_weights), np.array(y_batch) # print(sample_weights) X_batch_embed.clear() X_batch_mask.clear() domain_actual_lens.clear() X_batch_suf.clear() sample_weights.clear() y_batch.clear() start_index += batch_size
def ft_embed(word): if word in en_model.wv.vocab: return super(FastTextKeyedVectors, en_model.wv).word_vec(word) word_vec = np.zeros(en_model.wv.syn0_ngrams.shape[1], dtype=np.float32) ngrams = compute_ngrams(word, 3, 6) ngrams = [ng for ng in ngrams if ng in en_model.wv.ngrams] ngram_weights = en_model.wv.syn0_ngrams for ngram in ngrams: word_vec += ngram_weights[en_model.wv.ngrams[ngram]] if word_vec.any(): return word_vec / len(ngrams)
def __init__(self): ''' load data ''' self.domains_train = pickle.load(open(OUTPUT_DIR + 'training_domains_%s.list' % DATASET, 'rb')) self.domains_train = [d for cat_domains in self.domains_train for d in cat_domains] self.domains_val = pickle.load(open(OUTPUT_DIR + 'validation_domains_%s.list' % DATASET, 'rb')) self.domains_val = [d for cat_domains in self.domains_val for d in cat_domains] self.domains_test = pickle.load(open(OUTPUT_DIR + 'test_domains_%s.list' % DATASET, 'rb')) self.domains_test = [d for cat_domains in self.domains_test for d in cat_domains] self.charngram2index = defaultdict(int) # index starts from 1. 0 is for padding max_domain_ngram = 0 max_segment_ngram = 0 for domains in (self.domains_train, self.domains_val, self.domains_test): for domain in domains: n_ngram_d = 0 for word in domain['segmented_domain']: n_ngram_s = 0 for ngram in compute_ngrams(word, *char_ngram_sizes): n_ngram_d += 1 n_ngram_s += 1 if ngram in self.charngram2index: continue self.charngram2index[ngram] = len(self.charngram2index) + 1 max_segment_ngram = max(max_segment_ngram, n_ngram_s) max_domain_ngram = max(max_domain_ngram, n_ngram_d) self.inital_ngram_embed = np.random.uniform(low=-1.0, high=1.0, size=(max(self.charngram2index.values()) + 1, embed_dimen)).astype('float32') if FT_INITIAL: for ngram, index in self.charngram2index.items(): if ngram in en_model.wv.vocab: self.inital_ngram_embed[index, :] = super(FastTextKeyedVectors, en_model.wv).word_vec(ngram, False) elif ngram in en_model.wv.ngrams: self.inital_ngram_embed[index, :] = en_model.wv.syn0_ngrams[en_model.wv.ngrams[ngram]] print('self.inital_ngram_embed.shape =', self.inital_ngram_embed.shape) ''' load params ''' self.params = json.load(open(OUTPUT_DIR + 'params_%s.json' % DATASET)) # the word itself is also added, thus: sum(...) + 1 if REDUCE_TO_WORD_LEVEL: self.max_num_charngrams = max_segment_ngram else: self.max_num_charngrams = max_domain_ngram print('self.max_num_charngrams =', self.max_num_charngrams) self.compute_class_weights()
def init_ngrams(self, update=False): if not update: self.wv.ngrams = {} self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size), dtype=REAL) self.syn0_vocab_lockf = ones( (len(self.wv.vocab), self.vector_size), dtype=REAL) self.wv.syn0_ngrams = empty((self.bucket, self.vector_size), dtype=REAL) self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size), dtype=REAL) all_ngrams = [] for w, v in self.wv.vocab.items(): self.wv.ngrams_word[w] = compute_ngrams( w, self.min_n, self.max_n) all_ngrams += self.wv.ngrams_word[w] all_ngrams = list(set(all_ngrams)) self.num_ngram_vectors = len(all_ngrams) logger.info("Total number of ngrams is %d", len(all_ngrams)) self.wv.hash2index = {} ngram_indices = [] new_hash_count = 0 for i, ngram in enumerate(all_ngrams): ngram_hash = ft_hash(ngram) if ngram_hash in self.wv.hash2index: self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] else: ngram_indices.append(ngram_hash % self.bucket) self.wv.hash2index[ngram_hash] = new_hash_count self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] new_hash_count = new_hash_count + 1 self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0) self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices, axis=0) self.reset_ngram_weights() else: new_ngrams = [] for w, v in self.wv.vocab.items(): self.wv.ngrams_word[w] = compute_ngrams( w, self.min_n, self.max_n) new_ngrams += [ ng for ng in self.wv.ngrams_word[w] if ng not in self.wv.ngrams ] new_ngrams = list(set(new_ngrams)) logger.info("Number of new ngrams is %d", len(new_ngrams)) new_hash_count = 0 for i, ngram in enumerate(new_ngrams): ngram_hash = ft_hash(ngram) if ngram_hash not in self.wv.hash2index: self.wv.hash2index[ ngram_hash] = new_hash_count + self.old_hash2index_len self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] new_hash_count = new_hash_count + 1 else: self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] rand_obj = np.random rand_obj.seed(self.seed) new_vocab_rows = rand_obj.uniform( -1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.vocab) - self.old_vocab_len, self.vector_size)) new_vocab_lockf_rows = ones( (len(self.wv.vocab) - self.old_vocab_len, self.vector_size), dtype=REAL) new_ngram_rows = rand_obj.uniform( -1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size)) new_ngram_lockf_rows = ones( (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size), dtype=REAL) self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows]) self.syn0_vocab_lockf = vstack( [self.syn0_vocab_lockf, new_vocab_lockf_rows]) self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows]) self.syn0_ngrams_lockf = vstack( [self.syn0_ngrams_lockf, new_ngram_lockf_rows])
def init_ngrams(self, update=False): """Compute ngrams of all words present in vocabulary and stores vectors for only those ngrams. Vectors for other ngrams are initialized with a random uniform distribution in FastText. Parameters ---------- update : bool If True, the new vocab words and their new ngrams word vectors are initialized with random uniform distribution and updated/added to the existing vocab word and ngram vectors. """ if not update: self.wv.ngrams = {} self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size), dtype=REAL) self.syn0_vocab_lockf = ones( (len(self.wv.vocab), self.vector_size), dtype=REAL) self.wv.syn0_ngrams = empty((self.bucket, self.vector_size), dtype=REAL) self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size), dtype=REAL) all_ngrams = [] for w, v in self.wv.vocab.items(): self.wv.ngrams_word[w] = compute_ngrams( w, self.min_n, self.max_n) all_ngrams += self.wv.ngrams_word[w] all_ngrams = list(set(all_ngrams)) self.num_ngram_vectors = len(all_ngrams) logger.info("Total number of ngrams is %d", len(all_ngrams)) self.wv.hash2index = {} ngram_indices = [] new_hash_count = 0 for i, ngram in enumerate(all_ngrams): ngram_hash = ft_hash(ngram) % self.bucket if ngram_hash in self.wv.hash2index: self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] else: ngram_indices.append(ngram_hash % self.bucket) self.wv.hash2index[ngram_hash] = new_hash_count self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] new_hash_count = new_hash_count + 1 self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0) self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices, axis=0) self.reset_ngram_weights() else: new_ngrams = [] for w, v in self.wv.vocab.items(): self.wv.ngrams_word[w] = compute_ngrams( w, self.min_n, self.max_n) new_ngrams += [ ng for ng in self.wv.ngrams_word[w] if ng not in self.wv.ngrams ] new_ngrams = list(set(new_ngrams)) logger.info("Number of new ngrams is %d", len(new_ngrams)) new_hash_count = 0 for i, ngram in enumerate(new_ngrams): ngram_hash = ft_hash(ngram) % self.bucket if ngram_hash not in self.wv.hash2index: self.wv.hash2index[ ngram_hash] = new_hash_count + self.old_hash2index_len self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] new_hash_count = new_hash_count + 1 else: self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] rand_obj = np.random rand_obj.seed(self.seed) new_vocab_rows = rand_obj.uniform( -1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.vocab) - self.old_vocab_len, self.vector_size)).astype(REAL) new_vocab_lockf_rows = ones( (len(self.wv.vocab) - self.old_vocab_len, self.vector_size), dtype=REAL) new_ngram_rows = rand_obj.uniform( -1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size)).astype(REAL) new_ngram_lockf_rows = ones( (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size), dtype=REAL) self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows]) self.syn0_vocab_lockf = vstack( [self.syn0_vocab_lockf, new_vocab_lockf_rows]) self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows]) self.syn0_ngrams_lockf = vstack( [self.syn0_ngrams_lockf, new_ngram_lockf_rows])
def next_batch(self, domains, batch_size=batch_size): X_batch_charidx = [] X_batch_mask = [] X_batch_wordembed = [] X_batch_suf = [] domain_actual_lens = [] sample_weights = [] y_batch = [] shuffle(domains) start_index = 0 while start_index < len(domains): for i in range(start_index, min(len(domains), start_index + batch_size)): ''' get char n-gram indices ''' charidx = [] # [[1,2,5,0,0], [35,3,7,8,4], ...] or [1,2,5,35,3,7,8,4, ...] if REDUCE_TO_WORD_LEVEL: for word in domains[i]['segmented_domain']: if FT_INITIAL: charidx.append([self.charngram2index[ngram] for ngram in compute_ngrams(word, *char_ngram_sizes) if ngram in self.charngram2index]) else: charidx.append([self.charngram2index[ngram] for ngram in compute_ngrams(word, *char_ngram_sizes)]) else: for word in domains[i]['segmented_domain']: if FT_INITIAL: charidx.extend([self.charngram2index[ngram] for ngram in compute_ngrams(word, *char_ngram_sizes) if ngram in self.charngram2index]) else: charidx.extend([self.charngram2index[ngram] for ngram in compute_ngrams(word, *char_ngram_sizes)]) if not charidx or not any(charidx): domains[i]['skipped'] = True continue domains[i]['skipped'] = False domain_actual_lens.append(len(charidx)) ''' get and pad word embedding indices ''' word_embeds = [en_model[w].tolist() for w in domains[i]['segmented_domain'] if w in en_model] # if not word_embeds: # Skip if none of segments of this domain can not be recognized by FastText # continue n_extra_padding = self.params['max_domain_segments_len'] - len(word_embeds) word_embeds += [[0] * embed_dimen for _ in range(n_extra_padding)] # X_batch_embed.append(tf.pad(embeds, paddings=[[0, n_extra_padding],[0,0]], mode="CONSTANT")) X_batch_wordembed.append(word_embeds) ''' padding ''' # pad char-ngram level if REDUCE_TO_WORD_LEVEL: charidx = [indices + [0] * (self.max_num_charngrams - len(indices)) for indices in charidx] # pad char-ngram level charidx += [[0] * self.max_num_charngrams for _ in range(self.params['max_domain_segments_len'] - len(charidx))] # pad segment level else: charidx += [0] * (self.max_num_charngrams - len(charidx)) X_batch_charidx.append(charidx) ''' mask ''' X_batch_mask.append((np.array(charidx) != 0).astype(float)) ''' top-level domain (suffix) ''' one_hot_suf = np.zeros(self.params['num_suffix']) one_hot_suf[domains[i]['suffix_indices']] = 1.0 / len(domains[i]['suffix_indices']) X_batch_suf.append(one_hot_suf) ''' target category ''' sample_weights.append(self.class_weights[categories[domains[i]['target']]]) y_batch.append(domains[i]['target']) yield np.array(X_batch_charidx), np.array(X_batch_mask), np.array(X_batch_wordembed), np.array(domain_actual_lens), np.array(X_batch_suf), \ np.array(sample_weights), np.array(y_batch) # print(sample_weights) X_batch_charidx.clear() X_batch_mask.clear() X_batch_wordembed.clear() domain_actual_lens.clear() X_batch_suf.clear() sample_weights.clear() y_batch.clear() start_index += batch_size
def init_ngrams(self, update=False): if not update: self.wv.ngrams = {} self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size), dtype=REAL) self.syn0_vocab_lockf = ones((len(self.wv.vocab), self.vector_size), dtype=REAL) self.wv.syn0_ngrams = empty((self.bucket, self.vector_size), dtype=REAL) self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size), dtype=REAL) all_ngrams = [] for w, v in self.wv.vocab.items(): self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n) all_ngrams += self.wv.ngrams_word[w] all_ngrams = list(set(all_ngrams)) self.num_ngram_vectors = len(all_ngrams) logger.info("Total number of ngrams is %d", len(all_ngrams)) self.wv.hash2index = {} ngram_indices = [] new_hash_count = 0 for i, ngram in enumerate(all_ngrams): ngram_hash = ft_hash(ngram) if ngram_hash in self.wv.hash2index: self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] else: ngram_indices.append(ngram_hash % self.bucket) self.wv.hash2index[ngram_hash] = new_hash_count self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] new_hash_count = new_hash_count + 1 self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0) self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices, axis=0) self.reset_ngram_weights() else: new_ngrams = [] for w, v in self.wv.vocab.items(): self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n) new_ngrams += [ng for ng in self.wv.ngrams_word[w] if ng not in self.wv.ngrams] new_ngrams = list(set(new_ngrams)) logger.info("Number of new ngrams is %d", len(new_ngrams)) new_hash_count = 0 for i, ngram in enumerate(new_ngrams): ngram_hash = ft_hash(ngram) if ngram_hash not in self.wv.hash2index: self.wv.hash2index[ngram_hash] = new_hash_count + self.old_hash2index_len self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] new_hash_count = new_hash_count + 1 else: self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] rand_obj = np.random rand_obj.seed(self.seed) new_vocab_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.vocab) - self.old_vocab_len, self.vector_size)) new_vocab_lockf_rows = ones((len(self.wv.vocab) - self.old_vocab_len, self.vector_size), dtype=REAL) new_ngram_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size)) new_ngram_lockf_rows = ones((len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size), dtype=REAL) self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows]) self.syn0_vocab_lockf = vstack([self.syn0_vocab_lockf, new_vocab_lockf_rows]) self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows]) self.syn0_ngrams_lockf = vstack([self.syn0_ngrams_lockf, new_ngram_lockf_rows])
def next_batch(self, domains, batch_size=batch_size): X_batch_embed = [] X_batch_mask = [] X_batch_suf = [] domain_actual_lens = [] sample_weights = [] y_batch = [] shuffle(domains) start_index = 0 while start_index < len(domains): for i in range(start_index, min(len(domains), start_index + batch_size)): ''' get char n-gram indices ''' embeds = [] # [[1,2,5,0,0], [35,3,7,8,4], ...] or [1,2,5,35,3,7,8,4, ...] if REDUCE_TO_WORD_LEVEL: for word in domains[i]['segmented_domain']: if FT_INITIAL: embeds.append([self.charngram2index[ngram] for ngram in compute_ngrams(word, *char_ngram_sizes) if ngram in self.charngram2index]) else: embeds.append([self.charngram2index[ngram] for ngram in compute_ngrams(word, *char_ngram_sizes)]) else: for word in domains[i]['segmented_domain']: if FT_INITIAL: embeds.extend([self.charngram2index[ngram] for ngram in compute_ngrams(word, *char_ngram_sizes) if ngram in self.charngram2index]) else: embeds.extend([self.charngram2index[ngram] for ngram in compute_ngrams(word, *char_ngram_sizes)]) if not embeds or not any(embeds): domains[i]['skipped'] = True continue domains[i]['skipped'] = False domain_actual_lens.append(len(embeds)) ''' padding ''' # pad char-ngram level if REDUCE_TO_WORD_LEVEL: embeds = [indices + [0] * (self.max_num_charngrams - len(indices)) for indices in embeds] # pad char-ngram level embeds += [[0] * self.max_num_charngrams for _ in range(self.params['max_domain_segments_len'] - len(embeds))] # pad segment level else: embeds += [0] * (self.max_num_charngrams - len(embeds)) X_batch_embed.append(embeds) ''' mask ''' X_batch_mask.append((np.array(embeds) != 0).astype(float)) ''' top-level domain (suffix) ''' one_hot_suf = np.zeros(self.params['num_suffix']) one_hot_suf[domains[i]['suffix_indices']] = 1.0 / len(domains[i]['suffix_indices']) X_batch_suf.append(one_hot_suf) ''' target category ''' sample_weights.append(self.class_weights[categories[domains[i]['target']]]) y_batch.append(domains[i]['target']) yield np.array(X_batch_embed), np.array(X_batch_mask), np.array(domain_actual_lens), np.array(X_batch_suf), \ np.array(sample_weights), np.array(y_batch) # print(sample_weights) X_batch_embed.clear() X_batch_mask.clear() domain_actual_lens.clear() X_batch_suf.clear() sample_weights.clear() y_batch.clear() start_index += batch_size