def send(self, _) -> Tuple[List, np.ndarray]: iteration = 0 while True: sentence = self._sequences[iteration] iteration = (iteration + 1) % len(self._sequences) pairs, labels = sequence.skipgrams( sentence, vocabulary_size=self._num_words, window_size=self._window_size, sampling_table=self._sampling_table, ) if pairs: target_words, context_words = [ list(words) for words in zip(*pairs) ] # Batch size is at least 32. Higher batch size is not # problematic. self._add_to_batch(context_words, target_words, labels) if self._is_batch_ready(): return self._process_batch()
def generate_data(corpus, window_size, V): for words in corpus: couples, labels = skipgrams(words, V, window_size, negative_samples=1, shuffle=True,sampling_table=make_sampling_table(V, sampling_factor=1e-05)) if couples: X, y = zip(*couples) X = np_utils.to_categorical(X, V) y = np_utils.to_categorical(y, V) yield X, y
def create_dataset(text, vocab, num_words, window_size, negative_samples): data = vocab.texts_to_sequences([text]).pop() sampling_table = make_sampling_table(num_words) couples, labels = skipgrams(data, num_words, window_size=window_size, negative_samples=negative_samples, sampling_table=sampling_table) word_target, word_context = zip(*couples) word_target = np.reshape(word_target, (-1, 1)) word_context = np.reshape(word_context, (-1, 1)) labels = np.asfarray(labels) return [word_target, word_context], labels
def get_feature_word_embedding(path): # 使用bow的tokenizer f1 = open('tokenizer_bow.pkl', 'rb') tokenizer = pickle.load(f1) f1.close() code = code2text(path) seq = tokenizer.texts_to_sequences([code])[0] vocab_size = len(tokenizer.word_index) window_size = 2 positive_skip_grams, _ = skipgrams(seq, vocabulary_size=vocab_size, window_size=window_size, negative_samples=0) return
def training_data_generator(text_encoded, window_size=4, negative_samples=1.0, batch_docs=50): """ For given encoded text, return 3 np.array: words, contexts, labels Do not pair the w and its context cross different documents. input: text_encoded: list of list of int, each list of int is the numerical encoding of the doc window_size: int, define the context negative_samples: float, how much negative sampling you need, normally 1.0 batch_docs: int, number of docs for which it generates one return return: words: list of int, the numerical encoding of the central words contexts: list of int, the numerical encoding of the context words labels: list of int, 1 or 0 hint: 1. You can use skipgrams method from keras 2. For training purpose, words and contexts needs to be 2D array, with shape (N, 1), but labels is 1D array, with shape (N, ) 3. The output can be very big, you SHOULD using generator """ """ Write your code here """ sampling_table = make_sampling_table(VOCAB_SIZE) loc = list(range(len(text_encoded))) random.shuffle(loc) for j in loc[:batch_docs]: couples, label = skipgrams(text_encoded[j], VOCAB_SIZE, window_size=window_size, sampling_table=sampling_table, negative_samples=negative_samples, shuffle=True) if len(couples) > 0: target, context_ = zip(*couples) target = np.array(target, dtype="int32") context_ = np.array(context_, dtype="int32") yield target.tolist(), context_.tolist(), label else: continue
def main(): _download() file_path = dataset_dir + '/' + "ratings.txt" data = pd.read_csv(file_path, sep='\t', engine='python') for i in tqdm(range(len(data))): data.iloc[i, 1] = ' '.join(re.sub(r'[^가-힣]', ' ', str(data.iloc[i, 1]).strip()).split()) data.iloc[i, 1] = " ".join(mecab.morphs(data.iloc[i, 1])) df = data["document"].apply(lambda x: x.split()) df = df.to_list() stopwords = pd.read_csv(dataset_dir + '/' + "stopwords.csv", encoding="utf-8") stopwords = list(stopwords["stopwords"]) for i in tqdm(range(len(df))): for j in range(len(stopwords)): while stopwords[j] in df[i]: df[i].remove(stopwords[j]) drop_train = [index for index, sentence in enumerate(df) if len(sentence) <= 1] df = np.delete(df, drop_train, axis=0) tokenizer = Tokenizer() tokenizer.fit_on_texts(df) word2idx = tokenizer.word_index print(len(word2idx)) with open(dataset_dir + '/' + "word2idx.pickle", 'wb') as f: pickle.dump(word2idx, f) encoded = tokenizer.texts_to_sequences(df) with open(dataset_dir + '/' + "encoded.pickle", 'wb') as f: pickle.dump(encoded, f) df = tokenizer.texts_to_sequences(df) with open(dataset_dir + '/' + "df.pickle", 'wb') as f: pickle.dump(df, f) with open(dataset_dir + '/' + "encoded.pickle", 'rb') as f: encoded = pickle.load(f) skip_grams = [skipgrams(sample, vocabulary_size=52203, window_size=2) for sample in encoded] with open(dataset_dir + '/' + "skip_grams.pickle", 'wb') as f: pickle.dump(skip_grams, f)
def downsample_skipgrams(ids, vocab_size, subsample=1e-3, window=2, neg=2): w = [] y = [] sampling_table = make_sampling_table(vocab_size, sampling_factor=subsample) span = 2 * window + 1 targets = ids[window::span] pairs, labels = skipgrams(ids, vocabulary_size=vocab_size, window_size=np.random.randint(window - 1) + 1, negative_samples=neg, sampling_table=sampling_table, shuffle=True) for (t, c), l in zip(pairs, labels): if t in targets: w.append([t, c]) y.append(l) return w, y
def train(self, sequences, epochs=10): for epoch in range(epochs): loss = 0.0 for sequence in sequences: sg = skipgrams( sequence, vocabulary_size=self.vocab_size, window_size=self.window_size ) center = np.array(list(zip(*sg[0]))[0]) context = np.array(list(zip(*sg[0]))[1]) labels = np.array(sg[1]) X = [center, context] Y = labels loss += self.model.train_on_batch(X, Y) print(f'Epoch {epoch}, Loss {loss:.4f}')
def _make_skipgrams(s): """Numpy function to make skipgrams.""" samples_out = [] for i in range(s.shape[0]): pairs, labels = skipgrams( s[i, :], vocabulary_size=vocabulary_size, window_size=window_size, negative_samples=negative_samples, seed=seed, ) samples = np.concatenate([ np.atleast_2d(np.asarray(pairs)), np.asarray(labels)[:, None] ], axis=1) samples_out.append(samples) samples_out = np.concatenate(samples_out, axis=0) return samples_out
def find_word_context(self): # Build the sampling table for vocab_size tokens. sampling_table = tf.keras.preprocessing.sequence.make_sampling_table( len(self.vocabulary)) for sequence in tqdm.tqdm(self.vectorized_logs): positive_skip_grams, _ = skipgrams(sequence, vocabulary_size=len( self.vocabulary), sampling_table=sampling_table, window_size=self.window_size, negative_samples=0) for target_word, context_word in positive_skip_grams: context_class = tf.expand_dims( tf.constant([context_word], dtype='int64'), 1) negative_sampling_candidates, _, _ = negative_skipgrams( true_classes=context_class, num_true=1, num_sampled=num_neg_sampling, unique=True, range_max=len(self.vocabulary), seed=42, name="negative_sampling") negative_sampling_candidates = tf.expand_dims( negative_sampling_candidates, 1) context = tf.concat( [context_class, negative_sampling_candidates], 0) label = tf.constant([1] + [0] * num_neg_sampling, dtype='int64') self.targets.append(target_word) self.contexts.append(context) self.labels.append(label)
class Word2VecDataGenerator(DataGenerator): def __init__(self, config, language): self.language = language super(Word2VecDataGenerator, self).__init__(config) def _generate(self, case=None): vocabulary_size = self.config["params"]["vocab_size"] window_size = self.config["params"]["word2vec"]["window_size"] batch_size = self.config["hyper_params"]["word2vec"]["batch_size"] sampling_table = sequence.make_sampling_table(vocabulary_size, 0.1) while True: pairs = [] labels = [] indexes = np.arange(len(self.sentences)) np.random.shuffle(indexes) for i in indexes: p, l = sequence.skipgrams(self.sentences[i], vocabulary_size, window_size=window_size, sampling_table=sampling_table) pairs.extend(p) labels.extend(l) if len(pairs) >= batch_size: output_pairs = pairs[:batch_size] output_labels = labels[:batch_size] pairs = pairs[batch_size:] labels = labels[batch_size:] output_targets, output_contexts = zip(*output_pairs) output_targets = np.array(output_targets, dtype=np.int32) output_contexts = np.array(output_contexts, dtype=np.int32) output_labels = np.array(output_labels, dtype=np.float32) yield (output_targets, output_contexts), output_labels
def create_skipgrams(cls, normalized_doc, vocabulary_size=10000, ratio=3.0): ''' Create the skipgrams to be trained on the model normalized_doc: Normalized document nested array of mapped sentences. vocabulary_size: Size of the given vocabulary data has been compiled against. ratio: Negative to Positive sampling ratio. ''' # Used for generating the sampling_table argument for skipgrams. sampling_table[i] is the # probability of sampling the word i-th most common word in a dataset (more common words # should be sampled less frequently, for balance). sampling_table = sequence.make_sampling_table(vocabulary_size) # Create Skipgrams with Keras # This function transforms a sequence of word indexes (list of integers) into tuples of # words of the form: # * (word, word in the same window), with label 1 (positive samples). # * (word, random word from the vocabulary), with label 0 (negative samples). # Flatten normalized document data = list(itertools.chain.from_iterable(normalized_doc)) couples, labels = skipgrams(data, vocabulary_size, negative_samples=ratio, sampling_table=sampling_table) # Split couples into target and context word_target, word_context = zip(*couples) # Convert to Numpy array, ! rank 1 array! word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") return word_target, word_context, labels
oov_token="<OOV>") tokenizer.fit_on_texts(training_sentences) words = tokenizer.word_index sequences = tokenizer.texts_to_sequences(training_sentences) padded = tensorflow.keras.preprocessing.sequence.pad_sequences( sequences, maxlen=120, truncating="post") word_target_final = [] word_context_final = [] couples_final = [] labels_final = [] for i in range(1, int(len(padded) / 100)): sampling_table = sequence.make_sampling_table(vocab_size) couples, labels = sequence.skipgrams(padded[i], vocab_size, window_size=2) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") labels_final.append(labels) word_target_final.append(word_target) word_context_final.append(word_context) input_target = tensorflow.keras.layers.Input((1, )) input_context = tensorflow.keras.layers.Input((1, )) embedding = tensorflow.keras.layers.Embedding(vocab_size, vector_dim, input_length=1, name='embedding') target = embedding(input_target)
t = 1e-5 sampling_prob = np.sqrt(t / (unigrams / np.sum(unigrams))) sampling_prob = np.minimum(1, sampling_prob) # skipgrams() assumes 0 is not a word, so some shifting is done sampling_table = np.concatenate(([0], sampling_prob)) sg = SkipGram(vocab_length, emb_length=128) n_epochs = 10 for epoch in range(1, n_epochs + 1): load_prev = False if epoch == 1 else True # skipgrams() assumes 0 is not a word, so some shifting is done idx_couples = np.array( skipgrams(text_indices + 1, vocab_length + 1, window_size=4, sampling_table=sampling_table, negative_samples=0.)[0]) - 1 word_indices = idx_couples[:, 0] context_indices = idx_couples[:, 1].reshape(-1, 1) sg.train(word_indices, context_indices, l2_penalty=1.0, neg_sample_rate=20, sampling='unigram', unigrams=unigrams, learning_rate=2.5, batch_size=512, n_epochs=1, load_prev=load_prev,
#['WAT', 'SWKS', 'PAYX', 'CSCO', 'RF', 'NKE', 'INTC', 'ALL', 'GS', 'AVGO', 'HUM', 'NEE', 'T', 'CL', 'DVA', 'AMGN'] custom_examples = ['CSCO', 'NKE', 'INTC', 'GS', 'T', 'TSLA', 'AAPL', 'PAYX'] valid_examples = [tick_to_idx[tick] for tick in custom_examples] #picks {16} of the first {100} words for validation #may need to replace this with some pure play companies like TSLA compared to Ford #sampling table for negative examples sampling_table = sequence.make_sampling_table(vocab_size + 1) #+1 due to index zero being skipped #function to create skipgrams per ETF targets, contexts, labels = [], [], [] for etf in etf_names: tokens = np.array([tick_to_idx[tick] for tick in df.loc[df['ETF'] == etf, 'Ticker'].values]) etf_couples, etf_labels = skipgrams( tokens, vocab_size, window_size = window_size, negative_samples = negative_samples, sampling_table = sampling_table) etf_targets, etf_contexts = zip(*etf_couples) #separate into target and contexts by etf targets.append(np.array(etf_targets)) contexts.append(np.array(etf_contexts)) labels.append(np.array(etf_labels)) #may need to add criteria for context negative selection to random XX% in weight away from target word #flatten to a single numpy targets = np.concatenate(targets).ravel() contexts = np.concatenate(contexts).ravel() labels = np.concatenate(labels).ravel() #**MODEL** # create some input variables
하는데 전체 단어가 많은 경우 엄청난 계산량 발생 네거티브 샘플링은 소프트맥스 확률을 구할 때 전체 단어를 대상으로 구하지 않고, 일부 단어만 뽑아서 계산을 하는 방식 네거티브 샘플링 동작은 사용자가 지정한 윈도우 사이즈 내에 등장하지 않는 단어(negative samples)를 5~20개 정도 뽑고 이를 정답 단어와 합쳐 전체 단어처럼 소프트맥스 활률을 계산하여 파라미터 업데이트 ''' from tensorflow.keras.preprocessing.sequence import skipgrams ## 10개 샘플로 먼저 시도 # skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in sequences[:10]] # pairs, labels = skip_grams[0][0], skip_grams[0][1] # for i in range(5): # print("{:s}({:d}), {:s}({:d}) -> {:d}".format( # idx2word[pairs[i][0]], pairs[i][0], # idx2word[pairs[i][1]], pairs[i][1], # labels[i])) ## 전체 데이터로 시도 skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in sequences] ## 모델 생성 from tensorflow.keras.models import Sequential, Model from tensorflow.keras.layers import Embedding, Reshape, Activation, Input, Dot from tensorflow.keras.utils import plot_model embed_size = 50 ### 이후로는 너무 어려워서 중단
data, count, dictionary, reverse_dictionary = build_dataset( filename, vocabulary_size) #step 2: generate trainset window_size = 1 vector_dim = 300 epochs = 100000 batch_size = 1000 vocab_size = len(dictionary) valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. valid_examples = np.random.choice(valid_window, valid_size, replace=False) couples, labels = skipgrams(data, vocab_size, window_size=window_size, negative_samples=0.1) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") print(couples[:10], labels[:10]) #step 3: build model input_target = Input((1, )) input_context = Input((1, )) embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding') target = embedding(input_target) target = Reshape((vector_dim, 1))(target)
#%% tokenizer = Tokenizer() tokenizer.fit_on_texts(tokenized_doc) word2idx = tokenizer.word_index idx2word = {v: k for k, v in word2idx.items()} encoded = tokenizer.texts_to_sequences(tokenized_doc) #%% print(encoded[:2]) #%% vocab_size = len(word2idx) + 1 print('단어 집합의 크기:', vocab_size) #%% from tensorflow.keras.preprocessing.sequence import skipgrams skip_gramms = [ skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded[:10] ] #%% # check skipgram data pairs, labels = skip_gramms[0][0], skip_gramms[0][1] for i in range(5): print("({:s} ({:d}),{:s},({:d})) -> {:d}".format(idx2word[pairs[i][0]], pairs[i][0], idx2word[pairs[i][1]], pairs[i][1], labels[i])) #%% skip_grams = [ skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded ]
def word2vec(self, WINDOW_SZ, EMBEDDING_DIM, W2V_EPOCHS): print("WORD2VEC...") valid_size = 20 #random word set to evaluate similarity valid_window = 500 #pick samples in 500 most common words valid_examples = np.random.choice(valid_window, valid_size, replace=False) vocab_size = self.vocab_size ## skipgram set up sampling_table = sequence.make_sampling_table(vocab_size, sampling_factor=0.01) skipgrams = [ sequence.skipgrams(tweet, vocab_size, window_size=WINDOW_SZ, sampling_table=sampling_table) for tweet in self.x_train ] couples, labels = skipgrams[0][0], skipgrams[0][1] word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") ## Functional API model #input layers take in target and context word as ints input_target = layers.Input((1, )) input_context = layers.Input((1, )) #embedding layer then transpose vectors to take dot prod embedding = Embedding(vocab_size, EMBEDDING_DIM, input_length=1, name='embedding') target = embedding(input_target) target = Reshape((EMBEDDING_DIM, 1))(target) context = embedding(input_context) context = Reshape((EMBEDDING_DIM, 1))(context) #cosine similarity to be used in validation model similarity = Dot(axes=0, normalize=True) #dot product layers to measure similarity dot_product = Dot(axes=1)([target, context]) dot_product = Reshape((1, ))(dot_product) #sigmoid output layer output = Dense(1, activation='sigmoid')(dot_product) model = Model(inputs=[input_target, input_context], outputs=output) model.compile(loss='binary_crossentropy', optimizer='rmsprop') #cosine similarity to be used in validation model similarity = Dot(axes=1, normalize=True)([target, context]) validation_model = Model(inputs=[input_target, input_context], outputs=similarity) reversed_word_index = self.reversed_word_index ## Helper class for validating Word2Vec while training class SimilarityCallback: def run_sim(self): for i in range(valid_size): valid_word = reversed_word_index[valid_examples[i]] top_k = 8 #num of nearest neighbors sim = self._get_sim(valid_examples[i]) nearest = (-sim).argsort()[1:top_k + 1] log_str = 'Nearest to %s:' % valid_word for k in range(top_k): close_word = reversed_word_index[nearest[k]] log_str = '%s %s,' % (log_str, close_word) print(log_str) @staticmethod def _get_sim(valid_word_idx): sim = np.zeros((vocab_size, )) in_arr1 = np.zeros((1, )) in_arr2 = np.zeros((1, )) for i in range(vocab_size): in_arr1[0, ] = valid_word_idx in_arr2[0, ] = i out = validation_model.predict_on_batch([in_arr1, in_arr2]) sim[i] = out return sim sim_cb = SimilarityCallback() arr_1 = np.zeros((1, )) arr_2 = np.zeros((1, )) arr_3 = np.zeros((1, )) ## Train network for cnt in range(W2V_EPOCHS): idx = np.random.randint(0, len(labels) - 1) arr_1[0, ] = word_target[idx] arr_2[0, ] = word_context[idx] arr_3[0, ] = labels[idx] loss = model.train_on_batch([arr_1, arr_2], arr_3) # Every 100 epochs print loss if cnt % 100 == 0: print("Iteration {}, loss={}".format(cnt, loss)) # Every 500 run similarity test on validation data if cnt % 500 == 0: sim_cb.run_sim()
epochs = 5 valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. valid_examples = np.random.choice(valid_window, valid_size, replace=False) vocab_size = 10000 embedding_dim = 300 data, count, dictionary, reverse_dictionary = collect_data( vocabulary_size=vocab_size) sampling_table = sequence.make_sampling_table(vocab_size) couples, labels = sequence.skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table) # word_target, word_context = zip(*couples) # word_target = np.array(word_target, dtype="int32") # word_context = np.array(word_context, dtype="int32") print(couples[:10], labels[:10]) train_ds = tf.data.Dataset.from_tensor_slices( (couples, labels)).shuffle(10000).batch(32) # Create the model model = NegativeSamplingWord2VecEmbedding(vocab_size, embedding_dim) # Training loss_object = tf.keras.losses.BinaryCrossentropy() optimizer = tf.keras.optimizers.RMSprop()
tokenizer.fit_on_texts(norm_bible) word2id = tokenizer.word_index id2word = {v:k for k,v in word2id.items()} vocab_size = len(word2id)+1 embed_size = 100 wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible] print('Vocabulary Size:', vocab_size) print('Vocabulary Sample:', list(word2id.items())[:10]) # build and view sample skip grams from tensorflow.keras.preprocessing.sequence import skipgrams skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=5) for wid in wids] pairs, labels = skip_grams[0][0], skip_grams[0][1] for i in range(5): print('{:s}({:d}), {:s}({:d})'.format( id2word[pairs[i][0]], pairs[i][0], id2word[pairs[i][1]], pairs[i][1], labels[i] )) from tensorflow.keras.layers import Input, Dot, Concatenate, Dense, Reshape, Embedding from tensorflow.keras.models import Sequential, Model def build_model(): input_target = Input((1,)) input_context = Input((1,))