Beispiel #1
0
    def send(self, _) -> Tuple[List, np.ndarray]:
        iteration = 0

        while True:
            sentence = self._sequences[iteration]
            iteration = (iteration + 1) % len(self._sequences)

            pairs, labels = sequence.skipgrams(
                sentence,
                vocabulary_size=self._num_words,
                window_size=self._window_size,
                sampling_table=self._sampling_table,
            )

            if pairs:
                target_words, context_words = [
                    list(words) for words in zip(*pairs)
                ]

                # Batch size is at least 32. Higher batch size is not
                # problematic.
                self._add_to_batch(context_words, target_words, labels)

                if self._is_batch_ready():
                    return self._process_batch()
def generate_data(corpus, window_size, V):
    for words in corpus:
        couples, labels = skipgrams(words, V, window_size, negative_samples=1, shuffle=True,sampling_table=make_sampling_table(V, sampling_factor=1e-05))
        if couples:
            X, y = zip(*couples)
            X = np_utils.to_categorical(X, V)
            y = np_utils.to_categorical(y, V)
            yield X, y
def create_dataset(text, vocab, num_words, window_size, negative_samples):
    data = vocab.texts_to_sequences([text]).pop()
    sampling_table = make_sampling_table(num_words)
    couples, labels = skipgrams(data, num_words,
                                window_size=window_size,
                                negative_samples=negative_samples,
                                sampling_table=sampling_table)
    word_target, word_context = zip(*couples)
    word_target = np.reshape(word_target, (-1, 1))
    word_context = np.reshape(word_context, (-1, 1))
    labels = np.asfarray(labels)
    return [word_target, word_context], labels
Beispiel #4
0
def get_feature_word_embedding(path):
    # 使用bow的tokenizer
    f1 = open('tokenizer_bow.pkl', 'rb')
    tokenizer = pickle.load(f1)
    f1.close()
    code = code2text(path)
    seq = tokenizer.texts_to_sequences([code])[0]
    vocab_size = len(tokenizer.word_index)
    window_size = 2
    positive_skip_grams, _ = skipgrams(seq,
                                       vocabulary_size=vocab_size,
                                       window_size=window_size,
                                       negative_samples=0)
    return
Beispiel #5
0
def training_data_generator(text_encoded,
                            window_size=4,
                            negative_samples=1.0,
                            batch_docs=50):
    """
    For given encoded text, return 3 np.array:
    words, contexts, labels
    Do not pair the w and its context cross different documents.

    input:
        text_encoded: list of list of int, each list of int is the numerical encoding of the doc
        window_size: int, define the context
        negative_samples: float, how much negative sampling you need, normally 1.0
        batch_docs: int, number of docs for which it generates one return

    return:
        words: list of int, the numerical encoding of the central words
        contexts: list of int, the numerical encoding of the context words
        labels: list of int, 1 or 0

    hint:
    1. You can use skipgrams method from keras
    2. For training purpose, words and contexts needs to be 2D array, with shape (N, 1),
       but labels is 1D array, with shape (N, )
    3. The output can be very big, you SHOULD using generator
    """
    """
    Write your code here
    """
    sampling_table = make_sampling_table(VOCAB_SIZE)
    loc = list(range(len(text_encoded)))
    random.shuffle(loc)

    for j in loc[:batch_docs]:
        couples, label = skipgrams(text_encoded[j],
                                   VOCAB_SIZE,
                                   window_size=window_size,
                                   sampling_table=sampling_table,
                                   negative_samples=negative_samples,
                                   shuffle=True)

        if len(couples) > 0:
            target, context_ = zip(*couples)
            target = np.array(target, dtype="int32")
            context_ = np.array(context_, dtype="int32")

            yield target.tolist(), context_.tolist(), label

        else:
            continue
Beispiel #6
0
def main():
    _download()
    file_path = dataset_dir + '/' + "ratings.txt"

    data = pd.read_csv(file_path, sep='\t', engine='python')

    for i in tqdm(range(len(data))):
        data.iloc[i, 1] = ' '.join(re.sub(r'[^가-힣]', ' ', str(data.iloc[i, 1]).strip()).split())
        data.iloc[i, 1] = " ".join(mecab.morphs(data.iloc[i, 1]))

    df = data["document"].apply(lambda x: x.split())
    df = df.to_list()

    stopwords = pd.read_csv(dataset_dir + '/' + "stopwords.csv", encoding="utf-8")
    stopwords = list(stopwords["stopwords"])

    for i in tqdm(range(len(df))):
        for j in range(len(stopwords)):
            while stopwords[j] in df[i]:
                df[i].remove(stopwords[j])

    drop_train = [index for index, sentence in enumerate(df) if len(sentence) <= 1]
    df = np.delete(df, drop_train, axis=0)

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df)

    word2idx = tokenizer.word_index
    print(len(word2idx))
    with open(dataset_dir + '/' + "word2idx.pickle", 'wb') as f:
        pickle.dump(word2idx, f)

    encoded = tokenizer.texts_to_sequences(df)
    with open(dataset_dir + '/' + "encoded.pickle", 'wb') as f:
        pickle.dump(encoded, f)

    df = tokenizer.texts_to_sequences(df)
    with open(dataset_dir + '/' + "df.pickle", 'wb') as f:
        pickle.dump(df, f)

    with open(dataset_dir + '/' + "encoded.pickle", 'rb') as f:
        encoded = pickle.load(f)

    skip_grams = [skipgrams(sample, vocabulary_size=52203, window_size=2) for sample in encoded]
    with open(dataset_dir + '/' + "skip_grams.pickle", 'wb') as f:
        pickle.dump(skip_grams, f)
Beispiel #7
0
def downsample_skipgrams(ids, vocab_size, subsample=1e-3, window=2, neg=2):
    w = []
    y = []
    sampling_table = make_sampling_table(vocab_size, sampling_factor=subsample)
    span = 2 * window + 1
    targets = ids[window::span]
    pairs, labels = skipgrams(ids,
                              vocabulary_size=vocab_size,
                              window_size=np.random.randint(window - 1) + 1,
                              negative_samples=neg,
                              sampling_table=sampling_table,
                              shuffle=True)
    for (t, c), l in zip(pairs, labels):
        if t in targets:
            w.append([t, c])
            y.append(l)
    return w, y
    def train(self, sequences, epochs=10):
        for epoch in range(epochs):
            loss = 0.0
            for sequence in sequences:
                sg = skipgrams(
                    sequence,
                    vocabulary_size=self.vocab_size,
                    window_size=self.window_size
                )

                center = np.array(list(zip(*sg[0]))[0])
                context = np.array(list(zip(*sg[0]))[1])
                labels = np.array(sg[1])

                X = [center, context]
                Y = labels

                loss += self.model.train_on_batch(X, Y)

            print(f'Epoch {epoch}, Loss {loss:.4f}')
Beispiel #9
0
    def _make_skipgrams(s):
        """Numpy function to make skipgrams."""
        samples_out = []

        for i in range(s.shape[0]):
            pairs, labels = skipgrams(
                s[i, :],
                vocabulary_size=vocabulary_size,
                window_size=window_size,
                negative_samples=negative_samples,
                seed=seed,
            )
            samples = np.concatenate([
                np.atleast_2d(np.asarray(pairs)),
                np.asarray(labels)[:, None]
            ],
                                     axis=1)
            samples_out.append(samples)

        samples_out = np.concatenate(samples_out, axis=0)
        return samples_out
Beispiel #10
0
    def find_word_context(self):

        # Build the sampling table for vocab_size tokens.
        sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(
            len(self.vocabulary))

        for sequence in tqdm.tqdm(self.vectorized_logs):

            positive_skip_grams, _ = skipgrams(sequence,
                                               vocabulary_size=len(
                                                   self.vocabulary),
                                               sampling_table=sampling_table,
                                               window_size=self.window_size,
                                               negative_samples=0)

            for target_word, context_word in positive_skip_grams:
                context_class = tf.expand_dims(
                    tf.constant([context_word], dtype='int64'), 1)

                negative_sampling_candidates, _, _ = negative_skipgrams(
                    true_classes=context_class,
                    num_true=1,
                    num_sampled=num_neg_sampling,
                    unique=True,
                    range_max=len(self.vocabulary),
                    seed=42,
                    name="negative_sampling")

                negative_sampling_candidates = tf.expand_dims(
                    negative_sampling_candidates, 1)

                context = tf.concat(
                    [context_class, negative_sampling_candidates], 0)
                label = tf.constant([1] + [0] * num_neg_sampling,
                                    dtype='int64')

                self.targets.append(target_word)
                self.contexts.append(context)
                self.labels.append(label)
Beispiel #11
0
class Word2VecDataGenerator(DataGenerator):
    def __init__(self, config, language):
        self.language = language
        super(Word2VecDataGenerator, self).__init__(config)

    def _generate(self, case=None):

        vocabulary_size = self.config["params"]["vocab_size"]
        window_size = self.config["params"]["word2vec"]["window_size"]
        batch_size = self.config["hyper_params"]["word2vec"]["batch_size"]
        sampling_table = sequence.make_sampling_table(vocabulary_size, 0.1)
        while True:
            pairs = []
            labels = []
            indexes = np.arange(len(self.sentences))
            np.random.shuffle(indexes)
            for i in indexes:
                p, l = sequence.skipgrams(self.sentences[i],
                                          vocabulary_size,
                                          window_size=window_size,
                                          sampling_table=sampling_table)
                pairs.extend(p)
                labels.extend(l)
                if len(pairs) >= batch_size:

                    output_pairs = pairs[:batch_size]
                    output_labels = labels[:batch_size]

                    pairs = pairs[batch_size:]
                    labels = labels[batch_size:]
                    output_targets, output_contexts = zip(*output_pairs)

                    output_targets = np.array(output_targets, dtype=np.int32)
                    output_contexts = np.array(output_contexts, dtype=np.int32)

                    output_labels = np.array(output_labels, dtype=np.float32)

                    yield (output_targets, output_contexts), output_labels
Beispiel #12
0
    def create_skipgrams(cls,
                         normalized_doc,
                         vocabulary_size=10000,
                         ratio=3.0):
        '''
        Create the skipgrams to be trained on the model

        normalized_doc: Normalized document nested array of mapped sentences.
        vocabulary_size: Size of the given vocabulary data has been compiled against.
        ratio: Negative to Positive sampling ratio.
        '''
        # Used for generating the sampling_table argument for skipgrams. sampling_table[i] is the
        # probability of sampling the word i-th most common word in a dataset (more common words
        # should be sampled less frequently, for balance).
        sampling_table = sequence.make_sampling_table(vocabulary_size)

        # Create Skipgrams with Keras
        # This function transforms a sequence of word indexes (list of integers) into tuples of
        # words of the form:
        #   * (word, word in the same window), with label 1 (positive samples).
        #   * (word, random word from the vocabulary), with label 0 (negative samples).
        # Flatten normalized document
        data = list(itertools.chain.from_iterable(normalized_doc))
        couples, labels = skipgrams(data,
                                    vocabulary_size,
                                    negative_samples=ratio,
                                    sampling_table=sampling_table)

        # Split couples into target and context
        word_target, word_context = zip(*couples)

        # Convert to Numpy array, ! rank 1 array!
        word_target = np.array(word_target, dtype="int32")
        word_context = np.array(word_context, dtype="int32")

        return word_target, word_context, labels
                                                          oov_token="<OOV>")
tokenizer.fit_on_texts(training_sentences)
words = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(training_sentences)
padded = tensorflow.keras.preprocessing.sequence.pad_sequences(
    sequences, maxlen=120, truncating="post")

word_target_final = []
word_context_final = []
couples_final = []
labels_final = []

for i in range(1, int(len(padded) / 100)):
    sampling_table = sequence.make_sampling_table(vocab_size)
    couples, labels = sequence.skipgrams(padded[i], vocab_size, window_size=2)
    word_target, word_context = zip(*couples)
    word_target = np.array(word_target, dtype="int32")
    word_context = np.array(word_context, dtype="int32")
    labels_final.append(labels)
    word_target_final.append(word_target)
    word_context_final.append(word_context)

input_target = tensorflow.keras.layers.Input((1, ))
input_context = tensorflow.keras.layers.Input((1, ))

embedding = tensorflow.keras.layers.Embedding(vocab_size,
                                              vector_dim,
                                              input_length=1,
                                              name='embedding')
target = embedding(input_target)
Beispiel #14
0
t = 1e-5
sampling_prob = np.sqrt(t / (unigrams / np.sum(unigrams)))
sampling_prob = np.minimum(1, sampling_prob)
# skipgrams() assumes 0 is not a word, so some shifting is done
sampling_table = np.concatenate(([0], sampling_prob))

sg = SkipGram(vocab_length, emb_length=128)
n_epochs = 10
for epoch in range(1, n_epochs + 1):
    load_prev = False if epoch == 1 else True

    # skipgrams() assumes 0 is not a word, so some shifting is done
    idx_couples = np.array(
        skipgrams(text_indices + 1,
                  vocab_length + 1,
                  window_size=4,
                  sampling_table=sampling_table,
                  negative_samples=0.)[0]) - 1
    word_indices = idx_couples[:, 0]
    context_indices = idx_couples[:, 1].reshape(-1, 1)

    sg.train(word_indices,
             context_indices,
             l2_penalty=1.0,
             neg_sample_rate=20,
             sampling='unigram',
             unigrams=unigrams,
             learning_rate=2.5,
             batch_size=512,
             n_epochs=1,
             load_prev=load_prev,
#['WAT', 'SWKS', 'PAYX', 'CSCO', 'RF', 'NKE', 'INTC', 'ALL', 'GS', 'AVGO', 'HUM', 'NEE', 'T', 'CL', 'DVA', 'AMGN']
custom_examples = ['CSCO', 'NKE', 'INTC', 'GS', 'T', 'TSLA', 'AAPL', 'PAYX']
valid_examples = [tick_to_idx[tick] for tick in custom_examples]
#picks {16} of the first {100} words for validation
#may need to replace this with some pure play companies like TSLA compared to Ford

#sampling table for negative examples
sampling_table = sequence.make_sampling_table(vocab_size + 1)  #+1 due to index zero being skipped

#function to create skipgrams per ETF
targets, contexts, labels = [], [], []
for etf in etf_names:
    tokens = np.array([tick_to_idx[tick] for tick in df.loc[df['ETF'] == etf, 'Ticker'].values]) 
    etf_couples, etf_labels = skipgrams(
                        tokens,
                        vocab_size,
                        window_size = window_size,
                        negative_samples = negative_samples,
                        sampling_table = sampling_table)
    etf_targets, etf_contexts = zip(*etf_couples)   #separate into target and contexts by etf
    targets.append(np.array(etf_targets))
    contexts.append(np.array(etf_contexts))
    labels.append(np.array(etf_labels))
#may need to add criteria for context negative selection to random XX% in weight away from target word

#flatten to a single numpy
targets = np.concatenate(targets).ravel()
contexts = np.concatenate(contexts).ravel()
labels = np.concatenate(labels).ravel()

#**MODEL**
# create some input variables
Beispiel #16
0
     하는데 전체 단어가 많은 경우 엄청난 계산량 발생
     네거티브 샘플링은 소프트맥스 확률을 구할 때 전체 단어를 대상으로 구하지 않고, 일부 단어만 뽑아서 계산을 하는 방식
     네거티브 샘플링 동작은 사용자가 지정한 윈도우 사이즈 내에 등장하지 않는 단어(negative samples)를
     5~20개 정도 뽑고 이를 정답 단어와 합쳐 전체 단어처럼 소프트맥스 활률을 계산하여 파라미터 업데이트
'''
from tensorflow.keras.preprocessing.sequence import skipgrams

## 10개 샘플로 먼저 시도

# skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in sequences[:10]]
# pairs, labels = skip_grams[0][0], skip_grams[0][1]
# for i in range(5):
#     print("{:s}({:d}), {:s}({:d}) -> {:d}".format(
#         idx2word[pairs[i][0]], pairs[i][0],
#         idx2word[pairs[i][1]], pairs[i][1],
#         labels[i]))

## 전체 데이터로 시도
skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in sequences]

## 모델 생성
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape, Activation, Input, Dot
from tensorflow.keras.utils import plot_model

embed_size = 50

### 이후로는 너무 어려워서 중단


Beispiel #17
0
data, count, dictionary, reverse_dictionary = build_dataset(
    filename, vocabulary_size)

#step 2: generate trainset
window_size = 1
vector_dim = 300
epochs = 100000
batch_size = 1000
vocab_size = len(dictionary)

valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

couples, labels = skipgrams(data,
                            vocab_size,
                            window_size=window_size,
                            negative_samples=0.1)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])

#step 3: build model
input_target = Input((1, ))
input_context = Input((1, ))

embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')

target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
Beispiel #18
0
#%%
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_doc)
word2idx = tokenizer.word_index
idx2word = {v: k for k, v in word2idx.items()}
encoded = tokenizer.texts_to_sequences(tokenized_doc)
#%%
print(encoded[:2])
#%%
vocab_size = len(word2idx) + 1
print('단어 집합의 크기:', vocab_size)
#%%
from tensorflow.keras.preprocessing.sequence import skipgrams

skip_gramms = [
    skipgrams(sample, vocabulary_size=vocab_size, window_size=10)
    for sample in encoded[:10]
]
#%%
# check skipgram data
pairs, labels = skip_gramms[0][0], skip_gramms[0][1]
for i in range(5):
    print("({:s} ({:d}),{:s},({:d})) -> {:d}".format(idx2word[pairs[i][0]],
                                                     pairs[i][0],
                                                     idx2word[pairs[i][1]],
                                                     pairs[i][1], labels[i]))
#%%
skip_grams = [
    skipgrams(sample, vocabulary_size=vocab_size, window_size=10)
    for sample in encoded
]
Beispiel #19
0
    def word2vec(self, WINDOW_SZ, EMBEDDING_DIM, W2V_EPOCHS):
        print("WORD2VEC...")

        valid_size = 20  #random word set to evaluate similarity
        valid_window = 500  #pick samples in 500 most common words
        valid_examples = np.random.choice(valid_window,
                                          valid_size,
                                          replace=False)

        vocab_size = self.vocab_size

        ## skipgram set up
        sampling_table = sequence.make_sampling_table(vocab_size,
                                                      sampling_factor=0.01)

        skipgrams = [
            sequence.skipgrams(tweet,
                               vocab_size,
                               window_size=WINDOW_SZ,
                               sampling_table=sampling_table)
            for tweet in self.x_train
        ]

        couples, labels = skipgrams[0][0], skipgrams[0][1]
        word_target, word_context = zip(*couples)
        word_target = np.array(word_target, dtype="int32")
        word_context = np.array(word_context, dtype="int32")

        ## Functional API model

        #input layers take in target and context word as ints
        input_target = layers.Input((1, ))
        input_context = layers.Input((1, ))

        #embedding layer then transpose vectors to take dot prod
        embedding = Embedding(vocab_size,
                              EMBEDDING_DIM,
                              input_length=1,
                              name='embedding')

        target = embedding(input_target)
        target = Reshape((EMBEDDING_DIM, 1))(target)
        context = embedding(input_context)
        context = Reshape((EMBEDDING_DIM, 1))(context)

        #cosine similarity to be used in validation model
        similarity = Dot(axes=0, normalize=True)

        #dot product layers to measure similarity
        dot_product = Dot(axes=1)([target, context])
        dot_product = Reshape((1, ))(dot_product)
        #sigmoid output layer
        output = Dense(1, activation='sigmoid')(dot_product)

        model = Model(inputs=[input_target, input_context], outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='rmsprop')

        #cosine similarity to be used in validation model
        similarity = Dot(axes=1, normalize=True)([target, context])
        validation_model = Model(inputs=[input_target, input_context],
                                 outputs=similarity)

        reversed_word_index = self.reversed_word_index

        ## Helper class for validating Word2Vec while training
        class SimilarityCallback:
            def run_sim(self):
                for i in range(valid_size):
                    valid_word = reversed_word_index[valid_examples[i]]
                    top_k = 8  #num of nearest neighbors
                    sim = self._get_sim(valid_examples[i])
                    nearest = (-sim).argsort()[1:top_k + 1]
                    log_str = 'Nearest to %s:' % valid_word
                    for k in range(top_k):
                        close_word = reversed_word_index[nearest[k]]
                        log_str = '%s %s,' % (log_str, close_word)
                    print(log_str)

            @staticmethod
            def _get_sim(valid_word_idx):
                sim = np.zeros((vocab_size, ))
                in_arr1 = np.zeros((1, ))
                in_arr2 = np.zeros((1, ))
                for i in range(vocab_size):
                    in_arr1[0, ] = valid_word_idx
                    in_arr2[0, ] = i
                    out = validation_model.predict_on_batch([in_arr1, in_arr2])
                    sim[i] = out
                return sim

        sim_cb = SimilarityCallback()

        arr_1 = np.zeros((1, ))
        arr_2 = np.zeros((1, ))
        arr_3 = np.zeros((1, ))
        ## Train network
        for cnt in range(W2V_EPOCHS):
            idx = np.random.randint(0, len(labels) - 1)
            arr_1[0, ] = word_target[idx]
            arr_2[0, ] = word_context[idx]
            arr_3[0, ] = labels[idx]
            loss = model.train_on_batch([arr_1, arr_2], arr_3)
            # Every 100 epochs print loss
            if cnt % 100 == 0:
                print("Iteration {}, loss={}".format(cnt, loss))
            # Every 500 run similarity test on validation data
            if cnt % 500 == 0:
                sim_cb.run_sim()
Beispiel #20
0
epochs = 5

valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
vocab_size = 10000

embedding_dim = 300

data, count, dictionary, reverse_dictionary = collect_data(
    vocabulary_size=vocab_size)

sampling_table = sequence.make_sampling_table(vocab_size)

couples, labels = sequence.skipgrams(data,
                                     vocab_size,
                                     window_size=window_size,
                                     sampling_table=sampling_table)
# word_target, word_context = zip(*couples)
# word_target = np.array(word_target, dtype="int32")
# word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])
train_ds = tf.data.Dataset.from_tensor_slices(
    (couples, labels)).shuffle(10000).batch(32)

# Create the model
model = NegativeSamplingWord2VecEmbedding(vocab_size, embedding_dim)

# Training
loss_object = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.RMSprop()
tokenizer.fit_on_texts(norm_bible)

word2id = tokenizer.word_index
id2word = {v:k for k,v in word2id.items()}

vocab_size = len(word2id)+1
embed_size = 100

wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible]
print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

# build and view sample skip grams
from tensorflow.keras.preprocessing.sequence import skipgrams

skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=5)
              for wid in wids]
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(5):
    print('{:s}({:d}), {:s}({:d})'.format(
        id2word[pairs[i][0]], pairs[i][0],
        id2word[pairs[i][1]], pairs[i][1],
        labels[i]
    ))


from tensorflow.keras.layers import Input, Dot, Concatenate, Dense, Reshape, Embedding
from tensorflow.keras.models import Sequential, Model
def build_model():
    input_target = Input((1,))
    input_context = Input((1,))