Beispiel #1
0
 def __generate_skipgrams(self, documents):
     #generate skipgrams
     print('creating sents ({} rows)'.format(len(documents)))
     #sents = newsgroups_train.data
     
         
     sents = filter_sentences(documents)
     self.filtered_sents = sents
     print('tokenizing sents ({} sentences)'.format(len(sents)))
     self.tokenizer = Tokenizer(num_words= self.vocabulary_size, lower=True, filters=self.filters)
     self.tokenizer.fit_on_texts(sents)
     self.word_index_inv = {v: k for k, v in self.tokenizer.word_index.items()}
     sequences = self.tokenizer.texts_to_sequences(sents)    
     sampling_table = make_sampling_table(self.vocabulary_size, sampling_factor=0.001)
     print('generating couples')
     couples = []
     labels = []
     for seq in sequences:
         c,l = skipgrams(seq, vocabulary_size=self.vocabulary_size, 
                 window_size=self.window_size, shuffle=True, sampling_table=sampling_table, 
                 negative_samples=self.neg_samples)
         couples.extend(c)
         labels.extend(l)
     
     word_target, word_context = zip(*couples)
     word_target = np.array(word_target, dtype="int32")
     word_context = np.array(word_context, dtype="int32")
     return word_target, word_context, labels
    def build(self, training_file, min_count=5, estimate=0):
        # training_file: tab separated list (focus, target, adversarial_label) of skipgram pairs, gzipped
        # min_count: discard words that appear less than X times
        # estimate: estimate vocabulary using X words, 0 for read all
        c_focus, c_target, adversarials, word_counter = self.read_text(
            training_file, estimate)
        focus_words = {"<MASK>": 0, "<UNK>": 1}
        target_words = {"<MASK>": 0, "<UNK>": 1}
        filtered_word_count = 0
        for w, count in c_focus.most_common():
            if count < min_count:
                break
            focus_words[w] = len(focus_words)
            filtered_word_count += count
        for w, count in c_target.most_common():
            if count < min_count:
                break
            target_words[w] = len(target_words)
        adv_labels = {}
        for label in adversarials:
            adv_labels[label] = len(adv_labels)
        self.focus_words = focus_words
        self.target_words = target_words
        self.vocab_size = len(self.focus_words)
        self.inverted_words = self.invert(self.focus_words)
        self.adversarial_labels = adv_labels
        print("Vocabulary created with {w} words.".format(w=self.vocab_size),
              file=sys.stderr)
        if estimate == 0 or estimate > word_counter:
            self.total_word_count = filtered_word_count
        else:
            self.total_word_count = None  # unknown

        self.sampling_table = make_sampling_table(len(self.focus_words))
Beispiel #3
0
    def train_corpus(self, negative_samples=20, window_size=4):
        """ Train the model on the given corpus

        Parameters:
        negative_samples (int): the number of `false contexts' for each word
        window_size (int): the size of each context
        """
        logging.info('Initialising sampling table')
        sampling_table = sequence.make_sampling_table(self.vocab_size)
        ans = []
        for i, seq in enumerate(
                self.tokenizer.texts_to_sequences_generator(self.corpus)):
            logging.info(i)
            couples, labels = sequence.skipgrams(
                seq,
                self.vocab_size,
                window_size=window_size,
                negative_samples=negative_samples,
                sampling_table=sampling_table)
        if couples:
            word_target, word_context = zip(*couples)
            word_target = np.array(word_target, dtype="int32")
            word_context = np.array(word_context, dtype="int32")
            loss = self.model.train_on_batch([word_target, word_context],
                                             labels)
        ans.append(loss)
        return ans
Beispiel #4
0
    def get_skips(self, docs):
        """
        Formats the data and generates negative samples.

        :param docs: list; a list of documents; each document is a list of sentences;
        a sentence is a list of tokens (strings)
        :return: tuple; contains the center and context words, and the corresponding labels
        """
        sampling_table = make_sampling_table(self.vocab_size)
        center_words, context_words, labels = [], [], []
        for doc in docs:
            tokens = [token for sent in doc for token in sent]
            pairs, labels_ = skipgrams(tokens,
                                       self.vocab_size,
                                       window_size=self.window_size,
                                       sampling_table=sampling_table)
            try:
                center, context = zip(*pairs)
            except ValueError:
                continue
            center_words += center
            context_words += context
            labels += labels_

        return center_words, context_words, labels
Beispiel #5
0
    def train_corpus(self, negative_samples=20, window_size=4):
        """ Train the model on the given corpus

        Parameters:
        negative_samples (int): the number of `false contexts' for each word
        window_size (int): the size of each context
        """
        logging.info('Initialising sampling table')
        sampling_table = sequence.make_sampling_table(self.vocab_size)
        ans = []
        for i, seq in enumerate(
                self.tokenizer.texts_to_sequences_generator(self.corpus)):
            logging.info(i)
            couples, labels = sequence.skipgrams(
                seq, self.vocab_size, window_size=window_size,
                negative_samples=negative_samples,
                sampling_table=sampling_table)
        if couples:
            word_target, word_context = zip(*couples)
            word_target = np.array(word_target, dtype="int32")
            word_context = np.array(word_context, dtype="int32")
            loss = self.model.train_on_batch([word_target, word_context],
                                             labels)
        ans.append(loss)
        return ans
    def _fit_embeddings(self, text):
        sampling_table = sequence.make_sampling_table(max_words)

        for e in range(self.n_epochs):
            print('-' * 40)
            print('Epoch', e)
            print('-' * 40)

            progbar = generic_utils.Progbar(self.tokenizer.document_count)
            samples_seen = 0
            losses = []

            for i, seq in enumerate(self.tokenizer.texts_to_sequences_generator(text)):

                #MAKE SURE TOKENIZER AND FITTING ARE WORKING
                #if i < 5:
                #    print(map(lambda x: reverse_word_index[x], seq))

                # get skipgram couples for one text in the dataset
                couples, labels = sequence.skipgrams(seq, max_words,
                                                     window_size=self.window_size,
                                                     negative_samples=1.,
                                                     sampling_table=sampling_table)
                if couples:
                    # one gradient update per sentence (one sentence = a few 1000s of word couples)
                    X = np.array(couples, dtype="int32")
                    loss = self.embedding_model.train_on_batch(X, labels)
                    losses.append(loss)
                    if len(losses) % 100 == 0:
                        progbar.update(i, values=[("loss", np.mean(losses))])
                        losses = []
                    samples_seen += len(labels)
            print('Samples seen:', samples_seen)
        print("Training completed!")
        return self
Beispiel #7
0
def format_data(data):
    sampling_table = sequence.make_sampling_table(vocab_size)
    couples, labels = sequence.skipgrams(data, vocab_size, window_size=context_size, sampling_table=sampling_table)
    word_target, word_context = zip(*couples)
    word_target = np.array(word_target, dtype="int32")
    word_context = np.array(word_context, dtype="int32")
    labels = np.array(labels, dtype="int32")
    return word_target, word_context, labels
Beispiel #8
0
    def build_dataset(self, words):
        count = [['UNK', -1]]
        count.extend(collections.Counter(words).most_common(self.L - 1))
        self.logger.debug("original count " + str(len(count)))
        dictionary = dict()
        self.counter = dict()
        for word, _ in count:
            if self.emb_type:
                if word == 'UNK' or (word in self.word2vec_embedings.vocab\
                        and word in self.glove_embedings and word in self.fasttext_embedings):
                    dictionary[word] = len(dictionary)
                    self.counter[word] = _
                else:
                    self.logger.debug(word + " not in embeds")
            else:
                dictionary[word] = len(dictionary)
        del (self.word2vec_embedings)
        del (self.glove_embedings)
        del (self.fasttext_embedings)
        del (self.custom_embedings)
        self.L = len(dictionary)
        self.logger.debug("dictionary size" + str(len(dictionary)))
        reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
        data = list()
        unk_count = 0
        for word in words:
            if word in dictionary:
                index = dictionary[word]
            else:
                index = 0
                unk_count += 1
            data.append(index)
        count[0][1] = unk_count

        data = np.array(data)
        self.count = count
        self.dictionary = dictionary
        self.words = [reverse_dictionary[x] for x in range(len(reverse_dictionary))]
        self.logger.debug('....building samples')
        self.sampling_table = sequence.make_sampling_table(len(dictionary))
        couples, labels = skipgrams(data,
                                    len(dictionary),
                                    window_size=self.cs,
                                    sampling_table=self.sampling_table,
                                    negative_samples=self.ns)
        del data
        self.labels = np.array(labels)
        # labels[labels == 0] = -1
        word_target, word_context = zip(*couples)
        del couples
        self.word_target = np.array(word_target, dtype="int32")
        self.word_context = np.array(word_context, dtype="int32")
        self.logger.debug('....corpus generated')
        with open(self.dir_name+'/vocab.tsv', 'w') as txt:
            for word in self.words:
                txt.write(word + '\n')
        self.logger.debug('....vocab writen')
 def build(self):
     sampling_table = make_sampling_table(self.dictionary_size)
     word_pairs, labels = skipgrams(self.text_data, self.dictionary_size, window_size=self.window_size,
                                    sampling_table=sampling_table)
     #print(word_pairs)
     word_targets, word_contexts = zip(*word_pairs)
     word_contexts = np.array(word_contexts, dtype="int32")
     word_targets = np.array(word_targets, dtype="int32")
     labels = np.array(labels, dtype="int32")
     return word_contexts, word_targets, labels
Beispiel #10
0
 def get_data(self, data, vocab_size, window_size):
     sampling_table = sequence.make_sampling_table(vocab_size)
     couples, labels = skipgrams(data,
                                 vocab_size,
                                 window_size=window_size,
                                 sampling_table=sampling_table)
     word_target, word_context = zip(*couples)
     word_target = np.array(word_target, dtype="int32")
     word_context = np.array(word_context, dtype="int32")
     return self.DataSet(word_target, word_context, labels)
Beispiel #11
0
def main(dname, encode_dir, raw_dir, odir='./resources/skipgrams/'):
    # load tokenizer
    tok = pickle.load(open(encode_dir+dname+'.tkn', 'rb'))
    params = {
        'window': 5,
        'vocab_size': tok.num_words,
        'emb_dim': 300,
        'word_emb_path': './resources/word_emb.npy',
        'epochs': 5,
        'optimizer': 'adam',
        'lr': 1e-5,
    }
    word_sampling_table = make_sampling_table(size=params['vocab_size'])

    # load the data
    raw_corpus = pd.read_csv(raw_dir+dname+'.tsv', sep='\t')

    # build and train model
    print(params)
    print()
    model = build_model(params)
    print(model.summary())
    for epoch in range(params['epochs']):
        loss = 0
        # shuffle the data
        raw_corpus = raw_corpus.sample(frac=1).reset_index(drop=True)
        for step, doc in enumerate(raw_corpus.text):
            encode_doc = tok.texts_to_sequences([doc])
            word_pairs, word_labels = skipgrams(
            sequence=encode_doc[0], vocabulary_size=params['vocab_size'],
            window_size=params['window'])
            
            x = [np.array(x) for x in zip(*word_pairs)]
            y = np.array(word_labels, dtype=np.int32)
            
            if word_pairs:
                loss += model.train_on_batch(x,y)
                loss_avg = loss / step
            if step % 100 == 0:
                print('Epoch: {}, Step: {}'.format(epoch, step))
                print('\tLoss: {}.'.format(loss_avg))
                print('-------------------------------------------------')

        # save the model
        model.save(odir+'ww_model_{}.h5'.format(epoch))
        # save the word embedding
        np.save(odir+'word_{}.npy'.format(epoch), model.get_layer(name='word_emb').get_weights()[0])

    # save the model
    model.save(odir+'ww_model.h5')
    # save the word embedding
    np.save(odir+'word.npy', model.get_layer(name='word_emb').get_weights()[0])
Beispiel #12
0
def get_sim_model(vocab_size,
                  X,
                  vocab_map,
                  vector_dim=64,
                  save_path='sim_model.ckpt'):
    emb = Embedding(input_dim=vocab_size,
                    output_dim=vector_dim,
                    input_length=1)
    word_input = Input(shape=(1, ))
    context_input = Input(shape=(1, ))
    word = emb(word_input)
    context = emb(context_input)
    vectorizer = Model(inputs=word_input, outputs=word)
    similarity = dot([word, context], axes=2, normalize=True)
    print(similarity.shape)
    sim_model = Model(inputs=[word_input, context_input], outputs=similarity)
    merged = dot([word, context], axes=0, normalize=False)
    merged = Flatten()(merged)
    output = Dense(
        1,
        activation='sigmoid',
    )(merged)
    model = Model(inputs=[word_input, context_input], outputs=output)
    print(model.summary())
    model.compile(loss='binary_crossentropy', optimizer='adam')
    print('Trainning embedding...')
    sim_cb = SimilarityCallback(vocab_map, sim_model, vocab_size)
    for e in range(25):
        sampling_table = make_sampling_table(vocab_size)
        couples, labels = skipgrams(X.flatten(),
                                    vocab_size,
                                    window_size=3,
                                    sampling_table=sampling_table)
        word_target, word_context = zip(*couples)
        word_target = np.array(word_target, dtype="int32")
        word_context = np.array(word_context, dtype="int32")

        for x in range(3):
            model.fit([word_target, word_context],
                      labels,
                      epochs=1,
                      verbose=1,
                      batch_size=2048,
                      shuffle=True,
                      validation_split=0.1)
        if e % 10 == 0:
            sim_cb.run_sim()
            sim_model.save(save_path)
            vectorizer.save('word2{}vec.ckpt'.format(vector_dim))

    #sim_cb.run_sim()
    return sim_model, vectorizer
Beispiel #13
0
 def skipgrams(self, tokens, window_size=3):
     # Inputs and labels
     sampling_table = seq.make_sampling_table(self.vocab_size)
     skipgrams, labels = seq.skipgrams(tokens,
                                       self.vocab_size,
                                       window_size=window_size,
                                       shuffle=True,
                                       sampling_table=sampling_table)
     # downconvert the target and context vectors int16
     word_target, word_context = zip(*skipgrams)
     word_target = np.array(word_target, dtype='int32')
     word_context = np.array(word_context, dtype='int32')
     return (word_target, word_context, labels)
    def _get_word_pairs(self, tokens_indices):
        """This method takes the token indices and creates positive a negative samples for training.

        :param list tokens_indices: A list of the tokens represented by their ranks.
        :return: word_pairs which is a list containing elements of the form (word1, word2) and a list
                 of labels i.e. 0 or 1.
        """
        sampling_table = make_sampling_table(size=self.vocabulary_size + 1)
        word_pairs, labels = skipgrams(sequence=tokens_indices,
                                       vocabulary_size=self.vocabulary_size,
                                       window_size=self.window_size,
                                       sampling_table=sampling_table)
        return word_pairs, labels
    def generate_samples(self, data):

        data = data[:17000000]
        sampling_table = sequence.make_sampling_table(self.vocabulary_size)
        couples, labels = skipgrams(data,
                                    self.vocabulary_size,
                                    window_size=self.skip_window,
                                    sampling_table=sampling_table)
        word_target, word_context = zip(*couples)
        word_target = np.array(word_target, dtype="int32")
        word_context = np.array(word_context, dtype="int32")

        return [word_target, word_context, labels]
def generate_data(corpus, window_size, V):
    for words in corpus:
        couples, labels = skipgrams(words,
                                    V,
                                    window_size,
                                    negative_samples=1,
                                    shuffle=True,
                                    sampling_table=make_sampling_table(
                                        V, sampling_factor=1e-05))
        if couples:
            X, y = zip(*couples)
            X = np_utils.to_categorical(X, V)
            y = np_utils.to_categorical(y, V)
            yield X, y
Beispiel #17
0
def word_embedding(all_token, vocab_size):
	data, count, dic, rev_dic = dataset(all_token, vocab_size)

	win_size = 3
	vec_dim = 100
	epoch = 20

	valid_size = 16
	valid_win = 100
	valid_examples = np.random.choice(valid_win, valid_size, replace = False)

	sampling_table = sequence.make_sampling_table(vocab_size)
	couples, labels = skipgrams(data, vocab_size, window_size = win_size, sampling_table = sampling_table)
	word_target, word_context = zip(*couples)
	word_target = np.array(word_target, dtype="int32")
	word_context = np.array(word_context, dtype="int32")

	input_target = Input((1,))
	input_context = Input((1,))
	embedding = Embedding(vocab_size, vec_dim, input_length=1, name='embedding')

	target = embedding(input_target)
	target = Reshape((vec_dim, 1))(target)
	context = embedding(input_context)
	context = Reshape((vec_dim, 1))(context)

	similarity = merge([target, context], mode='cos', dot_axes=0)

	dot_product = merge([target, context], mode='dot', dot_axes=1)
	dot_product = Reshape((1,))(dot_product)
	# add the sigmoid output layer
	output = Dense(1, activation='sigmoid')(dot_product)

	model = Model(input=[input_target, input_context], output=output)
	model.compile(loss='binary_crossentropy', optimizer='rmsprop')

	arr_1 = np.zeros((1,))
	arr_2 = np.zeros((1,))
	arr_3 = np.zeros((1,))
	for cnt in range(epoch):
	    idx = np.random.randint(0, len(labels)-1)
	    arr_1[0,] = word_target[idx]
	    arr_2[0,] = word_context[idx]
	    arr_3[0,] = labels[idx]
	    loss = model.train_on_batch([arr_1, arr_2], arr_3)

	print '-------finish embedding----------'
	embedding_vector = model.get_weights()[0]
	return dic, embedding_vector
    def add_to_data(self):
        """Adds a randomly chosen file to the batch data
        """
        file_num = randint(0, len(self.relevant_file_names) - 1)
        year, file_name = self.relevant_file_names[file_num]
        # convert year to number between 0 and 1
        dec = self.year2dec(year)
        with open(file_name, "r") as coha_file:
            # first line is file number
            coha_file.readline()
            for line in coha_file:
                if line.strip() == "":
                    continue

                # converts words in training data to pseudowords for synthetic task
                # See paper for details
                if self.synth_task is not None:
                    words = line.rstrip().split()
                    adjusted_line = []
                    for word in words:
                        if word in self.synth_task.synth_task_words:
                            if word in self.synth_task.synth_task_w1_map:
                                datum = self.synth_task.synth_task_data[self.synth_task.synth_task_w1_map[word]]
                                if datum.w1_probs[year] > random():
                                    adjusted_line.append(word)
                            else:
                                datum = self.synth_task.synth_task_data[self.synth_task.synth_task_w2_map[word]]
                                if datum.w2_probs[year] > random():
                                    adjusted_line.append(datum.w1)

                        else:
                            adjusted_line.append(word)
                    line = " ".join(adjusted_line)

                wids = self.tokenizer.texts_to_sequences([line])[0]
                # Note that make_sampling_table estimates the sample probabilities using Zipf's law and does not
                # use the word counts in determining probabilities.
                sampling_table = make_sampling_table(self.vocab_size)
                # Note: skipgrams does not weigh sampling probabilities by unigram probability.
                pairs, labels = skipgrams(wids, self.vocab_size, window_size=self.args.window_size,
                                          negative_samples=self.args.num_negative_samples,
                                          sampling_table=sampling_table)
                # Add pair data to batch data
                self._curr_targets += [pair[0] for pair in pairs]
                self._curr_contexts += [pair[1] for pair in pairs]
                self._curr_labels += labels
                self._curr_times += len(pairs) * [dec]
def train_model(model):
    sampling_table = make_sampling_table(V, sampling_factor=args.sampling_factor)
    for epoch in range(args.epochs_to_train):
        loss = 0.
        for i, sent in enumerate(sentences):
            print('{}/{}'.format(i, len(sentences)))
            couples, labels = skipgrams(sequence=sent, vocabulary_size=V, window_size=args.window_size,
                                        negative_samples=args.num_neg_samples, sampling_table=sampling_table)
            if couples:
                words, contexts = zip(*couples)
                words = np.array(words, dtype=np.int32)
                contexts = np.array(contexts, dtype=np.int32)
                y = np.array(labels, dtype=np.int32)
                loss += model.train_on_batch([words, contexts], y)
        print('num epoch: {} loss: {}'.format(epoch, loss))

    return model
    def fit(self, X, y=None):
        self.build_vocab(X)
        self.build_graph()
        indexed_texts = self.texts_to_index(X)

        sampling_table = None
        if self.sort_vocab and self.use_sampling_table:
            sampling_table = make_sampling_table(self.vocab_size_)

        for epoch in trange(self.epochs):
            (batch_center,
             batch_context,
             batch_label) = generate_batch_data(
                indexed_texts, self.batch_size, self.vocab_size_, self.window_size,
                self.negative_samples, sampling_table)
            self.model_.train_on_batch([batch_center, batch_context], batch_label)

        return self
Beispiel #21
0
def main():
    vocab_size = 10000
    data, count, dictionary, reversed_dict = collect_data(vocab_size)
    window_size = 3
    vector_dim = 300
    epochs = 2000000
    valid_size = 16
    valid_window = 100
    valid_examples = np.random.choice(valid_window, valid_size, replace=False)

    sampling_table = sequence.make_sampling_table(vocab_size)
    couples, labels = sequence.skipgrams(
            data,
            vocab_size,
            window_size=window_size,
            sampling_table=sampling_table)
    word_target, word_context = zip(*couples)
    word_target = np.array(word_target, dtype='int32')
    word_context = np.array(word_context, dtype='int32')

    print(couples[:10], labels[:10])
Beispiel #22
0
 def build_dataset(self, words):
     count = [['UNK', -1]]
     count.extend(collections.Counter(words).most_common(self.L - 1))
     dictionary = dict()
     for word, _ in count:
         dictionary[word] = len(dictionary)
     data = list()
     unk_count = 0
     for word in words:
         if word in dictionary:
           index = dictionary[word]
         else:
           index = 0 
           unk_count += 1
         data.append(index)
     count[0][1] = unk_count
     reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
     self.data = np.array(data)
     self.count = count
     self.dictionary = dictionary
     self.words = [reverse_dictionary[x] for x in range(len(reverse_dictionary))]
     sampling_table = sequence.make_sampling_table(len(dictionary))
     couples, labels = skipgrams(data,
                                 len(dictionary),
                                 window_size=self.cs,
                                 sampling_table=sampling_table,
                                 negative_samples=self.ns)
     del data
     self.labels = np.array(labels)
     # labels[labels == 0] = -1
     word_target, word_context = zip(*couples)
     del couples
     self.word_target = np.array(word_target, dtype="int32")
     self.word_context = np.array(word_context, dtype="int32")
     with open('fits/vocab.tsv', 'w') as txt:
         for word in self.words:
             txt.write(word+'\n')
Beispiel #23
0
def make_training_data(all_msgs, hot_encoder):

    def skipgram_input(word, hot_encoder):
        index = hot_encoder.get_index(word)
        return 0 if index is None else index + 1 #the skipgram function uses 0 to mean invalid word

    all_msgs = sort_messages(all_msgs)
    all_content = (msg['content'] for msg in all_msgs)
    all_words = [
        skipgram_input(word, hot_encoder) 
        for message in all_content
        for word in message
    ]

    sampling_table = sequence.make_sampling_table(NUM_INDEXED_WORDS + 1) #make a table that estimates the frequency of each word occuring according to zipf's law
    skip_grams = sequence.skipgrams( #using the keras skipgrams function because having to figure out the word frequencies and how many samples to do and stuff sounds really hard
        all_words, NUM_INDEXED_WORDS + 1, window_size = WINDOW_SIZE, sampling_table = sampling_table
    )
    input_pairs, output = skip_grams
    target_input, context_input = map(np.array, zip(*input_pairs)) #reshape input to be in proper form and convert to numpy arrays
    target_input -= 1 #have to convert back from the format that skipgrams wanted
    context_input -= 1

    return [target_input, context_input], output
Beispiel #24
0
        continue
      valid_sequences += 1
    loss = train_batch(model, X_couples, y_labels)
    losses += loss
    if epoch % print_every == 0:
      logging.info("Mean loss in Epoch [%s] with %s valid sequences = %s" % (epoch, valid_sequences, losses / valid_sequences))
      losses, valid_sequences = 0.0, 0


if __name__ == "__main__":
  #g = Graph.Read_Edgelist("deepwalk/p2p-Gnutella08.edgelist")
  g = load_adjlist("deepwalk/karate.adjlist", directed=False)
  vocab_size = len(g.vs)
  max_len = 5
  save = True
  sampling_table = make_sampling_table(vocab_size)
  degrees = np.array(g.vs.degree())
  inv_sqrt_degree = 1/np.sqrt(degrees)
  sampling_table = inv_sqrt_degree/np.sum(inv_sqrt_degree)
  logging.info("Graph Summary: \n", summary(g))
  logging.info("Building Model")
  if save:
    model = cPickle.load(open("out/Karate.Model.3100.pkl"))
  else:
    model = cPickle.load("out/Karate.Model.3100.pkl")
    model = Sequential()
    model.add(WordContextProduct(vocab_size, proj_dim=300, init='uniform'))
    model.compile(loss='binary_crossentropy', optimizer='rmsprop')
    #couples, labels = skipgrams(sequences[np.random.randint(vocab_size)], vocab_size, window_size=4, negative_samples=1.0, sampling_table=sampling_table)
    #train_on_model(model, g, vocab_size, print_every=1)
    #cPickle.dump(model, open("out/Karate.Model.3100.pkl", "wb"))
Beispiel #25
0
 def test_make_sampling_table(self):
     a = preprocessing_sequence.make_sampling_table(3)
     self.assertAllClose(a,
                         np.asarray([0.00315225, 0.00315225, 0.00547597]),
                         rtol=.1)
def test_make_sampling_table():
    a = make_sampling_table(3)
    assert_allclose(a, np.asarray([0.00315225, 0.00315225, 0.00547597]),
                    rtol=.1)
if train_model:
	if load_model:
		print('Load model...')
		model = cPickle.load(open(os.path.join(save_dir, model_load_fname), 'rb'))
	else:
		print('Build model...')
		word = Sequential()
		word.add(Embedding(vocab_size,vector_dim, init='uniform'))
		context = Sequential()
		context.add(Embedding(vocab_size,vector_dim, init='uniform'))
		model = Sequential()
		model.add(Merge([word, context], mode='dot'))
		model.compile(loss='mse', optimizer='rmsprop')

	sampling_table = sequence.make_sampling_table(vocab_size)


	for e in range(nb_epoch):
		print('-'*40)
		print('Epoch',e)
		print('-'*40)

		progbar = generic_utils.Progbar(tokenizer.document_count)
		samples_seen = 0
		losses = []


		for i, seq in enumerate(tokenizer.texts_to_sequences_generator(text_generator())):
			# get skipgram couples for one text in the dataset
			couples, labels = sequence.skipgrams(seq, vocab_size, window_size=4, negative_samples=1., sampling_table=sampling_table)
    del vocabulary  # Hint to reduce memory.
    return data, count, dictionary, reverse_dictionary
################################################################
vocab_size = 10000
data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)
print(data[:7])

window_size = 3
vector_dim = 300
epochs = 2000

valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])

# create some input variables
input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
context = embedding(input_context)
Beispiel #29
0
def process(args):

  print "Loading graph..."
  if args.format == "adjlist":
    G = graph.load_adjacencylist(args.input, undirected=args.undirected)
  elif args.format == "edgelist":
    G = graph.load_edgelist(args.input, undirected=args.undirected)
  elif args.format == "mat":
    G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
  else:
    raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

  print("Number of nodes: {}".format(len(G.nodes())))

  num_walks = len(G.nodes()) * args.number_walks

  print("Number of walks: {}".format(num_walks))

  data_size = num_walks * args.walk_length

  print("Data size (walks*length): {}".format(data_size))

  if data_size < args.max_memory_data_size:
    #print("Walking...")
    #walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
    #                                    path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
    print("Training...")
    max_features = len(G.nodes())  # vocabulary size
    dim_proj = args.representation_size  # embedding space dimension
    nb_epoch = 1   # number of training epochs

    # Neural network ( in Keras )
    model = Sequential()
    model.add(WordContextProduct(max_features, proj_dim=dim_proj, init="uniform"))
    model.compile(loss='mse', optimizer='rmsprop')
    sampling_table = sequence.make_sampling_table(max_features)

    print("Fitting tokenizer on walks...")
    tokenizer = text.Tokenizer(nb_words=max_features)

    print "Epochs: %d" % nb_epoch
    #tokenizer.fit_on_texts( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length))

    for e in range(nb_epoch):
        print('-'*40)
        print('Epoch', e)
        print('-'*40)

        #progbar = generic_utils.Progbar(tokenizer.document_count)
        samples_seen = 0
        losses = []

#        for i, seq in enumerate(tokenizer.texts_to_sequences_generator( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length) )):

        for i, seq in enumerate( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length) ):
            # get skipgram couples for one text in the dataset
            couples, labels = sequence.skipgrams(seq, max_features, window_size=5, negative_samples=1., sampling_table=sampling_table)
            if couples:
                # one gradient update per sentence (one sentence = a few 1000s of word couples)
                X = np.array(couples, dtype="int32")
                print "Started fitting..."
                loss = model.fit(X, labels)

                print "Dumping..."

                # Dump weights to a temp file
                weights = model.layers[0].get_weights()[0]

                norm_weights = np_utils.normalize(weights)

                # TODO: save weights with indices
                np.savetxt( args.output, norm_weights )

                losses.append(loss)
                if len(losses) % 100 == 0:
    #                progbar.update(i, values=[("loss", np.mean(losses))])
                    losses = []
                samples_seen += len(labels)
        print('Samples seen:', samples_seen)
    print("Training completed!")

  else:
    print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
    print("Walking...")

    #TODO: IMPLEMENT THAT
    print "Not implemented yet..."
    sys.exit(1)

  print "Optimization done. Saving..."
  # recover the embedding weights trained with skipgram:
  weights = model.layers[0].get_weights()[0]

  # we no longer need this
  del model

  norm_weights = np_utils.normalize(weights)

  # TODO: save weights with indices
  np.savetxt( args.output, norm_weights )
  print "Saved!"
Beispiel #30
0
def sequence_make_sampling_table():
    print(sequence.make_sampling_table(5))
Beispiel #31
0
def main(dname,
         encode_dir,
         raw_dir,
         odir='./resources/skipgrams/',
         mode='local'):
    # load corpus data
    raw_corpus = pd.read_csv(raw_dir + dname + '.tsv', sep='\t')

    # load user data
    user_idx = json.load(open(raw_dir + 'user_idx.json'))
    user_info = dict()
    user_control = set()  # control if renew user_info sample method
    with open(encode_dir + 'users.json') as dfile:
        for line in dfile:
            line = json.loads(line)
            user_info[line['uid']] = line
            user_info[line['uid']]['count'] = 0

    # load tokenizer
    tok = pickle.load(open(encode_dir + dname + '.tkn', 'rb'))
    params = {
        'window': 5,
        'vocab_size': tok.num_words,
        'user_size': len(user_info) + 1,  # +1 for unknown
        'emb_dim': 300,
        'word_emb_path': './resources/word_emb.npy',
        'user_emb_path': './resources/user_emb.npy',
        'word_emb_train': True,
        'user_emb_train': True,
        'user_task_weight': 1,
        'word_task_weight': 1,
        'epochs': 5,
        'optimizer': 'adam',
        'lr': 1e-5,
    }
    word_sampling_table = make_sampling_table(size=params['vocab_size'])

    ww_model, uw_model = build_model(params)
    print()
    print(params)

    for epoch in range(params['epochs']):
        loss = 0
        # shuffle the data
        raw_corpus = raw_corpus.sample(frac=1).reset_index(drop=True)
        for step, entry in raw_corpus.iterrows():
            '''word info, ww: word-word'''
            encode_doc = tok.texts_to_sequences([entry.text])
            ww_pairs, ww_labels = skipgrams(
                sequence=encode_doc[0],
                vocabulary_size=params['vocab_size'],
                window_size=params['window'])

            word_pairs = [np.array(x) for x in zip(*ww_pairs)]
            ww_labels = np.array(ww_labels, dtype=np.int32)
            '''user info, uw: user-word'''
            cur_user = user_info[entry.uid]

            if mode == 'local':
                uw_pairs, uw_labels = utils.user_word_sampler(
                    uid=cur_user['uid_encode'],
                    sequence=encode_doc[0],
                    vocab_size=params['vocab_size'],
                    filter_words=set(cur_user['words']),
                    negative_samples=1)
                uw_pairs = [np.array(x) for x in zip(*uw_pairs)]
                uw_labels = np.array(uw_labels, dtype=np.int32)
            elif mode == 'decay':
                decay_num = utils.sample_decay(cur_user['count'])
                if decay_num > np.random.random():
                    uw_pairs, uw_labels = utils.user_word_sampler(
                        uid=cur_user['uid_encode'],
                        sequence=set(cur_user['words']),
                        vocab_size=params['vocab_size'],
                        negative_samples=1)
                    uw_pairs = [np.array(x) for x in zip(*uw_pairs)]
                    uw_labels = np.array(uw_labels, dtype=np.int32)

                    user_info[entry.uid]['count'] += 1
                    user_control.add(entry.uid)
                else:
                    uw_pairs = None
                    uw_labels = None

                if len(user_control) >= len(user_info) - 10:
                    # restart the control for sampling
                    for uid in user_info:
                        user_info[uid]['count'] = 0
                    user_control.clear()
            elif mode == 'global':
                uw_pairs, uw_labels = utils.user_word_sampler(
                    uid=cur_user['uid_encode'],
                    sequence=set(cur_user['words']),
                    vocab_size=params['vocab_size'],
                    negative_samples=1)
                uw_pairs = [np.array(x) for x in zip(*uw_pairs)]
                uw_labels = np.array(uw_labels, dtype=np.int32)
            else:
                raise ValueError('Mode {} does not exist!'.format(mode))
            '''Train'''
            if word_pairs:
                loss += ww_model.train_on_batch(word_pairs, ww_labels)
            if uw_pairs:
                loss += uw_model.train_on_batch(uw_pairs, uw_labels)

            loss_avg = loss / step
            if step % 100 == 0:
                print('Epoch: {}, Step: {}'.format(epoch, step))
                print('\tLoss: {}.'.format(loss_avg))
                print('-------------------------------------------------')

        # save the model
        ww_model.save(odir + 'ww_model_{}.h5'.format(epoch))
        uw_model.save(odir + 'uw_model_{}.h5'.format(epoch))
        # save the word embedding
        np.save(odir + 'word_{}.npy'.format(epoch),
                ww_model.get_layer(name='word_emb').get_weights()[0])
        # save the user embedding
        np.save(odir + 'user_{}.npy'.format(epoch),
                uw_model.get_layer(name='user_emb').get_weights()[0])

    # save the model
    ww_model.save(odir + 'ww_model.h5')
    uw_model.save(odir + 'uw_model.h5')
    # save the word embedding
    np.save(odir + 'word.npy',
            ww_model.get_layer(name='word_emb').get_weights()[0])
    # save the user embedding
    np.save(odir + 'user.npy',
            uw_model.get_layer(name='user_emb').get_weights()[0])
Beispiel #32
0
#model.add(LSTM(layers[2], return_sequences=False))
#model.add(Dropout(0.2))
model.add(Dense(output_dim=layers[3]))  # for skipgram
model.add(Activation('sigmoid'))  # for skipgram
#model.add(Activation('softmax'))         # for sequences
#model.compile(loss='categorical_crossentropy', optimizer='rmsprop')  # buildSequences
print('Compile model...')
model.compile(loss='mse', optimizer='rmsprop')  # buildSkipgram

# training process
if train_model:
    if load_model:
        print('Load model...')
        model.load_weights(os.path.join(save_dir, model_load_fname))

    sampling_table = sequence.make_sampling_table(max_features)

    for e in range(nb_epoch):
        print('-' * 40)
        print('Epoch', e)
        print('-' * 40)

        progbar = generic_utils.Progbar(tokenizer.document_count)
        samples_seen = 0
        losses = []

        for i, seq in enumerate(
                tokenizer.texts_to_sequences_generator(text_generator())):
            # get skipgram couples for one text in the dataset
            couples, labels = sequence.skipgrams(seq,
                                                 max_features,
Beispiel #33
0
def main():
	scoreList = [0.0,0.0]
	with open('data/info.json') as j:
		info = ujson.load(j)
	for problem in os.listdir('data'):
		greek=False
		if problem.startswith('problem'):
			truthPath = 'data/truth/'+problem+'/clustering.json'
			with open(truthPath) as t:
				truth = ujson.load(t)
			print(problem)
			probTokList = []
			docList = []
			docDict = {}
			X=[]
			Y=[]

			path = 'data/' + problem
			for entry in info:
				if entry["folder"] == problem:
					lang=entry["language"]
					if entry["language"] == "gr":
						greek=True

			CV = CountVectorizer(input='filename', strip_accents='unicode', analyzer='word', ngram_range=(1,4))
			docs = [path+'/'+x for x in os.listdir(path)]
			cMatrix = CV.fit_transform(docs)
			for doc in os.listdir(path):
				docTokList = []
				with open(path + '/' + doc) as d:
						article = d.readlines()
						for sent in article:
							sentTokList = []
							for word in sent.split():
								for token in word:
									procToken = preprop(token,greek)
									sentTokList.append(procToken) #Every item of the list is a normalized character
							docTokList.append(' '.join(sentTokList))#Every item of the list is a sentence
				probTokList.append(' '.join(docTokList))#Every item of the list is a document
				docList.append(doc)
			tokenizer = text.Tokenizer(nb_words=None,filters=text.base_filter(),lower=True,split=" ")
			tokenizer.fit_on_texts(probTokList)
			seqList = tokenizer.texts_to_sequences(probTokList)
			
			uniqueTokens = max([max(x) for x in seqList])

			print(uniqueTokens,lang)
			sampling_table = sequence.make_sampling_table(uniqueTokens+1)
			for i,seq in enumerate(seqList):
				x, y = sequence.skipgrams(seq, uniqueTokens, window_size=4, negative_samples=1.0, categorical=False, sampling_table=sampling_table)
				x = zip(x, y)
				X.append(x)
				#Y.extend(y)
				docDict[docList[i]] = seq
			strX=[str(x) for x in X]
			xTokenizer = text.Tokenizer(nb_words=None,filters=text.base_filter(),lower=True,split=" ")
			xTokenizer.fit_on_texts(strX)
			#docMatrix = tokenizer.sequences_to_matrix(seqList,mode="tfidf")
			docMatrix = xTokenizer.sequences_to_matrix(strX,mode="tfidf")
			#scores = embedNN(X,Y)
			pairs = combinations(docDict.keys(),2)
			cList = []
			nnDict = {}
			for cluster in truth:
				cPairs = []
				if len(cluster) > 1:
					for item in cluster:
						cPairs.append(str(item["document"]))
					cList.extend(list(permutations(cPairs,2)))
			for pair in pairs:
				match = False
				if pair in cList:
					match = True
				nnDict[pair] = match
			for i, doc in enumerate(docMatrix):
				docDict[docList[i]] = doc

			
			truthCounter =  Counter(nnDict.values())
			baseline = 1-float(truthCounter[True])/float(len(nnDict))
			print("Baseline for {} is {}".format(problem, baseline))
			clusterCount = Counter()
			kmclusters = False # Change to False for meanshift
			if kmclusters:
				pbar = ProgressBar()
				for nclusters in pbar(reversed(range(len(docMatrix)-1))):
					#print("{} Clusters".format(nclusters+1))
					clusters = KMclusterer(nclusters+1,cMatrix)
					for c in range(nclusters+1):
						#print(c,"has:",[i for i,x in enumerate(clusters) if x == c])
						for clusterpair in list(combinations([i for i,x in enumerate(clusters) if x == c],2)):
							combo = (docList[clusterpair[0]],docList[clusterpair[1]])
							clusterCount[combo] +=1
			else:
				clusters = KMclusterer(int(len(docMatrix)*0.67),docMatrix)
				#clusters = MSclusterer(cMatrix)#cMatrixdocMatrix
				for clusterpair in list(combinations([i for i,x in enumerate(clusters)],2)):
					combo = (docList[clusterpair[0]],docList[clusterpair[1]])
					clusterCount[combo] +=1

			x = 0.0 
			scoreList[0] += truthCounter[True]
			deleteList = []
			#print("Most common cluster is in {}%".format((float(clusterCount.most_common(20)[19][1])/len(docMatrix))*100))
			for combo in nnDict.keys():
				if combo not in clusterCount.keys():
					deleteList.append(combo)
			y = 0.0
			for item in deleteList:
				if item in cList:
					y+=1
				del nnDict[item]
			scores = sharedNN(docDict, nnDict)
			print("Deleted pairs are {}% of total correct pairs, {}% of deleted pairs was wrongly deleted".format(round(y/len(cList)*100.0,2), round(y/len(deleteList)*100.0,2)))

			for combo in clusterCount.most_common(20):
				if combo[0] in cList:
					x += 1
					scoreList[1] += 1
			print("prec: {}".format(x/20))
			#print("Document score is {} clusters correct out of {} (accuracy {})".format(x, truthCounter[True], x/truthCounter[True]))
			#print("prec: {} \nrec: {}".format(x/20, x/len(nnDict.values())))

	#print("Total precision  is {}, {} clusters correct".format(scoreList[1]/scoreList[0], scoreList[1]))


			if not os.path.exists('answers/'+problem):
				os.mkdir('answers/'+problem)
			clusDict = defaultdict(list)
			rankDict = defaultdict(list)
			for i, cluster in enumerate(list(clusters)):
				clusDict[cluster] .append({"document": docList[i]})
				rankDict[cluster] .append(docList[i])
			with open('answers/'+problem+'/clustering.json', "w") as jsonFile:
				ujson.dump(list(clusDict.values()), jsonFile, indent=4)
			rankList = []
			for value in rankDict.values():
				if len(value) > 1 :
					pairs = combinations(value,2)
				for pair in pairs:
					rankList.append({"document1": pair[0], "document2": pair[1], "score":  scores[pair][0]})
			with open('answers/'+problem+'/ranking.json', "w") as jsonFile:
				ujson.dump(rankList, jsonFile, indent=4)
nb_epoch = 5
skip_top = 10
dim_proj = 256
max_features = 1000

tokenizer = text.Tokenizer(nb_words=max_features)
tokenizer.fit_on_texts(text_generator())

# ----- 训练 -----

model = Sequential()
model.add(WordContextProduct(max_features, proj_dim=dim_proj, init="uniform"))
model.compile(loss='mse', optimizer='rmsprop')

sampling_table = sequence.make_sampling_table(max_features)

for e in range(nb_epoch):
    print 'Epoch:', e
    progbar = generic_utils.Progbar(tokenizer.document_count)
    samples_seen, losses = 0, []
    for i, seq in enumerate(tokenizer.texts_to_sequences_generator(text_generator())):
        couples, labels = sequence.skipgrams(seq, max_features, window_size=4, negative_samples=1., sampling_table=sampling_table)
        if couples:
            X = np.array(couples, dtype="int32")
            loss = model.train_on_batch(X, labels)
            losses.append(loss)
            if len(losses) % 100 == 0:
                progbar.update(i, values=[("loss", np.mean(losses))])
                losses = []
            samples_seen += len(labels)
Beispiel #35
0
    def train(self, model_config, wids, word2id):
        vocab_size = len(word2id.values())
        sampling_table = sequence.make_sampling_table(vocab_size)
        wids_flat = [word for sentence in wids for word in sentence]
        couples, labels = skipgrams(wids_flat, vocab_size, window_size=model_config['window_size'], sampling_table=sampling_table)
        word_target, word_context = zip(*couples)
        word_target = np.array(word_target, dtype="int32")
        word_context = np.array(word_context, dtype="int32")

        input_target = Input((1,))
        input_context = Input((1,))
        vector_dim = model_config['number_of_dimensions_in_hidden_layer']

        embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
        target = embedding(input_target)
        target = Reshape((vector_dim, 1))(target)
        context = embedding(input_context)
        context = Reshape((vector_dim, 1))(context)

        # setup a cosine similarity operation which will be output in a secondary model
        similarity = merge([target, context], mode='cos', dot_axes=0)

        # now perform the dot product operation to get a similarity measure
        dot_product = merge([target, context], mode='dot', dot_axes=1)
        dot_product = Reshape((1,))(dot_product)
        # add the sigmoid output layer
        output = Dense(1, activation='sigmoid')(dot_product)
        # create the primary training model
        model = Model(input=[input_target, input_context], output=output)
        model.compile(loss='binary_crossentropy', optimizer='rmsprop')

        validation_model = Model(input=[input_target, input_context], output=similarity)

        class SimilarityCallback:
            def run_sim(self):
                valid_size = 16  # Random set of words to evaluate similarity on.
                valid_window = 100  # Only pick dev samples in the head of the distribution.
                valid_examples = np.random.choice(valid_window, valid_size, replace=False)
                reverse_dictionary = dict(zip(word2id.values(), word2id.keys()))
                for i in range(valid_size):
                    valid_word = reverse_dictionary[valid_examples[i]]
                    top_k = 8  # number of nearest neighbors
                    sim = self._get_sim(valid_examples[i])
                    nearest = (-sim).argsort()[1:top_k + 1]
                    log_str = 'Nearest to %s:' % valid_word
                    for k in range(top_k):
                        close_word = reverse_dictionary[nearest[k]]
                        log_str = '%s %s,' % (log_str, close_word)
                    print(log_str)

            @staticmethod
            def _get_sim(valid_word_idx):
                sim = np.zeros((vocab_size,))
                in_arr1 = np.zeros((1,))
                in_arr2 = np.zeros((1,))
                in_arr1[0,] = valid_word_idx
                for i in range(vocab_size):
                    in_arr2[0,] = i
                    out = validation_model.predict_on_batch([in_arr1, in_arr2])
                    sim[i] = out
                return sim

        sim_cb = SimilarityCallback()

        arr_1 = np.zeros((1,))
        arr_2 = np.zeros((1,))
        arr_3 = np.zeros((1,))
        for cnt in range(model_config['epochs']):
            idx = np.random.randint(0, len(labels) - 1)
            arr_1[0,] = word_target[idx]
            arr_2[0,] = word_context[idx]
            arr_3[0,] = labels[idx]
            loss = model.train_on_batch([arr_1, arr_2], arr_3)
            if cnt % 100 == 0:
                print("Iteration {}, loss={}".format(cnt, loss))
            # if cnt % 10000 == 0:
            #     sim_cb.run_sim()

        return model, model.get_weights()[0]