def __generate_skipgrams(self, documents): #generate skipgrams print('creating sents ({} rows)'.format(len(documents))) #sents = newsgroups_train.data sents = filter_sentences(documents) self.filtered_sents = sents print('tokenizing sents ({} sentences)'.format(len(sents))) self.tokenizer = Tokenizer(num_words= self.vocabulary_size, lower=True, filters=self.filters) self.tokenizer.fit_on_texts(sents) self.word_index_inv = {v: k for k, v in self.tokenizer.word_index.items()} sequences = self.tokenizer.texts_to_sequences(sents) sampling_table = make_sampling_table(self.vocabulary_size, sampling_factor=0.001) print('generating couples') couples = [] labels = [] for seq in sequences: c,l = skipgrams(seq, vocabulary_size=self.vocabulary_size, window_size=self.window_size, shuffle=True, sampling_table=sampling_table, negative_samples=self.neg_samples) couples.extend(c) labels.extend(l) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") return word_target, word_context, labels
def build(self, training_file, min_count=5, estimate=0): # training_file: tab separated list (focus, target, adversarial_label) of skipgram pairs, gzipped # min_count: discard words that appear less than X times # estimate: estimate vocabulary using X words, 0 for read all c_focus, c_target, adversarials, word_counter = self.read_text( training_file, estimate) focus_words = {"<MASK>": 0, "<UNK>": 1} target_words = {"<MASK>": 0, "<UNK>": 1} filtered_word_count = 0 for w, count in c_focus.most_common(): if count < min_count: break focus_words[w] = len(focus_words) filtered_word_count += count for w, count in c_target.most_common(): if count < min_count: break target_words[w] = len(target_words) adv_labels = {} for label in adversarials: adv_labels[label] = len(adv_labels) self.focus_words = focus_words self.target_words = target_words self.vocab_size = len(self.focus_words) self.inverted_words = self.invert(self.focus_words) self.adversarial_labels = adv_labels print("Vocabulary created with {w} words.".format(w=self.vocab_size), file=sys.stderr) if estimate == 0 or estimate > word_counter: self.total_word_count = filtered_word_count else: self.total_word_count = None # unknown self.sampling_table = make_sampling_table(len(self.focus_words))
def train_corpus(self, negative_samples=20, window_size=4): """ Train the model on the given corpus Parameters: negative_samples (int): the number of `false contexts' for each word window_size (int): the size of each context """ logging.info('Initialising sampling table') sampling_table = sequence.make_sampling_table(self.vocab_size) ans = [] for i, seq in enumerate( self.tokenizer.texts_to_sequences_generator(self.corpus)): logging.info(i) couples, labels = sequence.skipgrams( seq, self.vocab_size, window_size=window_size, negative_samples=negative_samples, sampling_table=sampling_table) if couples: word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") loss = self.model.train_on_batch([word_target, word_context], labels) ans.append(loss) return ans
def get_skips(self, docs): """ Formats the data and generates negative samples. :param docs: list; a list of documents; each document is a list of sentences; a sentence is a list of tokens (strings) :return: tuple; contains the center and context words, and the corresponding labels """ sampling_table = make_sampling_table(self.vocab_size) center_words, context_words, labels = [], [], [] for doc in docs: tokens = [token for sent in doc for token in sent] pairs, labels_ = skipgrams(tokens, self.vocab_size, window_size=self.window_size, sampling_table=sampling_table) try: center, context = zip(*pairs) except ValueError: continue center_words += center context_words += context labels += labels_ return center_words, context_words, labels
def _fit_embeddings(self, text): sampling_table = sequence.make_sampling_table(max_words) for e in range(self.n_epochs): print('-' * 40) print('Epoch', e) print('-' * 40) progbar = generic_utils.Progbar(self.tokenizer.document_count) samples_seen = 0 losses = [] for i, seq in enumerate(self.tokenizer.texts_to_sequences_generator(text)): #MAKE SURE TOKENIZER AND FITTING ARE WORKING #if i < 5: # print(map(lambda x: reverse_word_index[x], seq)) # get skipgram couples for one text in the dataset couples, labels = sequence.skipgrams(seq, max_words, window_size=self.window_size, negative_samples=1., sampling_table=sampling_table) if couples: # one gradient update per sentence (one sentence = a few 1000s of word couples) X = np.array(couples, dtype="int32") loss = self.embedding_model.train_on_batch(X, labels) losses.append(loss) if len(losses) % 100 == 0: progbar.update(i, values=[("loss", np.mean(losses))]) losses = [] samples_seen += len(labels) print('Samples seen:', samples_seen) print("Training completed!") return self
def format_data(data): sampling_table = sequence.make_sampling_table(vocab_size) couples, labels = sequence.skipgrams(data, vocab_size, window_size=context_size, sampling_table=sampling_table) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") labels = np.array(labels, dtype="int32") return word_target, word_context, labels
def build_dataset(self, words): count = [['UNK', -1]] count.extend(collections.Counter(words).most_common(self.L - 1)) self.logger.debug("original count " + str(len(count))) dictionary = dict() self.counter = dict() for word, _ in count: if self.emb_type: if word == 'UNK' or (word in self.word2vec_embedings.vocab\ and word in self.glove_embedings and word in self.fasttext_embedings): dictionary[word] = len(dictionary) self.counter[word] = _ else: self.logger.debug(word + " not in embeds") else: dictionary[word] = len(dictionary) del (self.word2vec_embedings) del (self.glove_embedings) del (self.fasttext_embedings) del (self.custom_embedings) self.L = len(dictionary) self.logger.debug("dictionary size" + str(len(dictionary))) reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) data = list() unk_count = 0 for word in words: if word in dictionary: index = dictionary[word] else: index = 0 unk_count += 1 data.append(index) count[0][1] = unk_count data = np.array(data) self.count = count self.dictionary = dictionary self.words = [reverse_dictionary[x] for x in range(len(reverse_dictionary))] self.logger.debug('....building samples') self.sampling_table = sequence.make_sampling_table(len(dictionary)) couples, labels = skipgrams(data, len(dictionary), window_size=self.cs, sampling_table=self.sampling_table, negative_samples=self.ns) del data self.labels = np.array(labels) # labels[labels == 0] = -1 word_target, word_context = zip(*couples) del couples self.word_target = np.array(word_target, dtype="int32") self.word_context = np.array(word_context, dtype="int32") self.logger.debug('....corpus generated') with open(self.dir_name+'/vocab.tsv', 'w') as txt: for word in self.words: txt.write(word + '\n') self.logger.debug('....vocab writen')
def build(self): sampling_table = make_sampling_table(self.dictionary_size) word_pairs, labels = skipgrams(self.text_data, self.dictionary_size, window_size=self.window_size, sampling_table=sampling_table) #print(word_pairs) word_targets, word_contexts = zip(*word_pairs) word_contexts = np.array(word_contexts, dtype="int32") word_targets = np.array(word_targets, dtype="int32") labels = np.array(labels, dtype="int32") return word_contexts, word_targets, labels
def get_data(self, data, vocab_size, window_size): sampling_table = sequence.make_sampling_table(vocab_size) couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") return self.DataSet(word_target, word_context, labels)
def main(dname, encode_dir, raw_dir, odir='./resources/skipgrams/'): # load tokenizer tok = pickle.load(open(encode_dir+dname+'.tkn', 'rb')) params = { 'window': 5, 'vocab_size': tok.num_words, 'emb_dim': 300, 'word_emb_path': './resources/word_emb.npy', 'epochs': 5, 'optimizer': 'adam', 'lr': 1e-5, } word_sampling_table = make_sampling_table(size=params['vocab_size']) # load the data raw_corpus = pd.read_csv(raw_dir+dname+'.tsv', sep='\t') # build and train model print(params) print() model = build_model(params) print(model.summary()) for epoch in range(params['epochs']): loss = 0 # shuffle the data raw_corpus = raw_corpus.sample(frac=1).reset_index(drop=True) for step, doc in enumerate(raw_corpus.text): encode_doc = tok.texts_to_sequences([doc]) word_pairs, word_labels = skipgrams( sequence=encode_doc[0], vocabulary_size=params['vocab_size'], window_size=params['window']) x = [np.array(x) for x in zip(*word_pairs)] y = np.array(word_labels, dtype=np.int32) if word_pairs: loss += model.train_on_batch(x,y) loss_avg = loss / step if step % 100 == 0: print('Epoch: {}, Step: {}'.format(epoch, step)) print('\tLoss: {}.'.format(loss_avg)) print('-------------------------------------------------') # save the model model.save(odir+'ww_model_{}.h5'.format(epoch)) # save the word embedding np.save(odir+'word_{}.npy'.format(epoch), model.get_layer(name='word_emb').get_weights()[0]) # save the model model.save(odir+'ww_model.h5') # save the word embedding np.save(odir+'word.npy', model.get_layer(name='word_emb').get_weights()[0])
def get_sim_model(vocab_size, X, vocab_map, vector_dim=64, save_path='sim_model.ckpt'): emb = Embedding(input_dim=vocab_size, output_dim=vector_dim, input_length=1) word_input = Input(shape=(1, )) context_input = Input(shape=(1, )) word = emb(word_input) context = emb(context_input) vectorizer = Model(inputs=word_input, outputs=word) similarity = dot([word, context], axes=2, normalize=True) print(similarity.shape) sim_model = Model(inputs=[word_input, context_input], outputs=similarity) merged = dot([word, context], axes=0, normalize=False) merged = Flatten()(merged) output = Dense( 1, activation='sigmoid', )(merged) model = Model(inputs=[word_input, context_input], outputs=output) print(model.summary()) model.compile(loss='binary_crossentropy', optimizer='adam') print('Trainning embedding...') sim_cb = SimilarityCallback(vocab_map, sim_model, vocab_size) for e in range(25): sampling_table = make_sampling_table(vocab_size) couples, labels = skipgrams(X.flatten(), vocab_size, window_size=3, sampling_table=sampling_table) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") for x in range(3): model.fit([word_target, word_context], labels, epochs=1, verbose=1, batch_size=2048, shuffle=True, validation_split=0.1) if e % 10 == 0: sim_cb.run_sim() sim_model.save(save_path) vectorizer.save('word2{}vec.ckpt'.format(vector_dim)) #sim_cb.run_sim() return sim_model, vectorizer
def skipgrams(self, tokens, window_size=3): # Inputs and labels sampling_table = seq.make_sampling_table(self.vocab_size) skipgrams, labels = seq.skipgrams(tokens, self.vocab_size, window_size=window_size, shuffle=True, sampling_table=sampling_table) # downconvert the target and context vectors int16 word_target, word_context = zip(*skipgrams) word_target = np.array(word_target, dtype='int32') word_context = np.array(word_context, dtype='int32') return (word_target, word_context, labels)
def _get_word_pairs(self, tokens_indices): """This method takes the token indices and creates positive a negative samples for training. :param list tokens_indices: A list of the tokens represented by their ranks. :return: word_pairs which is a list containing elements of the form (word1, word2) and a list of labels i.e. 0 or 1. """ sampling_table = make_sampling_table(size=self.vocabulary_size + 1) word_pairs, labels = skipgrams(sequence=tokens_indices, vocabulary_size=self.vocabulary_size, window_size=self.window_size, sampling_table=sampling_table) return word_pairs, labels
def generate_samples(self, data): data = data[:17000000] sampling_table = sequence.make_sampling_table(self.vocabulary_size) couples, labels = skipgrams(data, self.vocabulary_size, window_size=self.skip_window, sampling_table=sampling_table) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") return [word_target, word_context, labels]
def generate_data(corpus, window_size, V): for words in corpus: couples, labels = skipgrams(words, V, window_size, negative_samples=1, shuffle=True, sampling_table=make_sampling_table( V, sampling_factor=1e-05)) if couples: X, y = zip(*couples) X = np_utils.to_categorical(X, V) y = np_utils.to_categorical(y, V) yield X, y
def word_embedding(all_token, vocab_size): data, count, dic, rev_dic = dataset(all_token, vocab_size) win_size = 3 vec_dim = 100 epoch = 20 valid_size = 16 valid_win = 100 valid_examples = np.random.choice(valid_win, valid_size, replace = False) sampling_table = sequence.make_sampling_table(vocab_size) couples, labels = skipgrams(data, vocab_size, window_size = win_size, sampling_table = sampling_table) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") input_target = Input((1,)) input_context = Input((1,)) embedding = Embedding(vocab_size, vec_dim, input_length=1, name='embedding') target = embedding(input_target) target = Reshape((vec_dim, 1))(target) context = embedding(input_context) context = Reshape((vec_dim, 1))(context) similarity = merge([target, context], mode='cos', dot_axes=0) dot_product = merge([target, context], mode='dot', dot_axes=1) dot_product = Reshape((1,))(dot_product) # add the sigmoid output layer output = Dense(1, activation='sigmoid')(dot_product) model = Model(input=[input_target, input_context], output=output) model.compile(loss='binary_crossentropy', optimizer='rmsprop') arr_1 = np.zeros((1,)) arr_2 = np.zeros((1,)) arr_3 = np.zeros((1,)) for cnt in range(epoch): idx = np.random.randint(0, len(labels)-1) arr_1[0,] = word_target[idx] arr_2[0,] = word_context[idx] arr_3[0,] = labels[idx] loss = model.train_on_batch([arr_1, arr_2], arr_3) print '-------finish embedding----------' embedding_vector = model.get_weights()[0] return dic, embedding_vector
def add_to_data(self): """Adds a randomly chosen file to the batch data """ file_num = randint(0, len(self.relevant_file_names) - 1) year, file_name = self.relevant_file_names[file_num] # convert year to number between 0 and 1 dec = self.year2dec(year) with open(file_name, "r") as coha_file: # first line is file number coha_file.readline() for line in coha_file: if line.strip() == "": continue # converts words in training data to pseudowords for synthetic task # See paper for details if self.synth_task is not None: words = line.rstrip().split() adjusted_line = [] for word in words: if word in self.synth_task.synth_task_words: if word in self.synth_task.synth_task_w1_map: datum = self.synth_task.synth_task_data[self.synth_task.synth_task_w1_map[word]] if datum.w1_probs[year] > random(): adjusted_line.append(word) else: datum = self.synth_task.synth_task_data[self.synth_task.synth_task_w2_map[word]] if datum.w2_probs[year] > random(): adjusted_line.append(datum.w1) else: adjusted_line.append(word) line = " ".join(adjusted_line) wids = self.tokenizer.texts_to_sequences([line])[0] # Note that make_sampling_table estimates the sample probabilities using Zipf's law and does not # use the word counts in determining probabilities. sampling_table = make_sampling_table(self.vocab_size) # Note: skipgrams does not weigh sampling probabilities by unigram probability. pairs, labels = skipgrams(wids, self.vocab_size, window_size=self.args.window_size, negative_samples=self.args.num_negative_samples, sampling_table=sampling_table) # Add pair data to batch data self._curr_targets += [pair[0] for pair in pairs] self._curr_contexts += [pair[1] for pair in pairs] self._curr_labels += labels self._curr_times += len(pairs) * [dec]
def train_model(model): sampling_table = make_sampling_table(V, sampling_factor=args.sampling_factor) for epoch in range(args.epochs_to_train): loss = 0. for i, sent in enumerate(sentences): print('{}/{}'.format(i, len(sentences))) couples, labels = skipgrams(sequence=sent, vocabulary_size=V, window_size=args.window_size, negative_samples=args.num_neg_samples, sampling_table=sampling_table) if couples: words, contexts = zip(*couples) words = np.array(words, dtype=np.int32) contexts = np.array(contexts, dtype=np.int32) y = np.array(labels, dtype=np.int32) loss += model.train_on_batch([words, contexts], y) print('num epoch: {} loss: {}'.format(epoch, loss)) return model
def fit(self, X, y=None): self.build_vocab(X) self.build_graph() indexed_texts = self.texts_to_index(X) sampling_table = None if self.sort_vocab and self.use_sampling_table: sampling_table = make_sampling_table(self.vocab_size_) for epoch in trange(self.epochs): (batch_center, batch_context, batch_label) = generate_batch_data( indexed_texts, self.batch_size, self.vocab_size_, self.window_size, self.negative_samples, sampling_table) self.model_.train_on_batch([batch_center, batch_context], batch_label) return self
def main(): vocab_size = 10000 data, count, dictionary, reversed_dict = collect_data(vocab_size) window_size = 3 vector_dim = 300 epochs = 2000000 valid_size = 16 valid_window = 100 valid_examples = np.random.choice(valid_window, valid_size, replace=False) sampling_table = sequence.make_sampling_table(vocab_size) couples, labels = sequence.skipgrams( data, vocab_size, window_size=window_size, sampling_table=sampling_table) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype='int32') word_context = np.array(word_context, dtype='int32') print(couples[:10], labels[:10])
def build_dataset(self, words): count = [['UNK', -1]] count.extend(collections.Counter(words).most_common(self.L - 1)) dictionary = dict() for word, _ in count: dictionary[word] = len(dictionary) data = list() unk_count = 0 for word in words: if word in dictionary: index = dictionary[word] else: index = 0 unk_count += 1 data.append(index) count[0][1] = unk_count reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) self.data = np.array(data) self.count = count self.dictionary = dictionary self.words = [reverse_dictionary[x] for x in range(len(reverse_dictionary))] sampling_table = sequence.make_sampling_table(len(dictionary)) couples, labels = skipgrams(data, len(dictionary), window_size=self.cs, sampling_table=sampling_table, negative_samples=self.ns) del data self.labels = np.array(labels) # labels[labels == 0] = -1 word_target, word_context = zip(*couples) del couples self.word_target = np.array(word_target, dtype="int32") self.word_context = np.array(word_context, dtype="int32") with open('fits/vocab.tsv', 'w') as txt: for word in self.words: txt.write(word+'\n')
def make_training_data(all_msgs, hot_encoder): def skipgram_input(word, hot_encoder): index = hot_encoder.get_index(word) return 0 if index is None else index + 1 #the skipgram function uses 0 to mean invalid word all_msgs = sort_messages(all_msgs) all_content = (msg['content'] for msg in all_msgs) all_words = [ skipgram_input(word, hot_encoder) for message in all_content for word in message ] sampling_table = sequence.make_sampling_table(NUM_INDEXED_WORDS + 1) #make a table that estimates the frequency of each word occuring according to zipf's law skip_grams = sequence.skipgrams( #using the keras skipgrams function because having to figure out the word frequencies and how many samples to do and stuff sounds really hard all_words, NUM_INDEXED_WORDS + 1, window_size = WINDOW_SIZE, sampling_table = sampling_table ) input_pairs, output = skip_grams target_input, context_input = map(np.array, zip(*input_pairs)) #reshape input to be in proper form and convert to numpy arrays target_input -= 1 #have to convert back from the format that skipgrams wanted context_input -= 1 return [target_input, context_input], output
continue valid_sequences += 1 loss = train_batch(model, X_couples, y_labels) losses += loss if epoch % print_every == 0: logging.info("Mean loss in Epoch [%s] with %s valid sequences = %s" % (epoch, valid_sequences, losses / valid_sequences)) losses, valid_sequences = 0.0, 0 if __name__ == "__main__": #g = Graph.Read_Edgelist("deepwalk/p2p-Gnutella08.edgelist") g = load_adjlist("deepwalk/karate.adjlist", directed=False) vocab_size = len(g.vs) max_len = 5 save = True sampling_table = make_sampling_table(vocab_size) degrees = np.array(g.vs.degree()) inv_sqrt_degree = 1/np.sqrt(degrees) sampling_table = inv_sqrt_degree/np.sum(inv_sqrt_degree) logging.info("Graph Summary: \n", summary(g)) logging.info("Building Model") if save: model = cPickle.load(open("out/Karate.Model.3100.pkl")) else: model = cPickle.load("out/Karate.Model.3100.pkl") model = Sequential() model.add(WordContextProduct(vocab_size, proj_dim=300, init='uniform')) model.compile(loss='binary_crossentropy', optimizer='rmsprop') #couples, labels = skipgrams(sequences[np.random.randint(vocab_size)], vocab_size, window_size=4, negative_samples=1.0, sampling_table=sampling_table) #train_on_model(model, g, vocab_size, print_every=1) #cPickle.dump(model, open("out/Karate.Model.3100.pkl", "wb"))
def test_make_sampling_table(self): a = preprocessing_sequence.make_sampling_table(3) self.assertAllClose(a, np.asarray([0.00315225, 0.00315225, 0.00547597]), rtol=.1)
def test_make_sampling_table(): a = make_sampling_table(3) assert_allclose(a, np.asarray([0.00315225, 0.00315225, 0.00547597]), rtol=.1)
if train_model: if load_model: print('Load model...') model = cPickle.load(open(os.path.join(save_dir, model_load_fname), 'rb')) else: print('Build model...') word = Sequential() word.add(Embedding(vocab_size,vector_dim, init='uniform')) context = Sequential() context.add(Embedding(vocab_size,vector_dim, init='uniform')) model = Sequential() model.add(Merge([word, context], mode='dot')) model.compile(loss='mse', optimizer='rmsprop') sampling_table = sequence.make_sampling_table(vocab_size) for e in range(nb_epoch): print('-'*40) print('Epoch',e) print('-'*40) progbar = generic_utils.Progbar(tokenizer.document_count) samples_seen = 0 losses = [] for i, seq in enumerate(tokenizer.texts_to_sequences_generator(text_generator())): # get skipgram couples for one text in the dataset couples, labels = sequence.skipgrams(seq, vocab_size, window_size=4, negative_samples=1., sampling_table=sampling_table)
del vocabulary # Hint to reduce memory. return data, count, dictionary, reverse_dictionary ################################################################ vocab_size = 10000 data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size) print(data[:7]) window_size = 3 vector_dim = 300 epochs = 2000 valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. valid_examples = np.random.choice(valid_window, valid_size, replace=False) sampling_table = sequence.make_sampling_table(vocab_size) couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") print(couples[:10], labels[:10]) # create some input variables input_target = Input((1,)) input_context = Input((1,)) embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding') target = embedding(input_target) target = Reshape((vector_dim, 1))(target) context = embedding(input_context)
def process(args): print "Loading graph..." if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: #print("Walking...") #walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, # path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") max_features = len(G.nodes()) # vocabulary size dim_proj = args.representation_size # embedding space dimension nb_epoch = 1 # number of training epochs # Neural network ( in Keras ) model = Sequential() model.add(WordContextProduct(max_features, proj_dim=dim_proj, init="uniform")) model.compile(loss='mse', optimizer='rmsprop') sampling_table = sequence.make_sampling_table(max_features) print("Fitting tokenizer on walks...") tokenizer = text.Tokenizer(nb_words=max_features) print "Epochs: %d" % nb_epoch #tokenizer.fit_on_texts( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length)) for e in range(nb_epoch): print('-'*40) print('Epoch', e) print('-'*40) #progbar = generic_utils.Progbar(tokenizer.document_count) samples_seen = 0 losses = [] # for i, seq in enumerate(tokenizer.texts_to_sequences_generator( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length) )): for i, seq in enumerate( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length) ): # get skipgram couples for one text in the dataset couples, labels = sequence.skipgrams(seq, max_features, window_size=5, negative_samples=1., sampling_table=sampling_table) if couples: # one gradient update per sentence (one sentence = a few 1000s of word couples) X = np.array(couples, dtype="int32") print "Started fitting..." loss = model.fit(X, labels) print "Dumping..." # Dump weights to a temp file weights = model.layers[0].get_weights()[0] norm_weights = np_utils.normalize(weights) # TODO: save weights with indices np.savetxt( args.output, norm_weights ) losses.append(loss) if len(losses) % 100 == 0: # progbar.update(i, values=[("loss", np.mean(losses))]) losses = [] samples_seen += len(labels) print('Samples seen:', samples_seen) print("Training completed!") else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") #TODO: IMPLEMENT THAT print "Not implemented yet..." sys.exit(1) print "Optimization done. Saving..." # recover the embedding weights trained with skipgram: weights = model.layers[0].get_weights()[0] # we no longer need this del model norm_weights = np_utils.normalize(weights) # TODO: save weights with indices np.savetxt( args.output, norm_weights ) print "Saved!"
def sequence_make_sampling_table(): print(sequence.make_sampling_table(5))
def main(dname, encode_dir, raw_dir, odir='./resources/skipgrams/', mode='local'): # load corpus data raw_corpus = pd.read_csv(raw_dir + dname + '.tsv', sep='\t') # load user data user_idx = json.load(open(raw_dir + 'user_idx.json')) user_info = dict() user_control = set() # control if renew user_info sample method with open(encode_dir + 'users.json') as dfile: for line in dfile: line = json.loads(line) user_info[line['uid']] = line user_info[line['uid']]['count'] = 0 # load tokenizer tok = pickle.load(open(encode_dir + dname + '.tkn', 'rb')) params = { 'window': 5, 'vocab_size': tok.num_words, 'user_size': len(user_info) + 1, # +1 for unknown 'emb_dim': 300, 'word_emb_path': './resources/word_emb.npy', 'user_emb_path': './resources/user_emb.npy', 'word_emb_train': True, 'user_emb_train': True, 'user_task_weight': 1, 'word_task_weight': 1, 'epochs': 5, 'optimizer': 'adam', 'lr': 1e-5, } word_sampling_table = make_sampling_table(size=params['vocab_size']) ww_model, uw_model = build_model(params) print() print(params) for epoch in range(params['epochs']): loss = 0 # shuffle the data raw_corpus = raw_corpus.sample(frac=1).reset_index(drop=True) for step, entry in raw_corpus.iterrows(): '''word info, ww: word-word''' encode_doc = tok.texts_to_sequences([entry.text]) ww_pairs, ww_labels = skipgrams( sequence=encode_doc[0], vocabulary_size=params['vocab_size'], window_size=params['window']) word_pairs = [np.array(x) for x in zip(*ww_pairs)] ww_labels = np.array(ww_labels, dtype=np.int32) '''user info, uw: user-word''' cur_user = user_info[entry.uid] if mode == 'local': uw_pairs, uw_labels = utils.user_word_sampler( uid=cur_user['uid_encode'], sequence=encode_doc[0], vocab_size=params['vocab_size'], filter_words=set(cur_user['words']), negative_samples=1) uw_pairs = [np.array(x) for x in zip(*uw_pairs)] uw_labels = np.array(uw_labels, dtype=np.int32) elif mode == 'decay': decay_num = utils.sample_decay(cur_user['count']) if decay_num > np.random.random(): uw_pairs, uw_labels = utils.user_word_sampler( uid=cur_user['uid_encode'], sequence=set(cur_user['words']), vocab_size=params['vocab_size'], negative_samples=1) uw_pairs = [np.array(x) for x in zip(*uw_pairs)] uw_labels = np.array(uw_labels, dtype=np.int32) user_info[entry.uid]['count'] += 1 user_control.add(entry.uid) else: uw_pairs = None uw_labels = None if len(user_control) >= len(user_info) - 10: # restart the control for sampling for uid in user_info: user_info[uid]['count'] = 0 user_control.clear() elif mode == 'global': uw_pairs, uw_labels = utils.user_word_sampler( uid=cur_user['uid_encode'], sequence=set(cur_user['words']), vocab_size=params['vocab_size'], negative_samples=1) uw_pairs = [np.array(x) for x in zip(*uw_pairs)] uw_labels = np.array(uw_labels, dtype=np.int32) else: raise ValueError('Mode {} does not exist!'.format(mode)) '''Train''' if word_pairs: loss += ww_model.train_on_batch(word_pairs, ww_labels) if uw_pairs: loss += uw_model.train_on_batch(uw_pairs, uw_labels) loss_avg = loss / step if step % 100 == 0: print('Epoch: {}, Step: {}'.format(epoch, step)) print('\tLoss: {}.'.format(loss_avg)) print('-------------------------------------------------') # save the model ww_model.save(odir + 'ww_model_{}.h5'.format(epoch)) uw_model.save(odir + 'uw_model_{}.h5'.format(epoch)) # save the word embedding np.save(odir + 'word_{}.npy'.format(epoch), ww_model.get_layer(name='word_emb').get_weights()[0]) # save the user embedding np.save(odir + 'user_{}.npy'.format(epoch), uw_model.get_layer(name='user_emb').get_weights()[0]) # save the model ww_model.save(odir + 'ww_model.h5') uw_model.save(odir + 'uw_model.h5') # save the word embedding np.save(odir + 'word.npy', ww_model.get_layer(name='word_emb').get_weights()[0]) # save the user embedding np.save(odir + 'user.npy', uw_model.get_layer(name='user_emb').get_weights()[0])
#model.add(LSTM(layers[2], return_sequences=False)) #model.add(Dropout(0.2)) model.add(Dense(output_dim=layers[3])) # for skipgram model.add(Activation('sigmoid')) # for skipgram #model.add(Activation('softmax')) # for sequences #model.compile(loss='categorical_crossentropy', optimizer='rmsprop') # buildSequences print('Compile model...') model.compile(loss='mse', optimizer='rmsprop') # buildSkipgram # training process if train_model: if load_model: print('Load model...') model.load_weights(os.path.join(save_dir, model_load_fname)) sampling_table = sequence.make_sampling_table(max_features) for e in range(nb_epoch): print('-' * 40) print('Epoch', e) print('-' * 40) progbar = generic_utils.Progbar(tokenizer.document_count) samples_seen = 0 losses = [] for i, seq in enumerate( tokenizer.texts_to_sequences_generator(text_generator())): # get skipgram couples for one text in the dataset couples, labels = sequence.skipgrams(seq, max_features,
def main(): scoreList = [0.0,0.0] with open('data/info.json') as j: info = ujson.load(j) for problem in os.listdir('data'): greek=False if problem.startswith('problem'): truthPath = 'data/truth/'+problem+'/clustering.json' with open(truthPath) as t: truth = ujson.load(t) print(problem) probTokList = [] docList = [] docDict = {} X=[] Y=[] path = 'data/' + problem for entry in info: if entry["folder"] == problem: lang=entry["language"] if entry["language"] == "gr": greek=True CV = CountVectorizer(input='filename', strip_accents='unicode', analyzer='word', ngram_range=(1,4)) docs = [path+'/'+x for x in os.listdir(path)] cMatrix = CV.fit_transform(docs) for doc in os.listdir(path): docTokList = [] with open(path + '/' + doc) as d: article = d.readlines() for sent in article: sentTokList = [] for word in sent.split(): for token in word: procToken = preprop(token,greek) sentTokList.append(procToken) #Every item of the list is a normalized character docTokList.append(' '.join(sentTokList))#Every item of the list is a sentence probTokList.append(' '.join(docTokList))#Every item of the list is a document docList.append(doc) tokenizer = text.Tokenizer(nb_words=None,filters=text.base_filter(),lower=True,split=" ") tokenizer.fit_on_texts(probTokList) seqList = tokenizer.texts_to_sequences(probTokList) uniqueTokens = max([max(x) for x in seqList]) print(uniqueTokens,lang) sampling_table = sequence.make_sampling_table(uniqueTokens+1) for i,seq in enumerate(seqList): x, y = sequence.skipgrams(seq, uniqueTokens, window_size=4, negative_samples=1.0, categorical=False, sampling_table=sampling_table) x = zip(x, y) X.append(x) #Y.extend(y) docDict[docList[i]] = seq strX=[str(x) for x in X] xTokenizer = text.Tokenizer(nb_words=None,filters=text.base_filter(),lower=True,split=" ") xTokenizer.fit_on_texts(strX) #docMatrix = tokenizer.sequences_to_matrix(seqList,mode="tfidf") docMatrix = xTokenizer.sequences_to_matrix(strX,mode="tfidf") #scores = embedNN(X,Y) pairs = combinations(docDict.keys(),2) cList = [] nnDict = {} for cluster in truth: cPairs = [] if len(cluster) > 1: for item in cluster: cPairs.append(str(item["document"])) cList.extend(list(permutations(cPairs,2))) for pair in pairs: match = False if pair in cList: match = True nnDict[pair] = match for i, doc in enumerate(docMatrix): docDict[docList[i]] = doc truthCounter = Counter(nnDict.values()) baseline = 1-float(truthCounter[True])/float(len(nnDict)) print("Baseline for {} is {}".format(problem, baseline)) clusterCount = Counter() kmclusters = False # Change to False for meanshift if kmclusters: pbar = ProgressBar() for nclusters in pbar(reversed(range(len(docMatrix)-1))): #print("{} Clusters".format(nclusters+1)) clusters = KMclusterer(nclusters+1,cMatrix) for c in range(nclusters+1): #print(c,"has:",[i for i,x in enumerate(clusters) if x == c]) for clusterpair in list(combinations([i for i,x in enumerate(clusters) if x == c],2)): combo = (docList[clusterpair[0]],docList[clusterpair[1]]) clusterCount[combo] +=1 else: clusters = KMclusterer(int(len(docMatrix)*0.67),docMatrix) #clusters = MSclusterer(cMatrix)#cMatrixdocMatrix for clusterpair in list(combinations([i for i,x in enumerate(clusters)],2)): combo = (docList[clusterpair[0]],docList[clusterpair[1]]) clusterCount[combo] +=1 x = 0.0 scoreList[0] += truthCounter[True] deleteList = [] #print("Most common cluster is in {}%".format((float(clusterCount.most_common(20)[19][1])/len(docMatrix))*100)) for combo in nnDict.keys(): if combo not in clusterCount.keys(): deleteList.append(combo) y = 0.0 for item in deleteList: if item in cList: y+=1 del nnDict[item] scores = sharedNN(docDict, nnDict) print("Deleted pairs are {}% of total correct pairs, {}% of deleted pairs was wrongly deleted".format(round(y/len(cList)*100.0,2), round(y/len(deleteList)*100.0,2))) for combo in clusterCount.most_common(20): if combo[0] in cList: x += 1 scoreList[1] += 1 print("prec: {}".format(x/20)) #print("Document score is {} clusters correct out of {} (accuracy {})".format(x, truthCounter[True], x/truthCounter[True])) #print("prec: {} \nrec: {}".format(x/20, x/len(nnDict.values()))) #print("Total precision is {}, {} clusters correct".format(scoreList[1]/scoreList[0], scoreList[1])) if not os.path.exists('answers/'+problem): os.mkdir('answers/'+problem) clusDict = defaultdict(list) rankDict = defaultdict(list) for i, cluster in enumerate(list(clusters)): clusDict[cluster] .append({"document": docList[i]}) rankDict[cluster] .append(docList[i]) with open('answers/'+problem+'/clustering.json', "w") as jsonFile: ujson.dump(list(clusDict.values()), jsonFile, indent=4) rankList = [] for value in rankDict.values(): if len(value) > 1 : pairs = combinations(value,2) for pair in pairs: rankList.append({"document1": pair[0], "document2": pair[1], "score": scores[pair][0]}) with open('answers/'+problem+'/ranking.json', "w") as jsonFile: ujson.dump(rankList, jsonFile, indent=4)
nb_epoch = 5 skip_top = 10 dim_proj = 256 max_features = 1000 tokenizer = text.Tokenizer(nb_words=max_features) tokenizer.fit_on_texts(text_generator()) # ----- 训练 ----- model = Sequential() model.add(WordContextProduct(max_features, proj_dim=dim_proj, init="uniform")) model.compile(loss='mse', optimizer='rmsprop') sampling_table = sequence.make_sampling_table(max_features) for e in range(nb_epoch): print 'Epoch:', e progbar = generic_utils.Progbar(tokenizer.document_count) samples_seen, losses = 0, [] for i, seq in enumerate(tokenizer.texts_to_sequences_generator(text_generator())): couples, labels = sequence.skipgrams(seq, max_features, window_size=4, negative_samples=1., sampling_table=sampling_table) if couples: X = np.array(couples, dtype="int32") loss = model.train_on_batch(X, labels) losses.append(loss) if len(losses) % 100 == 0: progbar.update(i, values=[("loss", np.mean(losses))]) losses = [] samples_seen += len(labels)
def train(self, model_config, wids, word2id): vocab_size = len(word2id.values()) sampling_table = sequence.make_sampling_table(vocab_size) wids_flat = [word for sentence in wids for word in sentence] couples, labels = skipgrams(wids_flat, vocab_size, window_size=model_config['window_size'], sampling_table=sampling_table) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") input_target = Input((1,)) input_context = Input((1,)) vector_dim = model_config['number_of_dimensions_in_hidden_layer'] embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding') target = embedding(input_target) target = Reshape((vector_dim, 1))(target) context = embedding(input_context) context = Reshape((vector_dim, 1))(context) # setup a cosine similarity operation which will be output in a secondary model similarity = merge([target, context], mode='cos', dot_axes=0) # now perform the dot product operation to get a similarity measure dot_product = merge([target, context], mode='dot', dot_axes=1) dot_product = Reshape((1,))(dot_product) # add the sigmoid output layer output = Dense(1, activation='sigmoid')(dot_product) # create the primary training model model = Model(input=[input_target, input_context], output=output) model.compile(loss='binary_crossentropy', optimizer='rmsprop') validation_model = Model(input=[input_target, input_context], output=similarity) class SimilarityCallback: def run_sim(self): valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. valid_examples = np.random.choice(valid_window, valid_size, replace=False) reverse_dictionary = dict(zip(word2id.values(), word2id.keys())) for i in range(valid_size): valid_word = reverse_dictionary[valid_examples[i]] top_k = 8 # number of nearest neighbors sim = self._get_sim(valid_examples[i]) nearest = (-sim).argsort()[1:top_k + 1] log_str = 'Nearest to %s:' % valid_word for k in range(top_k): close_word = reverse_dictionary[nearest[k]] log_str = '%s %s,' % (log_str, close_word) print(log_str) @staticmethod def _get_sim(valid_word_idx): sim = np.zeros((vocab_size,)) in_arr1 = np.zeros((1,)) in_arr2 = np.zeros((1,)) in_arr1[0,] = valid_word_idx for i in range(vocab_size): in_arr2[0,] = i out = validation_model.predict_on_batch([in_arr1, in_arr2]) sim[i] = out return sim sim_cb = SimilarityCallback() arr_1 = np.zeros((1,)) arr_2 = np.zeros((1,)) arr_3 = np.zeros((1,)) for cnt in range(model_config['epochs']): idx = np.random.randint(0, len(labels) - 1) arr_1[0,] = word_target[idx] arr_2[0,] = word_context[idx] arr_3[0,] = labels[idx] loss = model.train_on_batch([arr_1, arr_2], arr_3) if cnt % 100 == 0: print("Iteration {}, loss={}".format(cnt, loss)) # if cnt % 10000 == 0: # sim_cb.run_sim() return model, model.get_weights()[0]