def buildModel(self, embedding_dimension): self.model = Sequential() self.model.add( WordContextProduct(self.vocabulary_size, proj_dim=embedding_dimension, init="uniform")) self.model.compile(loss='binary_crossentropy', optimizer='rmsprop')
def __init__(self, max_words=50000, skip_top_words=0, n_epochs=1, n_dims=100, window_size=4, loss='mse', optimizer='rmsprop' ): """ :param max_words: use only the n most common words in data :param skip_top_words: ignore top m ost common words :param n_epochs: number of training epochs :param n_dims: embedding space dimension :return: embedding model """ self.max_words = max_words self.skip_top_words = skip_top_words self.n_epochs = n_epochs self.n_dims = n_dims self.tokenizer = text.Tokenizer(nb_words=self.max_words) self._is_tokenizer_fit = False self._word_index = None self._reverse_word_index = None self.window_size = window_size self.optimizer = optimizer self.loss = loss self.embedding_model = Sequential() self.embedding_model.add( WordContextProduct(self.max_words, proj_dim=self.n_dims, init="uniform")) self.embedding_model.compile(loss=loss, optimizer=optimizer) self._are_embeddings_fit = False
valid_sequences += 1 loss = train_batch(model, X_couples, y_labels) losses += loss if epoch % print_every == 0: logging.info("Mean loss in Epoch [%s] with %s valid sequences = %s" % (epoch, valid_sequences, losses / valid_sequences)) losses, valid_sequences = 0.0, 0 if __name__ == "__main__": #g = Graph.Read_Edgelist("deepwalk/p2p-Gnutella08.edgelist") g = load_adjlist("deepwalk/karate.adjlist", directed=False) vocab_size = len(g.vs) max_len = 5 save = True sampling_table = make_sampling_table(vocab_size) degrees = np.array(g.vs.degree()) inv_sqrt_degree = 1/np.sqrt(degrees) sampling_table = inv_sqrt_degree/np.sum(inv_sqrt_degree) logging.info("Graph Summary: \n", summary(g)) logging.info("Building Model") if save: model = cPickle.load(open("out/Karate.Model.3100.pkl")) else: model = cPickle.load("out/Karate.Model.3100.pkl") model = Sequential() model.add(WordContextProduct(vocab_size, proj_dim=300, init='uniform')) model.compile(loss='binary_crossentropy', optimizer='rmsprop') #couples, labels = skipgrams(sequences[np.random.randint(vocab_size)], vocab_size, window_size=4, negative_samples=1.0, sampling_table=sampling_table) #train_on_model(model, g, vocab_size, print_every=1) #cPickle.dump(model, open("out/Karate.Model.3100.pkl", "wb"))
tokenizer.fit_on_texts(text_generator()) if save: print("Save tokenizer...") if not os.path.exists(save_dir): os.makedirs(save_dir) six.moves.cPickle.dump(tokenizer, open(os.path.join(save_dir, tokenizer_fname), "wb")) # training process if train_model: if load_model: print('Load model...') model = six.moves.cPickle.load(open(os.path.join(save_dir, model_load_fname), 'rb')) else: print('Build model...') model = Sequential() model.add(WordContextProduct(max_features, proj_dim=dim_proj, init="uniform")) model.compile(loss='mse', optimizer='rmsprop') sampling_table = sequence.make_sampling_table(max_features) for e in range(nb_epoch): print('-'*40) print('Epoch', e) print('-'*40) progbar = generic_utils.Progbar(tokenizer.document_count) samples_seen = 0 losses = [] for i, seq in enumerate(tokenizer.texts_to_sequences_generator(text_generator())): # get skipgram couples for one text in the dataset
def process(args): print "Loading graph..." if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception( "Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: #print("Walking...") #walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, # path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") max_features = len(G.nodes()) # vocabulary size dim_proj = args.representation_size # embedding space dimension nb_epoch = 1 # number of training epochs # Neural network ( in Keras ) model = Sequential() model.add( WordContextProduct(max_features, proj_dim=dim_proj, init="uniform")) model.compile(loss='mse', optimizer='rmsprop') sampling_table = sequence.make_sampling_table(max_features) print("Fitting tokenizer on walks...") tokenizer = text.Tokenizer(nb_words=max_features) print "Epochs: %d" % nb_epoch #tokenizer.fit_on_texts( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length)) for e in range(nb_epoch): print('-' * 40) print('Epoch', e) print('-' * 40) #progbar = generic_utils.Progbar(tokenizer.document_count) samples_seen = 0 losses = [] # for i, seq in enumerate(tokenizer.texts_to_sequences_generator( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length) )): for i, seq in enumerate( build_deepwalk_corpus_minibatch_iter( G, args.number_walks, args.walk_length)): # get skipgram couples for one text in the dataset couples, labels = sequence.skipgrams( seq, max_features, window_size=5, negative_samples=1., sampling_table=sampling_table) if couples: # one gradient update per sentence (one sentence = a few 1000s of word couples) X = np.array(couples, dtype="int32") print "Started fitting..." loss = model.fit(X, labels) print "Dumping..." # Dump weights to a temp file weights = model.layers[0].get_weights()[0] norm_weights = np_utils.normalize(weights) # TODO: save weights with indices np.savetxt(args.output, norm_weights) losses.append(loss) if len(losses) % 100 == 0: # progbar.update(i, values=[("loss", np.mean(losses))]) losses = [] samples_seen += len(labels) print('Samples seen:', samples_seen) print("Training completed!") else: print( "Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk." .format(data_size, args.max_memory_data_size)) print("Walking...") #TODO: IMPLEMENT THAT print "Not implemented yet..." sys.exit(1) print "Optimization done. Saving..." # recover the embedding weights trained with skipgram: weights = model.layers[0].get_weights()[0] # we no longer need this del model norm_weights = np_utils.normalize(weights) # TODO: save weights with indices np.savetxt(args.output, norm_weights) print "Saved!"
if not os.path.exists(save_dir): os.makedirs(save_dir) six.moves.cPickle.dump( tokenizer, open(os.path.join(save_dir, tokenizer_fname), "wb")) # training process if train_model: if load_model: print('Load model...') model = six.moves.cPickle.load( open(os.path.join(save_dir, model_load_fname), 'rb')) else: print('Build model...') model = Sequential() model.add( WordContextProduct(max_features, proj_dim=dim_proj, init="normal")) model.compile(loss='hinge', optimizer='adam') sampling_table = sequence.make_sampling_table(max_features) for e in range(nb_epoch): print('-' * 40) print('Epoch', e) print('-' * 40) progbar = generic_utils.Progbar(tokenizer.document_count) samples_seen = 0 losses = [] for i, seq in enumerate( tokenizer.texts_to_sequences_generator(text_generator())):