def train(self, nodes, model, beta, chunksize=150, iter=1): for _ in range(iter): grad_input = np.zeros(model.node_embedding.shape).astype( np.float32) for node_index in chunkize_serial( map( lambda node: model.vocab[node].index, filter( lambda node: node in model.vocab and (model.vocab[node].sample_probability >= 1.0 or model.vocab[node].sample_probability >= np.random. random_sample()), nodes)), chunksize): input = model.node_embedding[node_index] batch_grad_input = np.zeros(input.shape).astype(np.float32) for com in range(model.k): diff = np.expand_dims(input - model.centroid[com], axis=-1) m = model.pi[node_index, com].reshape( len(node_index), 1, 1) * (model.inv_covariance_mat[com]) batch_grad_input += np.squeeze(np.matmul(m, diff), axis=-1) grad_input[node_index] += batch_grad_input grad_input *= (beta / model.k) model.node_embedding -= (grad_input.clip(min=-0.25, max=0.25)) * self.lr
def loss(self, nodes, model, beta, chunksize=150): """ Forward function used to compute o3 loss :param input_labels: of the node present in the batch :param model: model containing all the shared data :param beta: trade off param """ ret_loss = 0 for node_index in chunkize_serial( map(lambda x: model.vocab[x].index, nodes), chunksize): input = model.node_embedding[node_index] batch_loss = np.zeros(len(node_index), dtype=np.float32) for com in range(model.k): rd = multivariate_normal(model.centroid[com], model.covariance_mat[com]) # check if can be done as matrix operation #batch_loss += rd.logpdf(input).astype(np.float32) * model.pi[node_index, com] batch_loss += rd.logpdf(input).astype( np.float32) * model.pi[node_index, com] #ret_loss = abs(batch_loss.sum()) ret_loss = -(batch_loss.sum()) return ret_loss * (beta / model.k)
def loss(self, model, paths, total_paths, alpha=1.0): start, next_report, loss = time.time(), 5.0, 0.0 num_nodes = 0 for job_no, job in enumerate( chunkize_serial(prepare_sentences(model, paths), 250)): batch_loss = np.zeros(1, dtype=np.float32) batch_work = np.zeros(model.layer1_size, dtype=np.float32) batch_node = sum([ loss_o2(model.node_embedding, model.context_embedding, path, self.negative, self.window_size, model.table, alpha, model.layer1_size, batch_work, py_loss=batch_loss) for path in job if path is not None ]) num_nodes += batch_node loss += batch_loss[0] elapsed = time.time() - start if elapsed >= next_report: log.debug("PROGRESS: at %.2f%% path, %.0f paths/s" % (100.0 * num_nodes / total_paths, num_nodes / elapsed if elapsed else 0.0)) # log.debug("loss: {}".format(loss)) next_report = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports # def worker_loss(job, next_report): # """Train the model, lifting lists of paths from the jobs queue.""" # # py_work = np.zeros(model.layer1_size, dtype=np.float32) # job_nodes = sum([loss_o2(model.node_embedding, model.context_embedding, path, self.negative, # self.window_size, model.table, alpha, model.layer1_size, # py_work, py_loss=loss) for path in job]) # execute the sgd # num_nodes[0] += job_nodes # elapsed = time.time() - start # # if elapsed >= next_report: # print("PROGRESS: at %.2f%% path, %.0f paths/s" % ( # 100.0 * num_nodes[0] / total_paths, num_nodes[0] / elapsed if elapsed else 0.0)) # next_report = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports # print(loss) # return next_report # # for job_no, job in enumerate(chunkize_serial(prepare_sentences(model, paths), 250)): # next_report = worker_loss(job, next_report) log.info(num_nodes) log.info(loss) return loss
def train(self, nodes, model, beta, chunksize=150, iter=1): for _ in range(iter): grad_input = np.zeros(model.node_embedding.shape).astype(np.float32) for node_index in chunkize_serial(map(lambda x: model.vocab[x].index, nodes), chunksize): input = model.node_embedding[node_index] batch_grad_input = np.zeros(input.shape).astype(np.float32) for com in range(model.k): diff = np.expand_dims(input - model.centroid[com], axis=-1) m = model.pi[node_index, com].reshape(len(node_index), 1, 1) * model.inv_covariance_mat[com] batch_grad_input += np.squeeze(np.matmul(m, diff), axis=-1) grad_input[node_index] += batch_grad_input grad_input *= (beta/model.k) model.node_embedding -= (grad_input.clip(min=-5, max=5)) * self.lr
def train(self, nodes, model, beta, chunksize=150, iter=1): """ :param nodes: :param model: model containing all the shared data :param beta: trade off param :param chunksize: :param iter: """ log.info(f"O2 COMMUNITY training model with beta={beta}, chunksize={chunksize}, and iter={iter}") start = time.time() for i in range(iter): grad_input = np.zeros(model.node_embedding.shape).astype(np.float32) for node_index in chunkize_serial(map(lambda node: model.vocab[node].index, filter(lambda node: node in model.vocab and ( model.vocab[node].sample_probability >= 1.0 or model.vocab[ node].sample_probability >= np.random.random_sample()), nodes)), chunksize): input = model.node_embedding[node_index] batch_grad_input = np.zeros(input.shape).astype(np.float32) for com in range(model.k): diff = np.expand_dims(input - model.centroid[com], axis=-1) m = model.pi[node_index, com].reshape(len(node_index), 1, 1) * (model.inv_covariance_mat[com]) batch_grad_input += np.squeeze(np.matmul(m, diff), axis=-1) grad_input[node_index] += batch_grad_input grad_input *= (beta / model.k) model.node_embedding -= (grad_input.clip(min=-0.25, max=0.25)) * self.lr log.info(f"PROGRESS: at {i/iter*100:.2f}%") elapsed = time.time() - start log.info(f"training on took {elapsed:.1f}s")
def train(self, nodes, model, beta, chunksize=150, iter=1): for _ in range(iter): grad_input = np.zeros(model.node_embedding.shape).astype( np.float32) for node_index in chunkize_serial( map(lambda node: model.vocab[node].index, nodes), chunksize): input = model.node_embedding[node_index] batch_grad_input = np.zeros(input.shape).astype(np.float32) diff = np.expand_dims(input, axis=1) - np.expand_dims( model.centroid, axis=0) diff = np.transpose(diff, (1, 0, 2)) for k, (d, inv_cov) in enumerate( zip(diff, model.inv_covariance_mat)): batch_grad_input += model.pi[k] * np.sum( inv_cov * np.expand_dims(d, 1), 1) grad_input[node_index] += batch_grad_input grad_input *= (beta / model.k) model.node_embedding -= (grad_input.clip(min=-0.25, max=0.25)) * self.lr
def train(self, model, edges, chunksize=150, iter=1): """ Update the model's neural weights from a sequence of paths (can be a once-only generator stream). """ assert model.node_embedding.dtype == np.float32 log.info("O1 training model with %i workers on %i vocabulary and %i features and 'negative sampling'=%s" % (self.workers, len(model.vocab), model.layer1_size, self.negative)) if not model.vocab: raise RuntimeError("you must first build vocabulary before training the model") edges = RepeatCorpusNTimes(edges, iter) total_node = edges.corpus.shape[0] * edges.corpus.shape[1] * edges.n log.debug('total edges: %d' % total_node) start, next_report, word_count = time.time(), [5.0], [0] #int(sum(v.count * v.sample_probability for v in self.vocab.values())) jobs = Queue(maxsize=2*self.workers) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock() def worker_train(): """Train the model, lifting lists of paths from the jobs queue.""" while True: job = jobs.get(block=True) if job is None: # data finished, exit jobs.task_done() # print('thread %s break' % threading.current_thread().name) break py_work = np.zeros(model.layer1_size, dtype=np.float32) job_words = sum(train_o1(model.node_embedding, edge, self.lr, self.negative, model.table, py_size=model.layer1_size, py_work=py_work) for edge in job if edge is not None) jobs.task_done() lock.acquire(timeout=30) try: word_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: log.info("PROGRESS: at %.2f%% words\tword_computed %d\talpha %.05f\t %.0f words/s" % (100.0 * word_count[0] / total_node, word_count[0], self.lr, word_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 5.0 # don't flood the log, wait at least a second between progress reports finally: lock.release() workers = [threading.Thread(target=worker_train, name='thread_'+str(i)) for i in range(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue for job_no, job in enumerate(chunkize_serial(prepare_sentences(model, edges), chunksize)): jobs.put(job) for _ in range(self.workers): jobs.put(None) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start log.info("training on %i words took %.1fs, %.0f words/s" % (word_count[0], elapsed, word_count[0]/ elapsed if elapsed else 0.0))
def train(self, model, paths, total_nodes, alpha=1.0, node_count=0, chunksize=150): """ Update the model's neural weights from a sequence of paths (can be a once-only generator stream). :param model: model containing the shared data :param paths: generator of the paths :param total_nodes: total number of nodes in the path :param alpha: trade-off parameter :param node_count: init of the number of nodes :param chunksize: size of the batch :return: """ assert model.node_embedding.dtype == np.float32 assert model.context_embedding.dtype == np.float32 log.info( "O3 CONTEXT training model with %i workers on %i vocabulary and %i features, using \t'negative sampling'=%s\t'windows'=%s" % (self.workers, len(model.vocab), model.layer1_size, self.negative, self.window_size)) if alpha <= 0.: return if not model.vocab: raise RuntimeError( "you must first build vocabulary before training the model") start, next_report = time.time(), [1.0] if total_nodes is None: raise AttributeError('need the number of node') node_count = [0] jobs = Queue( maxsize=2 * self.workers ) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock( ) # for shared state (=number of nodes trained so far, log reports...) def worker_train(): """Train the model, lifting lists of paths from the jobs queue.""" py_work = np.zeros(model.layer1_size, dtype=np.float32) while True: job = jobs.get() if job is None: # data finished, exit break lr = max(self.min_lr, self.lr * (1 - 1.0 * node_count[0] / total_nodes)) job_nodes = sum( train_o2(model.node_embedding, model.context_embedding, path, lr, self.negative, self.window_size, model.table, py_alpha=alpha, py_size=model.layer1_size, py_work=py_work) for path in job) #execute the sgd with lock: node_count[0] += job_nodes elapsed = time.time() - start if elapsed >= next_report[0]: log.info( "PROGRESS: at %.2f%% nodes, lr %.05f, %.0f nodes/s" % (100.0 * node_count[0] / total_nodes, lr, node_count[0] / elapsed if elapsed else 0.0)) next_report[ 0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports workers = [ threading.Thread(target=worker_train) for _ in range(self.workers) ] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() # convert input strings to Vocab objects (eliding OOV/downsampled nodes), and start filling the jobs queue for job_no, job in enumerate( chunkize_serial(prepare_sentences(model, paths), chunksize)): jobs.put(job) log.debug( "reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in range(self.workers): jobs.put( None ) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start log.info("training on %i nodes took %.1fs, %.0f nodes/s" % (node_count[0], elapsed, node_count[0] / elapsed if elapsed else 0.0))