def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" paragraph_work = zeros(self.paragraph_size, dtype=REAL) # each thread must have its own work memory error = zeros(1, dtype = REAL) if self.concatenate: # word work here is for each individual word, so it has length logistic regression - para size word_work = zeros(self.logistic_regression_size - self.paragraph_size, dtype = REAL) neu1 = matutils.zeros_aligned(self.logistic_regression_size, dtype=REAL) else: # here word work is aggregated: word_work = zeros(self.layer1_size, dtype = REAL) neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) zeros(self.logistic_regression_size, dtype = REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) if self.weight_decay else self.alpha # how many words did we train on? out-of-vocabulary (unknown) words do not count job_words = self.training_function(self, job, paragraphs, paragraphs_only, alpha, paragraph_work, word_work, neu1, error, len(job)) with lock: # here we can store the scores for later plotting and viewing... word_count[0] += job_words elapsed = time.time() - start total_error[0] += error[0] if elapsed >= next_report[0]: logger.debug("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s," % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports
def reset_weights(self): """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" random.seed(self.seed) self.syn0 = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL) self.syn1 = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL) self.syn0 += (random.rand(len(self.vocab), self.layer1_size) - 0.5) / self.layer1_size self.syn0norm = None
def _get_thread_working_mem(self): work = matutils.zeros_aligned( self.trainables.layer1_size, dtype=self.vector_dtype) # per-thread private work memory neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=self.vector_dtype) return work, neu1
def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = matutils.zeros_aligned( self.layer1_size, dtype=REAL) # each thread must have its own work memory neu1 = matutils.zeros_aligned( self.layer1_size, dtype=REAL) # each thread must have its own work memory while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max( self.min_alpha, self.alpha * (1 - 1.0 * self.alpha_decay * word_count[0] / total_words)) # how many words did we train on? out-of-vocabulary (unknown) words do not count job_words = sum( train_sentence(self, sentence, alpha, work, neu1) for sentence in job) with lock: word_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: logger.info( "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[ 0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports
def train_sentence(model, sentence, alpha, work=None, neu1=None): #mod """ Update CBOW negative sampling model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Word2Vec.train()`. """ for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip if model.reduce > 0: reduced_window = random.randint(model.window) # `b` in the original word2vec code else: reduced_window = 0 # Combine all surrounding words into an appropriate input start = max(0, pos - model.window + reduced_window) l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL)#Initialize input count = 0 for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start): if pos2 == pos or word2 is None: pass else: l1 = l1 + model.syn0[word2.index] count += 1 if count > 0: l1 = l1 / count neu1e = matutils.zeros_aligned((model.layer1_size), dtype=REAL) for d in xrange(model.neg_samples+1): if d == 0: target_index = word.index label = 1 else: random_integer = random.randint(model.table_size) target_index = model.table[random_integer] if target_index == word.index: continue label = 0 l2a = model.syn1neg[target_index] fa = 1.0 / (1.0 + exp(-dot(l1, l2a))) # propagate hidden -> output ga = (label - fa) * alpha # vector of error gradients multiplied by the learning rate neu1e += dot(ga,l2a) model.syn1neg[target_index] += dot(ga, l1) # learn hidden -> output for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start): if pos2 == pos or word2 is None: pass else: model.syn0[word2.index] += neu1e return len([word for word in sentence if word is not None])
def reset_weights(self): """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" random.seed(self.seed) self.syn0 = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL) self.syn1neg = matutils.zeros_aligned( (len(self.vocab), self.layer1_size), dtype=REAL) self.syn0 += (random.rand(len(self.vocab), self.layer1_size) - 0.5) / self.layer1_size self.syn0norm = None
def __init__(self, words_model, seq_len=5): """ :param: words_model is a word2vec model for words. """ self.__words_model = words_model self.__seq_len = seq_len self.work = matutils.zeros_aligned( self.__words_model.layer1_size, dtype=np.float32) # per-thread private work memory self.neu1 = matutils.zeros_aligned(self.__words_model.layer1_size, dtype=np.float32) self.alpha = np.array([0.01])
def _get_thread_working_mem(self) -> [ndarray, ndarray]: """Computes the memory used per worker thread. Returns ------- np.ndarray Each worker threads private work memory. """ mem = zeros_aligned(self.sv.vector_size, dtype=REAL) oov_mem = zeros_aligned((self.batch_words, self.batch_ngrams), dtype=uINT) return (mem, oov_mem)
def worker_loop(): """Train the model, lifting lists of sentences from the job_queue.""" work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # per-thread private work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) jobs_processed = 0 while True: job = job_queue.get() if job is None: progress_queue.put(None) break # no more jobs => quit this worker sentences, alpha = job tally, raw_tally = self._do_train_job(sentences, alpha, (work, neu1)) progress_queue.put((len(sentences), tally, raw_tally)) # report back progress jobs_processed += 1
def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = zeros(self.layer1_size, dtype=REAL) # each thread must have its own work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) # how many words did we train on? out-of-vocabulary (unknown) words do not count if self.sg: job_words = sum(train_sentence_sg(self, sentence, alpha, work) for sentence in job) else: job_words = sum(train_sentence_cbow(self, sentence, alpha, work, neu1) for sentence in job) with lock: word_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: print "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % ( 100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0, ) next_report[0] = ( elapsed + 1.0 ) # don't flood the log, wait at least a second between progress reports
def train_sentence(model, sentence, alpha, work=None, neu1=None): #mod """ Update CBOW hierarchical softmax model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Word2Vec.train()`. """ for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip if model.reduce > 0: reduced_window = random.randint( model.window) # `b` in the original word2vec code else: reduced_window = 0 # Combine all surrounding words into an appropriate input start = max(0, pos - model.window + reduced_window) l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL) #Initialize input weights = 0. weights_2 = 0. for pos2, word2 in enumerate( sentence[start:pos + model.window + 1 - reduced_window], start): if pos2 == pos or word2 is None: pass else: weights += word2.count_power weights_2 += word2.count_power_2 l1 = l1 + word2.count_power * model.syn0[word2.index] if weights > 0.0000000000000001: regularization = weights / weights_2 l1 = l1 / weights l2a = model.syn1[ word.point] # 2d matrix, codelen x layer1_size fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T)) ) # propagate hidden -> output ga = ( 1 - word.code - fa ) * alpha # vector of error gradients multiplied by the learning rate model.syn1[word.point] += outer(ga, l1) # learn hidden -> output for pos2, word2 in enumerate( sentence[start:pos + model.window + 1 - reduced_window], start): if pos2 == pos or word2 is None: pass else: model.syn0[ word2. index] += regularization * word2.count_power * dot( ga, l2a) #MUST BE MODIFIED return len([word for word in sentence if word is not None])
def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): """ Infer a vector for given post-bulk training document. Document should be a list of (word) tokens. """ doctag_vectors = empty((1, self.vector_size), dtype=REAL) doctag_vectors[0] = self.seeded_vector(' '.join(doc_words)) doctag_locks = ones(1, dtype=REAL) doctag_indexes = [0] work = zeros(self.layer1_size, dtype=REAL) if not self.sg: neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) for i in range(steps): if self.sg: train_document_dbow(self, doc_words, doctag_indexes, alpha, work, learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) elif self.dm_concat: train_document_dm_concat(self, doc_words, doctag_indexes, alpha, work, neu1, learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) else: train_document_dm(self, doc_words, doctag_indexes, alpha, work, neu1, learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha return doctag_vectors[0]
def score_document_labeled_cbow(model, document, label, work=ones(1, dtype=REAL), neu1=None): if neu1 is None: neu1 = matutils.zeros_aligned(model.layer1_size, dtype=REAL) return sdlc(model, document, label, work, neu1)
def worker_loop(): """Train the model, lifting lists of train_pairs from the job_queue.""" # per-thread private work memory - useless in numpy implementation work = matutils.zeros_aligned(self.vector_size, dtype=REAL) neu1 = matutils.zeros_aligned(self.vector_size, dtype=REAL) jobs_processed = 0 while True: job = job_queue.get() if job is None: progress_queue.put(None) break # no more jobs => quit this worker train_pairs, alpha = job tally, raw_tally = self._do_train_job(train_pairs, alpha, (work, neu1)) progress_queue.put((len(train_pairs), tally, raw_tally)) # report back progress jobs_processed += 1 logger.debug("worker exiting, processed %i jobs", jobs_processed)
def train_sentence(model, sentence, alpha, work=None): """ Update skip-gram negative sampling model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Word2Vec.train()`. """ for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip if model.reduce > 0: reduced_window = random.randint( model.window) # `b` in the original word2vec code else: reduced_window = 0 # now go over all words from the (reduced) window, predicting each one in turn start = max(0, pos - model.window + reduced_window) for pos2, word2 in enumerate( sentence[start:pos + model.window + 1 - reduced_window], start): if pos2 == pos or word2 is None: # don't train on OOV words and on the `word` itself continue l1 = model.syn0[word2.index] neu1e = matutils.zeros_aligned((model.layer1_size), dtype=REAL) for d in xrange(model.neg_samples + 1): if d == 0: target_index = word.index label = 1 else: random_integer = random.randint( model.table_size - 1) #exclude the upper bound target_index = model.table[random_integer] if target_index == word.index: continue label = 0 l2a = model.syn1neg[target_index] fa = 1.0 / (1.0 + exp(-dot(l1, l2a)) ) # propagate hidden -> output ga = ( label - fa ) * alpha # vector of error gradients multiplied by the learning rate neu1e += dot(ga, l2a) model.syn1neg[target_index] += dot( ga, l1) # learn hidden -> output l1 += neu1e # learn input -> hidden return len([word for word in sentence if word is not None])
def train_sentence(model, sentence, alpha, work=None): """ Update skip-gram negative sampling model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Word2Vec.train()`. """ for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip if model.reduce > 0: reduced_window = random.randint(model.window) # `b` in the original word2vec code else: reduced_window = 0 # now go over all words from the (reduced) window, predicting each one in turn start = max(0, pos - model.window + reduced_window) for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start): if pos2 == pos or word2 is None: # don't train on OOV words and on the `word` itself continue l1 = model.syn0[word2.index] neu1e = matutils.zeros_aligned((model.layer1_size), dtype=REAL) for d in xrange(model.neg_samples+1): if d == 0: target_index = word.index label = 1 else: random_integer = random.randint(model.table_size) target_index = model.table[random_integer] if target_index == word.index: continue label = 0 l2a = model.syn1neg[target_index] fa = 1.0 / (1.0 + exp(-dot(l1, l2a))) # propagate hidden -> output ga = (label - fa) * alpha # vector of error gradients multiplied by the learning rate neu1e += dot(ga,l2a) model.syn1neg[target_index] += dot(ga, l1) # learn hidden -> output l1 += neu1e # learn input -> hidden return len([word for word in sentence if word is not None])
def score_document_labeled_cbow(model, document, labels=None, work=None, neu1=None): if model.bucket > 0: document = HashIter.hash_doc(document, model.bucket) if work is None: work = ones(len(model.lvocab) if labels is None else len(labels), dtype=REAL) if neu1 is None: neu1 = matutils.zeros_aligned(model.layer1_size, dtype=REAL) labels = labels or model.lvocab.keys() scores = sdlc(model, document, labels, work, neu1) return zip(labels, scores)
def train_sentence_cbow(model, sentence, alpha, work=None, neu1=None): """ Update CBOW hierarchical softmax model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Word2Vec.train()`. """ for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip reduced_window = random.randint( model.window) # `b` in the original word2vec code # Combine all context words into an appropriate input start = max(0, pos - model.window + reduced_window) l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL) count = 0 for pos2, word2 in enumerate( sentence[start:pos + model.window + 1 - reduced_window], start): if pos2 == pos or word2 is None: pass else: count += 1 l1 += model.syn0[word2.index] if count > 0: l1 = l1 / count l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T)) ) # propagate hidden -> output ga = ( 1 - word.code - fa ) * alpha # vector of error gradients multiplied by the learning rate model.syn1[word.point] += outer(ga, l1) # learn hidden -> output for pos2, word2 in enumerate( sentence[start:pos + model.window + 1 - reduced_window], start): if pos2 == pos or word2 is None: pass else: model.syn0[word2.index] += dot(ga, l2a) return len([word for word in sentence if word is not None])
def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): """ Infer a vector for given post-bulk training document. Parameters ---------- doc_words : :obj: `list` of :obj: `str` Document should be a list of (word) tokens. alpha : float The initial learning rate. min_alpha : float Learning rate will linearly drop to `min_alpha` as training progresses. steps : int Number of times to train the new document. Returns ------- :obj: `numpy.ndarray` Returns the inferred vector for the new document. """ doctag_vectors, doctag_locks = self.trainables.get_doctag_trainables(doc_words, self.docvecs.vector_size) doctag_indexes = [0] work = zeros(self.trainables.layer1_size, dtype=REAL) if not self.sg: neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) for i in range(steps): if self.sg: train_document_dbow( self, doc_words, doctag_indexes, alpha, work, learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks ) elif self.dm_concat: train_document_dm_concat( self, doc_words, doctag_indexes, alpha, work, neu1, learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks ) else: train_document_dm( self, doc_words, doctag_indexes, alpha, work, neu1, learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks ) alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha return doctag_vectors[0]
def train_sentence(model, sentence, alpha, work=None, neu1=None): #mod """ Update CBOW hierarchical softmax model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Word2Vec.train()`. """ for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip if model.reduce > 0: reduced_window = random.randint(model.window) # `b` in the original word2vec code else: reduced_window = 0 # Combine all surrounding words into an appropriate input start = max(0, pos - model.window + reduced_window) l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL)#Initialize input weights = 0. weights_2 = 0. for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start): if pos2 == pos or word2 is None: pass else: weights += word2.count_power weights_2 += word2.count_power_2 l1 = l1 + word2.count_power*model.syn0[word2.index] if weights > 0.0000000000000001: regularization = weights/weights_2 l1 = l1 / weights l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T))) # propagate hidden -> output ga = (1 - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate model.syn1[word.point] += outer(ga, l1) # learn hidden -> output for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start): if pos2 == pos or word2 is None: pass else: model.syn0[word2.index] += regularization*word2.count_power*dot(ga, l2a) #MUST BE MODIFIED return len([word for word in sentence if word is not None])
def load_word2vec_format(cls, fname, binary=False): """ Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information loaded is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. """ logger.info("loading projection weights from %s" % (fname)) with open(fname) as fin: header = fin.readline() vocab_size, layer1_size = map( int, header.split()) # throws for invalid file format result = Word2Vec(size=layer1_size) result.syn0 = matutils.zeros_aligned((vocab_size, layer1_size), dtype=REAL) if binary: binary_len = dtype(REAL).itemsize * layer1_size for line_no in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == ' ': word = ''.join(word) break if ch != '\n': # ignore newlines in front of words (some binary files have newline, some not) word.append(ch) result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) result.index2word.append(word) result.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL) else: for line_no, line in enumerate(fin): parts = line.split() assert len(parts) == layer1_size + 1 word, weights = parts[0], map(REAL, parts[1:]) result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) result.index2word.append(word) result.syn0[line_no] = weights logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname)) result.init_sims() return result
def load_word2vec_format(cls, fname, binary=False): """ Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information loaded is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. """ logger.info("loading projection weights from %s" % (fname)) fin = open(fname) if type(fname) == str else fname if True: # with open(fname) as fin: header = fin.readline() vocab_size, layer1_size = map(int, header.split()) # throws for invalid file format result = Word2Vec(size=layer1_size) result.syn0 = matutils.zeros_aligned((vocab_size, layer1_size), dtype=REAL) if binary: binary_len = dtype(REAL).itemsize * layer1_size for line_no in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == " ": word = "".join(word) break if ch != "\n": # ignore newlines in front of words (some binary files have newline, some not) word.append(ch) result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) result.index2word.append(word) result.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL) else: for line_no, line in enumerate(fin): parts = line.split() assert len(parts) == layer1_size + 1 word, weights = parts[0], map(REAL, parts[1:]) result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) result.index2word.append(word) result.syn0[line_no] = weights fin.close() # [DiN] logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname)) result.init_sims() return result
def train_sentence_cbow(model, sentence, alpha, work=None, neu1=None): """ Update CBOW hierarchical softmax model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Word2Vec.train()`. """ for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip reduced_window = random.randint(model.window) # `b` in the original word2vec code # Combine all context words into an appropriate input start = max(0, pos - model.window + reduced_window) l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL) count = 0 for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start): if pos2 == pos or word2 is None: pass else: count += 1 l1 += model.syn0[word2.index] if count > 0: l1 = l1 / count l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T))) # propagate hidden -> output ga = (1 - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate model.syn1[word.point] += outer(ga, l1) # learn hidden -> output for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start): if pos2 == pos or word2 is None: pass else: model.syn0[word2.index] += dot(ga, l2a) return len([word for word in sentence if word is not None])
def worker_train(): """Train the model, lifting lists of instances from the jobs queue.""" ''' multiple working space ''' work = zeros(self.layer1_size, dtype=REAL) # each thread must have its own work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * feat_count[0] / total_feats)) # how many words did we train on? out-of-vocabulary (unknown) features do not count job_words = sum(train_instance(self, instance, alpha, work) for instance in job) with lock: feat_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: logger.info("PROGRESS: at %.2f%% features, alpha %.05f, %.0f features/s" % (100.0 * feat_count[0] / total_feats, alpha, feat_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports
def _get_thread_working_mem(self): work = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) # per-thread private work memory neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) return work, neu1
def train_sentence(model, sentence, alpha, work=None, neu1=None): # This implementation has not been tested """ Update skip-gram CBOW hybrid hierarchical softmax model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Word2Vec.train()`. """ for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip if model.reduce > 0: reduced_half_bags = random.randint(model.half_bags) else: reduced_half_bags = 0 bags_before = min(model.half_bags - reduced_half_bags, (pos - 1)/model.words_per_bag + 1) #Verify? bags_after = min(model.half_bags - reduced_half_bags, (len(sentence)-pos-2)/model.words_per_bag +1) #Verify? for bag_index in xrange(-bags_before,0): start = max(0, pos + bag_index*model.words_per_bag) end = pos + (bag_index+1)*model.words_per_bag l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL)#Initialize input count = 0 for pos2, word2 in enumerate(sentence[start: end], start): if pos2 == pos or word2 is None: pass else: l1 = l1 + model.syn0[word2.index] count += 1 if count > 0: l1 = l1 / count #divide or not? l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T))) # propagate hidden -> output ga = (1 - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate model.syn1[word.point] += outer(ga, l1) # learn hidden -> output for pos2, word2 in enumerate(sentence[start: end], start): if pos2 == pos or word2 is None: pass else: model.syn0[word2.index] += dot(ga, l2a) for bag_index in xrange(0, bags_after): start = pos + bag_index*model.words_per_bag + 1 end = min(len(sentence), pos + (bag_index+1) * model.words_per_bag + 1) #Verify? l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL)#Initialize input count = 0 for pos2, word2 in enumerate(sentence[start: end], start): if pos2 == pos or word2 is None: pass else: l1 = l1 + model.syn0[word2.index] count += 1 if count > 0: l1 = l1 / count #divide or not? l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T))) # propagate hidden -> output ga = (1 - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate model.syn1[word.point] += outer(ga, l1) # learn hidden -> output for pos2, word2 in enumerate(sentence[start: end], start): if pos2 == pos or word2 is None: pass else: model.syn0[word2.index] += dot(ga, l2a) return len([word for word in sentence if word is not None])
def worker_init(): work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # per-thread private work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) context_vector = matutils.zeros_aligned(self.topic_size,dtype=REAL) return (work, neu1,context_vector)
def worker_init(): work = matutils.zeros_aligned( self.layer1_size, dtype=REAL) # per-thread private work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) return (work, neu1)
def train_sentence(model, sentence, alpha, work=None, neu1=None): # This implementation has not been tested """ Update skip-gram CBOW hybrid hierarchical softmax model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Word2Vec.train()`. """ for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip if model.reduce > 0: reduced_half_bags = random.randint(model.half_bags) else: reduced_half_bags = 0 bags_before = min(model.half_bags - reduced_half_bags, (pos - 1) / model.words_per_bag + 1) #Verify? bags_after = min(model.half_bags - reduced_half_bags, (len(sentence) - pos - 2) / model.words_per_bag + 1) #Verify? for bag_index in xrange(-bags_before, 0): start = max(0, pos + bag_index * model.words_per_bag) end = pos + (bag_index + 1) * model.words_per_bag l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL) #Initialize input count = 0 for pos2, word2 in enumerate(sentence[start:end], start): if pos2 == pos or word2 is None: pass else: l1 = l1 + model.syn0[word2.index] count += 1 if count > 0: l1 = l1 / count #divide or not? l2a = model.syn1[ word.point] # 2d matrix, codelen x layer1_size fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T)) ) # propagate hidden -> output ga = ( 1 - word.code - fa ) * alpha # vector of error gradients multiplied by the learning rate model.syn1[word.point] += outer(ga, l1) # learn hidden -> output for pos2, word2 in enumerate(sentence[start:end], start): if pos2 == pos or word2 is None: pass else: model.syn0[word2.index] += dot(ga, l2a) for bag_index in xrange(0, bags_after): start = pos + bag_index * model.words_per_bag + 1 end = min(len(sentence), pos + (bag_index + 1) * model.words_per_bag + 1) #Verify? l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL) #Initialize input count = 0 for pos2, word2 in enumerate(sentence[start:end], start): if pos2 == pos or word2 is None: pass else: l1 = l1 + model.syn0[word2.index] count += 1 if count > 0: l1 = l1 / count #divide or not? l2a = model.syn1[ word.point] # 2d matrix, codelen x layer1_size fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T)) ) # propagate hidden -> output ga = ( 1 - word.code - fa ) * alpha # vector of error gradients multiplied by the learning rate model.syn1[word.point] += outer(ga, l1) # learn hidden -> output for pos2, word2 in enumerate(sentence[start:end], start): if pos2 == pos or word2 is None: pass else: model.syn0[word2.index] += dot(ga, l2a) return len([word for word in sentence if word is not None])
def train_sentence(model, sentence, alpha, work=None, neu1=None): #mod """ Update CBOW negative sampling model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Word2Vec.train()`. """ for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip if model.reduce > 0: reduced_window = random.randint( model.window) # `b` in the original word2vec code else: reduced_window = 0 # Combine all surrounding words into an appropriate input start = max(0, pos - model.window + reduced_window) l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL) #Initialize input count = 0 for pos2, word2 in enumerate( sentence[start:pos + model.window + 1 - reduced_window], start): if pos2 == pos or word2 is None: pass else: l1 = l1 + model.syn0[word2.index] count += 1 if count > 0: l1 = l1 / count neu1e = matutils.zeros_aligned((model.layer1_size), dtype=REAL) for d in xrange(model.neg_samples + 1): if d == 0: target_index = word.index label = 1 else: random_integer = random.randint( model.table_size - 1) #exclude the upper bound target_index = model.table[random_integer] if target_index == word.index: continue label = 0 l2a = model.syn1neg[target_index] fa = 1.0 / (1.0 + exp(-dot(l1, l2a)) ) # propagate hidden -> output ga = ( label - fa ) * alpha # vector of error gradients multiplied by the learning rate neu1e += dot(ga, l2a) model.syn1neg[target_index] += dot( ga, l1) # learn hidden -> output for pos2, word2 in enumerate( sentence[start:pos + model.window + 1 - reduced_window], start): if pos2 == pos or word2 is None: pass else: model.syn0[word2.index] += neu1e return len([word for word in sentence if word is not None])