def cossim(vec1, vec2): vec1, vec2 = dict(vec1), dict(vec2) if not vec1 or not vec2: return 0.0 vec1len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec1))) vec2len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec2))) assert vec1len > 0.0 and vec2len > 0.0, "sparse documents must not contain any explicit zero entries" if len(vec2) < len(vec1): vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector result = sum(value * vec2.get(index, 0.0) for index, value in iteritems(vec1)) result /= vec1len * vec2len # rescale by vector lengths return result
def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): """ Filter out tokens that appear in 1. less than `no_below` documents (absolute number) or 2. more than `no_above` documents (fraction of total corpus size, *not* absolute number). 3. after (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `None`). After the pruning, shrink resulting gaps in word ids. **Note**: Due to the gap shrinking, the same word may have a different word id before and after the call to this function! """ no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold # determine which tokens to keep good_ids = (v for v in itervalues(self.token2id) if no_below <= self.dfs[v] <= no_above_abs) good_ids = sorted(good_ids, key=self.dfs.get, reverse=True) if keep_n is not None: good_ids = good_ids[:keep_n] logger.info("keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents" % (len(good_ids), no_below, no_above_abs, 100.0 * no_above)) # do the actual filtering, then rebuild dictionary to remove gaps in ids self.filter_tokens(good_ids=good_ids) self.compactify() logger.info("resulting dictionary: %s" % self)
def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): """ Filter out tokens that appear in 1. less than `no_below` documents (absolute number) or 2. more than `no_above` documents (fraction of total corpus size, *not* absolute number). 3. after (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `None`). After the pruning, shrink resulting gaps in word ids. **Note**: Due to the gap shrinking, the same word may have a different word id before and after the call to this function! """ no_above_abs = int( no_above * self.num_docs ) # convert fractional threshold to absolute threshold # determine which tokens to keep good_ids = (v for v in itervalues(self.token2id) if no_below <= self.dfs[v] <= no_above_abs) good_ids = sorted(good_ids, key=self.dfs.get, reverse=True) if keep_n is not None: good_ids = good_ids[:keep_n] logger.info( "keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents" % (len(good_ids), no_below, no_above_abs, 100.0 * no_above)) # do the actual filtering, then rebuild dictionary to remove gaps in ids self.filter_tokens(good_ids=good_ids) self.compactify() logger.info("resulting dictionary: %s" % self)
def create_binary_tree(self): """ Create a binary Huffman tree using stored vocabulary word counts. Frequent words will have shorter binary codes. Called internally from `build_vocab()`. """ logger.info("constructing a huffman tree from %i words" % len(self.vocab)) # build the huffman tree heap = list(itervalues(self.vocab)) heapq.heapify(heap) for i in xrange(len(self.vocab) - 1): min1, min2 = heapq.heappop(heap), heapq.heappop(heap) heapq.heappush(heap, Vocab(count=min1.count + min2.count, index=i + len(self.vocab), left=min1, right=min2)) # recurse over the tree, assigning a binary code to each vocabulary word if heap: max_depth, stack = 0, [(heap[0], [], [])] while stack: node, codes, points = stack.pop() if node.index < len(self.vocab): # leaf node => store its path from the root node.code, node.point = codes, points max_depth = max(len(codes), max_depth) else: # inner node => continue recursion points = array(list(points) + [node.index - len(self.vocab)], dtype=uint32) stack.append((node.left, array(list(codes) + [0], dtype=uint8), points)) stack.append((node.right, array(list(codes) + [1], dtype=uint8), points)) logger.info("built huffman tree with maximum node depth %i" % max_depth)
def sparse2full(doc, length): """ Convert a document in sparse corpus format (sequence of 2-tuples) into a dense numpy array (of size `length`). """ result = numpy.zeros(length, dtype=numpy.float32) # fill with zeroes (default value) doc = dict(doc) # overwrite some of the zeroes with explicit values result[list(doc)] = list(itervalues(doc)) return result
def sparse2full(doc, length): """ Convert a document in sparse corpus format (sequence of 2-tuples) into a dense numpy array (of size `length`). """ result = numpy.zeros( length, dtype=numpy.float32) # fill with zeroes (default value) doc = dict(doc) # overwrite some of the zeroes with explicit values result[list(doc)] = list(itervalues(doc)) return result
def create_binary_tree(self): """ Create a binary Huffman tree using stored vocabulary word counts. Frequent words will have shorter binary codes. Called internally from `build_vocab()`. """ logger.info("constructing a huffman tree from %i words" % len(self.vocab)) # build the huffman tree heap = list(itervalues(self.vocab)) heapq.heapify(heap) for i in xrange(len(self.vocab) - 1): min1, min2 = heapq.heappop(heap), heapq.heappop(heap) heapq.heappush( heap, Vocab(count=min1.count + min2.count, index=i + len(self.vocab), left=min1, right=min2)) # recurse over the tree, assigning a binary code to each vocabulary word if heap: max_depth, stack = 0, [(heap[0], [], [])] while stack: node, codes, points = stack.pop() if node.index < len(self.vocab): # leaf node => store its path from the root node.code, node.point = codes, points max_depth = max(len(codes), max_depth) else: # inner node => continue recursion points = array(list(points) + [node.index - len(self.vocab)], dtype=uint32) stack.append( (node.left, array(list(codes) + [0], dtype=uint8), points)) stack.append( (node.right, array(list(codes) + [1], dtype=uint8), points)) logger.info("built huffman tree with maximum node depth %i" % max_depth)
def compactify(self): """ Assign new word ids to all words. This is done to make the ids more compact, e.g. after some tokens have been removed via :func:`filter_tokens` and there are gaps in the id series. Calling this method will remove the gaps. """ logger.debug("rebuilding dictionary, shrinking gaps") # build mapping from old id -> new id idmap = dict( izip(itervalues(self.token2id), xrange(len(self.token2id)))) # reassign mappings to new ids self.token2id = dict((token, idmap[tokenid]) for token, tokenid in iteritems(self.token2id)) self.id2token = {} self.dfs = dict( (idmap[tokenid], freq) for tokenid, freq in iteritems(self.dfs))
def compactify(self): """ Assign new word ids to all words. This is done to make the ids more compact, e.g. after some tokens have been removed via :func:`filter_tokens` and there are gaps in the id series. Calling this method will remove the gaps. """ logger.debug("rebuilding dictionary, shrinking gaps") # build mapping from old id -> new id idmap = dict(izip(itervalues(self.token2id), xrange(len(self.token2id)))) # reassign mappings to new ids self.token2id = dict((token, idmap[tokenid]) for token, tokenid in iteritems(self.token2id)) self.id2token = {} self.dfs = dict((idmap[tokenid], freq) for tokenid, freq in iteritems(self.dfs))
def accuracy(self, questions, restrict_vocab=30000): """ Compute accuracy of the model. `questions` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example. The accuracy is reported (=printed to log and returned as a list) for each section separately, plus there's one aggregate summary at the end. Use `restrict_vocab` to ignore all questions containing a word whose frequency is not in the top-N most frequent words (default top 30,000). This method corresponds to the `compute-accuracy` script of the original C word2vec. """ ok_vocab = dict(sorted(iteritems(self.vocab), key=lambda item: -item[1].count)[:restrict_vocab]) ok_index = set(v.index for v in itervalues(ok_vocab)) def log_accuracy(section): correct, incorrect = section['correct'], section['incorrect'] if correct + incorrect > 0: logger.info("%s: %.1f%% (%i/%i)" % (section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect)) sections, section = [], None for line_no, line in enumerate(open(questions)): # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed if line.startswith(': '): # a new section starts => store the old section if section: sections.append(section) log_accuracy(section) section = {'section': line.lstrip(': ').strip(), 'correct': 0, 'incorrect': 0} else: if not section: raise ValueError("missing section header before line #%i in %s" % (line_no, questions)) try: a, b, c, expected = [word.lower() for word in line.split()] # TODO assumes vocabulary preprocessing uses lowercase, too... except: logger.info("skipping invalid line #%i in %s" % (line_no, questions)) if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: logger.debug("skipping line #%i with OOV words: %s" % (line_no, line)) continue ignore = set(self.vocab[v].index for v in [a, b, c]) # indexes of words to ignore predicted = None # find the most likely prediction, ignoring OOV words and input words for index in argsort(self.most_similar(positive=[b, c], negative=[a], topn=False))[::-1]: if index in ok_index and index not in ignore: predicted = self.index2word[index] if predicted != expected: logger.debug("%s: expected %s, predicted %s" % (line.strip(), expected, predicted)) break section['correct' if predicted == expected else 'incorrect'] += 1 if section: # store the last section, too sections.append(section) log_accuracy(section) total = {'section': 'total', 'correct': sum(s['correct'] for s in sections), 'incorrect': sum(s['incorrect'] for s in sections)} log_accuracy(total) sections.append(total) return sections
def train(self, sentences, total_words=None, word_count=0, chunksize=100): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ if FAST_VERSION < 0: import warnings warnings.warn("Cython compilation failed, training will be slow. Do you have Cython installed? `pip install cython`") logger.info("training model with %i workers on %i vocabulary and %i features" % (self.workers, len(self.vocab), self.layer1_size)) if not self.vocab: raise RuntimeError("you must first build vocabulary before training the model") start, next_report = time.time(), [1.0] word_count, total_words = [word_count], total_words or sum(v.count for v in itervalues(self.vocab)) jobs = Queue(maxsize=2 * self.workers) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock() # for shared state (=number of words trained so far, log reports...) def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = zeros(self.layer1_size, dtype=REAL) # each thread must have its own work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) # how many words did we train on? out-of-vocabulary (unknown) words do not count if self.sg: job_words = sum(train_sentence_sg(self, sentence, alpha, work) for sentence in job) else: job_words = sum(train_sentence_cbow(self, sentence, alpha, work, neu1) for sentence in job) with lock: word_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: logger.info("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() # convert input strings to Vocab objects (or None for OOV words), and start filling the jobs queue no_oov = ([self.vocab.get(word, None) for word in sentence] for sentence in sentences) for job_no, job in enumerate(utils.grouper(no_oov, chunksize)): logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in xrange(self.workers): jobs.put(None) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start logger.info("training on %i words took %.1fs, %.0f words/s" % (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0)) return word_count[0]