Esempio n. 1
0
    def build_vocab(self, corpus):
        """
        Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided.
        """
        if self.vocabulary_counts != None:
            logger.debug("building vocabulary from provided frequency map")
            vocab = self.vocabulary_counts
        else:
            logger.debug("default vocabulary building")
            super(Skipgram, self).build_vocab(corpus)
            return

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for word, count in vocab.iteritems():
            v = Vocab()
            v.count = count
            if v.count >= self.min_count:
                v.index = len(self.vocab)
                self.index2word.append(word)
                self.vocab[word] = v

        logger.debug("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))

        if self.hs:
            # add info about each word's Huffman encoding
            self.create_binary_tree()
        if self.negative:
            # build the table for drawing random words (for negative sampling)
            self.make_table()
        # precalculate downsampling thresholds
        self.precalc_sampling()
        self.reset_weights()
Esempio n. 2
0
    def build_vocab(self, corpus):
        """
        Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided.
        """
        if self.vocabulary_counts != None:
            logger.debug("building vocabulary from provided frequency map")
            vocab = self.vocabulary_counts
        else:
            logger.debug("default vocabulary building")
            super(Skipgram, self).build_vocab(corpus)
            return

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for word, count in vocab.iteritems():
            v = Vocab()
            v.count = count
            if v.count >= self.min_count:
                v.index = len(self.vocab)
                self.index2word.append(word)
                self.vocab[word] = v

        logger.debug("total %i word types after removing those with count<%s" %
                     (len(self.vocab), self.min_count))

        if self.hs:
            # add info about each word's Huffman encoding
            self.create_binary_tree()
        if self.negative:
            # build the table for drawing random words (for negative sampling)
            self.make_table()
        # precalculate downsampling thresholds
        self.precalc_sampling()
        self.reset_weights()
Esempio n. 3
0
    def build_vocab(self, corpus):
        """
        Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided.
        """
        if self.vocabulary_counts != None:
          print "building vocabulary from provided frequency map"
          vocab = self.vocabulary_counts
        else:
          print "default vocabulary building"
          super(Skipgram, self).build_vocab(corpus)
          return

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for word, count in vocab.iteritems():
            v = Vocab()
            v.count = count
            if v.count >= self.min_count:
                v.index = len(self.vocab)
                self.index2word.append(word)
                self.vocab[word] = v

        self.corpus_count = len(vocab)
        self.raw_vocab = vocab

        logger.debug("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))

        self.scale_vocab()
        self.finalize_vocab()
Esempio n. 4
0
def add_new_labels(sentences, model):
    """
    Add new labels (for new docs) to the doc2vec model's `self.vocab`.

    from: <https://gist.github.com/zseder/4201551d7f8608f0b82b>
    """
    sentence_no = -1
    total_words = 0
    vocab = model.vocab
    #model_sentence_n = len([l for l in vocab if l.startswith("SENT")])
    model_sentence_n = max(int(l.split('_')[-1]) for l in vocab if l.startswith("SENT"))
    n_sentences = 0
    for sentence_no, sentence in enumerate(sentences):
        sentence_length = len(sentence.words)
        for label in sentence.labels:
            label_e = label.split("_")
            label_n = int(label_e[1]) + model_sentence_n
            label = "{0}_{1}".format(label_e[0], label_n)
            total_words += 1
            if label in vocab:
                vocab[label].count += sentence_length
            else:
                vocab[label] = Vocab(count=sentence_length)
                vocab[label].index = len(model.vocab) - 1
                vocab[label].code = [0]
                vocab[label].sample_probability = 1.
                model.index2word.append(label)
                n_sentences += 1
    return n_sentences
Esempio n. 5
0
 def _vocab_from(sentences):
     sentence_no, vocab = -1, {}
     total_words = 0
     for sentence_no, sentence in enumerate(sentences):
         if sentence_no % 10000 == 0:
             logger.info(
                 "PROGRESS: at item #%i, processed %i words and %i word types"
                 % (sentence_no, total_words, len(vocab)))
         sentence_length = len(sentence.words)
         for label in sentence.labels:
             total_words += 1
             if label in vocab:
                 vocab[label].count += sentence_length
             else:
                 vocab[label] = Vocab(count=sentence_length)
         for word in sentence.words:
             total_words += 1
             if word in vocab:
                 vocab[word].count += 1
             else:
                 vocab[word] = Vocab(count=1)
     logger.info(
         "collected %i word types from a corpus of %i words and %i items" %
         (len(vocab), total_words, sentence_no + 1))
     return vocab
Esempio n. 6
0
    def build_vocab(self, corpus):
        """
        Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided.
        """
        if self.vocabulary_counts != None:
            print "building vocabulary from provided frequency map"
            vocab = self.vocabulary_counts
        else:
            print "default vocabulary building"
            super(Skipgram, self).build_vocab(corpus)
            return

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for word, count in vocab.iteritems():
            v = Vocab()
            v.count = count
            if v.count >= self.min_count:
                v.index = len(self.vocab)
                self.index2word.append(word)
                self.vocab[word] = v

        self.corpus_count = len(vocab)
        self.raw_vocab = vocab

        logger.debug("total %i word types after removing those with count<%s" %
                     (len(self.vocab), self.min_count))

        self.scale_vocab()
        self.finalize_vocab()
Esempio n. 7
0
def create_corpus_from_matlab(word_embedding, index2word):
    model = Word2VecExtended()
    model.syn0 = word_embedding.astype(theano.config.floatX).copy()
    model.index2word = index2word
    model.index2word[0] = UnknownWord
    vocab = {}

    for word in model.index2word:
        v = Vocab(count=1)
        v.index = len(vocab)
        vocab[word] = v

    model.vocab = vocab
    model.UnknownWordIndex = model.vocab[UnknownWord].index
    return model
Esempio n. 8
0
    def load_word_embeddings(self, word_embeddings, word_to_ix):
        """Loads the word embeddings.

        Parameters
        ----------
        word_embeddings : numpy.ndarray
            Matrix with word-embeddings.
        word_to_ix : dict of (str, int)
            Mapping word to index.

        """
        logger.info("Loading the vocabulary")
        self.vocab = {}
        self.index2word = []
        counts = {}
        for word in word_to_ix:
            counts[word] = counts.get(word, 0) + 1
        self.vocab_size = len(counts)
        self.vector_size = word_embeddings.shape[1]
        self.syn0 = np.zeros((self.vocab_size, self.vector_size))
        self.index2word = [None] * self.vocab_size
        logger.info("Corpus has %i words", len(self.vocab))
        for word_id, word in enumerate(counts):
            self.vocab[word] = Vocab(index=word_id, count=counts[word])
            self.syn0[word_id] = word_embeddings[word_to_ix[word]]
            self.index2word[word_id] = word
        assert((len(self.vocab), self.vector_size) == self.syn0.shape)
        logger.info("Loaded matrix of %d size and %d dimensions", self.vocab_size, self.vector_size)
Esempio n. 9
0
    def __init__(self,
                 pathtomapping,
                 pathtovectors,
                 pathtocounts="",
                 initkeys=()):
        """
        SPPMI model equivalent to a gensim word2vec model.

        :param pathtomapping:
        :param pathtovectors:
        :param pathtocounts:
        :param initkeys:
        :return:
        """

        super(SPPMIModel, self).__init__()
        self.word2index = json.load(open(pathtomapping))
        self.index2word = {v: k for k, v in self.word2index.items()}
        self.word_vectors = self._load_sparse(pathtovectors)

        self.vocab = {}

        self.fast_table = {k: {} for k in initkeys}

        if pathtocounts:

            counts = json.load(open(pathtocounts))

            for w, idx in self.word2index.items():
                v = Vocab(count=counts[w], index=idx)
                self.vocab[w] = v
    def add_word(self, word, parent_word, emb, cur_index):
        fake_vocab_size = int(1e7)
        word_index = len(self.vocab)
        inner_node_index = word_index - 1
        parent_index = self.vocab[parent_word].index

        # add in the left subtree
        if word != parent_word:
            self.vocab[word] = Vocab(index=word_index,
                                     count=fake_vocab_size - word_index,
                                     sample_int=(2**32))
            if emb is not None:
                self.syn0[cur_index] = emb
            else:
                self.syn0[cur_index] = self.syn0[parent_index]
            # the node in the coarsened graph serves as an inner node now
            self.index2word.append(word)
            self.vocab[word].code = array(list(self.vocab[parent_word].code) +
                                          [0],
                                          dtype=uint8)
            self.vocab[word].point = array(
                list(self.vocab[parent_word].point) + [inner_node_index],
                dtype=uint32)
            self.inner_node_index_map[parent_word] = inner_node_index
        else:
            if emb is not None:
                self.syn0[parent_index] = emb
            self.vocab[word].code = array(list(self.vocab[word].code) + [1],
                                          dtype=uint8)
            self.vocab[word].point = array(list(self.vocab[word].point) +
                                           [self.inner_node_index_map[word]],
                                           dtype=uint32)
Esempio n. 11
0
    def finalize_vocab(self):
        """Build tables and model weights based on final vocabulary settings."""

        if not self.index2word:
            self.scale_vocab()
        if self.sorted_vocab:
            self.sort_vocab()
        if self.hs:

            class FakeSelf(LabeledWord2Vec):
                def __init__(self, vocab):
                    self.vocab = vocab

            # add info about each word's Huffman encoding
            self.__class__.create_binary_tree(FakeSelf(self.lvocab))
        if self.negative:
            # build the table for drawing random words (for negative sampling)
            self.make_cum_table()

        if self.null_word:
            # create null pseudo-word for padding when using concatenative L1 (run-of-words)
            # this word is only ever input – never predicted – so count, huffman-point, etc doesn't matter
            word, v = '\0', Vocab(count=1, sample_int=0)
            v.index = len(self.vocab)
            self.index2word.append(word)
            self.vocab[word] = v
        # set initial input/projection and hidden weights
        self.reset_weights()
Esempio n. 12
0
    def extend_vocab(self, sentences, oov_word=False, report_frequency=10000):
        """
		Extend vocabulary from a sequence of sentences (can be a once-only generator stream).
		Each sentence must be a list of utf8 strings.

		"""
        logger.info("collecting all words and their counts")

        prev_sentence_no = -1
        sentence_no, vocab = -1, {}
        total_words = 0
        assign_to_vocab = vocab.__setitem__  # slight performance gain
        # https://wiki.python.org/moin/PythonSpeed/PerformanceTips
        get_from_vocab = vocab.__getitem__
        for sentence_no, sentence in enumerate(sentences):
            if prev_sentence_no == sentence_no:
                break
            if sentence_no % report_frequency == 0:
                logger.info(
                    "PROGRESS: at sentence #%i, processed %i words and %i word types"
                    % (sentence_no, total_words, len(vocab)))
            for word in sentence:
                if word in vocab:
                    get_from_vocab(word).count += 1
                else:
                    assign_to_vocab(word, Vocab(count=1))
            total_words += len(sentence)
            prev_sentence_no = sentence_no
        logger.info(
            "collected %i word types from a corpus of %i words and %i sentences"
            % (len(vocab), total_words, sentence_no + 1))

        # assign a unique index to each word
        append = self.index2word.append
        assign_to_vocab = self.vocab.__setitem__
        for word, v in vocab.items():
            if word not in self.vocab:
                if v.count >= self.min_count:
                    v.index = len(self.vocab)
                    append(word)
                    assign_to_vocab(word, v)
            else:
                self.vocab[word].count += v.count

        # add the special out of vocabulary word **UNKNOWN**:
        if oov_word:
            self.add_oov_word(count=len(vocab) - len(self.vocab))

        logger.info("total %i word types after removing those with count<%s" %
                    (len(self.vocab), self.min_count))

        # add info about each word's Huffman encoding
        self.create_binary_tree()
        self.extend_weights()
Esempio n. 13
0
    def _vocab_from_new(self, sentences):
        """
        build word dict from subgrams, bigrams
        calculate total_words
        :arg sentences list of already segmented sentence
        """

        sentence_no, vocab, vocab_pred = -1, {}, {}
        total_words = 0

        # for meta_subgram in [self.START, self.END]:
        #    vocab[meta_subgram]=Vocab(count =1)

        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % 200 == 0:
                logger.info(
                    "PROGRESS: at sentence #%i, processed %i words and %i word types"
                    % (sentence_no, total_words, len(vocab)))

            if sentence:

                char_seq = [self.START, self.START] + map(
                    full2halfwidth, u"".join(sentence)) + [self.END, self.END]

                # count \n as a word
                total_words = total_words + len(char_seq) - 3

                subgrams = [char for char in char_seq] + [
                    self.su_prefix + varient + char for char in char_seq
                    for varient in self.state_varient
                ]
                bigrams = [
                    char_seq[index] + char_seq[index + 1]
                    for index in range(len(char_seq) - 1)
                ]
                subgrams.extend([
                    self.sb_prefix + varient + bigram for bigram in bigrams
                    for varient in self.state_varient
                ])
                subgrams.extend(bigrams)

                for sub in subgrams:
                    if sub in vocab:
                        vocab[sub].count += 1
                    else:
                        vocab[sub] = Vocab(count=1)

        logger.info(
            "collected %i word types from a corpus of %i words and %i sentences"
            % (len(vocab), total_words, sentence_no + 1))

        self.total_words = total_words

        return vocab
Esempio n. 14
0
 def add_word(word, weights):
     word_id = len(result.vocab)
     if word in result.vocab:
         logger.warning(
             "duplicate word '%s' in %s, ignoring all but first", word,
             fname)
         return
     if counts is None:
         # most common scenario: no vocab file given. just make up some bogus counts, in descending order
         result.vocab[word] = Vocab(index=word_id,
                                    count=vocab_size - word_id)
     elif word in counts:
         # use count from the vocab file
         result.vocab[word] = Vocab(index=word_id, count=counts[word])
     else:
         # vocab file given, but word is missing -- set count to None (TODO: or raise?)
         logger.warning(
             "vocabulary file is incomplete: '%s' is missing", word)
         result.vocab[word] = Vocab(index=word_id, count=None)
     result.syn0[word_id] = weights
     result.index2word.append(word)
Esempio n. 15
0
def load_gensim_from_binary_file(filename):
    from gensim.models.word2vec import Vocab, Word2Vec
    words, M = load_vectors_from_binary_file(filename)
    N, k = M.shape
    assert N == len(words)

    model = Word2Vec(size=k)
    model.syn0 = M
    model.index2word = words
    model.vocab = dict(
        (w, Vocab(index=idx, count=N - idx)) for (idx, w) in enumerate(words))
    return model
Esempio n. 16
0
    def build_vocab(self, sentences):

        logger.info("collecting all words and their counts")
        vocab = self._vocab_from_new(sentences)

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for meta_word in [self.label0_as_vocab, self.label1_as_vocab, self.unknown_as_vocab]:
            v = Vocab(count=1)
            v.index = len(self.vocab)
            v.sample_probability = 1.0
            self.index2word.append(meta_word)
            self.vocab[meta_word] = v

        # remove word with count < min_count, default min_count = 5 in gensim, Seger changed to 1
        # actually, not remove any words
        # build self.vocab word->Vocab dict, and assign a unique index to each word
        for subgram, v in iteritems(vocab):
            if v.count >= self.min_count:
                v.sample_probability = 1.0
                v.index = len(self.vocab)
                self.index2word.append(subgram)
                self.vocab[subgram] = v
        logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))
        logger.info('reset weights')

        if self.hybrid_pred:
            # v is word
            # get single character word frequency
            freq_list = [self.vocab[v].count for v in self.vocab if len(v) == 1]
            freq_list.sort(reverse=True)
            self.hybrid_threshold = freq_list[len(freq_list) / 25]
            print '>frequencey threshold for hybrid prediction is:', self.hybrid_threshold

        self.reset_weights()
Esempio n. 17
0
def update_vocab(corpus, old_model, model):
    """Like mode.build_vocab(), inserts words/vectors from old model"""
    count = model.min_count + 1
    model.scan_vocab(corpus)  # initial survey
    for word in old_model.vocab:  # insert old
        if word not in model.vocab:
            model.raw_vocab[word] += count
            model.vocab[word] = Vocab(count=count, index=len(model.index2word))
            model.index2word.append(word)
    # trim by min_count & precalculate downsampling
    model.scale_vocab()
    model.finalize_vocab()  # build tables & arrays
    for word in old_model.vocab:
        if word in model.vocab:
            model.syn0[model.vocab[word].index] = old_model.syn0[
                old_model.vocab[word].index]
Esempio n. 18
0
 def finalize_vocab(self, update=False):
     """Build tables and model weights based on final word vocabulary settings."""
     if not self.wv.index2word:
         self.scale_vocab()
     if self.sorted_vocab and not update:
         self.sort_vocab()
     if self.null_word:
         # create null pseudo-word for padding when using concatenative L1 (run-of-words)
         # this word is only ever input – never predicted – so count, huffman-point, etc doesn't matter
         word, v = '\0', Vocab(count=1, sample_int=0)
         v.index = len(self.wv.vocab)
         self.wv.index2word.append(word)
         self.wv.vocab[word] = v
     # set initial input/projection and hidden weights
     if not update:
         self.reset_weights(outputs=False)
     else:
         self.update_weights(outputs=False)
Esempio n. 19
0
 def add_vector_to_model(self, category_id, vector, model):
     # The category should not already be in the space
     # (rebuild the space in that case)
     catid = '#' + unicode(category_id)
     if catid in model.vocab:
         self.remove_category_from_space(category_id)
     w_count = len(model.vocab)
     model.vocab[catid] = Vocab(index=w_count, count=w_count + 1)
     model.index2word.append(catid)
     if w_count == 0:
         model.syn0 = np.empty((1, 300), dtype=np.float32)
         model.syn0[0] = vector
     else:
         try:
             model.syn0 = np.vstack((model.syn0, vector))
         except ValueError as e:
             print(e)
             print("Vector length: {}".format(len(vector)))
             print("Space Length: {}".format(model.vector_size))
     return model
Esempio n. 20
0
    def __init__(self, token_database, document_database):
        # set the token and document databases
        self.token_database = token_database
        self.document_database = document_database

        # create the gensim model
        self.model = gensim.models.Word2Vec(size=token_database.vector_size,
                                            window=3,
                                            negative=25,
                                            sorted_vocab=0)

        # add each token from the token database to the gensim model
        for token in self.token_database:
            self.model.wv.vocab[token] = Vocab(
                count=self.token_database.get_freq(token),
                index=len(self.model.wv.index2word),
                sample_int=sys.maxint)
            self.model.wv.index2word.append(token)

        # prepare the model and copy over the existing token vectors
        self.model.finalize_vocab()
        self.model.wv.syn0 = self.token_database.get_vectors()
Esempio n. 21
0
 def _vocab_from(self, sentences):
     """ Construct the vocabulary. """
     self.signals = set([])
     sentence_no, vocab = -1, {}
     total_words = 0
     for sentence_no, sentence_signal in enumerate(sentences):
         sentence, signal = self.extract_sentence_and_signal(
             sentence_signal)
         self.signals.add(signal)
         if sentence_no % 10000 == 0:
             logger.info(
                 "PROGRESS: at sentence #%i, processed %i words and %i word types"
                 % (sentence_no, total_words, len(vocab)))
         for word in sentence:
             total_words += 1
             if word in vocab:
                 vocab[word].count += 1
             else:
                 vocab[word] = Vocab(count=1)
     logger.info(
         "collected %i word types from a corpus of %i words and %i sentences"
         % (len(vocab), total_words, sentence_no + 1))
     return vocab
Esempio n. 22
0
    def build_vocab(self, sentences):

        logger.info("collecting all words and their counts")
        vocab = self._vocab_from_new(sentences)

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []

        for meta_word in [
                self.label0_as_vocab, self.label1_as_vocab,
                self.unknown_as_vocab
        ]:
            v = Vocab(count=1)
            v.index = len(self.vocab)
            v.sample_probability = 1.0
            self.index2word.append(meta_word)
            self.vocab[meta_word] = v

        # remove word with count < min_count, default min_count = 5 in gensim, Seger changed to 1
        # actually, not remove any words
        # build self.vocab word->Vocab dict, and assign a unique index to each word
        for subgram, v in iteritems(vocab):
            if v.count >= self.min_count:
                v.sample_probability = 1.0
                v.index = len(self.vocab)
                self.index2word.append(subgram)
                self.vocab[subgram] = v
        logger.info("total %i word types after removing those with count<%s" %
                    (len(self.vocab), self.min_count))
        logger.info('reset weights')

        if self.hybrid_pred:
            # v is word
            # get single character word frequency
            freq_list = [
                self.vocab[v].count for v in self.vocab if len(v) == 1
            ]
            freq_list.sort(reverse=True)
            self.hybrid_threshold = freq_list[len(freq_list) / 25]
            print '>frequencey threshold for hybrid prediction is:', self.hybrid_threshold

        self.reset_weights()
Esempio n. 23
0
def load_word2vec_format(fname,
                         fvocab=None,
                         binary=False,
                         norm_only=True,
                         encoding='utf8'):
    """
    Load the input-hidden weight matrix from the original C word2vec-tool format.

    Note that the information stored in the file is incomplete (the binary tree is missing),
    so while you can query for word similarity etc., you cannot continue training
    with a model loaded this way.

    `binary` is a boolean indicating whether the data is in binary word2vec format.
    `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
    Word counts are read from `fvocab` filename, if set (this is the file generated
    by `-save-vocab` flag of the original C tool).

    If you trained the C model using non-utf8 encoding for words, specify that
    encoding in `encoding`.

    """
    counts = None
    if fvocab is not None:
        logger.info("loading word counts from %s" % (fvocab))
        counts = {}
        with utils.smart_open(fvocab) as fin:
            for line in fin:
                word, count = utils.to_unicode(line).strip().split()
                counts[word] = int(count)

    logger.info("loading projection weights from %s" % (fname))
    with utils.smart_open(fname) as fin:
        header = utils.to_unicode(fin.readline(), encoding=encoding)
        vocab_size, vector_size = map(
            int, header.split())  # throws for invalid file format
        result = Word2Vec(size=vector_size)
        result.syn0 = zeros((vocab_size, vector_size), dtype=REAL)
        if binary:
            binary_len = dtype(REAL).itemsize * vector_size
            for line_no in xrange(vocab_size):
                # mixed text and binary: read text first, then binary
                word = []
                while True:
                    ch = fin.read(1)
                    if ch == b' ':
                        break
                    if ch != b'\n':  # ignore newlines in front of words (some binary files have)

                        word.append(ch)
                try:
                    word = utils.to_unicode(b''.join(word), encoding=encoding)
                except UnicodeDecodeError, e:
                    logger.warning(
                        "Couldn't convert whole word to unicode: trying to convert first %d bytes only ..."
                        % e.start)
                    word = utils.to_unicode(b''.join(word[:e.start]),
                                            encoding=encoding)
                    logger.warning("... first %d bytes converted to '%s'" %
                                   (e.start, word))

                if counts is None:
                    result.vocab[word] = Vocab(index=line_no,
                                               count=vocab_size - line_no)
                elif word in counts:
                    result.vocab[word] = Vocab(index=line_no,
                                               count=counts[word])
                else:
                    logger.warning("vocabulary file is incomplete")
                    result.vocab[word] = Vocab(index=line_no, count=None)
                result.index2word.append(word)
                result.syn0[line_no] = fromstring(fin.read(binary_len),
                                                  dtype=REAL)
        else:
Esempio n. 24
0
                                               count=counts[word])
                else:
                    logger.warning("vocabulary file is incomplete")
                    result.vocab[word] = Vocab(index=line_no, count=None)
                result.index2word.append(word)
                result.syn0[line_no] = fromstring(fin.read(binary_len),
                                                  dtype=REAL)
        else:
            for line_no, line in enumerate(fin):
                parts = utils.to_unicode(line[:-1],
                                         encoding=encoding).split(" ")
                if len(parts) != vector_size + 1:
                    raise ValueError(
                        "invalid vector on line %s (is this really the text format?)"
                        % (line_no))
                word, weights = parts[0], list(map(REAL, parts[1:]))
                if counts is None:
                    result.vocab[word] = Vocab(index=line_no,
                                               count=vocab_size - line_no)
                elif word in counts:
                    result.vocab[word] = Vocab(index=line_no,
                                               count=counts[word])
                else:
                    logger.warning("vocabulary file is incomplete")
                    result.vocab[word] = Vocab(index=line_no, count=None)
                result.index2word.append(word)
                result.syn0[line_no] = weights
    logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
    result.init_sims(norm_only)
    return result
Esempio n. 25
0
 def add_word_to_vocab(self, word, count=1):
     v = Vocab(count=count)
     v.index = len(self.vocab)
     self.vocab[word] = v
     self.index2word.append(word)
     return v
Esempio n. 26
0
    def build_vocab(self, sentences, oov_word=False, report_frequency=10000):
        """
		Build vocabulary from a sequence of sentences (can be a once-only generator stream).
		Each sentence must be a list of utf8 strings.

		"""
        print("build vocab")
        path = (re.sub("/", "_", sentences.fname) + ("(mc=%d)" %
                                                     (self.min_count)) +
                ".vocab") if hasattr(sentences, "fname") else None
        if path != None and file_exists(path):
            logger.info("loading from saved vocab list at \"%s\"" % (path))
            file = gzip.open(path, 'r')
            saved_vocab = pickle.load(file)
            file.close()
            self.index2word = saved_vocab["index2word"]
            self.vocab = saved_vocab["vocab"]

            if oov_word:
                self.add_oov_word(count=10000)

            self.create_binary_tree()
            self.reset_weights()

        else:
            logger.info("collecting all words and their counts")

            prev_sentence_no = -1
            sentence_no, vocab = -1, {}
            total_words = 0
            assign_to_vocab = vocab.__setitem__  # slight performance gain
            # https://wiki.python.org/moin/PythonSpeed/PerformanceTips
            get_from_vocab = vocab.__getitem__
            for sentence_no, sentence in enumerate(sentences):
                if prev_sentence_no == sentence_no:
                    break
                if sentence_no % report_frequency == 0:
                    logger.info(
                        "PROGRESS: at sentence #%i, processed %i words and %i word types"
                        % (sentence_no, total_words, len(vocab)))
                for word in sentence:
                    if word in vocab:
                        get_from_vocab(word).count += 1
                    else:
                        assign_to_vocab(word, Vocab(count=1))
                total_words += len(sentence)
                prev_sentence_no = sentence_no
            logger.info(
                "collected %i word types from a corpus of %i words and %i sentences"
                % (len(vocab), total_words, sentence_no + 1))

            # assign a unique index to each word
            self.vocab, self.index2word = {}, []
            append = self.index2word.append
            assign_to_vocab = self.vocab.__setitem__
            for word, v in vocab.items():
                if v.count >= self.min_count:
                    v.index = len(self.vocab)
                    append(word)
                    assign_to_vocab(word, v)

            # add the special out of vocabulary word **UNKNOWN**:
            if oov_word:
                self.add_oov_word(count=len(vocab) - len(self.vocab))
            len(vocab) - len(self.vocab)

            logger.info(
                "total %i word types after removing those with count<%s" %
                (len(self.vocab), self.min_count))

            # add info about each word's Huffman encoding
            self.create_binary_tree()
            self.reset_weights()
            if path != None:
                logger.info("saving vocab list in \"%s\"" % (path))
                with gzip.open(path, 'wb') as file:
                    pickle.dump(
                        {
                            "vocab": self.vocab,
                            "index2word": self.index2word
                        }, file, 1)
Esempio n. 27
0
    def scale_vocab(self,
                    sample=None,
                    dry_run=False,
                    keep_raw_vocab=False,
                    trim_rule=None,
                    update=False):
        """
        Apply vocabulary settings for `min_count` (discarding less-frequent words)
        and `sample` (controlling the downsampling of more-frequent words).

        Calling with `dry_run=True` will only simulate the provided settings and
        report the size of the retained vocabulary, effective corpus length, and
        estimated memory requirements. Results are both printed via logging and
        returned as a dict.

        Delete the raw vocabulary after the scaling is done to free up RAM,
        unless `keep_raw_vocab` is set.

        """
        sample = sample or self.sample

        logger.info("Loading a fresh vocabulary")

        # Discard words less-frequent than min_count
        if not dry_run:
            self.index2word = []
            # make stored settings match these applied settings
            self.sample = sample
            self.vocab = {}

        for word, v in iteritems(self.raw_vocab):
            if not dry_run:
                self.vocab[word] = Vocab(count=v, index=len(self.index2word))
                self.index2word.append(word)

        retain_total = self.total_words

        # Precalculate each vocabulary item's threshold for sampling
        if not sample:
            # no words downsampled
            threshold_count = retain_total
        elif sample < 1.0:
            # traditional meaning: set parameter as proportion of total
            threshold_count = sample * retain_total
        else:
            # new shorthand: sample >= 1 means downsample all words with
            # higher count than sample
            threshold_count = int(sample * (3 + sqrt(5)) / 2)

        downsample_total, downsample_unique = 0, 0
        for w in self.raw_vocab.iterkeys():
            v = self.raw_vocab[w]
            word_probability = (sqrt(v / threshold_count) +
                                1) * (threshold_count / v)
            if word_probability < 1.0:
                downsample_unique += 1
                downsample_total += word_probability * v
            else:
                word_probability = 1.0
                downsample_total += v
            if not dry_run:
                self.vocab[w].sample_int = int(round(word_probability * 2**32))

        if not dry_run and not keep_raw_vocab:
            logger.info("deleting the raw counts dictionary of %i items",
                        len(self.raw_vocab))
            self.raw_vocab = defaultdict(int)

        logger.info("sample=%g downsamples %i most-common words", sample,
                    downsample_unique)
        logger.info(
            "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)",
            downsample_total, downsample_total * 100.0 / max(retain_total, 1),
            retain_total)

        # print extra memory estimates
        memory = self.estimate_memory(vocab_size=len(self.vocab))

        return memory
Esempio n. 28
0
def load_word2vec_format_filtered(fname,
                                  vocab,
                                  fvocab=None,
                                  binary=False,
                                  norm_only=True):
    """
    Like Word2Vec's loader, but allows you to restrict to a limited vocabulary.

    """
    vocab = set(vocab)
    counts = None
    if fvocab is not None:
        counts = {}
        with utils.smart_open(fvocab) as fin:
            for line in fin:
                word, count = utils.to_unicode(line).strip().split()
                counts[word] = int(count)

    with utils.smart_open(fname) as fin:
        header = utils.to_unicode(fin.readline())
        vocab_size, layer1_size = map(
            int, header.split())  # throws for invalid file format
        # We know we only need to store the number of things in the vocab
        vocab_size = len(vocab)

        result = Word2Vec(size=layer1_size)
        result.syn0 = zeros((vocab_size, layer1_size), dtype=REAL)
        word_num = 0
        if binary:
            binary_len = dtype(REAL).itemsize * layer1_size
            while word_num < vocab_size:
                # mixed text and binary: read text first, then binary
                word = read_word(fin)
                if word is None:
                    # Reached EOF
                    break
                # Only store the vectors for words in the given vocabulary
                if word in vocab:
                    vocab.remove(word)
                    if counts is None:
                        result.vocab[word] = Vocab(index=word_num,
                                                   count=vocab_size - word_num)
                    elif word in counts:
                        result.vocab[word] = Vocab(index=word_num,
                                                   count=counts[word])
                    else:
                        result.vocab[word] = Vocab(index=word_num, count=None)
                    result.index2word.append(word)
                    result.syn0[word_num] = fromstring(fin.read(binary_len),
                                                       dtype=REAL)
                    word_num += 1
                else:
                    # Skip this vector
                    fin.read(binary_len)
        else:
            for line_no, line in enumerate(fin):
                parts = utils.to_unicode(line).split()
                if len(parts) != layer1_size + 1:
                    raise ValueError(
                        "invalid vector on line %s (is this really the text format?)"
                        % (line_no))
                word, weights = parts[0], map(REAL, parts[1:])
                if word in vocab:
                    vocab.remove(word)
                    if counts is None:
                        result.vocab[word] = Vocab(index=word_num,
                                                   count=vocab_size - word_num)
                    elif word in counts:
                        result.vocab[word] = Vocab(index=word_num,
                                                   count=counts[word])
                    else:
                        result.vocab[word] = Vocab(index=word_num, count=None)
                    result.index2word.append(word)
                    result.syn0[word_num] = weights
                    word_num += 1
                    if word_num >= vocab_size:
                        # Got all we need: don't carry on reading
                        break
    # Get rid of the empty vectors at the end if not all words were found
    if word_num < vocab_size:
        result.syn0 = result.syn0[:word_num].copy()
    result.init_sims(norm_only)
    return result
Esempio n. 29
0
    s = corpus   \
        .flatMap(lambda s: [(w, 1) for w in s])   \
        .reduceByKey(lambda a, b: a+b)            \
        .filter(lambda x: x[1] >= 5)              \
        .map(lambda x: (x[1], x[0]))              \
        .collect()
    #.sortByKey(False)                         \
    #.collect()

    vocab = {}
    for i, (c, w) in enumerate(s):
        if i >= 1000000:
            break
        if (i + 1) % 100000 == 0:
            print i + 1
        vocab[w] = Vocab(count=c)

    def build_vocab(model, vocab):
        model.word_count = long(0)
        model.total_words = long(0)
        model.vocab, model.index2word = {}, []
        for word, v in vocab.iteritems():
            if v.count >= model.min_count:
                v.index = len(model.vocab)
                model.index2word.append(word)
                model.vocab[word] = v
            model.total_words += v.count
        print "total %i word types after removing those with count<%s" % (len(
            model.vocab), model.min_count)

        if model.hs: