Example #1
0
    def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
        """
        Infer a vector for given post-bulk training document.

        Document should be a list of (word) tokens.
        """
        doctag_vectors = empty((1, self.vector_size), dtype=REAL)
        doctag_vectors[0] = self.seeded_vector(' '.join(doc_words))
        doctag_locks = ones(1, dtype=REAL)
        doctag_indexes = [0]

        work = zeros(self.layer1_size, dtype=REAL)
        if not self.sg:
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)

        for i in range(steps):
            if self.sg:
                train_document_dbow(self, doc_words, doctag_indexes, alpha, work,
                                    learn_words=False, learn_hidden=False,
                                    doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
            elif self.dm_concat:
                train_document_dm_concat(self, doc_words, doctag_indexes, alpha, work, neu1,
                                         learn_words=False, learn_hidden=False,
                                         doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
            else:
                train_document_dm(self, doc_words, doctag_indexes, alpha, work, neu1,
                                  learn_words=False, learn_hidden=False,
                                  doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
            alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha

        return doctag_vectors[0]
        def mapPartitions(iterable):
            model = bc_model.value
            syn0copy = model.syn0.copy()
            syn1negcopy = model.syn1neg.copy()
            params, sentences, lockf, k = iter(iterable).next()
            _a, _b, docvecs = params
            lookup = docvecs["lookup"]
            doctag_syn0_part = docvecs["doctag_syn0"]
            train_passes.add(1)
            for sent in sentences:
                i = lookup[sent.tags[0]]
                # training document modify doctag_syn0_part in-place
                train_document_dbow(
                    model,
                    sent.words,
                    doctag_indexes=[i],
                    alpha=alpha * 1.0 / sqrt(k + 1),
                    doctag_vectors=doctag_syn0_part,
                    doctag_locks=lockf,
                    learn_words=True,
                    train_words=True,
                    learn_hidden=True,
                )
            trained_count.add(i + 1)

            dsyn0 = model.syn0 - syn0copy
            dsyn1neg = model.syn1neg - syn1negcopy

            return [(dsyn0, dsyn1neg, docvecs)]
Example #3
0
        def mapPartitions(iterable):
            model = bc_model.value
            syn0copy = model.syn0.copy()
            syn1negcopy = model.syn1neg.copy()
            params, sentences, lockf, k = iter(iterable).next()
            _a, _b, docvecs = params
            lookup = docvecs['lookup']
            doctag_syn0_part = docvecs['doctag_syn0']
            train_passes.add(1)
            for sent in sentences:
                i = lookup[sent.tags[0]]
                # training document modify doctag_syn0_part in-place
                train_document_dbow(model,
                                    sent.words,
                                    doctag_indexes=[i],
                                    alpha=alpha * 1.0 / sqrt(k + 1),
                                    doctag_vectors=doctag_syn0_part,
                                    doctag_locks=lockf,
                                    learn_words=True,
                                    train_words=True,
                                    learn_hidden=True)
            trained_count.add(i + 1)

            dsyn0 = model.syn0 - syn0copy
            dsyn1neg = model.syn1neg - syn1negcopy

            return [(dsyn0, dsyn1neg, docvecs)]
Example #4
0
    def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
        """
        Infer a vector for given post-bulk training document.

        Parameters
        ----------
        doc_words : :obj: `list` of :obj: `str`
            Document should be a list of (word) tokens.
        alpha : float
            The initial learning rate.
        min_alpha : float
            Learning rate will linearly drop to `min_alpha` as training progresses.
        steps : int
            Number of times to train the new document.

        Returns
        -------
        :obj: `numpy.ndarray`
            Returns the inferred vector for the new document.

        """
        doctag_vectors, doctag_locks = self.trainables.get_doctag_trainables(doc_words, self.docvecs.vector_size)
        doctag_indexes = [0]
        work = zeros(self.trainables.layer1_size, dtype=REAL)
        if not self.sg:
            neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL)

        for i in range(steps):
            if self.sg:
                train_document_dbow(
                    self, doc_words, doctag_indexes, alpha, work,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
            elif self.dm_concat:
                train_document_dm_concat(
                    self, doc_words, doctag_indexes, alpha, work, neu1,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
            else:
                train_document_dm(
                    self, doc_words, doctag_indexes, alpha, work, neu1,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
            alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha

        return doctag_vectors[0]
Example #5
0
    def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
        """
        Infer a vector for given post-bulk training document.

        Parameters
        ----------
        doc_words : :obj: `list` of :obj: `str`
            Document should be a list of (word) tokens.
        alpha : float
            The initial learning rate.
        min_alpha : float
            Learning rate will linearly drop to `min_alpha` as training progresses.
        steps : int
            Number of times to train the new document.

        Returns
        -------
        :obj: `numpy.ndarray`
            Returns the inferred vector for the new document.

        """
        doctag_vectors, doctag_locks = self.trainables.get_doctag_trainables(doc_words, self.docvecs.vector_size)
        doctag_indexes = [0]
        work = zeros(self.trainables.layer1_size, dtype=REAL)
        if not self.sg:
            neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL)

        for i in range(steps):
            if self.sg:
                train_document_dbow(
                    self, doc_words, doctag_indexes, alpha, work,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
            elif self.dm_concat:
                train_document_dm_concat(
                    self, doc_words, doctag_indexes, alpha, work, neu1,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
            else:
                train_document_dm(
                    self, doc_words, doctag_indexes, alpha, work, neu1,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
            alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha

        return doctag_vectors[0]
Example #6
0
        def mapPartitions(iterable):
            model = bc_model.value
            docvecs, sentences, lockf, k = iter(iterable).next()
            lookup = docvecs['lookup']
            doctag_syn0_part = docvecs['doctag_syn0']
            train_passes.add(1)
            for sent in sentences:
                i = lookup[sent.tags[0]]
                # training document modify doctag_syn0_part in-place
                train_document_dbow(model,
                                    sent.words,
                                    doctag_indexes=[i],
                                    alpha=alpha * 1.0 / sqrt(k + 1),
                                    doctag_vectors=doctag_syn0_part,
                                    doctag_locks=lockf,
                                    learn_words=False,
                                    train_words=False,
                                    learn_hidden=False)
            trained_count.add(i + 1)

            return [docvecs]
        def mapPartitions(iterable):
            model = bc_model.value
            docvecs, sentences, lockf, k = iter(iterable).next()
            lookup = docvecs["lookup"]
            doctag_syn0_part = docvecs["doctag_syn0"]
            train_passes.add(1)
            for sent in sentences:
                i = lookup[sent.tags[0]]
                # training document modify doctag_syn0_part in-place
                train_document_dbow(
                    model,
                    sent.words,
                    doctag_indexes=[i],
                    alpha=alpha * 1.0 / sqrt(k + 1),
                    doctag_vectors=doctag_syn0_part,
                    doctag_locks=lockf,
                    learn_words=False,
                    train_words=False,
                    learn_hidden=False,
                )
            trained_count.add(i + 1)

            return [docvecs]
Example #8
0
 def _do_train_job(self, job, alpha, inits):
     work, neu1 = inits
     tally = 0
     for doc in job:
         indexed_doctags = self.docvecs.indexed_doctags(doc.tags)
         doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags
         if self.sg:
             tally += train_document_dbow(self, doc.words, doctag_indexes, alpha, work,
                                          train_words=self.dbow_words,
                                          doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
         elif self.dm_concat:
             tally += train_document_dm_concat(self, doc.words, doctag_indexes, alpha, work, neu1,
                                               doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
         else:
             tally += train_document_dm(self, doc.words, doctag_indexes, alpha, work, neu1,
                                        doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
         self.docvecs.trained_item(indexed_doctags)
     return tally, self._raw_word_count(job)
Example #9
0
 def _do_train_job(self, job, alpha, inits):
     work, neu1 = inits
     tally = 0
     for doc in job:
         doctag_indexes = self.vocabulary.indexed_doctags(doc.tags, self.docvecs)
         doctag_vectors = self.docvecs.vectors_docs
         doctag_locks = self.trainables.vectors_docs_lockf
         if self.sg:
             tally += train_document_dbow(
                 self, doc.words, doctag_indexes, alpha, work, train_words=self.dbow_words,
                 doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
             )
         elif self.dm_concat:
             tally += train_document_dm_concat(
                 self, doc.words, doctag_indexes, alpha, work, neu1,
                 doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
             )
         else:
             tally += train_document_dm(
                 self, doc.words, doctag_indexes, alpha, work, neu1,
                 doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
             )
     return tally, self._raw_word_count(job)
Example #10
0
 def _do_train_job(self, job, alpha, inits):
     work, neu1 = inits
     tally = 0
     for doc in job:
         doctag_indexes = self.vocabulary.indexed_doctags(doc.tags, self.docvecs)
         doctag_vectors = self.docvecs.vectors_docs
         doctag_locks = self.trainables.vectors_docs_lockf
         if self.sg:
             tally += train_document_dbow(
                 self, doc.words, doctag_indexes, alpha, work, train_words=self.dbow_words,
                 doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
             )
         elif self.dm_concat:
             tally += train_document_dm_concat(
                 self, doc.words, doctag_indexes, alpha, work, neu1,
                 doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
             )
         else:
             tally += train_document_dm(
                 self, doc.words, doctag_indexes, alpha, work, neu1,
                 doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
             )
     return tally, self._raw_word_count(job)