コード例 #1
0
ファイル: pv_dm.py プロジェクト: LiuFang816/SALSTM_py_data
		def worker_train():
			"""Train the model, lifting lists of sentences from the jobs queue."""
			paragraph_work = zeros(self.paragraph_size, dtype=REAL)  # each thread must have its own work memory
			error = zeros(1, dtype = REAL)
			if self.concatenate:
				# word work here is for each individual word, so it has length logistic regression - para size
				word_work = zeros(self.logistic_regression_size - self.paragraph_size, dtype = REAL)
				neu1 = matutils.zeros_aligned(self.logistic_regression_size, dtype=REAL)
			else:
				# here word work is aggregated:
				word_work = zeros(self.layer1_size, dtype = REAL)
				neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)

			zeros(self.logistic_regression_size, dtype = REAL)
			while True:
				job = jobs.get()
				if job is None:  # data finished, exit
					break
				# update the learning rate before every job
				alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) if self.weight_decay else self.alpha
				# how many words did we train on? out-of-vocabulary (unknown) words do not count
				job_words = self.training_function(self, job, paragraphs, paragraphs_only, alpha, paragraph_work, word_work, neu1, error, len(job))

				with lock:
					# here we can store the scores for later plotting and viewing...
					word_count[0] += job_words

					elapsed = time.time() - start
					total_error[0] += error[0]
					if elapsed >= next_report[0]:
						logger.debug("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s," %
							(100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0))
						next_report[0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports
コード例 #2
0
 def reset_weights(self):
     """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary."""
     random.seed(self.seed)
     self.syn0 = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL)
     self.syn1 = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL)
     self.syn0 += (random.rand(len(self.vocab), self.layer1_size) - 0.5) / self.layer1_size
     self.syn0norm = None
コード例 #3
0
 def _get_thread_working_mem(self):
     work = matutils.zeros_aligned(
         self.trainables.layer1_size,
         dtype=self.vector_dtype)  # per-thread private work memory
     neu1 = matutils.zeros_aligned(self.trainables.layer1_size,
                                   dtype=self.vector_dtype)
     return work, neu1
コード例 #4
0
        def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            work = matutils.zeros_aligned(
                self.layer1_size,
                dtype=REAL)  # each thread must have its own work memory
            neu1 = matutils.zeros_aligned(
                self.layer1_size,
                dtype=REAL)  # each thread must have its own work memory

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # update the learning rate before every job
                alpha = max(
                    self.min_alpha,
                    self.alpha *
                    (1 - 1.0 * self.alpha_decay * word_count[0] / total_words))
                # how many words did we train on? out-of-vocabulary (unknown) words do not count
                job_words = sum(
                    train_sentence(self, sentence, alpha, work, neu1)
                    for sentence in job)
                with lock:
                    word_count[0] += job_words
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        logger.info(
                            "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s"
                            % (100.0 * word_count[0] / total_words, alpha,
                               word_count[0] / elapsed if elapsed else 0.0))
                        next_report[
                            0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports
コード例 #5
0
    def train_sentence(model, sentence, alpha, work=None, neu1=None): #mod
        """
        Update CBOW negative sampling model by training on a single sentence.

        The sentence is a list of Vocab objects (or None, where the corresponding
        word is not in the vocabulary. Called internally from `Word2Vec.train()`.

        """

        for pos, word in enumerate(sentence):
            if word is None:
                continue  # OOV word in the input sentence => skip
            if model.reduce > 0:
                reduced_window = random.randint(model.window)  # `b` in the original word2vec code
            else:
                reduced_window = 0

            # Combine all surrounding words into an appropriate input
            start = max(0, pos - model.window + reduced_window)
            l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL)#Initialize input

            count = 0
            for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start):
                if pos2 == pos or word2 is None:
                    pass
                else:
                    l1 = l1 + model.syn0[word2.index]               
                    count += 1
            if count > 0:
                l1 = l1 / count
            neu1e = matutils.zeros_aligned((model.layer1_size), dtype=REAL)

            for d in xrange(model.neg_samples+1):
            
                if d == 0:
                    target_index = word.index
                    label = 1

                else:
                    random_integer = random.randint(model.table_size)
                    target_index = model.table[random_integer]
                    if target_index == word.index:
                        continue
                    label = 0
            
                l2a = model.syn1neg[target_index]
                fa = 1.0 / (1.0 + exp(-dot(l1, l2a)))  #  propagate hidden -> output
                ga = (label - fa) * alpha  # vector of error gradients multiplied by the learning rate
                neu1e += dot(ga,l2a)
                model.syn1neg[target_index] += dot(ga, l1)  # learn hidden -> output

            for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start):
                if pos2 == pos or word2 is None:
                    pass
                else:
                    model.syn0[word2.index] += neu1e

        return len([word for word in sentence if word is not None])
コード例 #6
0
 def reset_weights(self):
     """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary."""
     random.seed(self.seed)
     self.syn0 = matutils.zeros_aligned((len(self.vocab), self.layer1_size),
                                        dtype=REAL)
     self.syn1neg = matutils.zeros_aligned(
         (len(self.vocab), self.layer1_size), dtype=REAL)
     self.syn0 += (random.rand(len(self.vocab), self.layer1_size) -
                   0.5) / self.layer1_size
     self.syn0norm = None
コード例 #7
0
 def __init__(self, words_model, seq_len=5):
     """
     :param: words_model is a word2vec model for words.
     """
     self.__words_model = words_model
     self.__seq_len = seq_len
     self.work = matutils.zeros_aligned(
         self.__words_model.layer1_size,
         dtype=np.float32)  # per-thread private work memory
     self.neu1 = matutils.zeros_aligned(self.__words_model.layer1_size,
                                        dtype=np.float32)
     self.alpha = np.array([0.01])
コード例 #8
0
    def _get_thread_working_mem(self) -> [ndarray, ndarray]:
        """Computes the memory used per worker thread.

        Returns
        -------
        np.ndarray
            Each worker threads private work memory.

        """
        mem = zeros_aligned(self.sv.vector_size, dtype=REAL)
        oov_mem = zeros_aligned((self.batch_words, self.batch_ngrams),
                                dtype=uINT)
        return (mem, oov_mem)
コード例 #9
0
 def worker_loop():
     """Train the model, lifting lists of sentences from the job_queue."""
     work = matutils.zeros_aligned(self.layer1_size, dtype=REAL)  # per-thread private work memory
     neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
     jobs_processed = 0
     while True:
         job = job_queue.get()
         if job is None:
             progress_queue.put(None)
             break  # no more jobs => quit this worker
         sentences, alpha = job
         tally, raw_tally = self._do_train_job(sentences, alpha, (work, neu1))
         progress_queue.put((len(sentences), tally, raw_tally))  # report back progress
         jobs_processed += 1
コード例 #10
0
        def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            work = zeros(self.layer1_size, dtype=REAL)  # each thread must have its own work memory
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # update the learning rate before every job
                alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words))
                # how many words did we train on? out-of-vocabulary (unknown) words do not count
                if self.sg:
                    job_words = sum(train_sentence_sg(self, sentence, alpha, work) for sentence in job)
                else:
                    job_words = sum(train_sentence_cbow(self, sentence, alpha, work, neu1) for sentence in job)
                with lock:
                    word_count[0] += job_words
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        print "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % (
                            100.0 * word_count[0] / total_words,
                            alpha,
                            word_count[0] / elapsed if elapsed else 0.0,
                        )
                        next_report[0] = (
                            elapsed + 1.0
                        )  # don't flood the log, wait at least a second between progress reports
コード例 #11
0
    def train_sentence(model, sentence, alpha, work=None, neu1=None):  #mod
        """
        Update CBOW hierarchical softmax model by training on a single sentence.

        The sentence is a list of Vocab objects (or None, where the corresponding
        word is not in the vocabulary. Called internally from `Word2Vec.train()`.

        """

        for pos, word in enumerate(sentence):
            if word is None:
                continue  # OOV word in the input sentence => skip
            if model.reduce > 0:
                reduced_window = random.randint(
                    model.window)  # `b` in the original word2vec code
            else:
                reduced_window = 0

            # Combine all surrounding words into an appropriate input
            start = max(0, pos - model.window + reduced_window)
            l1 = matutils.zeros_aligned((model.layer1_size),
                                        dtype=REAL)  #Initialize input
            weights = 0.
            weights_2 = 0.
            for pos2, word2 in enumerate(
                    sentence[start:pos + model.window + 1 - reduced_window],
                    start):
                if pos2 == pos or word2 is None:
                    pass
                else:
                    weights += word2.count_power
                    weights_2 += word2.count_power_2
                    l1 = l1 + word2.count_power * model.syn0[word2.index]

            if weights > 0.0000000000000001:
                regularization = weights / weights_2
                l1 = l1 / weights

                l2a = model.syn1[
                    word.point]  # 2d matrix, codelen x layer1_size
                fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T))
                            )  #  propagate hidden -> output
                ga = (
                    1 - word.code - fa
                ) * alpha  # vector of error gradients multiplied by the learning rate
                model.syn1[word.point] += outer(ga,
                                                l1)  # learn hidden -> output

                for pos2, word2 in enumerate(
                        sentence[start:pos + model.window + 1 -
                                 reduced_window], start):
                    if pos2 == pos or word2 is None:
                        pass
                    else:
                        model.syn0[
                            word2.
                            index] += regularization * word2.count_power * dot(
                                ga, l2a)  #MUST BE MODIFIED

        return len([word for word in sentence if word is not None])
コード例 #12
0
    def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
        """
        Infer a vector for given post-bulk training document.

        Document should be a list of (word) tokens.
        """
        doctag_vectors = empty((1, self.vector_size), dtype=REAL)
        doctag_vectors[0] = self.seeded_vector(' '.join(doc_words))
        doctag_locks = ones(1, dtype=REAL)
        doctag_indexes = [0]

        work = zeros(self.layer1_size, dtype=REAL)
        if not self.sg:
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)

        for i in range(steps):
            if self.sg:
                train_document_dbow(self, doc_words, doctag_indexes, alpha, work,
                                    learn_words=False, learn_hidden=False,
                                    doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
            elif self.dm_concat:
                train_document_dm_concat(self, doc_words, doctag_indexes, alpha, work, neu1,
                                         learn_words=False, learn_hidden=False,
                                         doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
            else:
                train_document_dm(self, doc_words, doctag_indexes, alpha, work, neu1,
                                  learn_words=False, learn_hidden=False,
                                  doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
            alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha

        return doctag_vectors[0]
コード例 #13
0
ファイル: word2vec.py プロジェクト: rishabh135/ShallowLearn
 def score_document_labeled_cbow(model,
                                 document,
                                 label,
                                 work=ones(1, dtype=REAL),
                                 neu1=None):
     if neu1 is None:
         neu1 = matutils.zeros_aligned(model.layer1_size, dtype=REAL)
     return sdlc(model, document, label, work, neu1)
コード例 #14
0
ファイル: bimodal.py プロジェクト: ronxin/codemend
    def worker_loop():
      """Train the model, lifting lists of train_pairs from the job_queue."""

      # per-thread private work memory - useless in numpy implementation
      work = matutils.zeros_aligned(self.vector_size, dtype=REAL)
      neu1 = matutils.zeros_aligned(self.vector_size, dtype=REAL)
      jobs_processed = 0
      while True:
        job = job_queue.get()
        if job is None:
          progress_queue.put(None)
          break  # no more jobs => quit this worker
        train_pairs, alpha = job
        tally, raw_tally = self._do_train_job(train_pairs, alpha, (work, neu1))
        progress_queue.put((len(train_pairs), tally, raw_tally))  # report back progress
        jobs_processed += 1
      logger.debug("worker exiting, processed %i jobs", jobs_processed)
コード例 #15
0
    def train_sentence(model, sentence, alpha, work=None):
        """
        Update skip-gram negative sampling model by training on a single sentence.

        The sentence is a list of Vocab objects (or None, where the corresponding
        word is not in the vocabulary. Called internally from `Word2Vec.train()`.

        """

        for pos, word in enumerate(sentence):
            if word is None:
                continue  # OOV word in the input sentence => skip
            if model.reduce > 0:
                reduced_window = random.randint(
                    model.window)  # `b` in the original word2vec code
            else:
                reduced_window = 0

            # now go over all words from the (reduced) window, predicting each one in turn
            start = max(0, pos - model.window + reduced_window)
            for pos2, word2 in enumerate(
                    sentence[start:pos + model.window + 1 - reduced_window],
                    start):
                if pos2 == pos or word2 is None:
                    # don't train on OOV words and on the `word` itself
                    continue

                l1 = model.syn0[word2.index]
                neu1e = matutils.zeros_aligned((model.layer1_size), dtype=REAL)

                for d in xrange(model.neg_samples + 1):

                    if d == 0:
                        target_index = word.index
                        label = 1

                    else:
                        random_integer = random.randint(
                            model.table_size - 1)  #exclude the upper bound
                        target_index = model.table[random_integer]
                        if target_index == word.index:
                            continue
                        label = 0

                    l2a = model.syn1neg[target_index]
                    fa = 1.0 / (1.0 + exp(-dot(l1, l2a))
                                )  #  propagate hidden -> output
                    ga = (
                        label - fa
                    ) * alpha  # vector of error gradients multiplied by the learning rate
                    neu1e += dot(ga, l2a)
                    model.syn1neg[target_index] += dot(
                        ga, l1)  # learn hidden -> output

                l1 += neu1e  # learn input -> hidden

        return len([word for word in sentence if word is not None])
コード例 #16
0
        def worker_loop():
            """Train the model, lifting lists of train_pairs from the job_queue."""

            # per-thread private work memory - useless in numpy implementation
            work = matutils.zeros_aligned(self.vector_size, dtype=REAL)
            neu1 = matutils.zeros_aligned(self.vector_size, dtype=REAL)
            jobs_processed = 0
            while True:
                job = job_queue.get()
                if job is None:
                    progress_queue.put(None)
                    break  # no more jobs => quit this worker
                train_pairs, alpha = job
                tally, raw_tally = self._do_train_job(train_pairs, alpha,
                                                      (work, neu1))
                progress_queue.put((len(train_pairs), tally,
                                    raw_tally))  # report back progress
                jobs_processed += 1
            logger.debug("worker exiting, processed %i jobs", jobs_processed)
コード例 #17
0
    def train_sentence(model, sentence, alpha, work=None):
        """
        Update skip-gram negative sampling model by training on a single sentence.

        The sentence is a list of Vocab objects (or None, where the corresponding
        word is not in the vocabulary. Called internally from `Word2Vec.train()`.

        """

        for pos, word in enumerate(sentence):
            if word is None:
                continue  # OOV word in the input sentence => skip
            if model.reduce > 0:
                reduced_window = random.randint(model.window)  # `b` in the original word2vec code
            else:
                reduced_window = 0

            # now go over all words from the (reduced) window, predicting each one in turn
            start = max(0, pos - model.window + reduced_window)
            for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start):
                if pos2 == pos or word2 is None:
                    # don't train on OOV words and on the `word` itself
                    continue

                l1 = model.syn0[word2.index]
                neu1e = matutils.zeros_aligned((model.layer1_size), dtype=REAL)

                for d in xrange(model.neg_samples+1):
                
                    if d == 0:
                        target_index = word.index
                        label = 1

                    else:
                        random_integer = random.randint(model.table_size)
                        target_index = model.table[random_integer]
                        if target_index == word.index:
                            continue
                        label = 0
                
                    l2a = model.syn1neg[target_index]
                    fa = 1.0 / (1.0 + exp(-dot(l1, l2a)))  #  propagate hidden -> output
                    ga = (label - fa) * alpha  # vector of error gradients multiplied by the learning rate
                    neu1e += dot(ga,l2a)
                    model.syn1neg[target_index] += dot(ga, l1)  # learn hidden -> output

                l1 += neu1e  # learn input -> hidden

        return len([word for word in sentence if word is not None])
コード例 #18
0
ファイル: word2vec.py プロジェクト: watereals/ShallowLearn
 def score_document_labeled_cbow(model,
                                 document,
                                 labels=None,
                                 work=None,
                                 neu1=None):
     if model.bucket > 0:
         document = HashIter.hash_doc(document, model.bucket)
     if work is None:
         work = ones(len(model.lvocab) if labels is None else len(labels),
                     dtype=REAL)
     if neu1 is None:
         neu1 = matutils.zeros_aligned(model.layer1_size, dtype=REAL)
     labels = labels or model.lvocab.keys()
     scores = sdlc(model, document, labels, work, neu1)
     return zip(labels, scores)
コード例 #19
0
    def train_sentence_cbow(model, sentence, alpha, work=None, neu1=None):
        """
        Update CBOW hierarchical softmax model by training on a single sentence.

        The sentence is a list of Vocab objects (or None, where the corresponding
        word is not in the vocabulary. Called internally from `Word2Vec.train()`.

        """

        for pos, word in enumerate(sentence):
            if word is None:
                continue  # OOV word in the input sentence => skip
            reduced_window = random.randint(
                model.window)  # `b` in the original word2vec code

            # Combine all context words into an appropriate input
            start = max(0, pos - model.window + reduced_window)
            l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL)
            count = 0
            for pos2, word2 in enumerate(
                    sentence[start:pos + model.window + 1 - reduced_window],
                    start):
                if pos2 == pos or word2 is None:
                    pass
                else:
                    count += 1
                    l1 += model.syn0[word2.index]

            if count > 0:
                l1 = l1 / count

            l2a = model.syn1[word.point]  # 2d matrix, codelen x layer1_size
            fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T))
                        )  #  propagate hidden -> output
            ga = (
                1 - word.code - fa
            ) * alpha  # vector of error gradients multiplied by the learning rate
            model.syn1[word.point] += outer(ga, l1)  # learn hidden -> output

            for pos2, word2 in enumerate(
                    sentence[start:pos + model.window + 1 - reduced_window],
                    start):
                if pos2 == pos or word2 is None:
                    pass
                else:
                    model.syn0[word2.index] += dot(ga, l2a)

        return len([word for word in sentence if word is not None])
コード例 #20
0
    def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
        """
        Infer a vector for given post-bulk training document.

        Parameters
        ----------
        doc_words : :obj: `list` of :obj: `str`
            Document should be a list of (word) tokens.
        alpha : float
            The initial learning rate.
        min_alpha : float
            Learning rate will linearly drop to `min_alpha` as training progresses.
        steps : int
            Number of times to train the new document.

        Returns
        -------
        :obj: `numpy.ndarray`
            Returns the inferred vector for the new document.

        """
        doctag_vectors, doctag_locks = self.trainables.get_doctag_trainables(doc_words, self.docvecs.vector_size)
        doctag_indexes = [0]
        work = zeros(self.trainables.layer1_size, dtype=REAL)
        if not self.sg:
            neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL)

        for i in range(steps):
            if self.sg:
                train_document_dbow(
                    self, doc_words, doctag_indexes, alpha, work,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
            elif self.dm_concat:
                train_document_dm_concat(
                    self, doc_words, doctag_indexes, alpha, work, neu1,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
            else:
                train_document_dm(
                    self, doc_words, doctag_indexes, alpha, work, neu1,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
            alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha

        return doctag_vectors[0]
コード例 #21
0
ファイル: doc2vec.py プロジェクト: abs51295/gensim
    def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
        """
        Infer a vector for given post-bulk training document.

        Parameters
        ----------
        doc_words : :obj: `list` of :obj: `str`
            Document should be a list of (word) tokens.
        alpha : float
            The initial learning rate.
        min_alpha : float
            Learning rate will linearly drop to `min_alpha` as training progresses.
        steps : int
            Number of times to train the new document.

        Returns
        -------
        :obj: `numpy.ndarray`
            Returns the inferred vector for the new document.

        """
        doctag_vectors, doctag_locks = self.trainables.get_doctag_trainables(doc_words, self.docvecs.vector_size)
        doctag_indexes = [0]
        work = zeros(self.trainables.layer1_size, dtype=REAL)
        if not self.sg:
            neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL)

        for i in range(steps):
            if self.sg:
                train_document_dbow(
                    self, doc_words, doctag_indexes, alpha, work,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
            elif self.dm_concat:
                train_document_dm_concat(
                    self, doc_words, doctag_indexes, alpha, work, neu1,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
            else:
                train_document_dm(
                    self, doc_words, doctag_indexes, alpha, work, neu1,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
            alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha

        return doctag_vectors[0]
コード例 #22
0
    def train_sentence(model, sentence, alpha, work=None, neu1=None): #mod
        """
        Update CBOW hierarchical softmax model by training on a single sentence.

        The sentence is a list of Vocab objects (or None, where the corresponding
        word is not in the vocabulary. Called internally from `Word2Vec.train()`.

        """

        for pos, word in enumerate(sentence):
            if word is None:
                continue  # OOV word in the input sentence => skip
            if model.reduce > 0:
                reduced_window = random.randint(model.window)  # `b` in the original word2vec code
            else:
                reduced_window = 0

            # Combine all surrounding words into an appropriate input
            start = max(0, pos - model.window + reduced_window)
            l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL)#Initialize input
            weights = 0.
            weights_2 = 0.
            for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start):
                if pos2 == pos or word2 is None:
                    pass
                else:            
                    weights += word2.count_power
                    weights_2 += word2.count_power_2
                    l1 = l1 + word2.count_power*model.syn0[word2.index] 

            if weights > 0.0000000000000001:
                regularization = weights/weights_2
                l1 = l1 / weights
                
                l2a = model.syn1[word.point]  # 2d matrix, codelen x layer1_size
                fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T)))  #  propagate hidden -> output
                ga = (1 - word.code - fa) * alpha  # vector of error gradients multiplied by the learning rate
                model.syn1[word.point] += outer(ga, l1)  # learn hidden -> output
                
                for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start):
                    if pos2 == pos or word2 is None:
                        pass
                    else:
                        model.syn0[word2.index] += regularization*word2.count_power*dot(ga, l2a) #MUST BE MODIFIED

        return len([word for word in sentence if word is not None])
コード例 #23
0
ファイル: word2vec.py プロジェクト: ssword/gensim
    def load_word2vec_format(cls, fname, binary=False):
        """
        Load the input-hidden weight matrix from the original C word2vec-tool format.

        Note that the information loaded is incomplete (the binary tree is missing),
        so while you can query for word similarity etc., you cannot continue training
        with a model loaded this way.

        """
        logger.info("loading projection weights from %s" % (fname))
        with open(fname) as fin:
            header = fin.readline()
            vocab_size, layer1_size = map(
                int, header.split())  # throws for invalid file format
            result = Word2Vec(size=layer1_size)
            result.syn0 = matutils.zeros_aligned((vocab_size, layer1_size),
                                                 dtype=REAL)
            if binary:
                binary_len = dtype(REAL).itemsize * layer1_size
                for line_no in xrange(vocab_size):
                    # mixed text and binary: read text first, then binary
                    word = []
                    while True:
                        ch = fin.read(1)
                        if ch == ' ':
                            word = ''.join(word)
                            break
                        if ch != '\n':  # ignore newlines in front of words (some binary files have newline, some not)
                            word.append(ch)
                    result.vocab[word] = Vocab(index=line_no,
                                               count=vocab_size - line_no)
                    result.index2word.append(word)
                    result.syn0[line_no] = fromstring(fin.read(binary_len),
                                                      dtype=REAL)
            else:
                for line_no, line in enumerate(fin):
                    parts = line.split()
                    assert len(parts) == layer1_size + 1
                    word, weights = parts[0], map(REAL, parts[1:])
                    result.vocab[word] = Vocab(index=line_no,
                                               count=vocab_size - line_no)
                    result.index2word.append(word)
                    result.syn0[line_no] = weights
        logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
        result.init_sims()
        return result
コード例 #24
0
    def load_word2vec_format(cls, fname, binary=False):
        """
        Load the input-hidden weight matrix from the original C word2vec-tool format.

        Note that the information loaded is incomplete (the binary tree is missing),
        so while you can query for word similarity etc., you cannot continue training
        with a model loaded this way.

        """
        logger.info("loading projection weights from %s" % (fname))
        fin = open(fname) if type(fname) == str else fname
        if True:  # with open(fname) as fin:
            header = fin.readline()
            vocab_size, layer1_size = map(int, header.split())  # throws for invalid file format
            result = Word2Vec(size=layer1_size)
            result.syn0 = matutils.zeros_aligned((vocab_size, layer1_size), dtype=REAL)
            if binary:
                binary_len = dtype(REAL).itemsize * layer1_size
                for line_no in xrange(vocab_size):
                    # mixed text and binary: read text first, then binary
                    word = []
                    while True:
                        ch = fin.read(1)
                        if ch == " ":
                            word = "".join(word)
                            break
                        if ch != "\n":  # ignore newlines in front of words (some binary files have newline, some not)
                            word.append(ch)
                    result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no)
                    result.index2word.append(word)
                    result.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL)
            else:
                for line_no, line in enumerate(fin):
                    parts = line.split()
                    assert len(parts) == layer1_size + 1
                    word, weights = parts[0], map(REAL, parts[1:])
                    result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no)
                    result.index2word.append(word)
                    result.syn0[line_no] = weights
        fin.close()  # [DiN]
        logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
        result.init_sims()
        return result
コード例 #25
0
ファイル: temp.py プロジェクト: MichiganNLP/TextSimilarity
    def train_sentence_cbow(model, sentence, alpha, work=None, neu1=None):
        """
        Update CBOW hierarchical softmax model by training on a single sentence.

        The sentence is a list of Vocab objects (or None, where the corresponding
        word is not in the vocabulary. Called internally from `Word2Vec.train()`.

        """

        for pos, word in enumerate(sentence):
            if word is None:
                continue  # OOV word in the input sentence => skip
            reduced_window = random.randint(model.window)  # `b` in the original word2vec code

            # Combine all context words into an appropriate input
            start = max(0, pos - model.window + reduced_window)
            l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL)
            count = 0
            for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start):
                if pos2 == pos or word2 is None:
                    pass
                else:
                    count += 1
                    l1 += model.syn0[word2.index]

            if count > 0:
                l1 = l1 / count

            l2a = model.syn1[word.point]  # 2d matrix, codelen x layer1_size
            fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T)))  #  propagate hidden -> output
            ga = (1 - word.code - fa) * alpha  # vector of error gradients multiplied by the learning rate
            model.syn1[word.point] += outer(ga, l1)  # learn hidden -> output

            for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start):
                if pos2 == pos or word2 is None:
                    pass
                else:
                    model.syn0[word2.index] += dot(ga, l2a)

        return len([word for word in sentence if word is not None])
コード例 #26
0
ファイル: feat2vec.py プロジェクト: StevenLOL/feat2vec
        def worker_train():
            """Train the model, lifting lists of instances from the jobs queue."""
            '''
            multiple working space
            '''
            work = zeros(self.layer1_size, dtype=REAL)  # each thread must have its own work memory
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # update the learning rate before every job
                alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * feat_count[0] / total_feats))
                # how many words did we train on? out-of-vocabulary (unknown) features do not count
                job_words = sum(train_instance(self, instance, alpha, work) for instance in job)
                with lock:
                    feat_count[0] += job_words
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        logger.info("PROGRESS: at %.2f%% features, alpha %.05f, %.0f features/s" %
                            (100.0 * feat_count[0] / total_feats, alpha, feat_count[0] / elapsed if elapsed else 0.0))
                        next_report[0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports
コード例 #27
0
ファイル: base_any2vec.py プロジェクト: abs51295/gensim
 def _get_thread_working_mem(self):
     work = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL)  # per-thread private work memory
     neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL)
     return work, neu1
コード例 #28
0
    def train_sentence(model, sentence, alpha, work=None, neu1=None): # This implementation has not been tested
        """
        Update skip-gram CBOW hybrid hierarchical softmax model by training on a single sentence.

        The sentence is a list of Vocab objects (or None, where the corresponding
        word is not in the vocabulary. Called internally from `Word2Vec.train()`.

        """

        for pos, word in enumerate(sentence):
            if word is None:
                continue  # OOV word in the input sentence => skip
            if model.reduce > 0:
                reduced_half_bags = random.randint(model.half_bags)
            else:
                reduced_half_bags = 0

            bags_before = min(model.half_bags - reduced_half_bags, (pos - 1)/model.words_per_bag + 1) #Verify?
            bags_after = min(model.half_bags - reduced_half_bags, (len(sentence)-pos-2)/model.words_per_bag +1) #Verify?
            
            for bag_index in xrange(-bags_before,0):
                start = max(0, pos + bag_index*model.words_per_bag)
                end = pos + (bag_index+1)*model.words_per_bag
                l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL)#Initialize input
                count = 0
                for pos2, word2 in enumerate(sentence[start: end], start):            
                    if pos2 == pos or word2 is None:
                        pass
                    else:
                        l1 = l1 + model.syn0[word2.index]                     
                        count += 1
                if count > 0:
                    l1 = l1 / count #divide or not?
                
                l2a = model.syn1[word.point]  # 2d matrix, codelen x layer1_size
                fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T)))  #  propagate hidden -> output
                ga = (1 - word.code - fa) * alpha  # vector of error gradients multiplied by the learning rate
                model.syn1[word.point] += outer(ga, l1)  # learn hidden -> output

                for pos2, word2 in enumerate(sentence[start: end], start):
                    if pos2 == pos or word2 is None:
                        pass
                    else:
                        model.syn0[word2.index] += dot(ga, l2a)
            
            for bag_index in xrange(0, bags_after):
                start = pos + bag_index*model.words_per_bag + 1
                end = min(len(sentence), pos + (bag_index+1) * model.words_per_bag + 1) #Verify?
                l1 = matutils.zeros_aligned((model.layer1_size), dtype=REAL)#Initialize input
                count = 0

                for pos2, word2 in enumerate(sentence[start: end], start):            
                    if pos2 == pos or word2 is None:
                        pass
                    else:
                        l1 = l1 + model.syn0[word2.index]                     
                        count += 1
                if count > 0:
                    l1 = l1 / count #divide or not?
                
                l2a = model.syn1[word.point]  # 2d matrix, codelen x layer1_size
                fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T)))  #  propagate hidden -> output
                ga = (1 - word.code - fa) * alpha  # vector of error gradients multiplied by the learning rate
                model.syn1[word.point] += outer(ga, l1)  # learn hidden -> output

                for pos2, word2 in enumerate(sentence[start: end], start):
                    if pos2 == pos or word2 is None:
                        pass
                    else:
                        model.syn0[word2.index] += dot(ga, l2a)

        return len([word for word in sentence if word is not None])
コード例 #29
0
ファイル: word2mat_v3.py プロジェクト: Yelrose/WordMatrix
 def worker_init():
     work = matutils.zeros_aligned(self.layer1_size, dtype=REAL)  # per-thread private work memory
     neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
     context_vector = matutils.zeros_aligned(self.topic_size,dtype=REAL)
     return (work, neu1,context_vector)
コード例 #30
0
 def worker_init():
     work = matutils.zeros_aligned(
         self.layer1_size, dtype=REAL)  # per-thread private work memory
     neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
     return (work, neu1)
コード例 #31
0
    def train_sentence(model,
                       sentence,
                       alpha,
                       work=None,
                       neu1=None):  # This implementation has not been tested
        """
        Update skip-gram CBOW hybrid hierarchical softmax model by training on a single sentence.

        The sentence is a list of Vocab objects (or None, where the corresponding
        word is not in the vocabulary. Called internally from `Word2Vec.train()`.

        """

        for pos, word in enumerate(sentence):
            if word is None:
                continue  # OOV word in the input sentence => skip
            if model.reduce > 0:
                reduced_half_bags = random.randint(model.half_bags)
            else:
                reduced_half_bags = 0

            bags_before = min(model.half_bags - reduced_half_bags,
                              (pos - 1) / model.words_per_bag + 1)  #Verify?
            bags_after = min(model.half_bags - reduced_half_bags,
                             (len(sentence) - pos - 2) / model.words_per_bag +
                             1)  #Verify?

            for bag_index in xrange(-bags_before, 0):
                start = max(0, pos + bag_index * model.words_per_bag)
                end = pos + (bag_index + 1) * model.words_per_bag
                l1 = matutils.zeros_aligned((model.layer1_size),
                                            dtype=REAL)  #Initialize input
                count = 0
                for pos2, word2 in enumerate(sentence[start:end], start):
                    if pos2 == pos or word2 is None:
                        pass
                    else:
                        l1 = l1 + model.syn0[word2.index]
                        count += 1
                if count > 0:
                    l1 = l1 / count  #divide or not?

                l2a = model.syn1[
                    word.point]  # 2d matrix, codelen x layer1_size
                fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T))
                            )  #  propagate hidden -> output
                ga = (
                    1 - word.code - fa
                ) * alpha  # vector of error gradients multiplied by the learning rate
                model.syn1[word.point] += outer(ga,
                                                l1)  # learn hidden -> output

                for pos2, word2 in enumerate(sentence[start:end], start):
                    if pos2 == pos or word2 is None:
                        pass
                    else:
                        model.syn0[word2.index] += dot(ga, l2a)

            for bag_index in xrange(0, bags_after):
                start = pos + bag_index * model.words_per_bag + 1
                end = min(len(sentence), pos +
                          (bag_index + 1) * model.words_per_bag + 1)  #Verify?
                l1 = matutils.zeros_aligned((model.layer1_size),
                                            dtype=REAL)  #Initialize input
                count = 0

                for pos2, word2 in enumerate(sentence[start:end], start):
                    if pos2 == pos or word2 is None:
                        pass
                    else:
                        l1 = l1 + model.syn0[word2.index]
                        count += 1
                if count > 0:
                    l1 = l1 / count  #divide or not?

                l2a = model.syn1[
                    word.point]  # 2d matrix, codelen x layer1_size
                fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T))
                            )  #  propagate hidden -> output
                ga = (
                    1 - word.code - fa
                ) * alpha  # vector of error gradients multiplied by the learning rate
                model.syn1[word.point] += outer(ga,
                                                l1)  # learn hidden -> output

                for pos2, word2 in enumerate(sentence[start:end], start):
                    if pos2 == pos or word2 is None:
                        pass
                    else:
                        model.syn0[word2.index] += dot(ga, l2a)

        return len([word for word in sentence if word is not None])
コード例 #32
0
    def train_sentence(model, sentence, alpha, work=None, neu1=None):  #mod
        """
        Update CBOW negative sampling model by training on a single sentence.

        The sentence is a list of Vocab objects (or None, where the corresponding
        word is not in the vocabulary. Called internally from `Word2Vec.train()`.

        """

        for pos, word in enumerate(sentence):
            if word is None:
                continue  # OOV word in the input sentence => skip
            if model.reduce > 0:
                reduced_window = random.randint(
                    model.window)  # `b` in the original word2vec code
            else:
                reduced_window = 0

            # Combine all surrounding words into an appropriate input
            start = max(0, pos - model.window + reduced_window)
            l1 = matutils.zeros_aligned((model.layer1_size),
                                        dtype=REAL)  #Initialize input

            count = 0
            for pos2, word2 in enumerate(
                    sentence[start:pos + model.window + 1 - reduced_window],
                    start):
                if pos2 == pos or word2 is None:
                    pass
                else:
                    l1 = l1 + model.syn0[word2.index]
                    count += 1
            if count > 0:
                l1 = l1 / count
            neu1e = matutils.zeros_aligned((model.layer1_size), dtype=REAL)

            for d in xrange(model.neg_samples + 1):

                if d == 0:
                    target_index = word.index
                    label = 1

                else:
                    random_integer = random.randint(
                        model.table_size - 1)  #exclude the upper bound
                    target_index = model.table[random_integer]
                    if target_index == word.index:
                        continue
                    label = 0

                l2a = model.syn1neg[target_index]
                fa = 1.0 / (1.0 + exp(-dot(l1, l2a))
                            )  #  propagate hidden -> output
                ga = (
                    label - fa
                ) * alpha  # vector of error gradients multiplied by the learning rate
                neu1e += dot(ga, l2a)
                model.syn1neg[target_index] += dot(
                    ga, l1)  # learn hidden -> output

            for pos2, word2 in enumerate(
                    sentence[start:pos + model.window + 1 - reduced_window],
                    start):
                if pos2 == pos or word2 is None:
                    pass
                else:
                    model.syn0[word2.index] += neu1e

        return len([word for word in sentence if word is not None])