def train_model(self):
        log_to_info('Loading training sentences')

        review_d2v_id_list = zip(self.reviews['words'], self.reviews['d2v_id'], self.reviews['best_topics'],
                                 self.reviews['second_best_topics'])

        labeled_reviews = []
        if self.dm == 0:
            log_to_info('applying dbow with hpv={}'.format(self.mp.hierarchical_paragraph_vectors_dbow))
        elif self.dm == 1:
            log_to_info('applying dm with hpv={}'.format(self.mp.hierarchical_paragraph_vectors_dm))
            
        for space_separated_words, d2v_id, best_topic, second_best_topic in review_d2v_id_list:
            if self.dm == 0:
                labeled_reviews.extend(
                    convert_to_labeled_review(self.mp.hierarchical_paragraph_vectors_dbow, space_separated_words, d2v_id, best_topic,
                                              second_best_topic))
            elif self.dm == 1:
                labeled_reviews.extend(
                    convert_to_labeled_review(self.mp.hierarchical_paragraph_vectors_dm, space_separated_words, d2v_id, best_topic,
                                              second_best_topic))

        log_to_info('Loading Doc2Vec model...')
        start_epoch = self.epochs + 1
        model = None
        for epoch in range(self.epochs, 0, -1):
            model = self.load_model(epoch)
            if model:
                log_to_info('Found model in cache!')
                break
            start_epoch = epoch

        if not model:
            if self.dm == 0:
                # PV-DBOW
                log_to_info('Yep, this is DBOW!')
                model = Doc2Vec(dm=self.dm, hs=0, workers=self.workers, size=self.mp.word_vector_dimensionality, min_count=self.min_count,
                                window=self.mp.word_context_window, sample=self.mp.frequent_words_downsampling_dbow, seed=random_int(),
                                negative=self.mp.negative)
                # model = Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=8)
            elif self.dm == 1:
                # PV-DM w/average
                log_to_info('Yep, this is DM!')
                model = Doc2Vec(dm=self.dm, dm_mean=1, hs=0, workers=self.workers, size=self.mp.word_vector_dimensionality,
                                min_count=self.min_count, window=self.mp.word_context_window, sample=self.mp.frequent_words_downsampling_dm,
                                seed=random_int(), negative=self.mp.negative)
                # model = Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=8)

            start1 = time.time()
            model.build_vocab(labeled_reviews)
            start1 = time.time()
            end1 = time.time()
            log_to_info('Vocab building for dm{0} took {1} seconds'.format(self.dm, end1 - start1))

        log_to_info('Training Doc2Vec model...')
        for epoch in range(start_epoch, self.epochs + 1):
            log_to_info('Epoch {0} of {1}'.format(epoch, self.epochs))
            m = self.load_model(epoch)
            if m is not None:
                log_to_info('Found model in cache!')
                model = m
                continue

            permuted_labeled_reviews = labeled_reviews[:]
            random.shuffle(permuted_labeled_reviews)

            alpha = alpha_for_epoch(epoch, self.mp.epochs_total, self.mp.alpha_max, self.mp.alpha_min, self.mp.learning_rate_type)
            model.min_alpha, model.alpha = alpha, alpha

            start2 = time.time()
            model.train(permuted_labeled_reviews)
            end2 = time.time()
            log_to_info('DM HPV is {0}, DBOW HPV is {1}'.format(self.mp.hierarchical_paragraph_vectors_dm,
                                                                self.mp.hierarchical_paragraph_vectors_dbow))
            log_to_info('Model training for dm{0} took {1} seconds'.format(self.dm, end2 - start2))

            self.model = model
            self.store_model(epoch)

        self.model = model
Beispiel #2
0
def test_alpha_for_3_linear():
    assert_similar(alpha_for_epoch(1, 3, 0.025, 0.001, 'linear'), .025)
    assert_similar(alpha_for_epoch(2, 3, 0.025, 0.001, 'linear'), .013)
    assert_similar(alpha_for_epoch(3, 3, 0.025, 0.001, 'linear'), .001)
Beispiel #3
0
def test_alpha_for_3_exp():
    assert_similar(alpha_for_epoch(1, 3, 0.025, 0.001, 'exp'), .025)
    assert_similar(alpha_for_epoch(2, 3, 0.025, 0.001, 'exp'), .005)
    assert_similar(alpha_for_epoch(3, 3, 0.025, 0.001, 'exp'), .001)
Beispiel #4
0
def test_alpha_for_4_exp():
    assert_similar(alpha_for_epoch(1, 4, 0.025, 0.001, 'exp'), .025)
    assert_similar(alpha_for_epoch(2, 4, 0.025, 0.001, 'exp'), .00854988)
    assert_similar(alpha_for_epoch(3, 4, 0.025, 0.001, 'exp'), .002924018)
    assert_similar(alpha_for_epoch(4, 4, 0.025, 0.001, 'exp'), .001)
Beispiel #5
0
def test_alpha_for_4_linear():
    assert_similar(alpha_for_epoch(1, 4, 0.025, 0.001, 'linear'), .025)
    assert_similar(alpha_for_epoch(2, 4, 0.025, 0.001, 'linear'), .017)
    assert_similar(alpha_for_epoch(3, 4, 0.025, 0.001, 'linear'), .009)
    assert_similar(alpha_for_epoch(4, 4, 0.025, 0.001, 'linear'), .001)