コード例 #1
0
def ten_fold_SVM(fold_type, feature_type, if_doc2vec, model_no):
    # read data (avoid replication)
    neg_reviews = text.read_data_from_file('neg')
    pos_reviews = text.read_data_from_file('pos')

    no_fold, length_data = 10, 1000
    if fold_type == 'consecutive':
        test_ranges = cv.n_fold_cons(no_fold, length_data)
    else:
        test_ranges = cv.n_fold_RR(no_fold, length_data)

    results = list()
    for i in range(len(test_ranges)):
        if fold_type == 'consecutive':
            train_size, test_size, reviews_train, reviews_test = cv.prepare_data_tenfold(
                neg_reviews, pos_reviews, test_ranges[i])
        else:
            train_size, test_size, reviews_train, reviews_test = cv.prepare_data_roundrobin(
                neg_reviews, pos_reviews, test_ranges[i])
        result = svm.SVM_classifier(feature_type, True, if_doc2vec, model_no,
                                    True, train_size, test_size, reviews_train,
                                    reviews_test)
        results.append(result)

    performances = results  # list of accuracies
    perf_average, variance = np.average(performances), np.var(performances)

    # save results into file
    svm.save_results_cv(fold_type, feature_type, if_doc2vec, results,
                        performances, perf_average, variance)
    print("\ncross validation results written to file")
コード例 #2
0
def ten_fold_NB(fold_type, feature_type):
    # read data (avoid replication)
    neg_reviews = text.read_data_from_file('neg')
    pos_reviews = text.read_data_from_file('pos')

    no_fold, length_data = 10, 1000
    if fold_type == 'consecutive':
        test_ranges = cv.n_fold_cons(no_fold, length_data)
    else:
        test_ranges = cv.n_fold_RR(no_fold, length_data)

    results = list()
    for i in range(len(test_ranges)):
        if fold_type == 'consecutive':
            train_size, test_size, reviews_train, reviews_test = cv.prepare_data_tenfold(
                neg_reviews, pos_reviews, test_ranges[i])
        else:
            train_size, test_size, reviews_train, reviews_test = cv.prepare_data_roundrobin(
                neg_reviews, pos_reviews, test_ranges[i])
        result = nb.naive_bayes_classifier(feature_type, 'laplace', True,
                                           train_size, test_size,
                                           reviews_train, reviews_test)
        results.append(result)

    performances = np.array(
        ([sum(x) / len(x) for x in results]))  # list of accuracies
    perf_average, variance = np.average(performances), np.var(performances)

    # save results into file
    nb.save_results_cv(fold_type, feature_type, results, performances,
                       perf_average, variance)
    print("\ncross validation results written to file")
コード例 #3
0
 def test_freq_cutoff(self):
     neg_texts = text.read_data_from_file('neg')
     pos_texts = text.read_data_from_file('pos')
     vocab_unigram = feat.get_vocab(neg_texts+pos_texts,9)
     vocab_bigram = feat.get_vocab_bigram(neg_texts+pos_texts,14)
     print(len(vocab_unigram))
     print(len(vocab_bigram))
コード例 #4
0
 def test_bag_words2vec_bigram_real(self):
     # take only 10 reviews into test part
     texts = text.read_data_from_file('neg')[:10]
     sample_vocab = feat.get_vocab_bigram(texts, 14)
     mat_feat = feat.bag_words2vec_bigram(sample_vocab, texts)
     mat_feat_naive = feat.bag_words2vec_bigram_naive(sample_vocab, texts)
     assert len(sample_vocab) == len(mat_feat[random.randrange(0, 10)])
     assert mat_feat_naive.all() == mat_feat.all()
コード例 #5
0
 def test_visual_data(self):
     visual_data(read_data_from_file('neg'))
     pass
コード例 #6
0
 def test_read_data_from_file(self):
     reviews_neg = read_data_from_file('neg')
     reviews_pos = read_data_from_file('pos')
     assert len(reviews_neg) == self.number_reviews
     assert len(reviews_pos) == self.number_reviews