random.shuffle(news_samples)
n_samples = len(news_samples)
train_samples = news_samples[0:int(n_samples*_train_ratio)]
test_samples = news_samples[int(n_samples*_train_ratio):n_samples]

print("Samples distribution:", preprocessing.samples_statistics(news_samples, _section_filter, get_section))
print("Train set distribution:", preprocessing.samples_statistics(train_samples, _section_filter, get_section))
print("Test set distribution:", preprocessing.samples_statistics(test_samples, _section_filter, get_section))

train_texts = [sample.text for sample in train_samples]
test_texts = [sample.text for sample in test_samples]
train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, words_src = "samples", normalize_flag = False, reduction = _reduction, reduce_n_attr = _reduce_n_attr, stem_words = _stem_words)

print("Generating labels..")
train_labels = preprocessing.samples_to_label(train_samples, _section_filter, get_section)
test_labels = preprocessing.samples_to_label(test_samples, _section_filter, get_section)

print("Training..")
kmeans = KMeans(n_clusters = len(_section_filter))
reference_output = kmeans.fit_predict(train_matrix)

# count[c, j]: for the cth cluster, how many texts belong to the jth section
count = np.zeros((len(_section_filter), len(_section_filter)))
for i in range(reference_output.shape[0]):
	c = reference_output[i]
	j = _section_filter.index(get_section(train_samples[i]))
	count[c, j] += 1 

cluster_section_map = count.argmax(axis = 1)
Beispiel #2
0
tfidf_vectorizer = get_tfidfVectorizer_of_essay_top_tf_words()
print("Vectorizer built..")
train_matrix, test_matrix, words = preprocessing.preprocess(
    train_texts,
    test_texts,
    savedir=_save_dir,
    words_src=tfidf_vectorizer,
    normalize_flag=False,
    reduction=_reduction,
    reduce_n_attr=_reduce_n_attr,
    stem_words=_stem_words)
model = None
print("Generating labels..")
if _model == "SVM":
    train_labels = preprocessing.samples_to_label(train_samples, _sections,
                                                  get_section)
    test_labels = preprocessing.samples_to_label(test_samples, _sections,
                                                 get_section)

    model = SVM()
    print("Training.. ")
    model.train(train_matrix, train_labels)
    predict = model.predict(test_matrix)

elif _model == "NN":
    train_dists = preprocessing.samples_to_dists(train_samples, _sections,
                                                 get_section)
    test_dists = preprocessing.samples_to_dists(test_samples, _sections,
                                                get_section)
    model = Neural_Network(_n_factors=train_matrix.shape[1],
                           _learning_rate=_learning_rate,
Beispiel #3
0
samples = [s for s in samples if s.batch_name == _batch_name and s.question is not None]
random.shuffle(samples)
n_samples = len(samples)
train_samples = samples[0:int(n_samples*_train_ratio)]
test_samples = samples[int(n_samples*_train_ratio):n_samples]

print("Samples distribution:", preprocessing.samples_statistics(samples, _classes, get_question))
print("Train set distribution:", preprocessing.samples_statistics(train_samples, _classes, get_question))
print("Test set distribution:", preprocessing.samples_statistics(test_samples, _classes, get_question))

train_texts = [sample.text for sample in train_samples]
test_texts = [sample.text for sample in test_samples]
train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, words_src = "samples", normalize_flag = False)

if _model == "SVM":
	train_labels = preprocessing.samples_to_label(train_samples, _classes, get_question)
	test_labels = preprocessing.samples_to_label(test_samples, _classes, get_question)

	model = SVM()
	model.train(train_matrix, train_labels)
	predict = model.predict(test_matrix)

elif _model == "NN":
	train_dists = preprocessing.samples_to_dists(train_samples, _classes, get_question)
	test_dists = preprocessing.samples_to_dists(test_samples, _classes, get_question)
	model = Neural_Network(_n_factors = train_matrix.shape[1], _learning_rate = _learning_rate, _hidden_nodes = _hidden_nodes, _last_layer = len(_classes))
	model.train(train_matrix, train_dists, test_matrix, test_dists)
	predict = model.predict(test_matrix)
	predict = preprocessing.dists_to_labels(predict, _classes)
	test_labels = preprocessing.samples_to_label(test_samples, _classes)
Beispiel #4
0
test_samples = samples[int(n_samples * _train_ratio):n_samples]

print("Samples distribution:",
      preprocessing.samples_statistics(samples, _classes, get_question))
print("Train set distribution:",
      preprocessing.samples_statistics(train_samples, _classes, get_question))
print("Test set distribution:",
      preprocessing.samples_statistics(test_samples, _classes, get_question))

train_texts = [sample.text for sample in train_samples]
test_texts = [sample.text for sample in test_samples]
train_matrix, test_matrix, words = preprocessing.preprocess(
    train_texts, test_texts, words_src="samples", normalize_flag=False)

if _model == "SVM":
    train_labels = preprocessing.samples_to_label(train_samples, _classes,
                                                  get_question)
    test_labels = preprocessing.samples_to_label(test_samples, _classes,
                                                 get_question)

    model = SVM()
    model.train(train_matrix, train_labels)
    predict = model.predict(test_matrix)

elif _model == "NN":
    train_dists = preprocessing.samples_to_dists(train_samples, _classes,
                                                 get_question)
    test_dists = preprocessing.samples_to_dists(test_samples, _classes,
                                                get_question)
    model = Neural_Network(_n_factors=train_matrix.shape[1],
                           _learning_rate=_learning_rate,
                           _hidden_nodes=_hidden_nodes,