def get_section(sample): return sample.section print("Reading samples.. ") news_samples = preprocessing.news_sample.get_samples_multithread(_news_dir, _max_thread, _max_sample_count) print("Preprocessing.. ") news_samples = [sample for sample in news_samples if sample.word_count >= _min_word_count and sample.section in _section_filter] random.shuffle(news_samples) n_samples = len(news_samples) train_samples = news_samples[0:int(n_samples*_train_ratio)] test_samples = news_samples[int(n_samples*_train_ratio):n_samples] print("Samples distribution:", preprocessing.samples_statistics(news_samples, _section_filter, get_section)) print("Train set distribution:", preprocessing.samples_statistics(train_samples, _section_filter, get_section)) print("Test set distribution:", preprocessing.samples_statistics(test_samples, _section_filter, get_section)) train_texts = [sample.text for sample in train_samples] test_texts = [sample.text for sample in test_samples] train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, words_src = "samples", normalize_flag = False, reduction = _reduction, reduce_n_attr = _reduce_n_attr, stem_words = _stem_words) print("Generating labels..") train_labels = preprocessing.samples_to_label(train_samples, _section_filter, get_section) test_labels = preprocessing.samples_to_label(test_samples, _section_filter, get_section) print("Training..") kmeans = KMeans(n_clusters = len(_section_filter)) reference_output = kmeans.fit_predict(train_matrix)
# NN parameters _learning_rate = 1 _hidden_nodes = [] def get_question(sample): return sample.question samples = preprocessing.tp_sample.get_samples(_sample_folder) samples = [s for s in samples if s.batch_name == _batch_name and s.question is not None] random.shuffle(samples) n_samples = len(samples) train_samples = samples[0:int(n_samples*_train_ratio)] test_samples = samples[int(n_samples*_train_ratio):n_samples] print("Samples distribution:", preprocessing.samples_statistics(samples, _classes, get_question)) print("Train set distribution:", preprocessing.samples_statistics(train_samples, _classes, get_question)) print("Test set distribution:", preprocessing.samples_statistics(test_samples, _classes, get_question)) train_texts = [sample.text for sample in train_samples] test_texts = [sample.text for sample in test_samples] train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, words_src = "samples", normalize_flag = False) if _model == "SVM": train_labels = preprocessing.samples_to_label(train_samples, _classes, get_question) test_labels = preprocessing.samples_to_label(test_samples, _classes, get_question) model = SVM() model.train(train_matrix, train_labels) predict = model.predict(test_matrix)
def get_section(sample): return sample.section print("Reading samples.. ") news_samples = preprocessing.news_sample.get_samples_multithread(_news_dir, _max_thread, _max_sample_count) print("Preprocessing.. ") news_samples = [sample for sample in news_samples if sample.word_count >= _min_word_count and sample.section in _section_filter] random.shuffle(news_samples) n_samples = len(news_samples) train_samples = news_samples[0:int(n_samples*_train_ratio)] test_samples = news_samples[int(n_samples*_train_ratio):n_samples] print("Samples distribution:", preprocessing.samples_statistics(news_samples, _section_filter, get_section)) print("Train set distribution:", preprocessing.samples_statistics(train_samples, _section_filter, get_section)) print("Test set distribution:", preprocessing.samples_statistics(test_samples, _section_filter, get_section)) train_texts = [sample.text for sample in train_samples] test_texts = [sample.text for sample in test_samples] train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, words_src = "samples", normalize_flag = False, reduction = _reduction, reduce_n_attr = _reduce_n_attr, stem_words = _stem_words) print("Generating labels..") if _model == "SVM": train_labels = preprocessing.samples_to_label(train_samples, _section_filter, get_section) test_labels = preprocessing.samples_to_label(test_samples, _section_filter, get_section) model = SVM() print("Training.. ") model.train(train_matrix, train_labels)
random.shuffle(news_samples) n_samples = len(news_samples) _sections = _section_filter if _section_group_map is not None: _sections = {section: True for section in _section_group_map.values()} _sections = list(_sections.keys()) print("Grouped sections:", _sections) for sample in news_samples: sample.section = _section_group_map[sample.section] train_samples = news_samples[0:int(n_samples * _train_ratio)] test_samples = news_samples[int(n_samples * _train_ratio):n_samples] print("Samples distribution:", preprocessing.samples_statistics(news_samples, _sections, get_section)) print("Train set distribution:", preprocessing.samples_statistics(train_samples, _sections, get_section)) print("Test set distribution:", preprocessing.samples_statistics(test_samples, _sections, get_section)) train_texts = [sample.text for sample in train_samples] test_texts = [sample.text for sample in test_samples] tfidf_vectorizer = get_tfidfVectorizer_of_essay_top_tf_words() print("Vectorizer built..") train_matrix, test_matrix, words = preprocessing.preprocess( train_texts, test_texts, savedir=_save_dir, words_src=tfidf_vectorizer,
def get_question(sample): return sample.question samples = preprocessing.tp_sample.get_samples(_sample_folder) samples = [ s for s in samples if s.batch_name == _batch_name and s.question is not None ] random.shuffle(samples) n_samples = len(samples) train_samples = samples[0:int(n_samples * _train_ratio)] test_samples = samples[int(n_samples * _train_ratio):n_samples] print("Samples distribution:", preprocessing.samples_statistics(samples, _classes, get_question)) print("Train set distribution:", preprocessing.samples_statistics(train_samples, _classes, get_question)) print("Test set distribution:", preprocessing.samples_statistics(test_samples, _classes, get_question)) train_texts = [sample.text for sample in train_samples] test_texts = [sample.text for sample in test_samples] train_matrix, test_matrix, words = preprocessing.preprocess( train_texts, test_texts, words_src="samples", normalize_flag=False) if _model == "SVM": train_labels = preprocessing.samples_to_label(train_samples, _classes, get_question) test_labels = preprocessing.samples_to_label(test_samples, _classes, get_question)