def get_types(fullset): results = [] for row in fullset: disaster_prob_vec = row[:common.N_DISASTER] feature_vec = row[common.N_DISASTER:] categorized_vec = common.categorize(disaster_prob_vec, 0.4, 0.6) if common.is_certain(categorized_vec): label = common.classify(categorized_vec) results.append(label) elif -99 in feature_vec: results.append(-99) else: results.append(UNCERTAIN_LABEL) return np.array(results)
def get_codes(fullset): results = [] for i, row in enumerate(fullset): disaster_prob_vec = row[:common.N_DISASTER] feature_vec = row[common.N_DISASTER:] categorized_vec = common.categorize(disaster_prob_vec, 0.4, 0.6) if common.is_certain(categorized_vec): label = common.classify(categorized_vec) results.append(label) elif -99 in feature_vec: results.append(-99) else: results.append(common.N_CLASS) return np.array(results)
def get_trainset(fullset, upward=False, with_id=False): result = [] if upward: fullset = np.flipud(fullset.reshape(common.N_ROWS, common.N_COLS, -1)).reshape(-1, fullset.shape[-1]) for row in fullset: disaster_prob_vec = row[:common.N_DISASTER] feature_vec = row[common.N_DISASTER:] categorized_vec = common.categorize(disaster_prob_vec, 0.4, 0.6) if common.is_certain(categorized_vec) and -99 not in feature_vec: label = common.classify(categorized_vec) result.append(feature_vec.tolist() + [label]) if with_id: return np.concatenate((np.arange(len(result))[:, np.newaxis], result), axis=1) else: return np.array(result)
""" data = np.loadtxt(f'{DATA_PATH}/{data_name}') labels = np.loadtxt(f'{DATA_PATH}/{labels_name}') # Initialize dict to map class label to feature vector class_data = defaultdict(list) for k in range(len(labels)): # Save class specific data class_data[int(labels[k])].append(data[k]) for k in class_data: class_data[k] = np.array(class_data[k]) print(f'shape of class entry {k}: ', class_data[k].shape) return class_data class_data = read_data('X_combined.txt', 'Y_combined.txt') #render(class_data) priors, gaussians, variances = create_distributions(class_data) #render_pca(class_data, variances) labeled_data, classifications = classify(class_data, gaussians, priors) assess_classification(class_data, labeled_data, classifications, priors)
documents = common.retrieve_reuters_documents(max_documents=max_documents) print('Loaded reuters documents') elif data_set == 'imdb': documents = common.retrieve_imdb_movie_reviews(max_documents=max_documents) print('Loaded imdb reviews') elif data_set == 'newsgroups': documents = common.retrieve_newsgroup_articles(max_documents=max_documents) print('Loaded newsgroup articles') else: documents = [] for feature_extraction_algorithm in feature_extraction_algorithms: print('using {} algorithm on data set: {}'.format(feature_extraction_algorithm, data_set)) if 'doc2vec' == feature_extraction_algorithm: doc2vec = common.create_or_load_doc2vec_model('model/{}-doc2vec-{}.bin'.format(data_set, len(documents)), documents) common.add_feature_vectors_doc2vec(documents, doc2vec) if 'word_count' == feature_extraction_algorithm: word_count_vectorizer = common.create_word_count_vectorizer(documents) common.add_feature_vectors_text_vectorizer(documents, word_count_vectorizer) # Visualize png_file_name = 'fig/{}-tsne-{}-{}.png'.format(data_set, feature_extraction_algorithm, len(documents)) common.visualize(documents, png_file_name) # Classify print('classify data_set: {}, feature_extraction_algorithm: {}, num_documents: {}'.format(data_set, feature_extraction_algorithm, len(documents))) common.classify(documents) print()