def test_nNN(model, data_descriptions, data_codes, nNNmin=2, nNNmax=10): for nNN in xrange(nNNmin, nNNmax + 1): classifier = KNNClassifier(n_neighbors=nNN) t1 = time() classifier.fit(model.official_embeddings, get_section_codes(model.official_codes)) # classifier.fit(model.official_embeddings, coarsen_codes(model.official_codes)) pred_section_codes = classifier.predict_with_edit_dist( model.data_embeddings, data_descriptions, model.official_descriptions) # true_coarse_codes = coarsen_codes(data_codes) # .reshape((-1,1)) # errors = pred_codes - true_coarse_codes true_section_codes = get_section_codes(data_codes) # .reshape((-1,1)) errors = pred_section_codes - true_section_codes print '------------------------------' print 'nNN:', nNN print 'Correctly predicted', 1.0 * np.sum( errors == 0) / errors.shape[0], 'percent of top level codes w/ edit dist kNN' print 'Took', time() - t1, 'seconds' t1 = time() # pred_codes = classifier.predict(model.data_embeddings) # errors = pred_codes - true_coarse_codes pred_section_codes = classifier.predict(model.data_embeddings) errors = pred_section_codes - true_section_codes print 'Correctly predicted', 1.0 * np.sum( errors == 0) / errors.shape[0], 'percent of top level code w/ euclidean kNN' print 'Took', time() - t1, 'seconds' print '------------------------------'
def match_codes(): """Matching full dataset in parallel with customizable embedding method""" # 0.398657046863 hard, 0.275397642306 soft working_directory = './data/' data_codes, data_descriptions = get_data_to_match('slim') official_codes, official_descriptions = get_official_data() level = 1 model = HashingEmbedder( level=level, analyzer='char_wb', ngram_range=(4, 5), norm='l2' ) # word2vecEmbedder() # word2vecEmbedder() # HashingEmbedder() # [HashingEmbedder(level=level, analyzer='char', ngram_range=(3,5), norm='l2')] #[HashingEmbedder(level=level, analyzer='char', ngram_range=(2,3))] model.embed_data(data_descriptions) print 'loaded and embedded data' # test_nNN(model, data_descriptions, data_codes) official_code_labels = None true_data_codes = None use_section = False if use_section: official_code_labels = get_section_codes(model.official_codes) true_data_codes = get_section_codes(data_codes) else: official_code_labels = coarsen_codes(model.official_codes) true_data_codes = coarsen_codes(data_codes) nNN = 4 classifier = KNNClassifier(n_neighbors=nNN) classifier.fit(model.official_embeddings, official_code_labels) pred_codes = classifier.predict( model.data_embeddings, pbar=True ) # classifier.predict_with_edit_dist(model.data_embeddings, data_descriptions, model.official_descriptions) errors = pred_codes - true_data_codes print 'Correctly predicted', 1.0 * np.sum( errors == 0) / errors.shape[0], 'percent of top level codes' # plot_confusion_matrix(true_data_codes, pred_codes) model = word2vecEmbedder() model.embed_data(data_descriptions) classifier = KNNClassifier(n_neighbors=nNN) classifier.fit(model.official_embeddings, official_code_labels) pred_codes = classifier.predict( model.data_embeddings ) # classifier.predict_with_edit_dist(model.data_embeddings, data_descriptions, model.official_descriptions)