def classify_other(training, test, use_priors=False, add_features=False): if len(test) == 0: return 0, len(test), [] y_train, x_train = zip(*training) y_test, x_test = zip(*test) if use_priors: priors = priors_with_kde(y_test, y_train) priors_others = [OTHER_MAPPING[y] for y in priors] else: priors_others = None if add_features: x_test, x_train = extend_features(x_train, y_train, x_test, y_test, 'gender') #x_test, x_train = extend_features(x_train, y_train, x_test, y_test, 'working') #x_test, x_train = extend_features(x_train, y_train, x_test, y_test, 'age_group') #x_test, x_train = extend_features(x_train, y_train, x_test, y_test, 'label') y_training_other = [OTHER_MAPPING[y[1]] for y in y_train] result = classify_top_level(x_train, y_training_other, x_test, priors_others) global ola ola += other_level_accuracy(result, test) accurate = 0.0 count = 0.0 answers = [] sports_training = [(y, x) for (y, x) in training if y[1] in [6, 7]] shop_and_food_training = [(y, x) for (y, x) in training if y[1] in [8, 9]] sports_test = [] food_shop_test = [] for index, val in enumerate(result): if val == 0: sports_test.append(test[index]) elif val == 1: food_shop_test.append(test[index]) elif val == 2 or val == 3: count += 1 accurate += REVERSE_OUTER_MAPPING[val] == y_test[index][1] answers.append((y_test[index][0], REVERSE_OUTER_MAPPING[val], y_test[index][1])) #food_shop_test, shop_and_food_training = extend_features_with_split(shop_and_food_training, food_shop_test, 'age_group') #sports_test, sports_training = extend_features_with_split(sports_training, sports_test, 'age_group') a, c, d = train_classifier_and_predict(shop_and_food_training, food_shop_test) accurate += a count += c answers.extend(d) a, c, d = train_classifier_and_predict(sports_training, sports_test) accurate += a count += c answers.extend(d) return accurate, count, answers
def extend_features(x_train, y_train, x_test, y_test, attribute, method='dbscan'): training_scores, test_scores = priors_with_kde(y_test, y_train, return_predictions=False, attribute = attribute) modified_training_set = [] for y, x in zip(y_train, x_train): x_new = np.hstack((x, training_scores[y[0]])) modified_training_set.append(x_new) modified_test_set = [] for y, x in zip(y_test, x_test): x_new = np.hstack((x, test_scores[y[0]])) modified_test_set.append(x_new) return modified_test_set, modified_training_set
def extend_features_with_split(train, test, attribute): y_train, x_train = zip(*train) y_test, x_test = zip(*test) training_scores, test_scores = priors_with_kde(y_test, y_train, return_predictions=False, attribute = attribute) modified_training_set = [] for y, x in zip(y_train, x_train): x_new = np.hstack((x, training_scores[y[0]])) modified_training_set.append((y, x_new)) modified_test_set = [] for y, x in zip(y_test, x_test): x_new = np.hstack((x, test_scores[y[0]])) modified_test_set.append((y, x_new)) return modified_test_set, modified_training_set
def classify_other(training, test, use_priors = False, add_features = False): if len(test) == 0: return 0, len(test), [] y_train, x_train = zip(*training) y_test, x_test = zip(*test) if use_priors: priors = priors_with_kde(y_test, y_train) priors_others = [OTHER_MAPPING[y] for y in priors] else: priors_others = None if add_features: x_test, x_train = extend_features(x_train, y_train, x_test, y_test, 'gender') #x_test, x_train = extend_features(x_train, y_train, x_test, y_test, 'working') #x_test, x_train = extend_features(x_train, y_train, x_test, y_test, 'age_group') #x_test, x_train = extend_features(x_train, y_train, x_test, y_test, 'label') y_training_other = [OTHER_MAPPING[y[1]] for y in y_train] result = classify_top_level(x_train, y_training_other, x_test, priors_others) global ola ola +=other_level_accuracy(result, test) accurate = 0.0 count = 0.0 answers = [] sports_training = [(y, x) for (y, x) in training if y[1] in [6, 7]] shop_and_food_training = [(y, x) for (y, x) in training if y[1] in [8, 9]] sports_test = [] food_shop_test = [] for index, val in enumerate(result): if val == 0: sports_test.append(test[index]) elif val == 1: food_shop_test.append(test[index]) elif val == 2 or val == 3: count += 1 accurate += REVERSE_OUTER_MAPPING[val] == y_test[index][1] answers.append((y_test[index][0], REVERSE_OUTER_MAPPING[val], y_test[index][1])) #food_shop_test, shop_and_food_training = extend_features_with_split(shop_and_food_training, food_shop_test, 'age_group') #sports_test, sports_training = extend_features_with_split(sports_training, sports_test, 'age_group') a,c,d = train_classifier_and_predict(shop_and_food_training, food_shop_test) accurate += a count += c answers.extend(d) a,c,d = train_classifier_and_predict(sports_training, sports_test) accurate += a count += c answers.extend(d) return accurate, count, answers
def extend_features_with_split(train, test, attribute): y_train, x_train = zip(*train) y_test, x_test = zip(*test) training_scores, test_scores = priors_with_kde(y_test, y_train, return_predictions=False, attribute=attribute) modified_training_set = [] for y, x in zip(y_train, x_train): x_new = np.hstack((x, training_scores[y[0]])) modified_training_set.append((y, x_new)) modified_test_set = [] for y, x in zip(y_test, x_test): x_new = np.hstack((x, test_scores[y[0]])) modified_test_set.append((y, x_new)) return modified_test_set, modified_training_set
def do_classification(X, Y, train_index, test_index, kde_as_priors = False, kde_as_features = False, add_features= False): answers = [] X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] training_dataset = zip(y_train, X_train) test_set = zip(y_test, X_test) priors_top_level = None if kde_as_priors: priors = priors_with_kde(y_test, y_train) priors_top_level = [TOP_LEVEL_MAPPING[y] for y in priors] if kde_as_features: X_test, X_train = extend_features(X_train, y_train, X_test, y_test, 'label') test_set = zip(y_test, X_test) training_dataset = zip(y_train, X_train) #X_test, X_train = extend_features(X_train, y_train, X_test, y_test, 'time', method = 'simple') test_set = zip(y_test, X_test) training_dataset = zip(y_train, X_train) X_train = [x for (y, x) in training_dataset] X_test = [x for (y, x) in test_set] home_training_dataset = [(y, x) for (y, x) in training_dataset if y[1] in [1, 2]] work_training_dataset = [(y, x) for (y, x) in training_dataset if y[1] in [3, 5]] other_training_dataset = [(y, x) for (y, x) in training_dataset if y[1] in [4, 6, 7, 8, 9, 10]] y_train_top_level = [TOP_LEVEL_MAPPING[y[1]] for y in y_train] top_level_predictions = classify_top_level(X_train, y_train_top_level, X_test, priors_top_level) tla = top_level_accuracy(top_level_predictions, test_set) home_input = [] work_input = [] other_input = [] for index, pred in enumerate(top_level_predictions): if pred == 0: home_input.append(test_set[index]) elif pred == 1: work_input.append(test_set[index]) else: other_input.append(test_set[index]) logging.debug((len(home_input), len(work_input), len(other_input))) h_n, h_d, home_answers = train_classifier_and_predict(home_training_dataset, home_input, use_priors = False) w_n, w_d, work_answers = train_classifier_and_predict(work_training_dataset, work_input, use_priors = False) o_n, o_d, other_answers = classify_other(other_training_dataset, other_input, use_priors=kde_as_priors, add_features = add_features) overall_accuracy = ((h_n + w_n + o_n) * 1.0 )/ ((h_d + w_d + o_d) * 1.0) for a in [home_answers, work_answers, other_answers]: answers.extend(a) return tla, overall_accuracy, answers
def extend_features(x_train, y_train, x_test, y_test, attribute, method='dbscan'): training_scores, test_scores = priors_with_kde(y_test, y_train, return_predictions=False, attribute=attribute) modified_training_set = [] for y, x in zip(y_train, x_train): x_new = np.hstack((x, training_scores[y[0]])) modified_training_set.append(x_new) modified_test_set = [] for y, x in zip(y_test, x_test): x_new = np.hstack((x, test_scores[y[0]])) modified_test_set.append(x_new) return modified_test_set, modified_training_set
def train_classifier_and_predict(training, test, use_priors=False, class_weight= None): if len(test) == 0: return 0, len(test), [] y_train, x_train = zip(*training) y_test, x_test = zip(*test) if use_priors: priors = priors_with_kde(y_test, y_train) else: priors = None places = [y[0] for y in y_test] y_test = [y[1] for y in y_test] y_train = [y[1] for y in y_train] clf = get_best_estimator(x_train, y_train, x_test, priors) logging.debug(clf) predictions = clf.predict(x_test) answers = zip(places, predictions, y_test) result = [y == y_test[index] for index, y in enumerate(predictions)] return result.count(1), len(result), answers
def train_classifier_and_predict(training, test, use_priors=False, class_weight=None): if len(test) == 0: return 0, len(test), [] y_train, x_train = zip(*training) y_test, x_test = zip(*test) if use_priors: priors = priors_with_kde(y_test, y_train) else: priors = None places = [y[0] for y in y_test] y_test = [y[1] for y in y_test] y_train = [y[1] for y in y_train] clf = get_best_estimator(x_train, y_train, x_test, priors) logging.debug(clf) predictions = clf.predict(x_test) answers = zip(places, predictions, y_test) result = [y == y_test[index] for index, y in enumerate(predictions)] return result.count(1), len(result), answers
def do_classification(X, Y, train_index, test_index, kde_as_priors=False, kde_as_features=False, add_features=False): answers = [] X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] training_dataset = zip(y_train, X_train) test_set = zip(y_test, X_test) priors_top_level = None if kde_as_priors: priors = priors_with_kde(y_test, y_train) priors_top_level = [TOP_LEVEL_MAPPING[y] for y in priors] if kde_as_features: X_test, X_train = extend_features(X_train, y_train, X_test, y_test, 'label') test_set = zip(y_test, X_test) training_dataset = zip(y_train, X_train) #X_test, X_train = extend_features(X_train, y_train, X_test, y_test, 'time', method = 'simple') test_set = zip(y_test, X_test) training_dataset = zip(y_train, X_train) X_train = [x for (y, x) in training_dataset] X_test = [x for (y, x) in test_set] home_training_dataset = [(y, x) for (y, x) in training_dataset if y[1] in [1, 2]] work_training_dataset = [(y, x) for (y, x) in training_dataset if y[1] in [3, 5]] other_training_dataset = [(y, x) for (y, x) in training_dataset if y[1] in [4, 6, 7, 8, 9, 10]] y_train_top_level = [TOP_LEVEL_MAPPING[y[1]] for y in y_train] top_level_predictions = classify_top_level(X_train, y_train_top_level, X_test, priors_top_level) tla = top_level_accuracy(top_level_predictions, test_set) home_input = [] work_input = [] other_input = [] for index, pred in enumerate(top_level_predictions): if pred == 0: home_input.append(test_set[index]) elif pred == 1: work_input.append(test_set[index]) else: other_input.append(test_set[index]) logging.debug((len(home_input), len(work_input), len(other_input))) h_n, h_d, home_answers = train_classifier_and_predict( home_training_dataset, home_input, use_priors=False) w_n, w_d, work_answers = train_classifier_and_predict( work_training_dataset, work_input, use_priors=False) o_n, o_d, other_answers = classify_other(other_training_dataset, other_input, use_priors=kde_as_priors, add_features=add_features) overall_accuracy = ((h_n + w_n + o_n) * 1.0) / ((h_d + w_d + o_d) * 1.0) for a in [home_answers, work_answers, other_answers]: answers.extend(a) return tla, overall_accuracy, answers