コード例 #1
0
ファイル: classifier.py プロジェクト: siddharthsarda/spams
def classify_other(training, test, use_priors=False, add_features=False):
    if len(test) == 0:
        return 0, len(test), []
    y_train, x_train = zip(*training)
    y_test, x_test = zip(*test)
    if use_priors:
        priors = priors_with_kde(y_test, y_train)
        priors_others = [OTHER_MAPPING[y] for y in priors]
    else:
        priors_others = None
    if add_features:
        x_test, x_train = extend_features(x_train, y_train, x_test, y_test,
                                          'gender')
        #x_test, x_train = extend_features(x_train, y_train, x_test, y_test, 'working')
        #x_test, x_train = extend_features(x_train, y_train, x_test, y_test, 'age_group')

    #x_test, x_train = extend_features(x_train, y_train, x_test, y_test, 'label')
    y_training_other = [OTHER_MAPPING[y[1]] for y in y_train]
    result = classify_top_level(x_train, y_training_other, x_test,
                                priors_others)
    global ola
    ola += other_level_accuracy(result, test)
    accurate = 0.0
    count = 0.0
    answers = []

    sports_training = [(y, x) for (y, x) in training if y[1] in [6, 7]]
    shop_and_food_training = [(y, x) for (y, x) in training if y[1] in [8, 9]]
    sports_test = []
    food_shop_test = []

    for index, val in enumerate(result):
        if val == 0:
            sports_test.append(test[index])
        elif val == 1:
            food_shop_test.append(test[index])
        elif val == 2 or val == 3:
            count += 1
            accurate += REVERSE_OUTER_MAPPING[val] == y_test[index][1]
            answers.append((y_test[index][0], REVERSE_OUTER_MAPPING[val],
                            y_test[index][1]))

    #food_shop_test, shop_and_food_training = extend_features_with_split(shop_and_food_training, food_shop_test, 'age_group')

    #sports_test, sports_training = extend_features_with_split(sports_training, sports_test, 'age_group')
    a, c, d = train_classifier_and_predict(shop_and_food_training,
                                           food_shop_test)
    accurate += a
    count += c
    answers.extend(d)
    a, c, d = train_classifier_and_predict(sports_training, sports_test)
    accurate += a
    count += c
    answers.extend(d)
    return accurate, count, answers
コード例 #2
0
ファイル: classifier.py プロジェクト: siddharthsarda/spams
def extend_features(x_train, y_train, x_test, y_test, attribute, method='dbscan'):
    training_scores, test_scores = priors_with_kde(y_test, y_train, return_predictions=False, attribute = attribute)
    modified_training_set = []
    for y, x in zip(y_train, x_train):
        x_new = np.hstack((x, training_scores[y[0]]))
        modified_training_set.append(x_new)
    modified_test_set = []
    for y, x in zip(y_test, x_test):
        x_new = np.hstack((x, test_scores[y[0]]))
        modified_test_set.append(x_new)
    return modified_test_set, modified_training_set
コード例 #3
0
ファイル: classifier.py プロジェクト: siddharthsarda/spams
def extend_features_with_split(train, test, attribute):
    y_train, x_train = zip(*train)
    y_test, x_test = zip(*test)
    training_scores, test_scores = priors_with_kde(y_test, y_train, return_predictions=False, attribute = attribute)
    modified_training_set = []
    for y, x in zip(y_train, x_train):
        x_new = np.hstack((x, training_scores[y[0]]))
        modified_training_set.append((y, x_new))
    modified_test_set = []
    for y, x in zip(y_test, x_test):
        x_new = np.hstack((x, test_scores[y[0]]))
        modified_test_set.append((y, x_new))
    return modified_test_set, modified_training_set
コード例 #4
0
ファイル: classifier.py プロジェクト: siddharthsarda/spams
def classify_other(training, test, use_priors = False, add_features = False):
    if len(test) == 0:
        return 0, len(test), []
    y_train, x_train = zip(*training)
    y_test, x_test = zip(*test) 
    if use_priors:
        priors = priors_with_kde(y_test, y_train)
        priors_others = [OTHER_MAPPING[y] for y in priors]
    else:
        priors_others = None
    if add_features:
        x_test, x_train = extend_features(x_train, y_train, x_test, y_test, 'gender')
        #x_test, x_train = extend_features(x_train, y_train, x_test, y_test, 'working')
        #x_test, x_train = extend_features(x_train, y_train, x_test, y_test, 'age_group')
    
    #x_test, x_train = extend_features(x_train, y_train, x_test, y_test, 'label')
    y_training_other = [OTHER_MAPPING[y[1]] for y in y_train]
    result = classify_top_level(x_train, y_training_other, x_test, priors_others)
    global ola
    ola +=other_level_accuracy(result, test)
    accurate = 0.0
    count = 0.0
    answers = []
    
    sports_training = [(y, x) for (y, x) in training if y[1] in [6, 7]]
    shop_and_food_training = [(y, x) for (y, x) in training if y[1] in [8, 9]]
    sports_test = []
    food_shop_test = []

    for index, val in enumerate(result):
        if val == 0:
           sports_test.append(test[index])
        elif val == 1:
           food_shop_test.append(test[index])
        elif val == 2 or val == 3: 
           count += 1
           accurate += REVERSE_OUTER_MAPPING[val] == y_test[index][1]
           answers.append((y_test[index][0], REVERSE_OUTER_MAPPING[val], y_test[index][1]))

    #food_shop_test, shop_and_food_training = extend_features_with_split(shop_and_food_training, food_shop_test, 'age_group')

    #sports_test, sports_training = extend_features_with_split(sports_training, sports_test, 'age_group')
    a,c,d  = train_classifier_and_predict(shop_and_food_training, food_shop_test)
    accurate += a
    count += c
    answers.extend(d)
    a,c,d  = train_classifier_and_predict(sports_training, sports_test)
    accurate += a
    count += c
    answers.extend(d)
    return accurate, count, answers 
コード例 #5
0
ファイル: classifier.py プロジェクト: siddharthsarda/spams
def extend_features_with_split(train, test, attribute):
    y_train, x_train = zip(*train)
    y_test, x_test = zip(*test)
    training_scores, test_scores = priors_with_kde(y_test,
                                                   y_train,
                                                   return_predictions=False,
                                                   attribute=attribute)
    modified_training_set = []
    for y, x in zip(y_train, x_train):
        x_new = np.hstack((x, training_scores[y[0]]))
        modified_training_set.append((y, x_new))
    modified_test_set = []
    for y, x in zip(y_test, x_test):
        x_new = np.hstack((x, test_scores[y[0]]))
        modified_test_set.append((y, x_new))
    return modified_test_set, modified_training_set
コード例 #6
0
ファイル: classifier.py プロジェクト: siddharthsarda/spams
def do_classification(X, Y, train_index, test_index, kde_as_priors = False, kde_as_features = False, add_features= False):
    answers = []
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    training_dataset = zip(y_train, X_train)
    test_set = zip(y_test, X_test)
    priors_top_level = None

    if kde_as_priors:
        priors = priors_with_kde(y_test, y_train)
        priors_top_level = [TOP_LEVEL_MAPPING[y] for y in priors]

    if kde_as_features:
        X_test, X_train = extend_features(X_train, y_train, X_test, y_test, 'label')
        test_set = zip(y_test, X_test)
        training_dataset = zip(y_train, X_train)
    #X_test, X_train = extend_features(X_train, y_train, X_test, y_test, 'time', method = 'simple')
    test_set = zip(y_test, X_test)
    training_dataset = zip(y_train, X_train)

        
    X_train = [x for (y, x) in training_dataset]
    X_test  = [x for (y, x) in test_set]
    home_training_dataset = [(y, x) for (y, x) in training_dataset if y[1] in [1, 2]]
    work_training_dataset = [(y, x) for (y, x) in training_dataset if y[1] in [3, 5]]
    other_training_dataset = [(y, x) for (y, x) in training_dataset if y[1] in [4, 6, 7, 8, 9, 10]]
    y_train_top_level = [TOP_LEVEL_MAPPING[y[1]] for y in y_train]
    top_level_predictions = classify_top_level(X_train, y_train_top_level, X_test, priors_top_level)
    tla = top_level_accuracy(top_level_predictions, test_set)
    home_input = []
    work_input = []
    other_input = []
    for index, pred in enumerate(top_level_predictions):
        if pred == 0:
            home_input.append(test_set[index])
        elif pred == 1:
            work_input.append(test_set[index])
        else:
            other_input.append(test_set[index])
    logging.debug((len(home_input), len(work_input), len(other_input)))        
    h_n, h_d, home_answers = train_classifier_and_predict(home_training_dataset, home_input, use_priors = False)
    w_n, w_d, work_answers = train_classifier_and_predict(work_training_dataset, work_input, use_priors = False)
    o_n, o_d, other_answers = classify_other(other_training_dataset, other_input, use_priors=kde_as_priors, add_features = add_features)
    overall_accuracy = ((h_n + w_n + o_n) * 1.0 )/ ((h_d + w_d + o_d) * 1.0)
    for a in [home_answers, work_answers, other_answers]:
        answers.extend(a)
    return tla, overall_accuracy, answers
コード例 #7
0
ファイル: classifier.py プロジェクト: siddharthsarda/spams
def extend_features(x_train,
                    y_train,
                    x_test,
                    y_test,
                    attribute,
                    method='dbscan'):
    training_scores, test_scores = priors_with_kde(y_test,
                                                   y_train,
                                                   return_predictions=False,
                                                   attribute=attribute)
    modified_training_set = []
    for y, x in zip(y_train, x_train):
        x_new = np.hstack((x, training_scores[y[0]]))
        modified_training_set.append(x_new)
    modified_test_set = []
    for y, x in zip(y_test, x_test):
        x_new = np.hstack((x, test_scores[y[0]]))
        modified_test_set.append(x_new)
    return modified_test_set, modified_training_set
コード例 #8
0
ファイル: classifier.py プロジェクト: siddharthsarda/spams
def train_classifier_and_predict(training, test, use_priors=False, class_weight= None):
    if len(test) == 0:
        return 0, len(test), []
    y_train, x_train = zip(*training) 
    y_test, x_test = zip(*test)
    if use_priors:
        priors = priors_with_kde(y_test, y_train)
    else:
        priors = None
    
    places = [y[0] for y in y_test]
    y_test = [y[1] for y in y_test]
    y_train = [y[1] for y in y_train]
    clf = get_best_estimator(x_train, y_train, x_test, priors)
    logging.debug(clf)
    predictions = clf.predict(x_test)
    answers = zip(places, predictions, y_test)
    result = [y == y_test[index] for index, y in enumerate(predictions)]
    
    return result.count(1), len(result), answers
コード例 #9
0
ファイル: classifier.py プロジェクト: siddharthsarda/spams
def train_classifier_and_predict(training,
                                 test,
                                 use_priors=False,
                                 class_weight=None):
    if len(test) == 0:
        return 0, len(test), []
    y_train, x_train = zip(*training)
    y_test, x_test = zip(*test)
    if use_priors:
        priors = priors_with_kde(y_test, y_train)
    else:
        priors = None

    places = [y[0] for y in y_test]
    y_test = [y[1] for y in y_test]
    y_train = [y[1] for y in y_train]
    clf = get_best_estimator(x_train, y_train, x_test, priors)
    logging.debug(clf)
    predictions = clf.predict(x_test)
    answers = zip(places, predictions, y_test)
    result = [y == y_test[index] for index, y in enumerate(predictions)]

    return result.count(1), len(result), answers
コード例 #10
0
ファイル: classifier.py プロジェクト: siddharthsarda/spams
def do_classification(X,
                      Y,
                      train_index,
                      test_index,
                      kde_as_priors=False,
                      kde_as_features=False,
                      add_features=False):
    answers = []
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    training_dataset = zip(y_train, X_train)
    test_set = zip(y_test, X_test)
    priors_top_level = None

    if kde_as_priors:
        priors = priors_with_kde(y_test, y_train)
        priors_top_level = [TOP_LEVEL_MAPPING[y] for y in priors]

    if kde_as_features:
        X_test, X_train = extend_features(X_train, y_train, X_test, y_test,
                                          'label')
        test_set = zip(y_test, X_test)
        training_dataset = zip(y_train, X_train)
    #X_test, X_train = extend_features(X_train, y_train, X_test, y_test, 'time', method = 'simple')
    test_set = zip(y_test, X_test)
    training_dataset = zip(y_train, X_train)

    X_train = [x for (y, x) in training_dataset]
    X_test = [x for (y, x) in test_set]
    home_training_dataset = [(y, x) for (y, x) in training_dataset
                             if y[1] in [1, 2]]
    work_training_dataset = [(y, x) for (y, x) in training_dataset
                             if y[1] in [3, 5]]
    other_training_dataset = [(y, x) for (y, x) in training_dataset
                              if y[1] in [4, 6, 7, 8, 9, 10]]
    y_train_top_level = [TOP_LEVEL_MAPPING[y[1]] for y in y_train]
    top_level_predictions = classify_top_level(X_train, y_train_top_level,
                                               X_test, priors_top_level)
    tla = top_level_accuracy(top_level_predictions, test_set)
    home_input = []
    work_input = []
    other_input = []
    for index, pred in enumerate(top_level_predictions):
        if pred == 0:
            home_input.append(test_set[index])
        elif pred == 1:
            work_input.append(test_set[index])
        else:
            other_input.append(test_set[index])
    logging.debug((len(home_input), len(work_input), len(other_input)))
    h_n, h_d, home_answers = train_classifier_and_predict(
        home_training_dataset, home_input, use_priors=False)
    w_n, w_d, work_answers = train_classifier_and_predict(
        work_training_dataset, work_input, use_priors=False)
    o_n, o_d, other_answers = classify_other(other_training_dataset,
                                             other_input,
                                             use_priors=kde_as_priors,
                                             add_features=add_features)
    overall_accuracy = ((h_n + w_n + o_n) * 1.0) / ((h_d + w_d + o_d) * 1.0)
    for a in [home_answers, work_answers, other_answers]:
        answers.extend(a)
    return tla, overall_accuracy, answers