def _update_models_and_counts_(X_pool, y_pool, X_test, y_test, X_train, y_train, feature_expert, instance_model, rmw_n, rmw_a, reasoning_model, docs, \
                                discovered_features, discovered_class0_features, discovered_class1_features, \
                                covered_docs, num_a_feat_chosen, instance_model_scores, reasoning_model_scores, \
                                num_training_samples, discovered_feature_counts, num_docs_covered):
    # Train instance model
    instance_model.fit(X_train, y_train)

    X_pool_csc = X_pool.tocsc()

    for doc_id in docs:
        #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
        feature = feature_expert.any_informative_feature(
            X_pool[doc_id], y_pool[doc_id])

        if feature:
            discovered_features.add(feature)
            if y_pool[doc_id] == 0:
                discovered_class0_features.add(feature)
            else:
                discovered_class1_features.add(feature)

        # Train reasoning model
        reasoning_model.partial_fit(X_pool[doc_id], y_pool[doc_id], feature,
                                    rmw_n,
                                    rmw_a)  # train feature_model one by one

        # docs covered
        if feature:
            f_covered_docs = X_pool_csc[:, feature].indices
            covered_docs.update(f_covered_docs)

        # number of times a feat is chosen as a reason
        if feature:
            num_a_feat_chosen[feature] += 1

    (accu, auc) = evaluate_model(instance_model, X_test, y_test)
    instance_model_scores['auc'].append(auc)
    instance_model_scores['accu'].append(accu)

    (accu, auc) = evaluate_model(reasoning_model, X_test, y_test)
    reasoning_model_scores['auc'].append(auc)
    reasoning_model_scores['accu'].append(accu)

    num_training_samples.append(X_train.shape[0])

    # discovered feature counts

    discovered_feature_counts['class0'].append(len(discovered_class0_features))
    discovered_feature_counts['class1'].append(len(discovered_class1_features))

    num_docs_covered.append(len(covered_docs))
def learn(X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \
          selection_strategy, disagree_strat, coverage, budget, instance_model, feature_model, \
          pooling_model, reasoning_model, rmw_n, rmw_a, seed=0, Debug=False, \
          reasoning_strategy='random', switch=40):

    start = time()
    print '-' * 50
    print 'Starting Active Learning...'

    instance_model_scores = {'auc': [], 'accu': []}
    feature_model_scores = {'auc': [], 'accu': []}
    pooling_model_scores = {'auc': [], 'accu': []}
    reasoning_model_scores = {'auc': [], 'accu': []}

    discovered_feature_counts = {'class0': [], 'class1': []}
    num_docs_covered = []
    covered_docs = set()
    X_pool_csc = X_pool.tocsc()

    num_samples = len(pool_set) + len(training_set)

    num_feat = X_pool.shape[1]

    num_a_feat_chosen = np.zeros(num_feat)

    discovered_features = set()

    discovered_class0_features = set()

    discovered_class1_features = set()

    feature_expert.rg.seed(seed)

    if selection_strategy == 'random':
        doc_pick_model = RandomStrategy(seed)
    elif selection_strategy == 'uncertaintyIM':
        doc_pick_model = UNCSampling(instance_model, feature_expert, y_pool,
                                     Debug)
    elif selection_strategy == 'uncertaintyFM':
        doc_pick_model = UNCSampling(feature_model, feature_expert, y_pool,
                                     Debug)
    elif selection_strategy == 'uncertaintyPM':
        doc_pick_model = UNCSampling(pooling_model, feature_expert, y_pool,
                                     Debug)
    elif selection_strategy == 'uncertaintyRM':
        doc_pick_model = UNCSampling(reasoning_model, feature_expert, y_pool,
                                     Debug)
    elif selection_strategy == 'disagreement':
        doc_pick_model = DisagreementStrategy(instance_model, feature_model, \
            feature_expert, y_pool, disagree_strat, Debug=Debug)
    elif selection_strategy == 'covering':
        doc_pick_model = CoveringStrategy(feature_expert, num_samples, y_pool, \
            type='unknown', seed=seed, Debug=Debug)
    elif selection_strategy == 'covering_fewest':
        doc_pick_model = CoveringStrategy(feature_expert, num_samples, y_pool, \
            type='fewest', seed=seed, Debug=Debug)
    elif selection_strategy == 'cheating':
        doc_pick_model = CheatingApproach(feature_expert, num_samples, y_pool, \
            seed=seed, Debug=Debug)
    elif selection_strategy == 'cover_then_disagree':
        doc_pick_model = CoveringThenDisagreement(feature_expert, instance_model, \
            feature_model, num_samples, percentage=coverage, y=y_pool, type='unknown', \
            metric=disagree_strat, seed=seed, Debug=Debug)
    elif selection_strategy == 'cover_then_uncertaintyPM':
        doc_pick_model = CoverThenUncertainty(feature_expert, pooling_model, \
            num_samples, percentage=coverage, y=y_pool, type='unknown', \
            seed=seed, Debug=Debug)
    elif selection_strategy == 'cover_then_uncertaintyRM':
        doc_pick_model = CoverThenUncertainty(feature_expert, reasoning_model, \
            num_samples, percentage=coverage, y=y_pool, type='unknown', \
            seed=seed, Debug=Debug)
    elif selection_strategy == 'cover_then_featureCertainty':
        doc_pick_model = CoverThenFeatureCertainty(feature_expert, feature_model, \
            num_samples, percentage=coverage, y=y_pool, type='unknown', \
            seed=seed, Debug=Debug)
    elif selection_strategy == "optaucP":
        doc_pick_model = OptimizeAUC(X_test, y_test, feature_expert, \
            optimize="P", seed=seed, Debug=Debug)
    elif selection_strategy == "optaucI":
        doc_pick_model = OptimizeAUC(X_test, y_test, feature_expert, \
            optimize="I", seed=seed, Debug=Debug)
    elif selection_strategy == "optaucF":
        doc_pick_model = OptimizeAUC(X_test, y_test, feature_expert, \
            optimize="F", seed=seed, Debug=Debug)
    elif selection_strategy == "optaucR":
        doc_pick_model = OptimizeAUC(X_test, y_test, feature_expert, \
            optimize="R", seed=seed, Debug=Debug)
    elif selection_strategy == 'reasoning_then_featureCertainty':
        doc_pick_model = ReasoningThenFeatureCertainty(feature_expert, instance_model, \
            feature_model, switch=switch, reasoning_strategy=reasoning_strategy, y=y_pool, type='unknown', \
            seed=seed, Debug=Debug)
    elif selection_strategy == "unc_insuff_R":
        doc_pick_model = UNCForInsufficientReason(reasoning_model)
    elif selection_strategy == "unc_no_conflict_R":
        doc_pick_model = UNCWithNoConflict(reasoning_model)
    elif selection_strategy == "unc_prefer_no_conflict_R":
        doc_pick_model = UNCPreferNoConflict(reasoning_model)
    elif selection_strategy == "unc_prefer_conflict_R":
        doc_pick_model = UNCPreferConflict(reasoning_model)
    elif selection_strategy == "unc_three_types_R":
        doc_pick_model = UNCThreeTypes(reasoning_model)
    else:
        raise ValueError('Selection strategy: \'%s\' invalid!' %
                         selection_strategy)

    bootstrap_size = len(training_set)

    training_set_empty = (bootstrap_size == 0)

    if not training_set_empty:
        X_train = X_pool[training_set]
        y_train = y_pool[training_set]

        # Train all three models using the training set data
        instance_model.fit(X_train, y_train)  # train instance_model

        for doc in training_set:
            #feature = feature_expert.most_informative_feature(X_pool[doc], y_pool[doc])
            feature = feature_expert.any_informative_feature(
                X_pool[doc], y_pool[doc])

            if feature:
                feature_model.fit(
                    feature, y_pool[doc])  # train feature_model one by one
                discovered_features.add(feature)
                if y_pool[doc] == 0:
                    discovered_class0_features.add(feature)
                else:
                    discovered_class1_features.add(feature)

            # Reasoning model
            reasoning_model.partial_fit(
                X_pool[doc], y_pool[doc], feature, rmw_n,
                rmw_a)  # train feature_model one by one

            # docs covered
            if feature:
                f_covered_docs = X_pool_csc[:, feature].indices
                covered_docs.update(f_covered_docs)

            # number of times a feat is chosen as a reason
            if feature:
                num_a_feat_chosen[feature] += 1

            if selection_strategy == 'covering' or selection_strategy == 'covering_fewest':
                doc_pick_model.update(X_pool, feature, doc)
            elif selection_strategy == 'cheating':
                doc_pick_model.update(X_pool, feature, y_pool[doc])
            elif selection_strategy.startswith(
                    'cover_then') and doc_pick_model.phase == 'covering':
                doc_pick_model.covering.update(X_pool, feature, doc)

        pooling_model.fit(instance_model, feature_model,
                          weights=[0.5, 0.5])  # train pooling_model

        (accu, auc) = evaluate_model(instance_model, X_test, y_test)
        instance_model_scores['auc'].append(auc)
        instance_model_scores['accu'].append(accu)

        (accu, auc) = evaluate_model(feature_model, X_test, y_test)
        feature_model_scores['auc'].append(auc)
        feature_model_scores['accu'].append(accu)

        (accu, auc) = evaluate_model(pooling_model, X_test, y_test)
        pooling_model_scores['auc'].append(auc)
        pooling_model_scores['accu'].append(accu)

        (accu, auc) = evaluate_model(reasoning_model, X_test, y_test)
        reasoning_model_scores['auc'].append(auc)
        reasoning_model_scores['accu'].append(accu)

        # discovered feature counts
        if isinstance(feature_model, FeatureMNBUniform):
            discovered_feature_counts['class0'].append(
                len(feature_model.class0_features))
            discovered_feature_counts['class1'].append(
                len(feature_model.class1_features))
        elif isinstance(feature_model, FeatureMNBWeighted):
            nz = np.sum(feature_model.feature_count_ > 0, axis=1)
            discovered_feature_counts['class0'].append(nz[0])
            discovered_feature_counts['class1'].append(nz[1])

        num_docs_covered.append(len(covered_docs))

    else:
        if selection_strategy.startswith(
                'uncertainty') or selection_strategy == 'disagreement':
            raise ValueError('\'%s\' requires bootstrapping!' %
                             selection_strategy)

    for i in range(budget):
        train_set_size = len(training_set)

        # Choose a document based on the strategy chosen
        if selection_strategy.startswith('cover_then'):
            doc_id = doc_pick_model.choice(X_pool, i + 1, pool_set)
        elif selection_strategy.startswith('optauc'):
            doc_id = doc_pick_model.choice(X_pool, y_pool, pool_set,
                                           training_set, feature_model,
                                           reasoning_model, rmw_n, rmw_a)
        elif selection_strategy == 'reasoning_then_featureCertainty':
            doc_id = doc_pick_model.choice(X_pool, i + 1, pool_set,
                                           train_set_size)
        elif selection_strategy == "unc_insuff_R":
            doc_id = doc_pick_model.choice(X_pool,
                                           pool_set,
                                           discovered_features,
                                           max_num_feats=1)
        elif selection_strategy == "unc_no_conflict_R":
            doc_id = doc_pick_model.choice(X_pool, pool_set,
                                           discovered_class0_features,
                                           discovered_class1_features)
        elif selection_strategy == "unc_prefer_no_conflict_R":
            doc_id = doc_pick_model.choice(X_pool,
                                           pool_set,
                                           discovered_class0_features,
                                           discovered_class1_features,
                                           top_k=10)
        elif selection_strategy == "unc_prefer_conflict_R":
            doc_id = doc_pick_model.choice(X_pool,
                                           pool_set,
                                           discovered_class0_features,
                                           discovered_class1_features,
                                           top_k=10)
        elif selection_strategy == "unc_three_types_R":
            doc_id = doc_pick_model.choice(X_pool,
                                           pool_set,
                                           discovered_class0_features,
                                           discovered_class1_features,
                                           top_k=10)
        else:
            doc_id = doc_pick_model.choice(X_pool, pool_set)

        if doc_id == None:
            break

        # Remove the chosen document from pool and add it to the training set
        pool_set.remove(doc_id)
        training_set.append(doc_id)

        if i == 0 and training_set_empty:
            X_train = X_pool[doc_id]
            y_train = np.array([y_pool[doc_id]])
        else:
            X_train = sp.vstack((X_train, X_pool[doc_id]))
            y_train = np.hstack((y_train, np.array([y_pool[doc_id]])))

        # Ask the expert for instance label (returns the true label from the dataset)
        label = y_pool[doc_id]

        # Ask the expert for most informative feature given the label
        #feature = feature_expert.most_informative_feature(X_pool[doc_id], label)
        feature = feature_expert.any_informative_feature(
            X_pool[doc_id], y_pool[doc_id])

        # Update the instance model
        instance_model.fit(X_train, y_train)

        # Update the feature model
        if feature:
            feature_model.fit(feature, label)
            discovered_features.add(feature)
            if y_pool[doc_id] == 0:
                discovered_class0_features.add(feature)
            else:
                discovered_class1_features.add(feature)

        reasoning_model.partial_fit(X_pool[doc_id], y_pool[doc_id], feature,
                                    rmw_n,
                                    rmw_a)  # train feature_model one by one

        # docs covered
        if feature:
            f_covered_docs = X_pool_csc[:, feature].indices
            covered_docs.update(f_covered_docs)

        # number of times a feat is chosen as a reason
        if feature:
            num_a_feat_chosen[feature] += 1

        # Update the pooling model
        pooling_model.fit(instance_model, feature_model, weights=[0.5, 0.5])

        # print 'docs = %d, feature = %s' % (doc_id, str(feature))

        if selection_strategy == 'covering' or selection_strategy == 'covering_fewest':
            doc_pick_model.update(X_pool, feature, doc_id)
        elif selection_strategy == 'cheating':
            doc_pick_model.update(X_pool, feature, label)
        elif selection_strategy.startswith(
                'cover_then') and doc_pick_model.phase == 'covering':
            doc_pick_model.covering.update(X_pool, feature, doc_id)


#        print 'covering_fewest features: %d, feature model features: %d' % (len(doc_pick_model.annotated_features), len(feature_model.class0_features + feature_model.class1_features))

# Evaluate performance based on Instance Model
        (accu, auc) = evaluate_model(instance_model, X_test, y_test)
        instance_model_scores['auc'].append(auc)
        instance_model_scores['accu'].append(accu)

        # Evaluate performance on Feature Model
        (accu, auc) = evaluate_model(feature_model, X_test, y_test)
        feature_model_scores['auc'].append(auc)
        feature_model_scores['accu'].append(accu)

        # Evaluate performance on Pooled Model
        (accu, auc) = evaluate_model(pooling_model, X_test, y_test)
        pooling_model_scores['auc'].append(auc)
        pooling_model_scores['accu'].append(accu)

        # Evaluate performance of the Reasoning Model
        (accu, auc) = evaluate_model(reasoning_model, X_test, y_test)
        reasoning_model_scores['auc'].append(auc)
        reasoning_model_scores['accu'].append(accu)

        # discovered feature counts
        if isinstance(feature_model, FeatureMNBUniform):
            discovered_feature_counts['class0'].append(
                len(feature_model.class0_features))
            discovered_feature_counts['class1'].append(
                len(feature_model.class1_features))
        elif isinstance(feature_model, FeatureMNBWeighted):
            nz = np.sum(feature_model.feature_count_ > 0, axis=1)
            discovered_feature_counts['class0'].append(nz[0])
            discovered_feature_counts['class1'].append(nz[1])

        # docs covered
        num_docs_covered.append(len(covered_docs))

    if selection_strategy.startswith('cover_then'):
        transition = doc_pick_model.transition
    else:
        transition = None

    # compute the # of training samples for plot
    if training_set_empty:
        num_training_samples = np.arange(len(
            instance_model_scores['accu'])) + 1
    else:
        num_training_samples = np.arange(len(
            instance_model_scores['accu'])) + bootstrap_size

    print 'Active Learning took %2.2fs' % (time() - start)

    return (num_training_samples, instance_model_scores, feature_model_scores,
            pooling_model_scores, reasoning_model_scores,
            discovered_feature_counts, num_docs_covered, transition,
            num_a_feat_chosen)
Exemple #3
0
def learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \
          selection_strategy, budget, step_size, topk, w_o, w_r, seed=0, alpha=1, smoothing=0, poolingMNBWeights=[0.5, 0.5], poolingFM_r=100.0, lr_C=1, svm_C=1, \
          zaidan_C=1, zaidan_Ccontrast=1, zaidan_nu=1, cvTrain=False, Debug=False):

    start = time()
    print '-' * 50
    print 'Starting Active Learning...'

    _, num_feat = X_pool.shape
    model_scores = {
        'auc': [],
        'accu': [],
        'wr': [],
        'wo': [],
        'alpha': [],
        'svm_C': [],
        'zaidan_C': [],
        'zaidan_Ccontrast': [],
        'zaidan_nu': [],
        'FMrvalue': [],
        'IMweight': [],
        'FMweight': []
    }

    rationales = set()
    rationales_c0 = set()
    rationales_c1 = set()

    number_of_docs = 0

    feature_expert.rg.seed(seed)

    num_training_samples = []

    all_features = []

    # keep all the training data instance ids in docs list

    docs = training_set

    X_train = None
    y_train = []
    sample_weight = []

    for doc_id in docs:
        #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
        feature = feature_expert.any_informative_feature(
            X_pool[doc_id], y_pool[doc_id])

        number_of_docs = number_of_docs + 1

        # append feature to all_features, even if it is None
        all_features.append(feature)

        if feature is not None:
            rationales.add(feature)

            if y_pool[doc_id] == 0:
                rationales_c0.add(feature)
            else:
                rationales_c1.add(feature)

    if cvTrain:
        # get optimal parameters depending on the model_type

        if model_type == 'mnb_LwoR':
            w_r, w_o = optimalMNBLwoRParameters(X_pool[training_set],
                                                y_pool[training_set],
                                                all_features)

            feature_counter = 0
            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                x_feats = x[0].indices
                for f in x_feats:
                    if f == all_features[feature_counter]:
                        x[0, f] = w_r * x[0, f]
                    else:
                        x[0, f] = w_o * x[0, f]
                feature_counter = feature_counter + 1
                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))

                y_train.append(y_pool[doc_id])

        elif model_type == 'mnb':
            w_r, w_o = optimalMNBParameters(X_pool[training_set],
                                            y_pool[training_set], all_features)

            feature_counter = 0
            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                x_feats = x[0].indices
                for f in x_feats:
                    if f == all_features[feature_counter]:
                        x[0, f] = w_r * x[0, f]
                    else:
                        x[0, f] = w_o * x[0, f]
                feature_counter = feature_counter + 1
                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))

                y_train.append(y_pool[doc_id])

        elif model_type == 'svm_linear':
            w_r, w_o, C = optimalSVMParameters(X_pool[training_set],
                                               y_pool[training_set],
                                               all_features, seed)

            feature_counter = 0
            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                x_feats = x[0].indices
                for f in x_feats:
                    if f == all_features[feature_counter]:
                        x[0, f] = w_r * x[0, f]
                    else:
                        x[0, f] = w_o * x[0, f]
                feature_counter = feature_counter + 1
                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))

                y_train.append(y_pool[doc_id])

        elif model_type == 'svm_linear_LwoR':

            w_r, w_o, C = optimalSVMLwoRParameters(X_pool[training_set],
                                                   y_pool[training_set],
                                                   all_features, seed)
            feature_counter = 0
            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                x_feats = x[0].indices
                for f in x_feats:
                    if f == all_features[feature_counter]:
                        x[0, f] = w_r * x[0, f]
                    else:
                        x[0, f] = w_o * x[0, f]
                feature_counter = feature_counter + 1
                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))

                y_train.append(y_pool[doc_id])

        if model_type == 'poolingMNB':
            classpriors = np.zeros(2)
            classpriors[1] = (np.sum(y_pool[docs]) * 1.) / (len(docs) * 1.)
            classpriors[0] = 1. - classpriors[1]

            alpha, poolingFM_r, poolingMNBWeights = optimalPoolingMNBParameters(
                X_pool[training_set], y_pool[training_set], all_features,
                smoothing, num_feat)

            feature_model = FeatureMNBUniform(rationales_c0, rationales_c1,
                                              num_feat, smoothing, classpriors,
                                              poolingFM_r)

            feature_counter = 0
            for doc_id in docs:
                if all_features[feature_counter]:
                    # updates feature model with features one at a time
                    feature_model.fit(all_features[feature_counter],
                                      y_pool[doc_id])
                feature_counter = feature_counter + 1

                x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))

                y_train.append(y_pool[doc_id])

        if model_type == 'Zaidan':

            zaidan_C, zaidan_Ccontrast, zaidan_nu = optimalZaidanParameters(
                X_pool[training_set], y_pool[training_set], all_features, seed)

            feature_counter = 0

            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)
                if all_features[feature_counter] is not None:
                    x_pseudo = (X_pool[doc_id]).todense()

                    # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale.
                    x_feats = x[0].indices

                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            test = x[0, f]
                            x_pseudo[0, f] = x[0, f] / zaidan_nu
                        else:
                            x_pseudo[0, f] = 0.0
                    x_pseudo = sp.csr_matrix(x_pseudo, dtype=np.float64)

                if not y_train:
                    X_train = x
                    if all_features[feature_counter] is not None:
                        X_train = sp.vstack((X_train, x_pseudo))
                else:
                    X_train = sp.vstack((X_train, x))
                    if all_features[feature_counter] is not None:
                        X_train = sp.vstack((X_train, x_pseudo))

                y_train.append(y_pool[doc_id])
                if all_features[feature_counter] is not None:
                    # append y label again for the pseudoinstance created
                    y_train.append(y_pool[doc_id])

                sample_weight.append(zaidan_C)
                if all_features[feature_counter] is not None:
                    # append instance weight=zaidan_Ccontrast for the pseudoinstance created
                    sample_weight.append(zaidan_Ccontrast)

                feature_counter = feature_counter + 1

    # Train the model

    if model_type == 'lrl2':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C,
                                   penalty='l2',
                                   random_state=random_state)
    elif model_type == 'lrl1':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C,
                                   penalty='l1',
                                   random_state=random_state)
    elif model_type == 'mnb':
        model = MultinomialNB(alpha=alpha)
    elif model_type == 'mnb_LwoR':
        model = MultinomialNB(alpha=alpha)
    elif model_type == 'svm_linear_LwoR':
        random_state = np.random.RandomState(seed=seed)
        model = LinearSVC(C=C, random_state=random_state)
    elif model_type == 'svm_linear':
        random_state = np.random.RandomState(seed=seed)
        model = LinearSVC(C=C, random_state=random_state)
    elif model_type == 'poolingMNB':
        instance_model = MultinomialNB(alpha=alpha)
        model = PoolingMNB()
    elif model_type == 'Zaidan':
        random_state = np.random.RandomState(seed=seed)
        model = svm.SVC(kernel='linear', C=1.0, random_state=random_state)

    if model_type == 'poolingMNB':
        instance_model.fit(X_train, y_train)
        model.fit(instance_model, feature_model,
                  weights=poolingMNBWeights)  # train pooling_model
    elif model_type == 'Zaidan':
        model.fit(X_train, np.array(y_train), sample_weight=sample_weight)
    else:
        model.fit(X_train, np.array(y_train))

    (accu, auc) = evaluate_model(model, X_test, y_test)
    model_scores['auc'].append(auc)
    model_scores['accu'].append(accu)
    if model_type == 'poolingMNB':
        model_scores['alpha'].append(alpha)
        model_scores['FMrvalue'].append(poolingFM_r)
        model_scores['IMweight'].append(poolingMNBWeights[0])
        model_scores['FMweight'].append(poolingMNBWeights[1])
    else:
        model_scores['FMrvalue'].append(0.0)
        model_scores['IMweight'].append(0.0)
        model_scores['FMweight'].append(0.0)

    if model_type == 'Zaidan':
        model_scores['zaidan_C'].append(zaidan_C)
        model_scores['zaidan_Ccontrast'].append(zaidan_Ccontrast)
        model_scores['zaidan_nu'].append(zaidan_nu)
    else:
        model_scores['zaidan_C'].append(0.0)
        model_scores['zaidan_Ccontrast'].append(0.0)
        model_scores['zaidan_nu'].append(0.0)

    if model_type == 'mnb' or model_type == 'mnb_LwoR':
        model_scores['alpha'].append(alpha)
    else:
        model_scores['alpha'].append(0.0)

    if model_type == 'svm_linear' or model_type == 'svm_linear_LwoR':
        model_scores['svm_C'].append(C)
    else:
        model_scores['svm_C'].append(0.0)

    if model_type == 'svm_linear' or model_type == 'svm_linear_LwoR' or model_type == 'mnb' or model_type == 'mnb_LwoR':
        model_scores['wr'].append(w_r)
        model_scores['wo'].append(w_o)
    else:
        model_scores['wr'].append(0.0)
        model_scores['wo'].append(0.0)

    num_training_samples.append(number_of_docs)

    feature_expert.rg.seed(seed)

    if selection_strategy == 'random':
        doc_pick_model = RandomStrategy(seed)
    elif selection_strategy == 'unc':
        doc_pick_model = UNCSampling()
    elif selection_strategy == "pnc":
        doc_pick_model = UNCPreferNoConflict()
    elif selection_strategy == "pnr":
        doc_pick_model = UNCPreferNoRationale()
    elif selection_strategy == "pr":
        doc_pick_model = UNCPreferRationale()
    elif selection_strategy == "pc":
        doc_pick_model = UNCPreferConflict()
    elif selection_strategy == "tt":
        doc_pick_model = UNCThreeTypes()
    elif selection_strategy == "pipe":
        doc_pick_model = Pipe([UNCSampling(), UNCPreferConflict()], [10, 30])
    else:
        raise ValueError('Selection strategy: \'%s\' invalid!' %
                         selection_strategy)

    k = step_size

    #while X_train.shape[0] < budget:
    while number_of_docs < budget:

        # Choose a document based on the strategy chosen
        if selection_strategy == "pnc":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == "pnr":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == "pr":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == "pc":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == "tt":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == "pipe":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        else:
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k)

        if doc_ids is None or len(doc_ids) == 0:
            break

        for doc_id in doc_ids:
            #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
            feature = feature_expert.any_informative_feature(
                X_pool[doc_id], y_pool[doc_id])

            all_features.append(feature)

            number_of_docs = number_of_docs + 1

            if feature is not None:
                rationales.add(feature)

                if y_pool[doc_id] == 0:
                    rationales_c0.add(feature)
                else:
                    rationales_c1.add(feature)

            # Remove the chosen document from pool and add it to the training set
            pool_set.remove(doc_id)
            training_set.append(long(doc_id))

        if cvTrain:
            # get optimal parameters depending on the model_type

            X_train = None
            y_train = []
            sample_weight = []

            if model_type == 'mnb_LwoR':
                if np.mod(number_of_docs, 20) == 10:
                    w_r, w_o = optimalMNBLwoRParameters(
                        X_pool[training_set], y_pool[training_set],
                        all_features)

                feature_counter = 0
                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            x[0, f] = w_r * x[0, f]
                        else:
                            x[0, f] = w_o * x[0, f]
                    feature_counter = feature_counter + 1
                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))

                    y_train.append(y_pool[doc_id])

            elif model_type == 'mnb':
                if np.mod(number_of_docs, 20) == 10:
                    w_r, w_o = optimalMNBParameters(X_pool[training_set],
                                                    y_pool[training_set],
                                                    all_features)

                feature_counter = 0
                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            x[0, f] = w_r * x[0, f]
                        else:
                            x[0, f] = w_o * x[0, f]
                    feature_counter = feature_counter + 1
                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))

                    y_train.append(y_pool[doc_id])

            elif model_type == 'svm_linear_LwoR':
                if np.mod(number_of_docs, 20) == 10:
                    w_r, w_o, C = optimalSVMLwoRParameters(
                        X_pool[training_set], y_pool[training_set],
                        all_features, seed)

                feature_counter = 0
                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)

                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            x[0, f] = w_r * x[0, f]
                        else:
                            x[0, f] = w_o * x[0, f]

                    feature_counter = feature_counter + 1

                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))

                    y_train.append(y_pool[doc_id])

            elif model_type == 'svm_linear':
                if np.mod(number_of_docs, 20) == 10:
                    w_r, w_o, C = optimalSVMParameters(X_pool[training_set],
                                                       y_pool[training_set],
                                                       all_features, seed)

                feature_counter = 0
                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)

                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            x[0, f] = w_r * x[0, f]
                        else:
                            x[0, f] = w_o * x[0, f]

                    feature_counter = feature_counter + 1

                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))

                    y_train.append(y_pool[doc_id])

            if model_type == 'poolingMNB':
                classpriors = np.zeros(2)
                classpriors[1] = (np.sum(y_pool[docs]) * 1.) / (len(docs) * 1.)
                classpriors[0] = 1. - classpriors[1]

                if np.mod(number_of_docs, 20) == 10:
                    alpha, poolingFM_r, poolingMNBWeights = optimalPoolingMNBParameters(
                        X_pool[training_set], y_pool[training_set],
                        all_features, smoothing, num_feat)

                feature_model = FeatureMNBUniform(rationales_c0, rationales_c1,
                                                  num_feat, smoothing,
                                                  classpriors, poolingFM_r)

                feature_counter = 0
                for doc_id in training_set:
                    if all_features[feature_counter]:
                        # updates feature model with features one at a time
                        feature_model.fit(all_features[feature_counter],
                                          y_pool[doc_id])
                    feature_counter = feature_counter + 1

                    x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))

                    y_train.append(y_pool[doc_id])

            if model_type == 'Zaidan':

                if np.mod(number_of_docs, 20) == 10:
                    zaidan_C, zaidan_Ccontrast, zaidan_nu = optimalZaidanParameters(
                        X_pool[training_set], y_pool[training_set],
                        all_features, seed)

                feature_counter = 0

                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)

                    if all_features[feature_counter] is not None:
                        x_pseudo = (X_pool[doc_id]).todense()

                        # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale.
                        x_feats = x[0].indices

                        for f in x_feats:
                            if f == all_features[feature_counter]:
                                test = x[0, f]
                                x_pseudo[0, f] = x[0, f] / zaidan_nu
                            else:
                                x_pseudo[0, f] = 0.0

                        x_pseudo = sp.csr_matrix(x_pseudo, dtype=np.float64)

                    if not y_train:
                        X_train = x
                        if all_features[feature_counter] is not None:
                            X_train = sp.vstack((X_train, x_pseudo))
                    else:
                        X_train = sp.vstack((X_train, x))
                        if all_features[feature_counter] is not None:
                            X_train = sp.vstack((X_train, x_pseudo))

                    y_train.append(y_pool[doc_id])
                    if all_features[feature_counter] is not None:
                        # append y label again for the pseudoinstance created
                        y_train.append(y_pool[doc_id])

                    sample_weight.append(zaidan_C)
                    if all_features[feature_counter] is not None:
                        # append instance weight=zaidan_Ccontrast for the pseudoinstance created
                        sample_weight.append(zaidan_Ccontrast)

                    feature_counter = feature_counter + 1

        # Train the model

        if model_type == 'lrl2':
            random_state = np.random.RandomState(seed=seed)
            model = LogisticRegression(C=lr_C,
                                       penalty='l2',
                                       random_state=random_state)
        elif model_type == 'lrl1':
            random_state = np.random.RandomState(seed=seed)
            model = LogisticRegression(C=lr_C,
                                       penalty='l1',
                                       random_state=random_state)
        elif model_type == 'mnb':
            model = MultinomialNB(alpha=alpha)
        elif model_type == 'mnb_LwoR':
            model = MultinomialNB(alpha=alpha)
        elif model_type == 'svm_linear_LwoR':
            random_state = np.random.RandomState(seed=seed)
            model = LinearSVC(C=C, random_state=random_state)
        elif model_type == 'svm_linear':
            random_state = np.random.RandomState(seed=seed)
            model = LinearSVC(C=C, random_state=random_state)
        elif model_type == 'poolingMNB':
            instance_model = MultinomialNB(alpha=alpha)
            model = PoolingMNB()
        elif model_type == 'Zaidan':
            random_state = np.random.RandomState(seed=seed)
            model = svm.SVC(kernel='linear',
                            C=svm_C,
                            random_state=random_state)

        if model_type == 'poolingMNB':
            instance_model.fit(X_train, y_train)
            model.fit(instance_model, feature_model,
                      weights=poolingMNBWeights)  # train pooling_model
        elif model_type == 'Zaidan':
            model.fit(X_train, np.array(y_train), sample_weight=sample_weight)
        else:
            model.fit(X_train, np.array(y_train))

        (accu, auc) = evaluate_model(model, X_test, y_test)
        model_scores['auc'].append(auc)
        model_scores['accu'].append(accu)
        if model_type == 'poolingMNB':
            model_scores['alpha'].append(alpha)
            model_scores['FMrvalue'].append(poolingFM_r)
            model_scores['IMweight'].append(poolingMNBWeights[0])
            model_scores['FMweight'].append(poolingMNBWeights[1])
        else:
            model_scores['FMrvalue'].append(0.0)
            model_scores['IMweight'].append(0.0)
            model_scores['FMweight'].append(0.0)

        if model_type == 'Zaidan':
            model_scores['zaidan_C'].append(zaidan_C)
            model_scores['zaidan_Ccontrast'].append(zaidan_Ccontrast)
            model_scores['zaidan_nu'].append(zaidan_nu)
        else:
            model_scores['zaidan_C'].append(0.0)
            model_scores['zaidan_Ccontrast'].append(0.0)
            model_scores['zaidan_nu'].append(0.0)

        if model_type == 'mnb' or model_type == 'mnb_LwoR':
            model_scores['alpha'].append(alpha)
        else:
            model_scores['alpha'].append(0.0)

        if model_type == 'svm_linear' or model_type == 'svm_linear_LwoR':
            model_scores['svm_C'].append(C)
        else:
            model_scores['svm_C'].append(0.0)

        if model_type == 'svm_linear' or model_type == 'svm_linear_LwoR' or model_type == 'mnb' or model_type == 'mnb_LwoR':
            model_scores['wr'].append(w_r)
            model_scores['wo'].append(w_o)
        else:
            model_scores['wr'].append(0.0)
            model_scores['wo'].append(0.0)

        num_training_samples.append(number_of_docs)

    print 'Active Learning took %2.2fs' % (time() - start)

    return (np.array(num_training_samples), model_scores)
def learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \
          selection_strategy, budget, step_size, topk, w_o, w_r, seed=0, alpha=1, poolingMNBWeights=[0.5, 0.5], Meville_etal_r=100.0, lr_C=1, svm_C=1, \
          Zaidan_etal_C=1, Zaidan_etal_Ccontrast=1, Zaidan_etal_mu=1, Debug=False):

    start = time()
    print('-' * 50)
    print('Starting Active Learning...')

    _, num_feat = X_pool.shape
    model_scores = {'auc': [], 'accu': []}

    rationales = set()
    rationales_c0 = set()
    rationales_c1 = set()

    feature_expert.rg.seed(seed)

    num_training_samples = []

    number_of_docs = 0

    docs = training_set

    X_train = None
    y_train = []

    sample_weight = list()

    if model_type == 'Melville_etal':
        # create feature model
        classpriors = np.zeros(2)
        classpriors[1] = (np.sum(y_pool[docs]) * 1.) / (len(docs) * 1.)
        classpriors[0] = 1. - classpriors[1]

        feature_model = FeatureMNBUniform(rationales_c0, rationales_c1,
                                          num_feat, classpriors,
                                          Meville_etal_r)

    for doc_id in docs:

        number_of_docs = number_of_docs + 1

        feature = feature_expert.any_informative_feature(
            X_pool[doc_id], y_pool[doc_id])

        if model_type == 'Melville_etal':
            if feature:
                feature_model.fit(feature, y_pool[doc_id])

        rationales.add(feature)

        if y_pool[doc_id] == 0:
            rationales_c0.add(feature)
        else:
            rationales_c1.add(feature)

        if model_type == 'Zaidan_etal':
            x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)
            if feature is not None:
                x_pseudo = (X_pool[doc_id]).todense()

                # create pseudoinstances based on rationales provided; one pseudoinstance is created per rationale.
                x_feats = x[0].indices

                for f in x_feats:
                    if f == feature:
                        test = x[0, f]
                        x_pseudo[0, f] = x[0, f] / Zaidan_etal_mu
                    else:
                        x_pseudo[0, f] = 0.0
                x_pseudo = sp.csr_matrix(x_pseudo, dtype=np.float64)

        else:
            x = sp.csr_matrix(X_pool[doc_id], dtype=float)
            if "Melville_etal" not in model_type:
                x_feats = x[0].indices
                for f in x_feats:
                    if f == feature:
                        x[0, f] = w_r * x[0, f]
                    else:
                        x[0, f] = w_o * x[0, f]

        if model_type == 'Zaidan_etal':
            if not y_train:
                X_train = x
                if feature is not None:
                    X_train = sp.vstack((X_train, x_pseudo))
            else:
                X_train = sp.vstack((X_train, x))
                if feature is not None:
                    X_train = sp.vstack((X_train, x_pseudo))

            y_train.append(y_pool[doc_id])
            if feature is not None:
                # append y label again for the pseudoinstance created
                y_train.append(y_pool[doc_id])

            sample_weight.append(Zaidan_etal_C)
            if feature is not None:
                # append instance weight=Zaidan_etal_Ccontrast for the pseudoinstance created
                sample_weight.append(Zaidan_etal_Ccontrast)

        else:
            if not y_train:
                X_train = x
            else:
                X_train = sp.vstack((X_train, x))

            y_train.append(y_pool[doc_id])

    # Train the model

    if model_type == 'lrl2':
        random_state = RandomState(seed=seed)
        model = LogisticRegression(C=lr_C,
                                   penalty='l2',
                                   random_state=random_state)
    elif model_type == 'lrl1':
        random_state = RandomState(seed=seed)
        model = LogisticRegression(C=lr_C,
                                   penalty='l1',
                                   random_state=random_state)
    elif model_type == 'mnb':
        model = MultinomialNB(alpha=alpha)
    elif model_type == 'svm_linear':
        random_state = RandomState(seed=seed)
        model = LinearSVC(C=svm_C, random_state=random_state)
    elif model_type == 'Melville_etal':
        instance_model = MultinomialNB(alpha=alpha)
        model = PoolingMNB()
    elif model_type == 'Zaidan_etal':
        random_state = RandomState(seed=seed)
        model = svm.SVC(kernel='linear', C=svm_C, random_state=random_state)

    if model_type == 'Melville_etal':
        #feature_model.fit(feature, y_pool[doc_id])
        instance_model.fit(X_train, y_train)
        model.fit(instance_model, feature_model,
                  weights=poolingMNBWeights)  # train pooling_model
    elif model_type == 'Zaidan_etal':
        model.fit(X_train, np.array(y_train), sample_weight=sample_weight)
    else:
        model.fit(X_train, np.array(y_train))

    (accu, auc) = evaluate_model(model, X_test, y_test)
    model_scores['auc'].append(auc)
    model_scores['accu'].append(accu)

    num_training_samples.append(number_of_docs)

    feature_expert.rg.seed(seed)

    if selection_strategy == 'RND':
        doc_pick_model = RandomStrategy(seed)
    elif selection_strategy == 'UNC':
        doc_pick_model = UNCSampling()
    elif selection_strategy == 'UNC_PNC':
        doc_pick_model = UNCPreferNoConflict()
    elif selection_strategy == 'UNC_PC':
        doc_pick_model = UNCPreferConflict()
    else:
        raise ValueError('Selection strategy: \'%s\' invalid!' %
                         selection_strategy)

    k = step_size

    while X_train.shape[0] < budget:

        # Choose a document based on the strategy chosen
        if selection_strategy == 'UNC_PNC':
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == 'UNC_PC':
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        else:
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k)

        if doc_ids is None or len(doc_ids) == 0:
            break

        for doc_id in doc_ids:
            # Remove the chosen document from pool and add it to the training set
            pool_set.remove(doc_id)
            training_set.append(doc_id)

            #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
            feature = feature_expert.any_informative_feature(
                X_pool[doc_id], y_pool[doc_id])

            if model_type == 'Melville_etal':
                if feature:
                    feature_model.fit(feature, y_pool[doc_id])

            number_of_docs = number_of_docs + 1

            rationales.add(feature)

            if y_pool[doc_id] == 0:
                rationales_c0.add(feature)
            else:
                rationales_c1.add(feature)

            if model_type == 'Zaidan_etal':
                x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)
                if feature is not None:
                    x_pseudo = (X_pool[doc_id]).todense()

                    # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale.
                    x_feats = x[0].indices

                    for f in x_feats:
                        if f == feature:
                            test = x[0, f]
                            x_pseudo[0, f] = x[0, f] / Zaidan_etal_mu
                        else:
                            x_pseudo[0, f] = 0.0
                    x_pseudo = sp.csr_matrix(x_pseudo, dtype=np.float64)

            else:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)
                if "Melville_etal" not in model_type:
                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == feature:
                            x[0, f] = w_r * x[0, f]
                        else:
                            x[0, f] = w_o * x[0, f]

            if model_type == 'Zaidan_etal':
                X_train = sp.vstack((X_train, x))
                if feature is not None:
                    X_train = sp.vstack((X_train, x_pseudo))

                y_train.append(y_pool[doc_id])
                if feature is not None:
                    # append y label again for the pseudoinstance created
                    y_train.append(y_pool[doc_id])

                sample_weight.append(Zaidan_etal_C)
                if feature is not None:
                    # append instance weight=Zaidan_etal_Ccontrast for the pseudoinstance created
                    sample_weight.append(Zaidan_etal_Ccontrast)

            else:
                X_train = sp.vstack((X_train, x))
                y_train.append(y_pool[doc_id])

        # Train the model

        if model_type == 'lrl2':
            random_state = RandomState(seed=seed)
            model = LogisticRegression(C=lr_C,
                                       penalty='l2',
                                       random_state=random_state)
        elif model_type == 'lrl1':
            random_state = RandomState(seed=seed)
            model = LogisticRegression(C=lr_C,
                                       penalty='l1',
                                       random_state=random_state)
        elif model_type == 'mnb':
            model = MultinomialNB(alpha=alpha)
        elif model_type == 'svm_linear':
            random_state = RandomState(seed=seed)
            model = LinearSVC(C=svm_C, random_state=random_state)
        elif model_type == 'Melville_etal':
            instance_model = MultinomialNB(alpha=alpha)
            model = PoolingMNB()
        elif model_type == 'Zaidan_etal':
            random_state = RandomState(seed=seed)
            model = svm.SVC(kernel='linear',
                            C=svm_C,
                            random_state=random_state)

        if model_type == 'Melville_etal':
            instance_model.fit(X_train, y_train)
            model.fit(instance_model, feature_model,
                      weights=poolingMNBWeights)  # train pooling_model
        elif model_type == 'Zaidan_etal':
            model.fit(X_train, np.array(y_train), sample_weight=sample_weight)
        else:
            model.fit(X_train, np.array(y_train))

        (accu, auc) = evaluate_model(model, X_test, y_test)
        model_scores['auc'].append(auc)
        model_scores['accu'].append(accu)

        num_training_samples.append(number_of_docs)

    print('Active Learning took %2.2fs' % (time() - start))

    return (np.array(num_training_samples), model_scores)
Exemple #5
0
def learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \
          selection_strategy, budget, step_size, topk, w_o, w_r, seed=0, lr_C=1, svm_C=1, svm_gamma=0, Debug=False):
    
    start = time()
    print '-' * 50
    print 'Starting Active Learning...'
    
    
    model_scores = {'auc':[], 'accu':[]}
    
    rationales  = set()
    rationales_c0  = set()
    rationales_c1  = set()

    feature_expert.rg.seed(seed)
    
    num_training_samples = []
    
    
    docs = training_set
    
    X_train = None
    y_train = []
    
    for doc_id in docs:
        #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
        feature = feature_expert.any_informative_feature(X_pool[doc_id], y_pool[doc_id])
        
        rationales.add(feature)

        if y_pool[doc_id] == 0:
            rationales_c0.add(feature)
        else:
            rationales_c1.add(feature)
        
        x = sp.csr_matrix(X_pool[doc_id], dtype=float)
         
        x_feats = x[0].indices
        for f in x_feats:
            if f == feature:
                x[0,f] = w_r*x[0,f]
            else:
                x[0,f] = w_o*x[0,f]
        

        
        if not y_train:
            X_train = x
        else:
            X_train = sp.vstack((X_train, x))
        
        y_train.append(y_pool[doc_id])
    
    # Train the model
    
    if model_type=='lrl2':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state)        
    elif model_type=='mnb':        
        model = MultinomialNB(alpha=1)        
    elif model_type=='svm_linear':
        model = svm.SVC(kernel='linear', C=svm_C, probability=True)
        #model = svm.SVC(kernel='linear', probability=True)        
    elif model_type=='svm_rbf':
        model = svm.SVC(kernel='rbf', gamma=svm_gamma, C=svm_C, probability=True)
        #model = svm.SVC(kernel='rbf', probability=True)        
    elif model_type=='svm_poly':
        model = svm.SVC(kernel='rbf', gamma=svm_gamma, C=svm_C, probability=True)
        #model = svm.SVC(kernel='poly', probability=True)        
    elif model_type=='adaptive_lr':
        random_state = np.random.RandomState(seed=seed)
        #model = LogisticRegression(C=C, penalty='l2', random_state=random_state)
        #model.fit(X_train, np.array(y_train))
        model = LogisticRegressionAdaptive()   
    elif model_type=='adaptive_svm':
        random_state = np.random.RandomState(seed=seed)        
        model = AdaptiveSVM()      
    elif model_type=='SGD':
            model = SGDClassifier(loss="log",penalty='l2', n_iter=100) 
        
    model.fit(X_train, np.array(y_train))
            
    (accu, auc) = evaluate_model(model, X_test, y_test)
    model_scores['auc'].append(auc)
    model_scores['accu'].append(accu)
    
    num_training_samples.append(X_train.shape[0])
    
    feature_expert.rg.seed(seed)        
    
    if selection_strategy == 'random':
        doc_pick_model = RandomStrategy(seed)
    elif selection_strategy == 'uncertainty':
        doc_pick_model = UNCSampling(model, feature_expert, y_pool, Debug)        
    elif selection_strategy == "optauc":
        doc_pick_model = OptimizeAUC(X_test, y_test, feature_expert, \
            optimize="R", seed=seed, Debug=Debug)          
    elif selection_strategy == "unc_prefer_no_conflict":
        doc_pick_model = UNCPreferNoConflict(model)
    elif selection_strategy == "unc_prefer_conflict":
        doc_pick_model = UNCPreferConflict(model)
    elif selection_strategy == "unc_three_types":
        doc_pick_model = UNCThreeTypes(model)
    else:
        raise ValueError('Selection strategy: \'%s\' invalid!' % selection_strategy)
 
  
    k = step_size  

    while X_train.shape[0] < budget:                

        # Choose a document based on the strategy chosen
        if selection_strategy == "unc_prefer_no_conflict":
            doc_ids = doc_pick_model.choices(X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        elif selection_strategy == "unc_prefer_conflict":
            doc_ids = doc_pick_model.choices(X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        elif selection_strategy == "unc_three_types":
            doc_ids = doc_pick_model.choices(X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        else:
            doc_ids = doc_pick_model.choices(X_pool, pool_set, k)
        
        if doc_ids is None or len(doc_ids) == 0:
            break        
        
        for doc_id in doc_ids:
            # Remove the chosen document from pool and add it to the training set
            pool_set.remove(doc_id)
            training_set.append(doc_id)

            #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
            feature = feature_expert.any_informative_feature(X_pool[doc_id], y_pool[doc_id])
            
            rationales.add(feature)

            if y_pool[doc_id] == 0:
                rationales_c0.add(feature)
            else:
                rationales_c1.add(feature)
            
            x = sp.csr_matrix(X_pool[doc_id], dtype=float)
            
            x_feats = x[0].indices
            for f in x_feats:
                if f == feature:
                    x[0,f] = w_r*x[0,f]
                else:
                    x[0,f] = w_o*x[0,f]
            

            X_train = sp.vstack((X_train, x))
            y_train.append(y_pool[doc_id])
        
        # Train the model

        
        if model_type=='lrl2':
            random_state2 = np.random.RandomState(seed=seed)        
            model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state)                  
        elif model_type=='mnb':        
            model = MultinomialNB(alpha=1)            
        elif model_type=='svm_linear':
            model = svm.SVC(kernel='linear', C=svm_C, probability=True)
            #model = svm.SVC(kernel='linear', probability=True)            
        elif model_type=='svm_rbf':
            model = svm.SVC(kernel='rbf', gamma=svm_gamma, C=svm_C, probability=True)
            #model = svm.SVC(kernel='rbf', probability=True)            
        elif model_type=='svm_poly':
            model = svm.SVC(kernel='rbf', gamma=svm_gamma, C=svm_C, probability=True)
            #model = svm.SVC(kernel='poly', probability=True)   
        elif model_type=='adaptive_lr':
            #random_state = np.random.RandomState(seed=seed)            
            model = LogisticRegressionAdaptive()  
        elif model_type=='adaptive_svm':
            random_state = np.random.RandomState(seed=seed)        
            model = AdaptiveSVM()   
        elif model_type=='SGD':
            model = SGDClassifier(loss="log",penalty='l2', n_iter=100)              
                              

        # fit the model and evaluate
        model.fit(X_train, np.array(y_train))
            
        (accu, auc) = evaluate_model(model, X_test, y_test)
        model_scores['auc'].append(auc)
        model_scores['accu'].append(accu)
        
        num_training_samples.append(X_train.shape[0])
        
  
    print 'Active Learning took %2.2fs' % (time() - start)
    
    return (np.array(num_training_samples), model_scores)
def learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \
          selection_strategy, budget, step_size, topk, w_o, w_r, seed=0, alpha=1, poolingMNBWeights=[0.5, 0.5], Meville_etal_r=100.0, lr_C=1, svm_C=1, \
          Zaidan_etal_C=1, Zaidan_etal_Ccontrast=1, Zaidan_etal_mu=1, Debug=False):
    
    start = time()
    print '-' * 50
    print 'Starting Active Learning...'
    
    _, num_feat = X_pool.shape
    model_scores = {'auc':[], 'accu':[]}
    
    rationales  = set()
    rationales_c0  = set()
    rationales_c1  = set()

    feature_expert.rg.seed(seed)
    
    num_training_samples = []
    
    number_of_docs = 0
    
    docs = training_set
    
    X_train = None
    y_train = []
    
    if model_type=='Melville_etal':      
        # create feature model  
        classpriors=np.zeros(2)            
        classpriors[1] = (np.sum(y_pool[docs])*1.)/(len(docs)*1.)
        classpriors[0] = 1. - classpriors[1] 

        feature_model = FeatureMNBUniform(rationales_c0, rationales_c1, num_feat, classpriors, Meville_etal_r)    

    for doc_id in docs:
        
        number_of_docs=number_of_docs+1    
        
        feature = feature_expert.any_informative_feature(X_pool[doc_id], y_pool[doc_id])

        if model_type == 'Melville_etal':        
            if feature:
                feature_model.fit(feature, y_pool[doc_id])
        
        rationales.add(feature)

        if y_pool[doc_id] == 0:
            rationales_c0.add(feature)
        else:
            rationales_c1.add(feature)        
                    

        if model_type == 'Zaidan_etal':
            x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)
            if feature is not None:
                x_pseudo = (X_pool[doc_id]).todense()
                                
                # create pseudoinstances based on rationales provided; one pseudoinstance is created per rationale.
                x_feats = x[0].indices
        
                for f in x_feats:
                    if f == feature:
                        test= x[0,f]
                        x_pseudo[0,f] = x[0,f]/Zaidan_etal_mu
                    else:                                              
                        x_pseudo[0,f] = 0.0                          
                x_pseudo=sp.csr_matrix(x_pseudo, dtype=np.float64)

        else:
            x = sp.csr_matrix(X_pool[doc_id], dtype=float)
            if "Melville_etal" not in model_type:         
                x_feats = x[0].indices
                for f in x_feats:
                    if f == feature:
                        x[0,f] = w_r*x[0,f]
                    else:
                        x[0,f] = w_o*x[0,f]
        

        if model_type=='Zaidan_etal':
            if not y_train:
                X_train = x      
                if feature is not None:      
                    X_train = sp.vstack((X_train, x_pseudo))
            else:
                X_train = sp.vstack((X_train, x))
                if feature is not None:
                    X_train = sp.vstack((X_train, x_pseudo))
        
            y_train.append(y_pool[doc_id])
            if feature is not None:
                # append y label again for the pseudoinstance created
                y_train.append(y_pool[doc_id])
        

            sample_weight.append(Zaidan_etal_C)
            if feature is not None:
                # append instance weight=Zaidan_etal_Ccontrast for the pseudoinstance created
                sample_weight.append(Zaidan_etal_Ccontrast)  

        else:
            if not y_train:
                X_train = x
            else:
                X_train = sp.vstack((X_train, x))
        
            y_train.append(y_pool[doc_id])
    
    # Train the model
    
    if model_type=='lrl2':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state)
    elif model_type=='lrl1':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state)        
    elif model_type=='mnb':        
        model = MultinomialNB(alpha=alpha)        
    elif model_type=='svm_linear':
        random_state = np.random.RandomState(seed=seed)
        model = LinearSVC(C=svm_C, random_state=random_state)
    elif model_type=='Melville_etal':
        instance_model=MultinomialNB(alpha=alpha)        
        model = PoolingMNB()
    elif model_type=='Zaidan_etal':
        random_state = np.random.RandomState(seed=seed)        
        model = svm.SVC(kernel='linear', C=svm_C, random_state=random_state)
        
    if model_type=='Melville_etal':                
        #feature_model.fit(feature, y_pool[doc_id])
        instance_model.fit(X_train, y_train)
        model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model
    elif model_type=='Zaidan_etal':
        model.fit(X_train, np.array(y_train), sample_weight=sample_weight)
    else:
        model.fit(X_train, np.array(y_train))
    
    
            
    (accu, auc) = evaluate_model(model, X_test, y_test)
    model_scores['auc'].append(auc)
    model_scores['accu'].append(accu)
    
    num_training_samples.append(number_of_docs)
    
    feature_expert.rg.seed(seed)        
    
    if selection_strategy == 'RND':
        doc_pick_model = RandomStrategy(seed)
    elif selection_strategy == 'UNC':
        doc_pick_model = UNCSampling()         
    elif selection_strategy == 'UNC_PNC':
        doc_pick_model = UNCPreferNoConflict()   
    elif selection_strategy == 'UNC_PC':
        doc_pick_model = UNCPreferConflict()    
    else:
        raise ValueError('Selection strategy: \'%s\' invalid!' % selection_strategy)
 
  
    k = step_size  

    while X_train.shape[0] < budget:                

        # Choose a document based on the strategy chosen
        if selection_strategy == 'UNC_PNC':
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)       
        elif selection_strategy == 'UNC_PC':
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)        
        else:
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k)
        
        if doc_ids is None or len(doc_ids) == 0:
            break        
        
        for doc_id in doc_ids:
            # Remove the chosen document from pool and add it to the training set
            pool_set.remove(doc_id)
            training_set.append(doc_id)

            #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
            feature = feature_expert.any_informative_feature(X_pool[doc_id], y_pool[doc_id])

            if model_type=='Melville_etal':        
                if feature:
                    feature_model.fit(feature, y_pool[doc_id])
            
            number_of_docs=number_of_docs+1    

            rationales.add(feature)

            if y_pool[doc_id] == 0:
                rationales_c0.add(feature)
            else:
                rationales_c1.add(feature)
            

            if model_type=='Zaidan_etal':
                x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)
                if feature is not None:
                    x_pseudo = (X_pool[doc_id]).todense()
                                
                    # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale.
                    x_feats = x[0].indices
        
                    for f in x_feats:
                        if f == feature:
                            test= x[0,f]
                            x_pseudo[0,f] = x[0,f]/Zaidan_etal_mu
                        else:                                              
                            x_pseudo[0,f] = 0.0                          
                    x_pseudo=sp.csr_matrix(x_pseudo, dtype=np.float64)

            else:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)
                if "Melville_etal" not in model_type:         
                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == feature:
                            x[0,f] = w_r*x[0,f]
                        else:
                            x[0,f] = w_o*x[0,f]                                   

            if model_type=='Zaidan_etal':
                X_train = sp.vstack((X_train, x))
                if feature is not None:
                    X_train = sp.vstack((X_train, x_pseudo))
        
                y_train.append(y_pool[doc_id])
                if feature is not None:
                    # append y label again for the pseudoinstance created
                    y_train.append(y_pool[doc_id])        

                sample_weight.append(Zaidan_etal_C)
                if feature is not None:
                    # append instance weight=Zaidan_etal_Ccontrast for the pseudoinstance created
                    sample_weight.append(Zaidan_etal_Ccontrast)  

            else:
                X_train = sp.vstack((X_train, x))        
                y_train.append(y_pool[doc_id])
        
        # Train the model

        
        if model_type=='lrl2':
            random_state = np.random.RandomState(seed=seed)
            model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state)
        elif model_type=='lrl1':
            random_state = np.random.RandomState(seed=seed)
            model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state)        
        elif model_type=='mnb':        
            model = MultinomialNB(alpha=alpha)        
        elif model_type=='svm_linear':
            random_state = np.random.RandomState(seed=seed)
            model = LinearSVC(C=svm_C, random_state=random_state)
        elif model_type=='Melville_etal':
            instance_model=MultinomialNB(alpha=alpha)        
            model = PoolingMNB()
        elif model_type=='Zaidan_etal':
            random_state = np.random.RandomState(seed=seed)        
            model = svm.SVC(kernel='linear', C=svm_C, random_state=random_state)                                                          

        if model_type=='Melville_etal':                            
            instance_model.fit(X_train, y_train)
            model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model
        elif model_type=='Zaidan_etal':
            model.fit(X_train, np.array(y_train), sample_weight=sample_weight)
        else:
            model.fit(X_train, np.array(y_train))
            
        (accu, auc) = evaluate_model(model, X_test, y_test)
        model_scores['auc'].append(auc)
        model_scores['accu'].append(accu)
        
        num_training_samples.append(number_of_docs)
        
  
    print 'Active Learning took %2.2fs' % (time() - start)
    
    return (np.array(num_training_samples), model_scores)
Exemple #7
0
def learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \
          selection_strategy, budget, step_size, topk, w_o, w_r, seed=0, alpha=1, smoothing=0, poolingMNBWeights=[0.5, 0.5], poolingFM_r=100.0, lr_C=1, svm_C=1, \
          zaidan_C=1, zaidan_Ccontrast=1, zaidan_nu=1, cvTrain=False, Debug=False):
    
    start = time()
    print '-' * 50
    print 'Starting Active Learning...'
    
    _, num_feat = X_pool.shape
    model_scores = {'auc':[], 'accu':[], 'wr':[], 'wo':[], 'alpha':[], 'svm_C':[], 'zaidan_C':[], 'zaidan_Ccontrast':[], 'zaidan_nu':[], 'FMrvalue':[], 'IMweight':[], 'FMweight':[]}
    
    rationales  = set()
    rationales_c0  = set()
    rationales_c1  = set()

    number_of_docs = 0    

    feature_expert.rg.seed(seed)
    
    num_training_samples = []
    
    all_features=[]
    
    # keep all the training data instance ids in docs list    
    
    docs = training_set          
    
    X_train = None
    y_train = []
    sample_weight = []
      

    for doc_id in docs:
        #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
        feature = feature_expert.any_informative_feature(X_pool[doc_id], y_pool[doc_id])         
        
        number_of_docs=number_of_docs+1       
        
        # append feature to all_features, even if it is None
        all_features.append(feature)                     

        if feature is not None:
            rationales.add(feature)

            if y_pool[doc_id] == 0:
                rationales_c0.add(feature)
            else:
                rationales_c1.add(feature)

    if cvTrain:
        # get optimal parameters depending on the model_type

        if model_type=='mnb_LwoR':
            w_r, w_o=optimalMNBLwoRParameters(X_pool[training_set], y_pool[training_set], all_features)

            feature_counter=0
            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)
                            
                x_feats = x[0].indices
                for f in x_feats:
                    if f == all_features[feature_counter]:
                        x[0,f] = w_r*x[0,f]
                    else:
                        x[0,f] = w_o*x[0,f]
                feature_counter=feature_counter+1
                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))
        
                y_train.append(y_pool[doc_id])

                
        elif model_type=='mnb':      
            w_r, w_o=optimalMNBParameters(X_pool[training_set], y_pool[training_set], all_features)

            feature_counter=0
            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)
                            
                x_feats = x[0].indices
                for f in x_feats:
                    if f == all_features[feature_counter]:
                        x[0,f] = w_r*x[0,f]
                    else:
                        x[0,f] = w_o*x[0,f]
                feature_counter=feature_counter+1
                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))
        
                y_train.append(y_pool[doc_id])

        
        elif model_type=='svm_linear':                  
            w_r, w_o, C= optimalSVMParameters(X_pool[training_set], y_pool[training_set], all_features, seed)            

            feature_counter=0
            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)
                            
                x_feats = x[0].indices
                for f in x_feats:
                    if f == all_features[feature_counter]:
                        x[0,f] = w_r*x[0,f]
                    else:
                        x[0,f] = w_o*x[0,f]
                feature_counter=feature_counter+1
                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))
        
                y_train.append(y_pool[doc_id])       
                
        elif model_type=='svm_linear_LwoR':                  

            w_r, w_o, C= optimalSVMLwoRParameters(X_pool[training_set], y_pool[training_set], all_features, seed)
            feature_counter=0
            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=float)
                            
                x_feats = x[0].indices
                for f in x_feats:
                    if f == all_features[feature_counter]:
                        x[0,f] = w_r*x[0,f]
                    else:
                        x[0,f] = w_o*x[0,f]
                feature_counter=feature_counter+1
                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))
        
                y_train.append(y_pool[doc_id])                                    
            

        if model_type=='poolingMNB':   
            classpriors=np.zeros(2)            
            classpriors[1]=(np.sum(y_pool[docs])*1.)/(len(docs)*1.)
            classpriors[0]= 1. - classpriors[1]     
            
            alpha, poolingFM_r, poolingMNBWeights = optimalPoolingMNBParameters(X_pool[training_set], y_pool[training_set], all_features, smoothing, num_feat)
            
            feature_model=FeatureMNBUniform(rationales_c0, rationales_c1, num_feat, smoothing, classpriors, poolingFM_r)

            feature_counter=0
            for doc_id in docs:
                if all_features[feature_counter]:
                    # updates feature model with features one at a time
                    feature_model.fit(all_features[feature_counter], y_pool[doc_id])
                feature_counter=feature_counter+1

                x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                if not y_train:
                    X_train = x
                else:
                    X_train = sp.vstack((X_train, x))
        
                y_train.append(y_pool[doc_id])

        if model_type=='Zaidan':

            zaidan_C, zaidan_Ccontrast, zaidan_nu = optimalZaidanParameters(X_pool[training_set], y_pool[training_set], all_features, seed)

            feature_counter=0

            for doc_id in docs:
                x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)
                if all_features[feature_counter] is not None:
                    x_pseudo = (X_pool[doc_id]).todense()
                                
                    # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale.
                    x_feats = x[0].indices
        
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            test= x[0,f]
                            x_pseudo[0,f] = x[0,f]/zaidan_nu
                        else:                                              
                            x_pseudo[0,f] = 0.0                          
                    x_pseudo=sp.csr_matrix(x_pseudo, dtype=np.float64)                
        
                if not y_train:
                    X_train = x      
                    if all_features[feature_counter] is not None:      
                        X_train = sp.vstack((X_train, x_pseudo))
                else:
                    X_train = sp.vstack((X_train, x))
                    if all_features[feature_counter] is not None:
                        X_train = sp.vstack((X_train, x_pseudo))
        
                y_train.append(y_pool[doc_id])
                if all_features[feature_counter] is not None:
                    # append y label again for the pseudoinstance created
                    y_train.append(y_pool[doc_id])
        

                sample_weight.append(zaidan_C)
                if all_features[feature_counter] is not None:
                    # append instance weight=zaidan_Ccontrast for the pseudoinstance created
                    sample_weight.append(zaidan_Ccontrast)  

                feature_counter = feature_counter+1
    
    # Train the model
    
    if model_type=='lrl2':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state)
    elif model_type=='lrl1':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state)        
    elif model_type=='mnb':        
        model = MultinomialNB(alpha=alpha)        
    elif model_type=='mnb_LwoR':        
        model = MultinomialNB(alpha=alpha)  
    elif model_type=='svm_linear_LwoR':        
        random_state = np.random.RandomState(seed=seed)
        model = LinearSVC(C=C, random_state=random_state)
    elif model_type=='svm_linear':
        random_state = np.random.RandomState(seed=seed)
        model = LinearSVC(C=C, random_state=random_state)
    elif model_type=='poolingMNB':
        instance_model=MultinomialNB(alpha=alpha)        
        model = PoolingMNB()
    elif model_type=='Zaidan':
        random_state = np.random.RandomState(seed=seed)        
        model = svm.SVC(kernel='linear', C=1.0, random_state=random_state)
        
    if model_type=='poolingMNB':                        
        instance_model.fit(X_train, y_train)
        model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model
    elif model_type=='Zaidan':
        model.fit(X_train, np.array(y_train), sample_weight=sample_weight)
    else:
        model.fit(X_train, np.array(y_train))
    
    
            
    (accu, auc) = evaluate_model(model, X_test, y_test)
    model_scores['auc'].append(auc)
    model_scores['accu'].append(accu)
    if model_type=='poolingMNB':
        model_scores['alpha'].append(alpha)
        model_scores['FMrvalue'].append(poolingFM_r)
        model_scores['IMweight'].append(poolingMNBWeights[0])
        model_scores['FMweight'].append(poolingMNBWeights[1])
    else:
        model_scores['FMrvalue'].append(0.0)
        model_scores['IMweight'].append(0.0)
        model_scores['FMweight'].append(0.0)

    if model_type=='Zaidan':
        model_scores['zaidan_C'].append(zaidan_C)
        model_scores['zaidan_Ccontrast'].append(zaidan_Ccontrast)
        model_scores['zaidan_nu'].append(zaidan_nu)
    else:
        model_scores['zaidan_C'].append(0.0)
        model_scores['zaidan_Ccontrast'].append(0.0)
        model_scores['zaidan_nu'].append(0.0)

    if model_type=='mnb' or model_type=='mnb_LwoR':
        model_scores['alpha'].append(alpha)        
    else:
        model_scores['alpha'].append(0.0)        

    if model_type=='svm_linear' or model_type=='svm_linear_LwoR':
        model_scores['svm_C'].append(C)        
    else:
        model_scores['svm_C'].append(0.0)
        
    if model_type=='svm_linear' or model_type=='svm_linear_LwoR' or model_type=='mnb' or model_type=='mnb_LwoR':
        model_scores['wr'].append(w_r)
        model_scores['wo'].append(w_o)
    else:
        model_scores['wr'].append(0.0)
        model_scores['wo'].append(0.0)
    
    num_training_samples.append(number_of_docs)
    
    feature_expert.rg.seed(seed)        
    
    if selection_strategy == 'random':
        doc_pick_model = RandomStrategy(seed)
    elif selection_strategy == 'unc':
        doc_pick_model = UNCSampling()         
    elif selection_strategy == "pnc":
        doc_pick_model = UNCPreferNoConflict()
    elif selection_strategy == "pnr":
        doc_pick_model = UNCPreferNoRationale()
    elif selection_strategy == "pr":
        doc_pick_model = UNCPreferRationale()
    elif selection_strategy == "pc":
        doc_pick_model = UNCPreferConflict()
    elif selection_strategy == "tt":
        doc_pick_model = UNCThreeTypes()
    elif selection_strategy == "pipe":
        doc_pick_model = Pipe([UNCSampling(), UNCPreferConflict()], [10, 30])
    else:
        raise ValueError('Selection strategy: \'%s\' invalid!' % selection_strategy)
 
  
    k = step_size  


    #while X_train.shape[0] < budget:     
    while number_of_docs < budget:                       

        # Choose a document based on the strategy chosen
        if selection_strategy == "pnc":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        elif selection_strategy == "pnr":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        elif selection_strategy == "pr":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        elif selection_strategy == "pc":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        elif selection_strategy == "tt":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        elif selection_strategy == "pipe":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        else:
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k)
        
        if doc_ids is None or len(doc_ids) == 0:
            break        

        

        for doc_id in doc_ids:            
            #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
            feature = feature_expert.any_informative_feature(X_pool[doc_id], y_pool[doc_id])

            all_features.append(feature)                
            
            number_of_docs=number_of_docs + 1
        
            if feature is not None:
                rationales.add(feature)

                if y_pool[doc_id] == 0:
                    rationales_c0.add(feature)
                else:
                    rationales_c1.add(feature)

            # Remove the chosen document from pool and add it to the training set            
            pool_set.remove(doc_id)                        
            training_set.append(long(doc_id))               


        if cvTrain:
        # get optimal parameters depending on the model_type

            X_train = None
            y_train = []
            sample_weight = []

            if model_type=='mnb_LwoR':
                if np.mod(number_of_docs,20)==10:
                    w_r, w_o=optimalMNBLwoRParameters(X_pool[training_set], y_pool[training_set], all_features)

                feature_counter=0
                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=float)
                            
                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            x[0,f] = w_r*x[0,f]
                        else:
                            x[0,f] = w_o*x[0,f]
                    feature_counter=feature_counter+1
                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))
        
                    y_train.append(y_pool[doc_id])

                
            elif model_type=='mnb':      
                if np.mod(number_of_docs,20)==10:
                    w_r, w_o=optimalMNBParameters(X_pool[training_set], y_pool[training_set], all_features)

                feature_counter=0
                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=float)
                            
                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            x[0,f] = w_r*x[0,f]
                        else:
                            x[0,f] = w_o*x[0,f]
                    feature_counter=feature_counter+1
                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))
        
                    y_train.append(y_pool[doc_id])

        
            elif model_type=='svm_linear_LwoR': 
                if np.mod(number_of_docs,20)==10:                 
                    w_r, w_o, C= optimalSVMLwoRParameters(X_pool[training_set], y_pool[training_set], all_features, seed)
                
                feature_counter=0
                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)
                            
                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            x[0,f] = w_r*x[0,f]
                        else:
                            x[0,f] = w_o*x[0,f]

                    feature_counter=feature_counter+1

                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))
        
                    y_train.append(y_pool[doc_id])       
                
            elif model_type=='svm_linear':                  
                if np.mod(number_of_docs,20)==10:
                    w_r, w_o, C= optimalSVMParameters(X_pool[training_set], y_pool[training_set], all_features, seed)

                feature_counter=0
                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)
                            
                    x_feats = x[0].indices
                    for f in x_feats:
                        if f == all_features[feature_counter]:
                            x[0,f] = w_r*x[0,f]
                        else:
                            x[0,f] = w_o*x[0,f]

                    feature_counter=feature_counter+1

                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))
        
                    y_train.append(y_pool[doc_id])                                    
            

            if model_type=='poolingMNB':   
                classpriors=np.zeros(2)            
                classpriors[1]=(np.sum(y_pool[docs])*1.)/(len(docs)*1.)
                classpriors[0]= 1. - classpriors[1]     
                
                if np.mod(number_of_docs,20)==10:
                    alpha, poolingFM_r, poolingMNBWeights = optimalPoolingMNBParameters(X_pool[training_set], y_pool[training_set], all_features, smoothing, num_feat)
            
                feature_model=FeatureMNBUniform(rationales_c0, rationales_c1, num_feat, smoothing, classpriors, poolingFM_r)

                feature_counter=0
                for doc_id in training_set:
                    if all_features[feature_counter]:
                        # updates feature model with features one at a time
                        feature_model.fit(all_features[feature_counter], y_pool[doc_id])
                    feature_counter=feature_counter+1

                    x = sp.csr_matrix(X_pool[doc_id], dtype=float)

                    if not y_train:
                        X_train = x
                    else:
                        X_train = sp.vstack((X_train, x))
        
                    y_train.append(y_pool[doc_id])

            if model_type=='Zaidan':

                if np.mod(number_of_docs,20)==10:
                    zaidan_C, zaidan_Ccontrast, zaidan_nu = optimalZaidanParameters(X_pool[training_set], y_pool[training_set], all_features, seed)
                
                feature_counter=0

                for doc_id in training_set:
                    x = sp.csr_matrix(X_pool[doc_id], dtype=np.float64)

                    if all_features[feature_counter] is not None:
                        x_pseudo = (X_pool[doc_id]).todense()
                                
                        # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale.
                        x_feats = x[0].indices
        
                        for f in x_feats:
                            if f == all_features[feature_counter]:
                                test= x[0,f]
                                x_pseudo[0,f] = x[0,f]/zaidan_nu
                            else:                                              
                                x_pseudo[0,f] = 0.0                
                                          
                        x_pseudo=sp.csr_matrix(x_pseudo, dtype=np.float64)                
        
                    if not y_train:
                        X_train = x      
                        if all_features[feature_counter] is not None:      
                            X_train = sp.vstack((X_train, x_pseudo))
                    else:
                        X_train = sp.vstack((X_train, x))
                        if all_features[feature_counter] is not None:
                            X_train = sp.vstack((X_train, x_pseudo))
        
                    y_train.append(y_pool[doc_id])
                    if all_features[feature_counter] is not None:
                        # append y label again for the pseudoinstance created
                        y_train.append(y_pool[doc_id])
        

                    sample_weight.append(zaidan_C)
                    if all_features[feature_counter] is not None:
                        # append instance weight=zaidan_Ccontrast for the pseudoinstance created
                        sample_weight.append(zaidan_Ccontrast)  

                    feature_counter = feature_counter+1
        
        # Train the model

        if model_type=='lrl2':
            random_state = np.random.RandomState(seed=seed)
            model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state)
        elif model_type=='lrl1':
            random_state = np.random.RandomState(seed=seed)
            model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state)        
        elif model_type=='mnb':        
            model = MultinomialNB(alpha=alpha)        
        elif model_type=='mnb_LwoR':        
            model = MultinomialNB(alpha=alpha)  
        elif model_type=='svm_linear_LwoR':        
            random_state = np.random.RandomState(seed=seed)
            model = LinearSVC(C=C, random_state=random_state)
        elif model_type=='svm_linear':
            random_state = np.random.RandomState(seed=seed)
            model = LinearSVC(C=C, random_state=random_state)
        elif model_type=='poolingMNB':
            instance_model=MultinomialNB(alpha=alpha)        
            model = PoolingMNB()
        elif model_type=='Zaidan':
            random_state = np.random.RandomState(seed=seed)        
            model = svm.SVC(kernel='linear', C=svm_C, random_state=random_state)
        
        if model_type=='poolingMNB':                        
            instance_model.fit(X_train, y_train)
            model.fit(instance_model, feature_model, weights=poolingMNBWeights) # train pooling_model
        elif model_type=='Zaidan':
            model.fit(X_train, np.array(y_train), sample_weight=sample_weight)
        else:
            model.fit(X_train, np.array(y_train))
        
                    
        (accu, auc) = evaluate_model(model, X_test, y_test)
        model_scores['auc'].append(auc)
        model_scores['accu'].append(accu)
        if model_type=='poolingMNB':
            model_scores['alpha'].append(alpha)
            model_scores['FMrvalue'].append(poolingFM_r)
            model_scores['IMweight'].append(poolingMNBWeights[0])
            model_scores['FMweight'].append(poolingMNBWeights[1])
        else:
            model_scores['FMrvalue'].append(0.0)
            model_scores['IMweight'].append(0.0)
            model_scores['FMweight'].append(0.0)

        if model_type=='Zaidan':
            model_scores['zaidan_C'].append(zaidan_C)
            model_scores['zaidan_Ccontrast'].append(zaidan_Ccontrast)
            model_scores['zaidan_nu'].append(zaidan_nu)
        else:
            model_scores['zaidan_C'].append(0.0)
            model_scores['zaidan_Ccontrast'].append(0.0)
            model_scores['zaidan_nu'].append(0.0)

        if model_type=='mnb' or model_type=='mnb_LwoR':
            model_scores['alpha'].append(alpha)        
        else:
            model_scores['alpha'].append(0.0)        

        if model_type=='svm_linear' or model_type=='svm_linear_LwoR':
            model_scores['svm_C'].append(C)        
        else:
            model_scores['svm_C'].append(0.0)
        
        if model_type=='svm_linear' or model_type=='svm_linear_LwoR' or model_type=='mnb' or model_type=='mnb_LwoR':
            model_scores['wr'].append(w_r)
            model_scores['wo'].append(w_o)
        else:
            model_scores['wr'].append(0.0)
            model_scores['wo'].append(0.0)
        
        num_training_samples.append(number_of_docs)
        
  
    print 'Active Learning took %2.2fs' % (time() - start)
    
    return (np.array(num_training_samples), model_scores)
def learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \
          selection_strategy, budget, step_size, topk, w_o, w_r, seed=0, lr_C=1, svm_C=1, Debug=False):
    
    start = time()
    print '-' * 50
    print 'Starting Active Learning...'
    
    
    model_scores = {'auc':[], 'accu':[]}
    
    rationales  = set()
    rationales_c0  = set()
    rationales_c1  = set()

    feature_expert.rg.seed(seed)
    
    num_training_samples = []
    
    
    docs = training_set
    
    X_train = None
    y_train = []
    
    for doc_id in docs:
        #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
        feature = feature_expert.any_informative_feature(X_pool[doc_id], y_pool[doc_id])
        
        rationales.add(feature)

        if y_pool[doc_id] == 0:
            rationales_c0.add(feature)
        else:
            rationales_c1.add(feature)
        
        x = sp.csr_matrix(X_pool[doc_id], dtype=float)
         
        x_feats = x[0].indices
        for f in x_feats:
            if f == feature:
                x[0,f] = w_r*x[0,f]
            else:
                x[0,f] = w_o*x[0,f]
        

        
        if not y_train:
            X_train = x
        else:
            X_train = sp.vstack((X_train, x))
        
        y_train.append(y_pool[doc_id])
    
    # Train the model
    
    if model_type=='lrl2':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state)
    elif model_type=='lrl1':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state)        
    elif model_type=='mnb':        
        model = MultinomialNB(alpha=1)        
    elif model_type=='svm_linear':
        model = LinearSVC(C=svm_C)
        
    model.fit(X_train, np.array(y_train))
            
    (accu, auc) = evaluate_model(model, X_test, y_test)
    model_scores['auc'].append(auc)
    model_scores['accu'].append(accu)
    
    num_training_samples.append(X_train.shape[0])
    
    feature_expert.rg.seed(seed)        
    
    if selection_strategy == 'random':
        doc_pick_model = RandomStrategy(seed)
    elif selection_strategy == 'unc':
        doc_pick_model = UNCSampling()         
    elif selection_strategy == "pnc":
        doc_pick_model = UNCPreferNoConflict()
    elif selection_strategy == "pnr":
        doc_pick_model = UNCPreferNoRationale()
    elif selection_strategy == "pr":
        doc_pick_model = UNCPreferRationale()
    elif selection_strategy == "pc":
        doc_pick_model = UNCPreferConflict()
    elif selection_strategy == "tt":
        doc_pick_model = UNCThreeTypes()
    elif selection_strategy == "pipe":
        doc_pick_model = Pipe([UNCSampling(), UNCPreferConflict()], [10, 30])
    else:
        raise ValueError('Selection strategy: \'%s\' invalid!' % selection_strategy)
 
  
    k = step_size  

    while X_train.shape[0] < budget:                

        # Choose a document based on the strategy chosen
        if selection_strategy == "pnc":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        elif selection_strategy == "pnr":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        elif selection_strategy == "pr":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        elif selection_strategy == "pc":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        elif selection_strategy == "tt":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        elif selection_strategy == "pipe":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k, rationales_c0, rationales_c1, topk)
        else:
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k)
        
        if doc_ids is None or len(doc_ids) == 0:
            break        
        
        for doc_id in doc_ids:
            # Remove the chosen document from pool and add it to the training set
            pool_set.remove(doc_id)
            training_set.append(doc_id)

            #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
            feature = feature_expert.any_informative_feature(X_pool[doc_id], y_pool[doc_id])
            
            rationales.add(feature)

            if y_pool[doc_id] == 0:
                rationales_c0.add(feature)
            else:
                rationales_c1.add(feature)
            
            x = sp.csr_matrix(X_pool[doc_id], dtype=float)
            
            x_feats = x[0].indices
            for f in x_feats:
                if f == feature:
                    x[0,f] = w_r*x[0,f]
                else:
                    x[0,f] = w_o*x[0,f]
            

            X_train = sp.vstack((X_train, x))
            y_train.append(y_pool[doc_id])
        
        # Train the model

        
        if model_type=='lrl2':
            random_state = np.random.RandomState(seed=seed)
            model = LogisticRegression(C=lr_C, penalty='l2', random_state=random_state)
        elif model_type=='lrl1':
            random_state = np.random.RandomState(seed=seed)
            model = LogisticRegression(C=lr_C, penalty='l1', random_state=random_state)        
        elif model_type=='mnb':        
            model = MultinomialNB(alpha=1)        
        elif model_type=='svm_linear':
            model = LinearSVC(C=svm_C)   
                              

        # fit the model and evaluate
        model.fit(X_train, np.array(y_train))
            
        (accu, auc) = evaluate_model(model, X_test, y_test)
        model_scores['auc'].append(auc)
        model_scores['accu'].append(accu)
        
        num_training_samples.append(X_train.shape[0])
        
  
    print 'Active Learning took %2.2fs' % (time() - start)
    
    return (np.array(num_training_samples), model_scores)
def learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \
          selection_strategy, budget, step_size, topk, w_o, w_r, seed=0, lr_C=1, svm_C=1, svm_gamma=0, zaidan_C=0.01, zaidan_Ccontrast=1.0, zaidan_nu=1, Debug=False):

    start = time()
    print '-' * 50
    print 'Starting Active Learning...'

    model_scores = {'auc': [], 'accu': []}

    rationales = set()
    rationales_c0 = set()
    rationales_c1 = set()

    feature_expert.rg.seed(seed)

    num_training_samples = []

    docs = training_set

    X_train = None
    y_train = []
    sample_weight = []
    number_of_docs = 0

    for doc_id in docs:
        #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
        feature = feature_expert.any_informative_feature(
            X_pool[doc_id], y_pool[doc_id])

        number_of_docs = number_of_docs + 1

        if feature is not None:
            rationales.add(feature)

            if y_pool[doc_id] == 0:
                rationales_c0.add(feature)
            else:
                rationales_c1.add(feature)

        x = sp.csr_matrix(X_pool[doc_id], dtype=float)
        if feature is not None:
            x_pseudo = (X_pool[doc_id]).todense()

            # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale.
            x_feats = x[0].indices

            for f in x_feats:
                if f == feature:
                    x_pseudo[0, f] = x[0, f]
                else:
                    x_pseudo[0, f] = 0.0

            x_pseudo = sp.csr_matrix(x_pseudo)

        if not y_train:
            X_train = x
            if feature is not None:
                X_train = sp.vstack((X_train, x_pseudo))
        else:
            X_train = sp.vstack((X_train, x))
            if feature is not None:
                X_train = sp.vstack((X_train, x_pseudo))

        y_train.append(y_pool[doc_id])
        if feature is not None:
            # append y label again for the pseudoinstance created
            y_train.append(y_pool[doc_id])

        sample_weight.append(zaidan_C)
        if feature is not None:
            # append instance weight=zaidan_Ccontrast for the pseudoinstance created
            sample_weight.append(zaidan_Ccontrast)

    # Train the model

    if model_type == 'lrl2':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C,
                                   penalty='l2',
                                   random_state=random_state)
    elif model_type == 'mnb':
        model = MultinomialNB(alpha=1)
    elif model_type == 'svm_linear':
        model = svm.SVC(kernel='linear', C=svm_C, probability=True)
        #model = svm.SVC(kernel='linear', probability=True)
    elif model_type == 'svm_rbf':
        model = svm.SVC(kernel='rbf',
                        gamma=svm_gamma,
                        C=svm_C,
                        probability=True)
        #model = svm.SVC(kernel='rbf', probability=True)
    elif model_type == 'svm_poly':
        model = svm.SVC(kernel='rbf',
                        gamma=svm_gamma,
                        C=svm_C,
                        probability=True)
        #model = svm.SVC(kernel='poly', probability=True)
    elif model_type == 'adaptive_lr':
        random_state = np.random.RandomState(seed=seed)
        #model = LogisticRegression(C=C, penalty='l2', random_state=random_state)
        #model.fit(X_train, np.array(y_train))
        model = LogisticRegressionAdaptive()
    elif model_type == 'adaptive_svm':
        random_state = np.random.RandomState(seed=seed)
        model = AdaptiveSVM()
    elif model_type == 'SGD':
        model = SGDClassifier(loss="log", penalty='l2', n_iter=100)

    # fit model using sample weight
    model.fit(X_train, np.array(y_train), sample_weight=sample_weight)

    (accu, auc) = evaluate_model(model, X_test, y_test)
    model_scores['auc'].append(auc)
    model_scores['accu'].append(accu)

    num_training_samples.append(number_of_docs)

    feature_expert.rg.seed(seed)

    if selection_strategy == 'random':
        doc_pick_model = RandomStrategy(seed)
    elif selection_strategy == 'uncertainty':
        doc_pick_model = UNCSampling(model, feature_expert, y_pool, Debug)
    elif selection_strategy == "optauc":
        doc_pick_model = OptimizeAUC(X_test, y_test, feature_expert, \
            optimize="R", seed=seed, Debug=Debug)
    elif selection_strategy == "unc_prefer_no_conflict":
        doc_pick_model = UNCPreferNoConflict(model)
    elif selection_strategy == "unc_prefer_conflict":
        doc_pick_model = UNCPreferConflict(model)
    elif selection_strategy == "unc_three_types":
        doc_pick_model = UNCThreeTypes(model)
    else:
        raise ValueError('Selection strategy: \'%s\' invalid!' %
                         selection_strategy)

    k = step_size

    while number_of_docs < budget:

        # Choose a document based on the strategy chosen
        if selection_strategy == "unc_prefer_no_conflict":
            doc_ids = doc_pick_model.choices(X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == "unc_prefer_conflict":
            doc_ids = doc_pick_model.choices(X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == "unc_three_types":
            doc_ids = doc_pick_model.choices(X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        else:
            doc_ids = doc_pick_model.choices(X_pool, pool_set, k)

        if doc_ids is None or len(doc_ids) == 0:
            break

        for doc_id in doc_ids:
            # Remove the chosen document from pool and add it to the training set
            pool_set.remove(doc_id)
            training_set.append(doc_id)
            number_of_docs = number_of_docs + 1

            #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
            feature = feature_expert.any_informative_feature(
                X_pool[doc_id], y_pool[doc_id])

            if feature is not None:
                rationales.add(feature)

                if y_pool[doc_id] == 0:
                    rationales_c0.add(feature)
                else:
                    rationales_c1.add(feature)

            x = sp.csr_matrix(X_pool[doc_id], dtype=float)
            if feature is not None:
                x_pseudo = (X_pool[doc_id]).todense()

                # create pseudoinstances based on rationales provided; one pseudoinstance is created for each rationale.
                x_feats = x[0].indices

                for f in x_feats:
                    if f == feature:
                        x_pseudo[0, f] = x[0, f]
                    else:
                        x_pseudo[0, f] = 0.0

                x_pseudo = sp.csr_matrix(x_pseudo)

            if not y_train:
                X_train = x
                if feature is not None:
                    X_train = sp.vstack((X_train, x_pseudo))
            else:
                X_train = sp.vstack((X_train, x))
                if feature is not None:
                    X_train = sp.vstack((X_train, x_pseudo))

            y_train.append(y_pool[doc_id])
            if feature is not None:
                # append y label again for the pseudoinstance created
                y_train.append(y_pool[doc_id])

            sample_weight.append(zaidan_C)
            if feature is not None:
                # append instance weight=zaidan_Ccontrast for the pseudoinstance created
                sample_weight.append(zaidan_Ccontrast)

        # Train the model

        if model_type == 'lrl2':
            random_state2 = np.random.RandomState(seed=seed)
            model = LogisticRegression(C=lr_C,
                                       penalty='l2',
                                       random_state=random_state)
        elif model_type == 'mnb':
            model = MultinomialNB(alpha=1)
        elif model_type == 'svm_linear':
            model = svm.SVC(kernel='linear', C=svm_C, probability=True)
            #model = svm.SVC(kernel='linear', probability=True)
        elif model_type == 'svm_rbf':
            model = svm.SVC(kernel='rbf',
                            gamma=svm_gamma,
                            C=svm_C,
                            probability=True)
            #model = svm.SVC(kernel='rbf', probability=True)
        elif model_type == 'svm_poly':
            model = svm.SVC(kernel='rbf',
                            gamma=svm_gamma,
                            C=svm_C,
                            probability=True)
            #model = svm.SVC(kernel='poly', probability=True)
        elif model_type == 'adaptive_lr':
            #random_state = np.random.RandomState(seed=seed)
            model = LogisticRegressionAdaptive()
        elif model_type == 'adaptive_svm':
            random_state = np.random.RandomState(seed=seed)
            model = AdaptiveSVM()
        elif model_type == 'SGD':
            model = SGDClassifier(loss="log", penalty='l2', n_iter=100)

        # fit model using sample weight
        model.fit(X_train, np.array(y_train), sample_weight=sample_weight)

        (accu, auc) = evaluate_model(model, X_test, y_test)
        model_scores['auc'].append(auc)
        model_scores['accu'].append(accu)

        num_training_samples.append(number_of_docs)

    print 'Active Learning took %2.2fs' % (time() - start)

    return (np.array(num_training_samples), model_scores)
Exemple #10
0
def learn(X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \
          selection_strategy, disagree_strat, coverage, budget, instance_model, feature_model, \
          pooling_model, reasoning_model, rmw_n, rmw_a, seed=0, Debug=False, \
          reasoning_strategy='random', switch=40):
    
    start = time()
    print '-' * 50
    print 'Starting Active Learning...'
    
    instance_model_scores = {'auc':[], 'accu':[]}
    feature_model_scores = {'auc':[], 'accu':[]}
    pooling_model_scores = {'auc':[], 'accu':[]}
    reasoning_model_scores = {'auc':[], 'accu':[]}
        
    discovered_feature_counts = {'class0':[], 'class1': []}
    num_docs_covered = []    
    covered_docs = set()    
    X_pool_csc = X_pool.tocsc()
    
    num_samples = len(pool_set) + len(training_set)
    
    num_feat = X_pool.shape[1]
    
    num_a_feat_chosen = np.zeros(num_feat)
    
    discovered_features = set()
    
    discovered_class0_features = set()
    
    discovered_class1_features = set()
    
    feature_expert.rg.seed(seed)
           
    if selection_strategy == 'random':
        doc_pick_model = RandomStrategy(seed)
    elif selection_strategy == 'uncertaintyIM':
        doc_pick_model = UNCSampling(instance_model, feature_expert, y_pool, Debug)
    elif selection_strategy == 'uncertaintyFM':
        doc_pick_model = UNCSampling(feature_model, feature_expert, y_pool, Debug)
    elif selection_strategy == 'uncertaintyPM':
        doc_pick_model = UNCSampling(pooling_model, feature_expert, y_pool, Debug)
    elif selection_strategy == 'uncertaintyRM':
        doc_pick_model = UNCSampling(reasoning_model, feature_expert, y_pool, Debug)
    elif selection_strategy == 'disagreement':
        doc_pick_model = DisagreementStrategy(instance_model, feature_model, \
            feature_expert, y_pool, disagree_strat, Debug=Debug)
    elif selection_strategy == 'covering':
        doc_pick_model = CoveringStrategy(feature_expert, num_samples, y_pool, \
            type='unknown', seed=seed, Debug=Debug)
    elif selection_strategy == 'covering_fewest':
        doc_pick_model = CoveringStrategy(feature_expert, num_samples, y_pool, \
            type='fewest', seed=seed, Debug=Debug)
    elif selection_strategy == 'cheating':
        doc_pick_model = CheatingApproach(feature_expert, num_samples, y_pool, \
            seed=seed, Debug=Debug)
    elif selection_strategy == 'cover_then_disagree':
        doc_pick_model = CoveringThenDisagreement(feature_expert, instance_model, \
            feature_model, num_samples, percentage=coverage, y=y_pool, type='unknown', \
            metric=disagree_strat, seed=seed, Debug=Debug)
    elif selection_strategy == 'cover_then_uncertaintyPM':
        doc_pick_model = CoverThenUncertainty(feature_expert, pooling_model, \
            num_samples, percentage=coverage, y=y_pool, type='unknown', \
            seed=seed, Debug=Debug)
    elif selection_strategy == 'cover_then_uncertaintyRM':
        doc_pick_model = CoverThenUncertainty(feature_expert, reasoning_model, \
            num_samples, percentage=coverage, y=y_pool, type='unknown', \
            seed=seed, Debug=Debug)
    elif selection_strategy == 'cover_then_featureCertainty':
        doc_pick_model = CoverThenFeatureCertainty(feature_expert, feature_model, \
            num_samples, percentage=coverage, y=y_pool, type='unknown', \
            seed=seed, Debug=Debug)
    elif selection_strategy == "optaucP":
        doc_pick_model = OptimizeAUC(X_test, y_test, feature_expert, \
            optimize="P", seed=seed, Debug=Debug)
    elif selection_strategy == "optaucI":
        doc_pick_model = OptimizeAUC(X_test, y_test, feature_expert, \
            optimize="I", seed=seed, Debug=Debug)
    elif selection_strategy == "optaucF":
        doc_pick_model = OptimizeAUC(X_test, y_test, feature_expert, \
            optimize="F", seed=seed, Debug=Debug)
    elif selection_strategy == "optaucR":
        doc_pick_model = OptimizeAUC(X_test, y_test, feature_expert, \
            optimize="R", seed=seed, Debug=Debug)  
    elif selection_strategy == 'reasoning_then_featureCertainty':
        doc_pick_model = ReasoningThenFeatureCertainty(feature_expert, instance_model, \
            feature_model, switch=switch, reasoning_strategy=reasoning_strategy, y=y_pool, type='unknown', \
            seed=seed, Debug=Debug)
    elif selection_strategy == "unc_insuff_R":
        doc_pick_model = UNCForInsufficientReason(reasoning_model)
    elif selection_strategy == "unc_no_conflict_R":
        doc_pick_model = UNCWithNoConflict(reasoning_model)
    elif selection_strategy == "unc_prefer_no_conflict_R":
        doc_pick_model = UNCPreferNoConflict(reasoning_model)
    elif selection_strategy == "unc_prefer_conflict_R":
        doc_pick_model = UNCPreferConflict(reasoning_model)
    elif selection_strategy == "unc_three_types_R":
        doc_pick_model = UNCThreeTypes(reasoning_model)
    else:
        raise ValueError('Selection strategy: \'%s\' invalid!' % selection_strategy)
    
    bootstrap_size = len(training_set)

    training_set_empty = (bootstrap_size == 0)
    
    if not training_set_empty:
        X_train = X_pool[training_set]
        y_train = y_pool[training_set]
        
        # Train all three models using the training set data
        instance_model.fit(X_train, y_train) # train instance_model
        
        for doc in training_set:
            #feature = feature_expert.most_informative_feature(X_pool[doc], y_pool[doc])
            feature = feature_expert.any_informative_feature(X_pool[doc], y_pool[doc])
            
            if feature:
                feature_model.fit(feature, y_pool[doc]) # train feature_model one by one
                discovered_features.add(feature)                
                if y_pool[doc] == 0:
                    discovered_class0_features.add(feature)
                else:
                    discovered_class1_features.add(feature)
                    
            # Reasoning model
            reasoning_model.partial_fit(X_pool[doc], y_pool[doc], feature, rmw_n, rmw_a) # train feature_model one by one
           
            # docs covered            
            if feature:
                f_covered_docs = X_pool_csc[:, feature].indices
                covered_docs.update(f_covered_docs)
            
            # number of times a feat is chosen as a reason
            if feature:
                num_a_feat_chosen[feature] += 1
            
            if selection_strategy == 'covering' or selection_strategy == 'covering_fewest':
                doc_pick_model.update(X_pool, feature, doc)
            elif selection_strategy == 'cheating':
                doc_pick_model.update(X_pool, feature, y_pool[doc])
            elif selection_strategy.startswith('cover_then') and doc_pick_model.phase == 'covering':
                doc_pick_model.covering.update(X_pool, feature, doc)
                    
        pooling_model.fit(instance_model, feature_model, weights=[0.5, 0.5]) # train pooling_model
        
        (accu, auc) = evaluate_model(instance_model, X_test, y_test)
        instance_model_scores['auc'].append(auc)
        instance_model_scores['accu'].append(accu)
        
        (accu, auc) = evaluate_model(feature_model, X_test, y_test)
        feature_model_scores['auc'].append(auc)
        feature_model_scores['accu'].append(accu)
        
        (accu, auc) = evaluate_model(pooling_model, X_test, y_test)
        pooling_model_scores['auc'].append(auc)
        pooling_model_scores['accu'].append(accu)
        
        (accu, auc) = evaluate_model(reasoning_model, X_test, y_test)
        reasoning_model_scores['auc'].append(auc)
        reasoning_model_scores['accu'].append(accu)
        
        # discovered feature counts
        if isinstance(feature_model, FeatureMNBUniform):        
            discovered_feature_counts['class0'].append(len(feature_model.class0_features))
            discovered_feature_counts['class1'].append(len(feature_model.class1_features))
        elif isinstance(feature_model, FeatureMNBWeighted):
            nz = np.sum(feature_model.feature_count_>0, axis=1)
            discovered_feature_counts['class0'].append(nz[0])
            discovered_feature_counts['class1'].append(nz[1])
        
        num_docs_covered.append(len(covered_docs))
    
    else:
        if selection_strategy.startswith('uncertainty') or selection_strategy == 'disagreement':
            raise ValueError('\'%s\' requires bootstrapping!' % selection_strategy)            
       

    for i in range(budget):
        train_set_size=len(training_set)

        # Choose a document based on the strategy chosen
        if selection_strategy.startswith('cover_then'):
            doc_id = doc_pick_model.choice(X_pool, i+1, pool_set)        
        elif selection_strategy.startswith('optauc'):
            doc_id = doc_pick_model.choice(X_pool, y_pool, pool_set, training_set, feature_model, reasoning_model, rmw_n, rmw_a)
        elif selection_strategy == 'reasoning_then_featureCertainty':
            doc_id = doc_pick_model.choice(X_pool, i+1, pool_set, train_set_size)
        elif selection_strategy == "unc_insuff_R":
            doc_id = doc_pick_model.choice(X_pool, pool_set, discovered_features, max_num_feats=1)
        elif selection_strategy == "unc_no_conflict_R":
            doc_id = doc_pick_model.choice(X_pool, pool_set, discovered_class0_features, discovered_class1_features)
        elif selection_strategy == "unc_prefer_no_conflict_R":
            doc_id = doc_pick_model.choice(X_pool, pool_set, discovered_class0_features, discovered_class1_features, top_k=10)
        elif selection_strategy == "unc_prefer_conflict_R":
            doc_id = doc_pick_model.choice(X_pool, pool_set, discovered_class0_features, discovered_class1_features, top_k=10)
        elif selection_strategy == "unc_three_types_R":
            doc_id = doc_pick_model.choice(X_pool, pool_set, discovered_class0_features, discovered_class1_features, top_k=10)
        else:
            doc_id = doc_pick_model.choice(X_pool, pool_set)
        
        if doc_id == None:
            break
        
        # Remove the chosen document from pool and add it to the training set
        pool_set.remove(doc_id)
        training_set.append(doc_id)
        
        if i == 0 and training_set_empty:
            X_train = X_pool[doc_id]
            y_train = np.array([y_pool[doc_id]])
        else:
            X_train = sp.vstack((X_train, X_pool[doc_id]))
            y_train = np.hstack((y_train, np.array([y_pool[doc_id]])))
        
        # Ask the expert for instance label (returns the true label from the dataset)
        label = y_pool[doc_id]
        
        # Ask the expert for most informative feature given the label
        #feature = feature_expert.most_informative_feature(X_pool[doc_id], label)
        feature = feature_expert.any_informative_feature(X_pool[doc_id], y_pool[doc_id])
        
        # Update the instance model
        instance_model.fit(X_train, y_train)
        
        # Update the feature model
        if feature:
            feature_model.fit(feature, label)
            discovered_features.add(feature)
            if y_pool[doc_id] == 0:
                discovered_class0_features.add(feature)
            else:
                discovered_class1_features.add(feature)
            
        reasoning_model.partial_fit(X_pool[doc_id], y_pool[doc_id], feature, rmw_n, rmw_a) # train feature_model one by one
        
        # docs covered
        if feature:
            f_covered_docs = X_pool_csc[:, feature].indices
            covered_docs.update(f_covered_docs)
        
        # number of times a feat is chosen as a reason
        if feature:
            num_a_feat_chosen[feature] += 1
        
        
        # Update the pooling model
        pooling_model.fit(instance_model, feature_model, weights=[0.5, 0.5])
        
        # print 'docs = %d, feature = %s' % (doc_id, str(feature))
        
        if selection_strategy == 'covering' or selection_strategy == 'covering_fewest':
            doc_pick_model.update(X_pool, feature, doc_id)
        elif selection_strategy == 'cheating':
            doc_pick_model.update(X_pool, feature, label)
        elif selection_strategy.startswith('cover_then') and doc_pick_model.phase == 'covering':
            doc_pick_model.covering.update(X_pool, feature, doc_id)
                
#        print 'covering_fewest features: %d, feature model features: %d' % (len(doc_pick_model.annotated_features), len(feature_model.class0_features + feature_model.class1_features))

        # Evaluate performance based on Instance Model
        (accu, auc) = evaluate_model(instance_model, X_test, y_test)
        instance_model_scores['auc'].append(auc)
        instance_model_scores['accu'].append(accu)
        
        # Evaluate performance on Feature Model
        (accu, auc) = evaluate_model(feature_model, X_test, y_test)
        feature_model_scores['auc'].append(auc)
        feature_model_scores['accu'].append(accu)
        
        # Evaluate performance on Pooled Model
        (accu, auc) = evaluate_model(pooling_model, X_test, y_test)
        pooling_model_scores['auc'].append(auc)
        pooling_model_scores['accu'].append(accu)
        
        # Evaluate performance of the Reasoning Model
        (accu, auc) = evaluate_model(reasoning_model, X_test, y_test)
        reasoning_model_scores['auc'].append(auc)
        reasoning_model_scores['accu'].append(accu)
        
        # discovered feature counts
        if isinstance(feature_model, FeatureMNBUniform):        
            discovered_feature_counts['class0'].append(len(feature_model.class0_features))
            discovered_feature_counts['class1'].append(len(feature_model.class1_features))
        elif isinstance(feature_model, FeatureMNBWeighted):
            nz = np.sum(feature_model.feature_count_>0, axis=1)
            discovered_feature_counts['class0'].append(nz[0])
            discovered_feature_counts['class1'].append(nz[1])
         
        # docs covered        
        num_docs_covered.append(len(covered_docs))
    
    if selection_strategy.startswith('cover_then'):
        transition = doc_pick_model.transition
    else:
        transition = None
    
    
    # compute the # of training samples for plot
    if training_set_empty:
        num_training_samples = np.arange(len(instance_model_scores['accu'])) + 1
    else:
        num_training_samples = np.arange(len(instance_model_scores['accu'])) + bootstrap_size
    
    print 'Active Learning took %2.2fs' % (time() - start)
    
    return (num_training_samples, instance_model_scores, feature_model_scores, pooling_model_scores,reasoning_model_scores, discovered_feature_counts, num_docs_covered, transition, num_a_feat_chosen)
def learn(model_type, X_pool, y_pool, X_test, y_test, training_set, pool_set, feature_expert, \
          selection_strategy, budget, step_size, topk, w_o, w_r, seed=0, lr_C=1, svm_C=1, Debug=False):

    start = time()
    print '-' * 50
    print 'Starting Active Learning...'

    model_scores = {'auc': [], 'accu': []}

    rationales = set()
    rationales_c0 = set()
    rationales_c1 = set()

    feature_expert.rg.seed(seed)

    num_training_samples = []

    docs = training_set

    X_train = None
    y_train = []

    for doc_id in docs:
        #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
        feature = feature_expert.any_informative_feature(
            X_pool[doc_id], y_pool[doc_id])

        rationales.add(feature)

        if y_pool[doc_id] == 0:
            rationales_c0.add(feature)
        else:
            rationales_c1.add(feature)

        x = sp.csr_matrix(X_pool[doc_id], dtype=float)

        x_feats = x[0].indices
        for f in x_feats:
            if f == feature:
                x[0, f] = w_r * x[0, f]
            else:
                x[0, f] = w_o * x[0, f]

        if not y_train:
            X_train = x
        else:
            X_train = sp.vstack((X_train, x))

        y_train.append(y_pool[doc_id])

    # Train the model

    if model_type == 'lrl2':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C,
                                   penalty='l2',
                                   random_state=random_state)
    elif model_type == 'lrl1':
        random_state = np.random.RandomState(seed=seed)
        model = LogisticRegression(C=lr_C,
                                   penalty='l1',
                                   random_state=random_state)
    elif model_type == 'mnb':
        model = MultinomialNB(alpha=1)
    elif model_type == 'svm_linear':
        model = LinearSVC(C=svm_C)

    model.fit(X_train, np.array(y_train))

    (accu, auc) = evaluate_model(model, X_test, y_test)
    model_scores['auc'].append(auc)
    model_scores['accu'].append(accu)

    num_training_samples.append(X_train.shape[0])

    feature_expert.rg.seed(seed)

    if selection_strategy == 'random':
        doc_pick_model = RandomStrategy(seed)
    elif selection_strategy == 'unc':
        doc_pick_model = UNCSampling()
    elif selection_strategy == "pnc":
        doc_pick_model = UNCPreferNoConflict()
    elif selection_strategy == "pnr":
        doc_pick_model = UNCPreferNoRationale()
    elif selection_strategy == "pr":
        doc_pick_model = UNCPreferRationale()
    elif selection_strategy == "pc":
        doc_pick_model = UNCPreferConflict()
    elif selection_strategy == "tt":
        doc_pick_model = UNCThreeTypes()
    elif selection_strategy == "pipe":
        doc_pick_model = Pipe([UNCSampling(), UNCPreferConflict()], [10, 30])
    else:
        raise ValueError('Selection strategy: \'%s\' invalid!' %
                         selection_strategy)

    k = step_size

    while X_train.shape[0] < budget:

        # Choose a document based on the strategy chosen
        if selection_strategy == "pnc":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == "pnr":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == "pr":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == "pc":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == "tt":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        elif selection_strategy == "pipe":
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k,
                                             rationales_c0, rationales_c1,
                                             topk)
        else:
            doc_ids = doc_pick_model.choices(model, X_pool, pool_set, k)

        if doc_ids is None or len(doc_ids) == 0:
            break

        for doc_id in doc_ids:
            # Remove the chosen document from pool and add it to the training set
            pool_set.remove(doc_id)
            training_set.append(doc_id)

            #feature = feature_expert.most_informative_feature(X_pool[doc_id], y_pool[doc_id])
            feature = feature_expert.any_informative_feature(
                X_pool[doc_id], y_pool[doc_id])

            rationales.add(feature)

            if y_pool[doc_id] == 0:
                rationales_c0.add(feature)
            else:
                rationales_c1.add(feature)

            x = sp.csr_matrix(X_pool[doc_id], dtype=float)

            x_feats = x[0].indices
            for f in x_feats:
                if f == feature:
                    x[0, f] = w_r * x[0, f]
                else:
                    x[0, f] = w_o * x[0, f]

            X_train = sp.vstack((X_train, x))
            y_train.append(y_pool[doc_id])

        # Train the model

        if model_type == 'lrl2':
            random_state = np.random.RandomState(seed=seed)
            model = LogisticRegression(C=lr_C,
                                       penalty='l2',
                                       random_state=random_state)
        elif model_type == 'lrl1':
            random_state = np.random.RandomState(seed=seed)
            model = LogisticRegression(C=lr_C,
                                       penalty='l1',
                                       random_state=random_state)
        elif model_type == 'mnb':
            model = MultinomialNB(alpha=1)
        elif model_type == 'svm_linear':
            model = LinearSVC(C=svm_C)

        # fit the model and evaluate
        model.fit(X_train, np.array(y_train))

        (accu, auc) = evaluate_model(model, X_test, y_test)
        model_scores['auc'].append(auc)
        model_scores['accu'].append(accu)

        num_training_samples.append(X_train.shape[0])

    print 'Active Learning took %2.2fs' % (time() - start)

    return (np.array(num_training_samples), model_scores)