예제 #1
0
def covering(dataset, type='any', metric='L1', C=0.1, smoothing=1e-6):
    X_pool, y_pool, X_test, y_test, feat_names = load_dataset(dataset)
    num_samples, num_feat = X_pool.shape

    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)
    feature_count = np.zeros(num_feat)

    # no_feature_docs[0] counts # of documents labeled 0 but without any features
    # no_feature_docs[1] counts # of documents labeled 1 but without any features
    no_feature_docs = np.zeros(2)

    for doc in range(num_samples):
        label = y_pool[doc]

        if type == 'agnostic':
            top_class0_feature = fe.top_n_class0_features(X_pool[doc], 1)
            top_class1_feature = fe.top_n_class1_features(X_pool[doc], 1)

            if len(top_class0_feature) == 0 and len(top_class1_feature) == 0:
                no_feature_docs[label] += 1
            elif len(top_class0_feature) == 0 and len(top_class1_feature) != 0:
                # if there is no class 1 feature, then the top feature is the class0's top feature
                top_feature = top_class1_feature[0]
                feature_count[top_feature] += 1
            elif len(top_class0_feature) != 0 and len(top_class1_feature) == 0:
                # if there is no class 0 feature, then the top feature is the class1's top feature
                top_feature = top_class0_feature[0]
                feature_count[top_feature] += 1
            else:
                # if both classes have a valid top feature, then compare the absolute value of the weights
                # of both features to determine the top feature for this document
                class0_feature_weight = fe.L1_weights[top_class0_feature[0]]
                class1_feature_weight = fe.L1_weights[top_class1_feature[0]]

                if np.absolute(class0_feature_weight) >= np.absolute(
                        class1_feature_weight):
                    top_feature = top_class0_feature[0]
                else:
                    top_feature = top_class1_feature[0]

                feature_count[top_feature] += 1

        elif type == 'sensitive':
            feature = fe.most_informative_feature(X_pool[doc], label)
            if feature == None:
                no_feature_docs[label] += 1
            else:
                feature_count[feature] += 1

    print 'number of features needed to cover the entire corpus = %d' % len(
        np.nonzero(feature_count)[0])
    print 'number of uncovered class 0 documents: %d' % no_feature_docs[0]
    print 'number of uncovered class 1 documents: %d' % no_feature_docs[1]
    pickle.dump(feature_count, open('feature_count.pickle', 'wb'))
    pickle.dump(no_feature_docs, open('uncovered_count.pickle', 'wb'))
예제 #2
0
def covering(dataset, type='any', metric='L1', C=0.1, smoothing=1e-6):
    X_pool, y_pool, X_test, y_test, feat_names = load_dataset(dataset)
    num_samples, num_feat = X_pool.shape
    
    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)
    feature_count = np.zeros(num_feat)
    
    # no_feature_docs[0] counts # of documents labeled 0 but without any features
    # no_feature_docs[1] counts # of documents labeled 1 but without any features
    no_feature_docs = np.zeros(2)
    
    for doc in range(num_samples):
        label = y_pool[doc]
        
        if type == 'agnostic':
            top_class0_feature = fe.top_n_class0_features(X_pool[doc], 1)
            top_class1_feature = fe.top_n_class1_features(X_pool[doc], 1)
            
            if len(top_class0_feature) == 0 and len(top_class1_feature) == 0:
                no_feature_docs[label] += 1
            elif len(top_class0_feature) == 0 and len(top_class1_feature) != 0:
                # if there is no class 1 feature, then the top feature is the class0's top feature
                top_feature = top_class1_feature[0]
                feature_count[top_feature] += 1
            elif len(top_class0_feature) != 0 and len(top_class1_feature) == 0:
                # if there is no class 0 feature, then the top feature is the class1's top feature
                top_feature = top_class0_feature[0]
                feature_count[top_feature] += 1
            else:
                # if both classes have a valid top feature, then compare the absolute value of the weights
                # of both features to determine the top feature for this document
                class0_feature_weight = fe.L1_weights[top_class0_feature[0]]
                class1_feature_weight = fe.L1_weights[top_class1_feature[0]]
                
                if np.absolute(class0_feature_weight) >= np.absolute(class1_feature_weight):
                    top_feature = top_class0_feature[0]
                else:
                    top_feature = top_class1_feature[0]
                
                feature_count[top_feature] += 1
            
        elif type == 'sensitive':
            feature = fe.most_informative_feature(X_pool[doc], label)
            if feature == None:
                no_feature_docs[label] += 1
            else:
                feature_count[feature] += 1
            
    print 'number of features needed to cover the entire corpus = %d' % len(np.nonzero(feature_count)[0])
    print 'number of uncovered class 0 documents: %d' % no_feature_docs[0]
    print 'number of uncovered class 1 documents: %d' % no_feature_docs[1]
    pickle.dump(feature_count, open('feature_count.pickle', 'wb'))
    pickle.dump(no_feature_docs, open('uncovered_count.pickle', 'wb'))
예제 #3
0
def IM_explore(num_trials, dataset, bootstrap_size=0, balance=True, budget=500, seed=2343, Debug=False):
    sep = '-' * 50
    
    (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(dataset)
    
    models = {'MultinomialNB(alpha=1)':MultinomialNB(alpha=1), \
              'LogisticRegression(C=0.1, penalty=\'l1\')':LogisticRegression(C=0.1, penalty='l1'), \
              'LogisticRegression(C=1, penalty=\'l1\')':LogisticRegression(C=1, penalty='l1')}
              
    # models = {'LogisticRegression(C=0.1, penalty=\'l1\')':LogisticRegression(C=0.1, penalty='l1')}

    print sep
    print 'Instance Model Performance Evaluation'
    
    result = np.ndarray(num_trials, dtype=object)
    
    for model in models.keys():
        print sep
        print 'Instance Model: %s' % models[model]
        
        for i in range(num_trials):
            print sep
            print 'Starting Trial %d of %d...' % (i + 1, num_trials)

            trial_seed = seed + i # initialize the seed for the trial
            
            training_set, pool_set = RandomBootstrap(X_pool, y_pool, bootstrap_size, balance, trial_seed)
            
            result[i] = no_reasoning_learn(X_pool, y_pool, X_test, y_test, training_set, pool_set, \
                'random', budget, models[model], trial_seed, Debug=Debug)
            
            # save_result(result[i], filename='_'.join([dataset, 'trial'+str(i), 'result.txt']))
        
        if isinstance(dataset, list):
            name = '_'.join(dataset)
            save_result(average_results(result), filename='_'.join([name, model, 'result.txt']))
        else:
            save_result(average_results(result), filename='_'.join([dataset, model, 'result.txt']))
예제 #4
0
 parser.add_argument('-type', default='weight', choices=['weight', 'non_zero'], help='Type of metric used to' + \
                     'partition the features into the two classes')
 args = parser.parse_args()
 
 vect = CountVectorizer(min_df=args.d, max_df=1.0, binary=True, ngram_range=(1, 1))
 
 if args.dataset == 'imdb':
     X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb(path='./aclImdb', shuffle=True, vectorizer=vect)
     feature_names = np.array(vect.get_feature_names())
 elif args.dataset == '20newsgroups':
     X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \
         load_newsgroups(args.cat[0], args.cat[1], shuffle=True, random_state=42, \
             remove=('headers', 'footers'), vectorizer=vect)
     feature_names = vect.get_feature_names()
 elif args.dataset == 'SRAA':
     X_pool, y_pool, X_test, y_test, feat_names = load_dataset(args.dataset, vect=vect)
     X_pool_docs = pickle.load(open('SRAA_X_train_corpus.pickle', 'rb'))
     X_test_docs = pickle.load(open('SRAA_X_test_corpus.pickle', 'rb'))
     feature_names = pickle.load(open('SRAA_feature_names.pickle', 'rb'))
         
     print "n_samples: %d, n_features: %d" % X_pool.shape
     
 fe = alt_L1_feature_expert(X_pool, y_pool, args.type, smoothing=1e-6, C=args.c)
 
 print 'class 0 features (ranked):'
 print ', '.join([str((f, feature_names[f], fe.L1_weights[f])) for f in fe.class0_features_by_rank()])
 print '-' * 50
 
 print 'class 1 features (ranked):'
 print ', '.join([str((f, feature_names[f], fe.L1_weights[f])) for f in fe.class1_features_by_rank()])
 print '-' * 50
예제 #5
0
    vect = CountVectorizer(min_df=args.d,
                           max_df=1.0,
                           binary=True,
                           ngram_range=(1, 1))

    if args.dataset == 'imdb':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb(
            path='./aclImdb', shuffle=True, vectorizer=vect)
        feature_names = np.array(vect.get_feature_names())
    elif args.dataset == '20newsgroups':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \
            load_newsgroups(args.cat[0], args.cat[1], shuffle=True, random_state=42, \
                remove=('headers', 'footers'), vectorizer=vect)
        feature_names = vect.get_feature_names()
    elif args.dataset == 'SRAA':
        X_pool, y_pool, X_test, y_test, feat_names = load_dataset(args.dataset,
                                                                  vect=vect)
        X_pool_docs = pickle.load(open('SRAA_X_train_corpus.pickle', 'rb'))
        X_test_docs = pickle.load(open('SRAA_X_test_corpus.pickle', 'rb'))
        feature_names = pickle.load(open('SRAA_feature_names.pickle', 'rb'))

        print "n_samples: %d, n_features: %d" % X_pool.shape

    fe = alt_L1_feature_expert(X_pool,
                               y_pool,
                               args.type,
                               smoothing=1e-6,
                               C=args.c)

    print 'class 0 features (ranked):'
    print ', '.join([
        str((f, feature_names[f], fe.L1_weights[f]))
예제 #6
0
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from models import FeatureMNBUniform

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('-dataset', default=['imdb'], nargs='*', \
                        help='Dataset to be used: [\'imdb\', \'20newsgroups\'] 20newsgroups must have 2 valid group names')
    parser.add_argument('-c', type=float, default=0.1, help='Penalty term for the L1 feature expert')
    parser.add_argument('-k', type=int, default=10, help='number of features to use from each class')
    parser.add_argument('-smoothing', type=float, default=0, help='smoothing parameter for the feature MNB model')
   
    args = parser.parse_args()
    
    (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(args.dataset)
    
    models = {'MultinomialNB(alpha=1)':MultinomialNB(alpha=1), \
              'LogisticRegression(C=1, penalty=\'l1\')':LogisticRegression(C=1, penalty='l1'), \
              'LogisticRegression(C=0.1, penalty=\'l1\')':LogisticRegression(C=0.1, penalty='l1')}
    
    aucs = {}
    
    for mk in models.keys():
        models[mk].fit(X_pool, y_pool)
        _, auc = evaluate_model(models[mk], X_test, y_test)
        aucs[mk] = auc
    
    fe = feature_expert(X_pool, y_pool, metric="L1", C=args.c)
    
    all_feature_model = FeatureMNBUniform(fe.feature_rank[0], fe.feature_rank[1], fe.num_features, smoothing=args.smoothing)
예제 #7
0
    parser.add_argument('-c',
                        type=float,
                        default=0.1,
                        help='Penalty term for the L1 feature expert')
    parser.add_argument('-k',
                        type=int,
                        default=10,
                        help='number of features to use from each class')
    parser.add_argument('-smoothing',
                        type=float,
                        default=0,
                        help='smoothing parameter for the feature MNB model')

    args = parser.parse_args()

    (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(args.dataset)

    models = {'MultinomialNB(alpha=1)':MultinomialNB(alpha=1), \
              'LogisticRegression(C=1, penalty=\'l1\')':LogisticRegression(C=1, penalty='l1'), \
              'LogisticRegression(C=0.1, penalty=\'l1\')':LogisticRegression(C=0.1, penalty='l1')}

    aucs = {}

    for mk in models.keys():
        models[mk].fit(X_pool, y_pool)
        _, auc = evaluate_model(models[mk], X_test, y_test)
        aucs[mk] = auc

    fe = feature_expert(X_pool, y_pool, metric="L1", C=args.c)

    all_feature_model = FeatureMNBUniform(fe.feature_rank[0],