def covering(dataset, type='any', metric='L1', C=0.1, smoothing=1e-6): X_pool, y_pool, X_test, y_test, feat_names = load_dataset(dataset) num_samples, num_feat = X_pool.shape fe = feature_expert(X_pool, y_pool, metric, smoothing, C) feature_count = np.zeros(num_feat) # no_feature_docs[0] counts # of documents labeled 0 but without any features # no_feature_docs[1] counts # of documents labeled 1 but without any features no_feature_docs = np.zeros(2) for doc in range(num_samples): label = y_pool[doc] if type == 'agnostic': top_class0_feature = fe.top_n_class0_features(X_pool[doc], 1) top_class1_feature = fe.top_n_class1_features(X_pool[doc], 1) if len(top_class0_feature) == 0 and len(top_class1_feature) == 0: no_feature_docs[label] += 1 elif len(top_class0_feature) == 0 and len(top_class1_feature) != 0: # if there is no class 1 feature, then the top feature is the class0's top feature top_feature = top_class1_feature[0] feature_count[top_feature] += 1 elif len(top_class0_feature) != 0 and len(top_class1_feature) == 0: # if there is no class 0 feature, then the top feature is the class1's top feature top_feature = top_class0_feature[0] feature_count[top_feature] += 1 else: # if both classes have a valid top feature, then compare the absolute value of the weights # of both features to determine the top feature for this document class0_feature_weight = fe.L1_weights[top_class0_feature[0]] class1_feature_weight = fe.L1_weights[top_class1_feature[0]] if np.absolute(class0_feature_weight) >= np.absolute( class1_feature_weight): top_feature = top_class0_feature[0] else: top_feature = top_class1_feature[0] feature_count[top_feature] += 1 elif type == 'sensitive': feature = fe.most_informative_feature(X_pool[doc], label) if feature == None: no_feature_docs[label] += 1 else: feature_count[feature] += 1 print 'number of features needed to cover the entire corpus = %d' % len( np.nonzero(feature_count)[0]) print 'number of uncovered class 0 documents: %d' % no_feature_docs[0] print 'number of uncovered class 1 documents: %d' % no_feature_docs[1] pickle.dump(feature_count, open('feature_count.pickle', 'wb')) pickle.dump(no_feature_docs, open('uncovered_count.pickle', 'wb'))
def covering(dataset, type='any', metric='L1', C=0.1, smoothing=1e-6): X_pool, y_pool, X_test, y_test, feat_names = load_dataset(dataset) num_samples, num_feat = X_pool.shape fe = feature_expert(X_pool, y_pool, metric, smoothing, C) feature_count = np.zeros(num_feat) # no_feature_docs[0] counts # of documents labeled 0 but without any features # no_feature_docs[1] counts # of documents labeled 1 but without any features no_feature_docs = np.zeros(2) for doc in range(num_samples): label = y_pool[doc] if type == 'agnostic': top_class0_feature = fe.top_n_class0_features(X_pool[doc], 1) top_class1_feature = fe.top_n_class1_features(X_pool[doc], 1) if len(top_class0_feature) == 0 and len(top_class1_feature) == 0: no_feature_docs[label] += 1 elif len(top_class0_feature) == 0 and len(top_class1_feature) != 0: # if there is no class 1 feature, then the top feature is the class0's top feature top_feature = top_class1_feature[0] feature_count[top_feature] += 1 elif len(top_class0_feature) != 0 and len(top_class1_feature) == 0: # if there is no class 0 feature, then the top feature is the class1's top feature top_feature = top_class0_feature[0] feature_count[top_feature] += 1 else: # if both classes have a valid top feature, then compare the absolute value of the weights # of both features to determine the top feature for this document class0_feature_weight = fe.L1_weights[top_class0_feature[0]] class1_feature_weight = fe.L1_weights[top_class1_feature[0]] if np.absolute(class0_feature_weight) >= np.absolute(class1_feature_weight): top_feature = top_class0_feature[0] else: top_feature = top_class1_feature[0] feature_count[top_feature] += 1 elif type == 'sensitive': feature = fe.most_informative_feature(X_pool[doc], label) if feature == None: no_feature_docs[label] += 1 else: feature_count[feature] += 1 print 'number of features needed to cover the entire corpus = %d' % len(np.nonzero(feature_count)[0]) print 'number of uncovered class 0 documents: %d' % no_feature_docs[0] print 'number of uncovered class 1 documents: %d' % no_feature_docs[1] pickle.dump(feature_count, open('feature_count.pickle', 'wb')) pickle.dump(no_feature_docs, open('uncovered_count.pickle', 'wb'))
def IM_explore(num_trials, dataset, bootstrap_size=0, balance=True, budget=500, seed=2343, Debug=False): sep = '-' * 50 (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(dataset) models = {'MultinomialNB(alpha=1)':MultinomialNB(alpha=1), \ 'LogisticRegression(C=0.1, penalty=\'l1\')':LogisticRegression(C=0.1, penalty='l1'), \ 'LogisticRegression(C=1, penalty=\'l1\')':LogisticRegression(C=1, penalty='l1')} # models = {'LogisticRegression(C=0.1, penalty=\'l1\')':LogisticRegression(C=0.1, penalty='l1')} print sep print 'Instance Model Performance Evaluation' result = np.ndarray(num_trials, dtype=object) for model in models.keys(): print sep print 'Instance Model: %s' % models[model] for i in range(num_trials): print sep print 'Starting Trial %d of %d...' % (i + 1, num_trials) trial_seed = seed + i # initialize the seed for the trial training_set, pool_set = RandomBootstrap(X_pool, y_pool, bootstrap_size, balance, trial_seed) result[i] = no_reasoning_learn(X_pool, y_pool, X_test, y_test, training_set, pool_set, \ 'random', budget, models[model], trial_seed, Debug=Debug) # save_result(result[i], filename='_'.join([dataset, 'trial'+str(i), 'result.txt'])) if isinstance(dataset, list): name = '_'.join(dataset) save_result(average_results(result), filename='_'.join([name, model, 'result.txt'])) else: save_result(average_results(result), filename='_'.join([dataset, model, 'result.txt']))
parser.add_argument('-type', default='weight', choices=['weight', 'non_zero'], help='Type of metric used to' + \ 'partition the features into the two classes') args = parser.parse_args() vect = CountVectorizer(min_df=args.d, max_df=1.0, binary=True, ngram_range=(1, 1)) if args.dataset == 'imdb': X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb(path='./aclImdb', shuffle=True, vectorizer=vect) feature_names = np.array(vect.get_feature_names()) elif args.dataset == '20newsgroups': X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \ load_newsgroups(args.cat[0], args.cat[1], shuffle=True, random_state=42, \ remove=('headers', 'footers'), vectorizer=vect) feature_names = vect.get_feature_names() elif args.dataset == 'SRAA': X_pool, y_pool, X_test, y_test, feat_names = load_dataset(args.dataset, vect=vect) X_pool_docs = pickle.load(open('SRAA_X_train_corpus.pickle', 'rb')) X_test_docs = pickle.load(open('SRAA_X_test_corpus.pickle', 'rb')) feature_names = pickle.load(open('SRAA_feature_names.pickle', 'rb')) print "n_samples: %d, n_features: %d" % X_pool.shape fe = alt_L1_feature_expert(X_pool, y_pool, args.type, smoothing=1e-6, C=args.c) print 'class 0 features (ranked):' print ', '.join([str((f, feature_names[f], fe.L1_weights[f])) for f in fe.class0_features_by_rank()]) print '-' * 50 print 'class 1 features (ranked):' print ', '.join([str((f, feature_names[f], fe.L1_weights[f])) for f in fe.class1_features_by_rank()]) print '-' * 50
vect = CountVectorizer(min_df=args.d, max_df=1.0, binary=True, ngram_range=(1, 1)) if args.dataset == 'imdb': X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb( path='./aclImdb', shuffle=True, vectorizer=vect) feature_names = np.array(vect.get_feature_names()) elif args.dataset == '20newsgroups': X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \ load_newsgroups(args.cat[0], args.cat[1], shuffle=True, random_state=42, \ remove=('headers', 'footers'), vectorizer=vect) feature_names = vect.get_feature_names() elif args.dataset == 'SRAA': X_pool, y_pool, X_test, y_test, feat_names = load_dataset(args.dataset, vect=vect) X_pool_docs = pickle.load(open('SRAA_X_train_corpus.pickle', 'rb')) X_test_docs = pickle.load(open('SRAA_X_test_corpus.pickle', 'rb')) feature_names = pickle.load(open('SRAA_feature_names.pickle', 'rb')) print "n_samples: %d, n_features: %d" % X_pool.shape fe = alt_L1_feature_expert(X_pool, y_pool, args.type, smoothing=1e-6, C=args.c) print 'class 0 features (ranked):' print ', '.join([ str((f, feature_names[f], fe.L1_weights[f]))
from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import LogisticRegression from models import FeatureMNBUniform if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-dataset', default=['imdb'], nargs='*', \ help='Dataset to be used: [\'imdb\', \'20newsgroups\'] 20newsgroups must have 2 valid group names') parser.add_argument('-c', type=float, default=0.1, help='Penalty term for the L1 feature expert') parser.add_argument('-k', type=int, default=10, help='number of features to use from each class') parser.add_argument('-smoothing', type=float, default=0, help='smoothing parameter for the feature MNB model') args = parser.parse_args() (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(args.dataset) models = {'MultinomialNB(alpha=1)':MultinomialNB(alpha=1), \ 'LogisticRegression(C=1, penalty=\'l1\')':LogisticRegression(C=1, penalty='l1'), \ 'LogisticRegression(C=0.1, penalty=\'l1\')':LogisticRegression(C=0.1, penalty='l1')} aucs = {} for mk in models.keys(): models[mk].fit(X_pool, y_pool) _, auc = evaluate_model(models[mk], X_test, y_test) aucs[mk] = auc fe = feature_expert(X_pool, y_pool, metric="L1", C=args.c) all_feature_model = FeatureMNBUniform(fe.feature_rank[0], fe.feature_rank[1], fe.num_features, smoothing=args.smoothing)
parser.add_argument('-c', type=float, default=0.1, help='Penalty term for the L1 feature expert') parser.add_argument('-k', type=int, default=10, help='number of features to use from each class') parser.add_argument('-smoothing', type=float, default=0, help='smoothing parameter for the feature MNB model') args = parser.parse_args() (X_pool, y_pool, X_test, y_test, feat_names) = load_dataset(args.dataset) models = {'MultinomialNB(alpha=1)':MultinomialNB(alpha=1), \ 'LogisticRegression(C=1, penalty=\'l1\')':LogisticRegression(C=1, penalty='l1'), \ 'LogisticRegression(C=0.1, penalty=\'l1\')':LogisticRegression(C=0.1, penalty='l1')} aucs = {} for mk in models.keys(): models[mk].fit(X_pool, y_pool) _, auc = evaluate_model(models[mk], X_test, y_test) aucs[mk] = auc fe = feature_expert(X_pool, y_pool, metric="L1", C=args.c) all_feature_model = FeatureMNBUniform(fe.feature_rank[0],