def load_dataset(dataset): if dataset == ['imdb']: #(X_pool, y_pool, X_test, y_test) = load_data() #vect = CountVectorizer(min_df=0.005, max_df=1./3, binary=True, ngram_range=(1,1)) vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1)) X_pool, y_pool, X_test, y_test, _, _, = load_imdb( path='C:\\Users\\mbilgic\\Desktop\\aclImdb', shuffle=True, vectorizer=vect) return (X_pool, y_pool, X_test, y_test, vect.get_feature_names()) elif isinstance( dataset, list) and len(dataset) == 3 and dataset[0] == '20newsgroups': vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1)) X_pool, y_pool, X_test, y_test, _, _ = \ load_newsgroups(class1=dataset[1], class2=dataset[2], vectorizer=vect) return (X_pool, y_pool, X_test, y_test, vect.get_feature_names()) elif dataset == ['SRAA']: X_pool = pickle.load(open('SRAA_X_train.pickle', 'rb')) y_pool = pickle.load(open('SRAA_y_train.pickle', 'rb')) X_test = pickle.load(open('SRAA_X_test.pickle', 'rb')) y_test = pickle.load(open('SRAA_y_test.pickle', 'rb')) feat_names = pickle.load(open('SRAA_feature_names.pickle', 'rb')) return (X_pool, y_pool, X_test, y_test, feat_names) elif dataset == ['nova']: (X_pool, y_pool, X_test, y_test) = load_nova() return (X_pool, y_pool, X_test, y_test, None)
def check_feature_expert(dataset='imdb', metric='mutual_info', top_n=10, smoothing=1e-6, C=0.1, \ vect=CountVectorizer(min_df=5, max_df=1.0, binary=False)): class_label = {0:'negative', 1:'positive'} if isinstance(dataset, str) and dataset == 'imdb': X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb("./aclImdb", shuffle=True, vectorizer=vect) elif isinstance(dataset, tuple) and len(dataset) == 3 and dataset[0] == 'newsgroup': X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_newsgroups(dataset[1], dataset[2], vectorizer=vect) feature_names = vect.get_feature_names() fe = feature_expert(X_pool, y_pool, metric, smoothing, C) doc_ids = np.random.permutation(np.arange(X_pool.shape[0])) print '\n' print '=' * 50 for doc in doc_ids: print_all_features(feature_names, fe, top_n, doc, X_pool, y_pool, X_pool_docs) print '=' * 50 ch = raw_input('Display the next document? Press Enter to continue or type \'n\' to exit... ') if ch == 'n': break return
def load_dataset(dataset): if dataset == ['imdb']: #(X_pool, y_pool, X_test, y_test) = load_data() #vect = CountVectorizer(min_df=0.005, max_df=1./3, binary=True, ngram_range=(1,1)) vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1,1)) X_pool, y_pool, X_test, y_test, _, _, = load_imdb(path='./aclImdb/', shuffle=True, vectorizer=vect) return (X_pool, y_pool, X_test, y_test, vect.get_feature_names()) elif isinstance(dataset, list) and len(dataset) == 3 and dataset[0] == '20newsgroups': vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1)) X_pool, y_pool, X_test, y_test, _, _ = \ load_newsgroups(class1=dataset[1], class2=dataset[2], vectorizer=vect) return (X_pool, y_pool, X_test, y_test, vect.get_feature_names()) elif dataset == ['SRAA']: X_pool = pickle.load(open('./SRAA_X_train.pickle', 'rb')) y_pool = pickle.load(open('./SRAA_y_train.pickle', 'rb')) X_test = pickle.load(open('./SRAA_X_test.pickle', 'rb')) y_test = pickle.load(open('./SRAA_y_test.pickle', 'rb')) feat_names = pickle.load(open('./SRAA_feature_names.pickle', 'rb')) return (X_pool, y_pool, X_test, y_test, feat_names) elif dataset == ['nova']: (X_pool, y_pool, X_test, y_test) = load_nova() return (X_pool, y_pool, X_test, y_test, None) elif dataset == ['ibnsina']: (X_pool, y_pool, X_test, y_test) = load_ibnsina() return (X_pool, y_pool, X_test, y_test, None) elif dataset == ['creditg']: (X_pool, y_pool, X_test, y_test) = load_creditg() return (X_pool, y_pool, X_test, y_test, None)
def check_feature_expert(dataset='imdb', metric='mutual_info', top_n=10, smoothing=1e-6, C=0.1, \ vect=CountVectorizer(min_df=5, max_df=1.0, binary=False)): class_label = {0: 'negative', 1: 'positive'} if isinstance(dataset, str) and dataset == 'imdb': X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb( "./aclImdb", shuffle=True, vectorizer=vect) elif isinstance(dataset, tuple) and len(dataset) == 3 and dataset[0] == 'newsgroup': X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_newsgroups( dataset[1], dataset[2], vectorizer=vect) feature_names = vect.get_feature_names() fe = feature_expert(X_pool, y_pool, metric, smoothing, C) doc_ids = np.random.permutation(np.arange(X_pool.shape[0])) print '\n' print '=' * 50 for doc in doc_ids: print_all_features(feature_names, fe, top_n, doc, X_pool, y_pool, X_pool_docs) print '=' * 50 ch = raw_input( 'Display the next document? Press Enter to continue or type \'n\' to exit... ' ) if ch == 'n': break return
def output_features(filename='features.txt', dataset='imdb', metric='L1', smoothing=1e-6, C=0.1, \ vect=CountVectorizer(min_df=5, max_df=1.0, binary=False)): if isinstance(dataset, str) and dataset == 'imdb': X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb( "./aclImdb", shuffle=True, vectorizer=vect) elif isinstance(dataset, tuple) and len(dataset) == 3 and dataset[0] == 'newsgroup': X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_newsgroups( dataset[1], dataset[2], vectorizer=vect) feature_names = vect.get_feature_names() fe = feature_expert(X_pool, y_pool, metric, smoothing, C) print 'saving into \'%s\'...' % filename with open(filename, 'w') as f: f.write('-' * 50 + '\n') f.write('class 0 features:\n') f.write('-' * 50 + '\n') c0_feat = fe.class0_features_by_rank() for i in range(len(c0_feat)): feature = c0_feat[i] f.write('rank: #%d, feature: #%d, ' % (i, feature)) f.write('feature name: ' + feature_names[feature].encode('utf8') + ' ') f.write('L1 weight: %f' % fe.L1_weights[feature]) f.write('\n') f.write('-' * 50 + '\n') f.write('class 1 features:\n') f.write('-' * 50 + '\n') c1_feat = fe.class1_features_by_rank() for i in range(len(c1_feat)): feature = c1_feat[i] f.write('rank: #%d, feature: #%d, ' % (i, feature)) f.write('feature name: ' + feature_names[feature].encode('utf8') + ' ') f.write('L1 weight: %f' % fe.L1_weights[feature]) f.write('\n') return
def output_features(filename='features.txt', dataset='imdb', metric='L1', smoothing=1e-6, C=0.1, \ vect=CountVectorizer(min_df=5, max_df=1.0, binary=False)): if isinstance(dataset, str) and dataset == 'imdb': X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb("./aclImdb", shuffle=True, vectorizer=vect) elif isinstance(dataset, tuple) and len(dataset) == 3 and dataset[0] == 'newsgroup': X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_newsgroups(dataset[1], dataset[2], vectorizer=vect) feature_names = vect.get_feature_names() fe = feature_expert(X_pool, y_pool, metric, smoothing, C) print 'saving into \'%s\'...' % filename with open(filename, 'w') as f: f.write('-' * 50 + '\n') f.write('class 0 features:\n') f.write('-' * 50 + '\n') c0_feat = fe.class0_features_by_rank() for i in range(len(c0_feat)): feature = c0_feat[i] f.write('rank: #%d, feature: #%d, ' % (i, feature)) f.write('feature name: ' + feature_names[feature].encode('utf8') + ' ') f.write('L1 weight: %f' % fe.L1_weights[feature]) f.write('\n') f.write('-' * 50 + '\n') f.write('class 1 features:\n') f.write('-' * 50 + '\n') c1_feat = fe.class1_features_by_rank() for i in range(len(c1_feat)): feature = c1_feat[i] f.write('rank: #%d, feature: #%d, ' % (i, feature)) f.write('feature name: ' + feature_names[feature].encode('utf8') + ' ') f.write('L1 weight: %f' % fe.L1_weights[feature]) f.write('\n') return
python test1.py -cat comp.os.ms-windows.misc comp.sys.ibm.pc.hardware python test1.py -cat rec.sport.baseball sci.crypt ''' parser = argparse.ArgumentParser() parser.add_argument('-cat', default=['alt.atheism', 'talk.religion.misc'], nargs=2, \ help='2 class labels from the 20newsgroup dataset') parser.add_argument('-c', type=float, default=0.1, help='Penalty term for the L1 feature expert') parser.add_argument('-d', type=int, default=5, help='Min_df for CountVectorizer') parser.add_argument('-type', default='weight', choices=['weight', 'non_zero'], help='Type of metric used to' + \ 'partition the features into the two classes') args = parser.parse_args() vect = CountVectorizer(min_df=args.d, max_df=1.0, binary=True, ngram_range=(1, 1)) X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \ load_newsgroups(args.cat[0], args.cat[1], shuffle=True, random_state=42, \ remove=('headers', 'footers'), vectorizer=vect) feature_names = vect.get_feature_names() fe = alt_L1_feature_expert(X_pool, y_pool, args.type, smoothing=1e-6, C=args.c) print 'class 0 features (ranked):' print ', '.join([str((f, feature_names[f], fe.L1_weights[f])) for f in fe.class0_features_by_rank()]) print '-' * 50 print 'class 1 features (ranked):' print ', '.join([str((f, feature_names[f], fe.L1_weights[f])) for f in fe.class1_features_by_rank()]) print '-' * 50 doc_ids = np.random.permutation(np.arange(X_pool.shape[0])) top_n = 20
help='Penalty term for the L1 feature expert') parser.add_argument('-d', type=int, default=5, help='Min_df for CountVectorizer') parser.add_argument('-type', default='weight', choices=['weight', 'non_zero'], help='Type of metric used to' + \ 'partition the features into the two classes') args = parser.parse_args() vect = CountVectorizer(min_df=args.d, max_df=1.0, binary=True, ngram_range=(1, 1)) X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \ load_newsgroups(args.cat[0], args.cat[1], shuffle=True, random_state=42, \ remove=('headers', 'footers'), vectorizer=vect) feature_names = vect.get_feature_names() fe = alt_L1_feature_expert(X_pool, y_pool, args.type, smoothing=1e-6, C=args.c) print 'class 0 features (ranked):' print ', '.join([ str((f, feature_names[f], fe.L1_weights[f])) for f in fe.class0_features_by_rank() ]) print '-' * 50
def covering(dataset='imdb', first='positive', agreement='any', metric='mutual_info', smoothing=1e-6, C=1): if first == 'positive': offset = 1 else: offset = 0 class_label = {0:'negative', 1:'positive'} vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1)) if dataset == 'imdb': X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb("./aclImdb", shuffle=True, vectorizer=vect) elif isinstance(dataset, tuple) and len(dataset) == 3 and dataset[0] == 'newsgroups': X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \ load_newsgroups(class1=dataset[1], class2=dataset[2], shuffle=False, random_state=42, \ vectorizer=vect) feature_names = vect.get_feature_names() fe = feature_expert(X_pool, y_pool, metric, smoothing, C) print 'class 0 features (ranked):' print ', '.join([str((f, feature_names[f])) for f in fe.class0_features_by_rank()]) print 'class 1 features (ranked):' print ', '.join([str((f, feature_names[f])) for f in fe.class1_features_by_rank()]) sample_pool = range(X_pool.shape[0]) feature_list = list() X_csc = X_pool.tocsc() feature_num = 0 while len(sample_pool) != 0: label = (feature_num + offset) % 2 # label for the document rank = feature_num / 2 # rank of the feature in the list feature_num += 1 if rank < len(fe.feature_rank[label]): feature = fe.feature_rank[label][rank] else: print '*' * 50 print ', '.join(['#'+str(doc) for doc in sample_pool]) + ' are uncovered' for doc in sample_pool: print '-' * 50 print 'Document #%d:' % doc print '=' * 50 print 'length = %d' % len(X_pool_docs[doc]) print X_pool_docs[doc] print '=' * 50 print X_pool[doc].indices break feature_name = feature_names[feature] docs_with_feature = X_csc.getcol(feature).indices docs_in_pool_with_feature = list(set(sample_pool).intersection(set(docs_with_feature))) if len(docs_in_pool_with_feature) == 0: continue else: num_docs_covered = len(docs_in_pool_with_feature) num_positive_docs = len(np.nonzero(y_pool[docs_in_pool_with_feature] == 1)[0]) num_negative_docs = len(np.nonzero(y_pool[docs_in_pool_with_feature] == 0)[0]) poolsize_before_removal = len(sample_pool) if agreement == 'agree': docs_with_label = np.nonzero(y_pool == label)[0] docs_to_remove = list(set(docs_in_pool_with_feature).intersection(set(docs_with_label))) sample_pool = list(set(sample_pool).difference(set(docs_to_remove))) else: sample_pool = list(set(sample_pool).difference(set(docs_in_pool_with_feature))) # pack the information into a dictionary for easy printing result = dict() result['name'] = feature_name result['num'] = feature result['class'] = class_label[label] result['poolsize_before_removal'] = poolsize_before_removal result['num_docs_covered'] = num_docs_covered result['num_positive_docs'] = num_positive_docs result['num_negative_docs'] = num_negative_docs result['poolsize_after_removal'] = len(sample_pool) feature_list.append(result) return feature_list
index += 1 return feature_cover_counts, uncovered_docs if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-dataset', default=['imdb'], nargs='*', \ help='Dataset to be used: [\'imdb\', \'20newsgroups\'] 20newsgroups must have 2 valid group names') args = parser.parse_args() vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1)) if args.dataset == ['imdb']: X_pool, y_pool, _, _, _, _ = load_imdb("../aclImdb", shuffle=True, vectorizer=vect) elif len(args.dataset) == 3 and args.dataset[0] == '20newsgroups': X_pool, y_pool, _, _, _, _ = load_newsgroups(args.dataset[1], args.dataset[2], shuffle=True, vectorizer=vect) elif args.dataset == ['SRAA']: X_pool = pickle.load(open('SRAA_X_train.pickle', 'rb')) y_pool = pickle.load(open('SRAA_y_train.pickle', 'rb')) elif args.dataset == ['nova']: X_pool, y_pool, _, _, = load_nova() else: raise ValueError('Invalid Dataset!') if args.dataset != ['SRAA'] and args.dataset != ['nova']: feature_names = vect.get_feature_names() elif args.dataset == ['SRAA']: feature_names = pickle.load(open('SRAA_feature_names.pickle', 'rb')) # Note: nova dataset has no feature_names clf_l1 = LogisticRegression(penalty='l1', C=0.1)
parser = argparse.ArgumentParser() parser.add_argument('-dataset', default=['imdb'], nargs='*', \ help='Dataset to be used: [\'imdb\', \'20newsgroups\'] 20newsgroups must have 2 valid group names') args = parser.parse_args() vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1)) if args.dataset == ['imdb']: X_pool, y_pool, _, _, _, _ = load_imdb("../aclImdb", shuffle=True, vectorizer=vect) elif len(args.dataset) == 3 and args.dataset[0] == '20newsgroups': X_pool, y_pool, _, _, _, _ = load_newsgroups(args.dataset[1], args.dataset[2], shuffle=True, vectorizer=vect) elif args.dataset == ['SRAA']: X_pool = pickle.load(open('SRAA_X_train.pickle', 'rb')) y_pool = pickle.load(open('SRAA_y_train.pickle', 'rb')) elif args.dataset == ['nova']: X_pool, y_pool, _, _, = load_nova() else: raise ValueError('Invalid Dataset!') if args.dataset != ['SRAA'] and args.dataset != ['nova']: feature_names = vect.get_feature_names() elif args.dataset == ['SRAA']: feature_names = pickle.load(open('SRAA_feature_names.pickle', 'rb')) # Note: nova dataset has no feature_names
def covering(dataset='imdb', first='positive', agreement='any', metric='mutual_info', smoothing=1e-6, C=1): if first == 'positive': offset = 1 else: offset = 0 class_label = {0: 'negative', 1: 'positive'} vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1)) if dataset == 'imdb': X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb( "./aclImdb", shuffle=True, vectorizer=vect) elif isinstance( dataset, tuple) and len(dataset) == 3 and dataset[0] == 'newsgroups': X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \ load_newsgroups(class1=dataset[1], class2=dataset[2], shuffle=False, random_state=42, \ vectorizer=vect) feature_names = vect.get_feature_names() fe = feature_expert(X_pool, y_pool, metric, smoothing, C) print 'class 0 features (ranked):' print ', '.join( [str((f, feature_names[f])) for f in fe.class0_features_by_rank()]) print 'class 1 features (ranked):' print ', '.join( [str((f, feature_names[f])) for f in fe.class1_features_by_rank()]) sample_pool = range(X_pool.shape[0]) feature_list = list() X_csc = X_pool.tocsc() feature_num = 0 while len(sample_pool) != 0: label = (feature_num + offset) % 2 # label for the document rank = feature_num / 2 # rank of the feature in the list feature_num += 1 if rank < len(fe.feature_rank[label]): feature = fe.feature_rank[label][rank] else: print '*' * 50 print ', '.join(['#' + str(doc) for doc in sample_pool]) + ' are uncovered' for doc in sample_pool: print '-' * 50 print 'Document #%d:' % doc print '=' * 50 print 'length = %d' % len(X_pool_docs[doc]) print X_pool_docs[doc] print '=' * 50 print X_pool[doc].indices break feature_name = feature_names[feature] docs_with_feature = X_csc.getcol(feature).indices docs_in_pool_with_feature = list( set(sample_pool).intersection(set(docs_with_feature))) if len(docs_in_pool_with_feature) == 0: continue else: num_docs_covered = len(docs_in_pool_with_feature) num_positive_docs = len( np.nonzero(y_pool[docs_in_pool_with_feature] == 1)[0]) num_negative_docs = len( np.nonzero(y_pool[docs_in_pool_with_feature] == 0)[0]) poolsize_before_removal = len(sample_pool) if agreement == 'agree': docs_with_label = np.nonzero(y_pool == label)[0] docs_to_remove = list( set(docs_in_pool_with_feature).intersection( set(docs_with_label))) sample_pool = list( set(sample_pool).difference(set(docs_to_remove))) else: sample_pool = list( set(sample_pool).difference( set(docs_in_pool_with_feature))) # pack the information into a dictionary for easy printing result = dict() result['name'] = feature_name result['num'] = feature result['class'] = class_label[label] result['poolsize_before_removal'] = poolsize_before_removal result['num_docs_covered'] = num_docs_covered result['num_positive_docs'] = num_positive_docs result['num_negative_docs'] = num_negative_docs result['poolsize_after_removal'] = len(sample_pool) feature_list.append(result) return feature_list