Example #1
0
def load_dataset(dataset):
    if dataset == ['imdb']:
        #(X_pool, y_pool, X_test, y_test) = load_data()
        #vect = CountVectorizer(min_df=0.005, max_df=1./3, binary=True, ngram_range=(1,1))
        vect = CountVectorizer(min_df=5,
                               max_df=1.0,
                               binary=True,
                               ngram_range=(1, 1))
        X_pool, y_pool, X_test, y_test, _, _, = load_imdb(
            path='C:\\Users\\mbilgic\\Desktop\\aclImdb',
            shuffle=True,
            vectorizer=vect)
        return (X_pool, y_pool, X_test, y_test, vect.get_feature_names())
    elif isinstance(
            dataset,
            list) and len(dataset) == 3 and dataset[0] == '20newsgroups':
        vect = CountVectorizer(min_df=5,
                               max_df=1.0,
                               binary=True,
                               ngram_range=(1, 1))
        X_pool, y_pool, X_test, y_test, _, _ = \
        load_newsgroups(class1=dataset[1], class2=dataset[2], vectorizer=vect)
        return (X_pool, y_pool, X_test, y_test, vect.get_feature_names())
    elif dataset == ['SRAA']:
        X_pool = pickle.load(open('SRAA_X_train.pickle', 'rb'))
        y_pool = pickle.load(open('SRAA_y_train.pickle', 'rb'))
        X_test = pickle.load(open('SRAA_X_test.pickle', 'rb'))
        y_test = pickle.load(open('SRAA_y_test.pickle', 'rb'))
        feat_names = pickle.load(open('SRAA_feature_names.pickle', 'rb'))
        return (X_pool, y_pool, X_test, y_test, feat_names)
    elif dataset == ['nova']:
        (X_pool, y_pool, X_test, y_test) = load_nova()
        return (X_pool, y_pool, X_test, y_test, None)
def load_Debug_data(top_n=10, min_df=5, max_df=1.0, binary=True, ngram_range=(1,1), \
                    shuffle=False, path='./aclImdb'):
    vect = CountVectorizer(min_df=min_df, max_df=max_df, binary=binary, ngram_range=ngram_range)
    print '=' * 50
    X_pool, y_pool, _, _, X_pool_docs, _ = load_imdb(path, shuffle=shuffle, vectorizer=vect)
    feature_names = np.array(vect.get_feature_names())
    return (top_n, X_pool, y_pool, X_pool_docs, feature_names)
Example #3
0
def check_feature_expert(dataset='imdb', metric='mutual_info', top_n=10, smoothing=1e-6, C=0.1, \
                         vect=CountVectorizer(min_df=5, max_df=1.0, binary=False)):
    class_label = {0:'negative', 1:'positive'}
    if isinstance(dataset, str) and dataset == 'imdb':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb("./aclImdb", shuffle=True, vectorizer=vect)
    elif isinstance(dataset, tuple) and len(dataset) == 3 and dataset[0] == 'newsgroup':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_newsgroups(dataset[1], dataset[2], vectorizer=vect)
    
    feature_names = vect.get_feature_names()
    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)
    doc_ids = np.random.permutation(np.arange(X_pool.shape[0]))
    
    print '\n'
    print '=' * 50
    
    for doc in doc_ids:
        print_all_features(feature_names, fe, top_n, doc, X_pool, y_pool, X_pool_docs)
        
        print '=' * 50
        ch = raw_input('Display the next document? Press Enter to continue or type \'n\' to exit...  ')
        
        if ch == 'n':
            break
    
    return
Example #4
0
def check_feature_expert(dataset='imdb', metric='mutual_info', top_n=10, smoothing=1e-6, C=0.1, \
                         vect=CountVectorizer(min_df=5, max_df=1.0, binary=False)):
    class_label = {0: 'negative', 1: 'positive'}
    if isinstance(dataset, str) and dataset == 'imdb':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb(
            "./aclImdb", shuffle=True, vectorizer=vect)
    elif isinstance(dataset,
                    tuple) and len(dataset) == 3 and dataset[0] == 'newsgroup':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_newsgroups(
            dataset[1], dataset[2], vectorizer=vect)

    feature_names = vect.get_feature_names()
    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)
    doc_ids = np.random.permutation(np.arange(X_pool.shape[0]))

    print '\n'
    print '=' * 50

    for doc in doc_ids:
        print_all_features(feature_names, fe, top_n, doc, X_pool, y_pool,
                           X_pool_docs)

        print '=' * 50
        ch = raw_input(
            'Display the next document? Press Enter to continue or type \'n\' to exit...  '
        )

        if ch == 'n':
            break

    return
Example #5
0
def load_dataset(dataset):
    if dataset == ['imdb']:
        #(X_pool, y_pool, X_test, y_test) = load_data()
        #vect = CountVectorizer(min_df=0.005, max_df=1./3, binary=True, ngram_range=(1,1))
        vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1,1))        
        X_pool, y_pool, X_test, y_test, _, _, = load_imdb(path='./aclImdb/', shuffle=True, vectorizer=vect)
        return (X_pool, y_pool, X_test, y_test, vect.get_feature_names())
    elif isinstance(dataset, list) and len(dataset) == 3 and dataset[0] == '20newsgroups':
        vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))
        X_pool, y_pool, X_test, y_test, _, _ = \
        load_newsgroups(class1=dataset[1], class2=dataset[2], vectorizer=vect)
        return (X_pool, y_pool, X_test, y_test, vect.get_feature_names())
    elif dataset == ['SRAA']:
        X_pool = pickle.load(open('./SRAA_X_train.pickle', 'rb'))
        y_pool = pickle.load(open('./SRAA_y_train.pickle', 'rb'))
        X_test = pickle.load(open('./SRAA_X_test.pickle', 'rb'))
        y_test = pickle.load(open('./SRAA_y_test.pickle', 'rb'))
        feat_names = pickle.load(open('./SRAA_feature_names.pickle', 'rb'))
        return (X_pool, y_pool, X_test, y_test, feat_names)
    elif dataset == ['nova']:
        (X_pool, y_pool, X_test, y_test) = load_nova()
        return (X_pool, y_pool, X_test, y_test, None)
    elif dataset == ['ibnsina']:
        (X_pool, y_pool, X_test, y_test) = load_ibnsina()
        return (X_pool, y_pool, X_test, y_test, None)
    elif dataset == ['creditg']:
        (X_pool, y_pool, X_test, y_test) = load_creditg()
        return (X_pool, y_pool, X_test, y_test, None)
Example #6
0
def write_features(filename='feature.txt'):
    with open(filename, 'w') as f:
        vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))
        (X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs) = \
            load_imdb("./aclImdb", shuffle=True, vectorizer=vect)
        feature_names = vect.get_feature_names()
        for feature in feature_names:
            f.write(feature.encode('utf8') + '\n')
    return
Example #7
0
def write_features(filename='feature.txt'):
    with open(filename, 'w') as f:
        vect = CountVectorizer(min_df=5,
                               max_df=1.0,
                               binary=True,
                               ngram_range=(1, 1))
        (X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs) = \
            load_imdb("./aclImdb", shuffle=True, vectorizer=vect)
        feature_names = vect.get_feature_names()
        for feature in feature_names:
            f.write(feature.encode('utf8') + '\n')
    return
def load_Debug_data(top_n=10, min_df=5, max_df=1.0, binary=True, ngram_range=(1,1), \
                    shuffle=False, path='./aclImdb'):
    vect = CountVectorizer(min_df=min_df,
                           max_df=max_df,
                           binary=binary,
                           ngram_range=ngram_range)
    print '=' * 50
    X_pool, y_pool, _, _, X_pool_docs, _ = load_imdb(path,
                                                     shuffle=shuffle,
                                                     vectorizer=vect)
    feature_names = np.array(vect.get_feature_names())
    return (top_n, X_pool, y_pool, X_pool_docs, feature_names)
Example #9
0
def output_features(filename='features.txt', dataset='imdb', metric='L1', smoothing=1e-6, C=0.1, \
                    vect=CountVectorizer(min_df=5, max_df=1.0, binary=False)):

    if isinstance(dataset, str) and dataset == 'imdb':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb(
            "./aclImdb", shuffle=True, vectorizer=vect)
    elif isinstance(dataset,
                    tuple) and len(dataset) == 3 and dataset[0] == 'newsgroup':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_newsgroups(
            dataset[1], dataset[2], vectorizer=vect)

    feature_names = vect.get_feature_names()
    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)

    print 'saving into \'%s\'...' % filename
    with open(filename, 'w') as f:
        f.write('-' * 50 + '\n')
        f.write('class 0 features:\n')
        f.write('-' * 50 + '\n')
        c0_feat = fe.class0_features_by_rank()
        for i in range(len(c0_feat)):
            feature = c0_feat[i]
            f.write('rank: #%d, feature: #%d, ' % (i, feature))
            f.write('feature name: ' + feature_names[feature].encode('utf8') +
                    ' ')
            f.write('L1 weight: %f' % fe.L1_weights[feature])
            f.write('\n')

        f.write('-' * 50 + '\n')
        f.write('class 1 features:\n')
        f.write('-' * 50 + '\n')
        c1_feat = fe.class1_features_by_rank()
        for i in range(len(c1_feat)):
            feature = c1_feat[i]
            f.write('rank: #%d, feature: #%d, ' % (i, feature))
            f.write('feature name: ' + feature_names[feature].encode('utf8') +
                    ' ')
            f.write('L1 weight: %f' % fe.L1_weights[feature])
            f.write('\n')

    return
Example #10
0
def output_features(filename='features.txt', dataset='imdb', metric='L1', smoothing=1e-6, C=0.1, \
                    vect=CountVectorizer(min_df=5, max_df=1.0, binary=False)):

    if isinstance(dataset, str) and dataset == 'imdb':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb("./aclImdb", shuffle=True, vectorizer=vect)
    elif isinstance(dataset, tuple) and len(dataset) == 3 and dataset[0] == 'newsgroup':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_newsgroups(dataset[1], dataset[2], vectorizer=vect)
    
    feature_names = vect.get_feature_names()
    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)

    print 'saving into \'%s\'...' % filename
    with open(filename, 'w') as f:
        f.write('-' * 50 + '\n')
        f.write('class 0 features:\n')
        f.write('-' * 50 + '\n')
        c0_feat = fe.class0_features_by_rank()
        for i in range(len(c0_feat)):
            feature = c0_feat[i]
            f.write('rank: #%d, feature: #%d, ' % (i, feature))
            f.write('feature name: ' + feature_names[feature].encode('utf8') + ' ')
            f.write('L1 weight: %f' % fe.L1_weights[feature])
            f.write('\n')

        f.write('-' * 50 + '\n')
        f.write('class 1 features:\n')
        f.write('-' * 50 + '\n')
        c1_feat = fe.class1_features_by_rank()
        for i in range(len(c1_feat)):
            feature = c1_feat[i]
            f.write('rank: #%d, feature: #%d, ' % (i, feature))
            f.write('feature name: ' + feature_names[feature].encode('utf8') + ' ')
            f.write('L1 weight: %f' % fe.L1_weights[feature])
            f.write('\n') 
    
    return
Example #11
0
def covering(dataset='imdb',
             first='positive',
             agreement='any',
             metric='mutual_info',
             smoothing=1e-6,
             C=1):
    if first == 'positive':
        offset = 1
    else:
        offset = 0
    class_label = {0: 'negative', 1: 'positive'}
    vect = CountVectorizer(min_df=5,
                           max_df=1.0,
                           binary=True,
                           ngram_range=(1, 1))

    if dataset == 'imdb':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb(
            "./aclImdb", shuffle=True, vectorizer=vect)
    elif isinstance(
            dataset,
            tuple) and len(dataset) == 3 and dataset[0] == 'newsgroups':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \
        load_newsgroups(class1=dataset[1], class2=dataset[2], shuffle=False, random_state=42, \
            vectorizer=vect)

    feature_names = vect.get_feature_names()
    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)

    print 'class 0 features (ranked):'
    print ', '.join(
        [str((f, feature_names[f])) for f in fe.class0_features_by_rank()])

    print 'class 1 features (ranked):'
    print ', '.join(
        [str((f, feature_names[f])) for f in fe.class1_features_by_rank()])

    sample_pool = range(X_pool.shape[0])
    feature_list = list()
    X_csc = X_pool.tocsc()

    feature_num = 0

    while len(sample_pool) != 0:
        label = (feature_num + offset) % 2  # label for the document
        rank = feature_num / 2  # rank of the feature in the list
        feature_num += 1

        if rank < len(fe.feature_rank[label]):
            feature = fe.feature_rank[label][rank]
        else:
            print '*' * 50
            print ', '.join(['#' + str(doc)
                             for doc in sample_pool]) + ' are uncovered'
            for doc in sample_pool:
                print '-' * 50
                print 'Document #%d:' % doc
                print '=' * 50
                print 'length = %d' % len(X_pool_docs[doc])
                print X_pool_docs[doc]
                print '=' * 50
                print X_pool[doc].indices
            break

        feature_name = feature_names[feature]
        docs_with_feature = X_csc.getcol(feature).indices

        docs_in_pool_with_feature = list(
            set(sample_pool).intersection(set(docs_with_feature)))
        if len(docs_in_pool_with_feature) == 0:
            continue
        else:
            num_docs_covered = len(docs_in_pool_with_feature)
            num_positive_docs = len(
                np.nonzero(y_pool[docs_in_pool_with_feature] == 1)[0])
            num_negative_docs = len(
                np.nonzero(y_pool[docs_in_pool_with_feature] == 0)[0])

            poolsize_before_removal = len(sample_pool)

            if agreement == 'agree':
                docs_with_label = np.nonzero(y_pool == label)[0]
                docs_to_remove = list(
                    set(docs_in_pool_with_feature).intersection(
                        set(docs_with_label)))
                sample_pool = list(
                    set(sample_pool).difference(set(docs_to_remove)))
            else:
                sample_pool = list(
                    set(sample_pool).difference(
                        set(docs_in_pool_with_feature)))

            # pack the information into a dictionary for easy printing
            result = dict()
            result['name'] = feature_name
            result['num'] = feature
            result['class'] = class_label[label]
            result['poolsize_before_removal'] = poolsize_before_removal
            result['num_docs_covered'] = num_docs_covered
            result['num_positive_docs'] = num_positive_docs
            result['num_negative_docs'] = num_negative_docs
            result['poolsize_after_removal'] = len(sample_pool)

            feature_list.append(result)

    return feature_list
Example #12
0
    return feature_cover_counts, uncovered_docs


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-dataset', default=['imdb'], nargs='*', \
                    help='Dataset to be used: [\'imdb\', \'20newsgroups\'] 20newsgroups must have 2 valid group names')
    args = parser.parse_args()
    vect = CountVectorizer(min_df=5,
                           max_df=1.0,
                           binary=True,
                           ngram_range=(1, 1))

    if args.dataset == ['imdb']:
        X_pool, y_pool, _, _, _, _ = load_imdb("../aclImdb",
                                               shuffle=True,
                                               vectorizer=vect)
    elif len(args.dataset) == 3 and args.dataset[0] == '20newsgroups':
        X_pool, y_pool, _, _, _, _ = load_newsgroups(args.dataset[1],
                                                     args.dataset[2],
                                                     shuffle=True,
                                                     vectorizer=vect)
    elif args.dataset == ['SRAA']:
        X_pool = pickle.load(open('SRAA_X_train.pickle', 'rb'))
        y_pool = pickle.load(open('SRAA_y_train.pickle', 'rb'))
    elif args.dataset == ['nova']:
        X_pool, y_pool, _, _, = load_nova()
    else:
        raise ValueError('Invalid Dataset!')

    if args.dataset != ['SRAA'] and args.dataset != ['nova']:
Example #13
0
 python explore_feature.py -cat rec.sport.baseball sci.crypt
 '''
 parser = argparse.ArgumentParser()
 parser.add_argument('-cat', default=['alt.atheism', 'talk.religion.misc'], nargs=2, \
                     help='2 class labels from the 20newsgroup dataset')
 parser.add_argument('-dataset', default='SRAA', choices=['20newsgroups', 'SRAA', 'imdb'], help='dataset')
 parser.add_argument('-c', type=float, default=0.1, help='Penalty term for the L1 feature expert')
 parser.add_argument('-d', type=int, default=5, help='Min_df for CountVectorizer')
 parser.add_argument('-type', default='weight', choices=['weight', 'non_zero'], help='Type of metric used to' + \
                     'partition the features into the two classes')
 args = parser.parse_args()
 
 vect = CountVectorizer(min_df=args.d, max_df=1.0, binary=True, ngram_range=(1, 1))
 
 if args.dataset == 'imdb':
     X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb(path='./aclImdb', shuffle=True, vectorizer=vect)
     feature_names = np.array(vect.get_feature_names())
 elif args.dataset == '20newsgroups':
     X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \
         load_newsgroups(args.cat[0], args.cat[1], shuffle=True, random_state=42, \
             remove=('headers', 'footers'), vectorizer=vect)
     feature_names = vect.get_feature_names()
 elif args.dataset == 'SRAA':
     X_pool, y_pool, X_test, y_test, feat_names = load_dataset(args.dataset, vect=vect)
     X_pool_docs = pickle.load(open('SRAA_X_train_corpus.pickle', 'rb'))
     X_test_docs = pickle.load(open('SRAA_X_test_corpus.pickle', 'rb'))
     feature_names = pickle.load(open('SRAA_feature_names.pickle', 'rb'))
         
     print "n_samples: %d, n_features: %d" % X_pool.shape
     
 fe = alt_L1_feature_expert(X_pool, y_pool, args.type, smoothing=1e-6, C=args.c)
Example #14
0
        return input_shape[0], input_shape[-1]


# set parameters:
wandb.init()
config = wandb.config
config.vocab_size = 1000
config.maxlen = 300
config.batch_size = 32
config.embedding_dims = 50
config.filters = 250
config.kernel_size = 3
config.hidden_dims = 100
config.epochs = 10

(X_train, y_train), (X_test, y_test) = imdb.load_imdb()

tokenizer = text.Tokenizer(num_words=config.vocab_size)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = sequence.pad_sequences(X_train, maxlen=config.maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=config.maxlen)

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(config.vocab_size,
                                    config.embedding_dims,
                                    input_length=config.maxlen))
model.add(tf.keras.layers.CuDNNLSTM(config.hidden_dims, return_sequences=True))
model.add(AttentionWithContext())
Example #15
0
def covering(dataset='imdb', first='positive', agreement='any', metric='mutual_info', smoothing=1e-6, C=1):
    if first == 'positive':
        offset = 1
    else:
        offset = 0
    class_label = {0:'negative', 1:'positive'}
    vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))
    
    if dataset == 'imdb':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb("./aclImdb", shuffle=True, vectorizer=vect)
    elif isinstance(dataset, tuple) and len(dataset) == 3 and dataset[0] == 'newsgroups':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \
        load_newsgroups(class1=dataset[1], class2=dataset[2], shuffle=False, random_state=42, \
            vectorizer=vect)
    
    feature_names = vect.get_feature_names()
    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)
        
    print 'class 0 features (ranked):'
    print ', '.join([str((f, feature_names[f])) for f in fe.class0_features_by_rank()])
    
    print 'class 1 features (ranked):'
    print ', '.join([str((f, feature_names[f])) for f in fe.class1_features_by_rank()])
    
    sample_pool = range(X_pool.shape[0])
    feature_list = list()
    X_csc = X_pool.tocsc()
    
    feature_num = 0

    while len(sample_pool) != 0:
        label = (feature_num + offset) % 2 # label for the document
        rank = feature_num / 2 # rank of the feature in the list
        feature_num += 1
        
        if rank < len(fe.feature_rank[label]):
            feature = fe.feature_rank[label][rank]
        else:
            print '*' * 50
            print ', '.join(['#'+str(doc) for doc in sample_pool]) + ' are uncovered'
            for doc in sample_pool:
                print '-' * 50
                print 'Document #%d:' % doc
                print '=' * 50
                print 'length = %d' % len(X_pool_docs[doc])
                print X_pool_docs[doc]
                print '=' * 50
                print X_pool[doc].indices
            break
            
        feature_name = feature_names[feature]
        docs_with_feature = X_csc.getcol(feature).indices

        docs_in_pool_with_feature = list(set(sample_pool).intersection(set(docs_with_feature)))
        if len(docs_in_pool_with_feature) == 0:
            continue
        else:
            num_docs_covered = len(docs_in_pool_with_feature)
            num_positive_docs = len(np.nonzero(y_pool[docs_in_pool_with_feature] == 1)[0])
            num_negative_docs = len(np.nonzero(y_pool[docs_in_pool_with_feature] == 0)[0])

            poolsize_before_removal = len(sample_pool)
            
            if agreement == 'agree':
                docs_with_label = np.nonzero(y_pool == label)[0]
                docs_to_remove = list(set(docs_in_pool_with_feature).intersection(set(docs_with_label)))
                sample_pool = list(set(sample_pool).difference(set(docs_to_remove)))
            else:
                sample_pool = list(set(sample_pool).difference(set(docs_in_pool_with_feature)))

            # pack the information into a dictionary for easy printing   
            result = dict()
            result['name'] = feature_name
            result['num'] = feature
            result['class'] = class_label[label]
            result['poolsize_before_removal'] = poolsize_before_removal
            result['num_docs_covered'] = num_docs_covered
            result['num_positive_docs'] = num_positive_docs
            result['num_negative_docs'] = num_negative_docs
            result['poolsize_after_removal'] = len(sample_pool)
            
            feature_list.append(result)

    return feature_list
Example #16
0
                        help='Penalty term for the L1 feature expert')
    parser.add_argument('-d',
                        type=int,
                        default=5,
                        help='Min_df for CountVectorizer')
    parser.add_argument('-type', default='weight', choices=['weight', 'non_zero'], help='Type of metric used to' + \
                        'partition the features into the two classes')
    args = parser.parse_args()

    vect = CountVectorizer(min_df=args.d,
                           max_df=1.0,
                           binary=True,
                           ngram_range=(1, 1))

    if args.dataset == 'imdb':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb(
            path='./aclImdb', shuffle=True, vectorizer=vect)
        feature_names = np.array(vect.get_feature_names())
    elif args.dataset == '20newsgroups':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \
            load_newsgroups(args.cat[0], args.cat[1], shuffle=True, random_state=42, \
                remove=('headers', 'footers'), vectorizer=vect)
        feature_names = vect.get_feature_names()
    elif args.dataset == 'SRAA':
        X_pool, y_pool, X_test, y_test, feat_names = load_dataset(args.dataset,
                                                                  vect=vect)
        X_pool_docs = pickle.load(open('SRAA_X_train_corpus.pickle', 'rb'))
        X_test_docs = pickle.load(open('SRAA_X_test_corpus.pickle', 'rb'))
        feature_names = pickle.load(open('SRAA_feature_names.pickle', 'rb'))

        print "n_samples: %d, n_features: %d" % X_pool.shape
            feature_cover_counts[feature_rank[index]] = len(nzdocs)
            uncovered_docs.difference_update(nzdocs)
        
        index += 1
    
    return feature_cover_counts, uncovered_docs

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-dataset', default=['imdb'], nargs='*', \
                    help='Dataset to be used: [\'imdb\', \'20newsgroups\'] 20newsgroups must have 2 valid group names')
    args = parser.parse_args()
    vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))
    
    if args.dataset == ['imdb']:
        X_pool, y_pool, _, _, _, _ = load_imdb("../aclImdb", shuffle=True, vectorizer=vect)
    elif len(args.dataset) == 3 and args.dataset[0] == '20newsgroups':
        X_pool, y_pool, _, _, _, _ = load_newsgroups(args.dataset[1], args.dataset[2], shuffle=True, vectorizer=vect)
    elif args.dataset == ['SRAA']:
        X_pool = pickle.load(open('SRAA_X_train.pickle', 'rb'))
        y_pool = pickle.load(open('SRAA_y_train.pickle', 'rb'))
    elif args.dataset == ['nova']:
        X_pool, y_pool, _, _, = load_nova()
    else:
        raise ValueError('Invalid Dataset!')
    
    if args.dataset != ['SRAA'] and args.dataset != ['nova']:
        feature_names = vect.get_feature_names()
    elif args.dataset == ['SRAA']:
        feature_names = pickle.load(open('SRAA_feature_names.pickle', 'rb'))
    # Note: nova dataset has no feature_names