Esempio n. 1
0
def load_dataset(dataset):
    if dataset == ['imdb']:
        #(X_pool, y_pool, X_test, y_test) = load_data()
        #vect = CountVectorizer(min_df=0.005, max_df=1./3, binary=True, ngram_range=(1,1))
        vect = CountVectorizer(min_df=5,
                               max_df=1.0,
                               binary=True,
                               ngram_range=(1, 1))
        X_pool, y_pool, X_test, y_test, _, _, = load_imdb(
            path='C:\\Users\\mbilgic\\Desktop\\aclImdb',
            shuffle=True,
            vectorizer=vect)
        return (X_pool, y_pool, X_test, y_test, vect.get_feature_names())
    elif isinstance(
            dataset,
            list) and len(dataset) == 3 and dataset[0] == '20newsgroups':
        vect = CountVectorizer(min_df=5,
                               max_df=1.0,
                               binary=True,
                               ngram_range=(1, 1))
        X_pool, y_pool, X_test, y_test, _, _ = \
        load_newsgroups(class1=dataset[1], class2=dataset[2], vectorizer=vect)
        return (X_pool, y_pool, X_test, y_test, vect.get_feature_names())
    elif dataset == ['SRAA']:
        X_pool = pickle.load(open('SRAA_X_train.pickle', 'rb'))
        y_pool = pickle.load(open('SRAA_y_train.pickle', 'rb'))
        X_test = pickle.load(open('SRAA_X_test.pickle', 'rb'))
        y_test = pickle.load(open('SRAA_y_test.pickle', 'rb'))
        feat_names = pickle.load(open('SRAA_feature_names.pickle', 'rb'))
        return (X_pool, y_pool, X_test, y_test, feat_names)
    elif dataset == ['nova']:
        (X_pool, y_pool, X_test, y_test) = load_nova()
        return (X_pool, y_pool, X_test, y_test, None)
Esempio n. 2
0
def check_feature_expert(dataset='imdb', metric='mutual_info', top_n=10, smoothing=1e-6, C=0.1, \
                         vect=CountVectorizer(min_df=5, max_df=1.0, binary=False)):
    class_label = {0:'negative', 1:'positive'}
    if isinstance(dataset, str) and dataset == 'imdb':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb("./aclImdb", shuffle=True, vectorizer=vect)
    elif isinstance(dataset, tuple) and len(dataset) == 3 and dataset[0] == 'newsgroup':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_newsgroups(dataset[1], dataset[2], vectorizer=vect)
    
    feature_names = vect.get_feature_names()
    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)
    doc_ids = np.random.permutation(np.arange(X_pool.shape[0]))
    
    print '\n'
    print '=' * 50
    
    for doc in doc_ids:
        print_all_features(feature_names, fe, top_n, doc, X_pool, y_pool, X_pool_docs)
        
        print '=' * 50
        ch = raw_input('Display the next document? Press Enter to continue or type \'n\' to exit...  ')
        
        if ch == 'n':
            break
    
    return
Esempio n. 3
0
def load_dataset(dataset):
    if dataset == ['imdb']:
        #(X_pool, y_pool, X_test, y_test) = load_data()
        #vect = CountVectorizer(min_df=0.005, max_df=1./3, binary=True, ngram_range=(1,1))
        vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1,1))        
        X_pool, y_pool, X_test, y_test, _, _, = load_imdb(path='./aclImdb/', shuffle=True, vectorizer=vect)
        return (X_pool, y_pool, X_test, y_test, vect.get_feature_names())
    elif isinstance(dataset, list) and len(dataset) == 3 and dataset[0] == '20newsgroups':
        vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))
        X_pool, y_pool, X_test, y_test, _, _ = \
        load_newsgroups(class1=dataset[1], class2=dataset[2], vectorizer=vect)
        return (X_pool, y_pool, X_test, y_test, vect.get_feature_names())
    elif dataset == ['SRAA']:
        X_pool = pickle.load(open('./SRAA_X_train.pickle', 'rb'))
        y_pool = pickle.load(open('./SRAA_y_train.pickle', 'rb'))
        X_test = pickle.load(open('./SRAA_X_test.pickle', 'rb'))
        y_test = pickle.load(open('./SRAA_y_test.pickle', 'rb'))
        feat_names = pickle.load(open('./SRAA_feature_names.pickle', 'rb'))
        return (X_pool, y_pool, X_test, y_test, feat_names)
    elif dataset == ['nova']:
        (X_pool, y_pool, X_test, y_test) = load_nova()
        return (X_pool, y_pool, X_test, y_test, None)
    elif dataset == ['ibnsina']:
        (X_pool, y_pool, X_test, y_test) = load_ibnsina()
        return (X_pool, y_pool, X_test, y_test, None)
    elif dataset == ['creditg']:
        (X_pool, y_pool, X_test, y_test) = load_creditg()
        return (X_pool, y_pool, X_test, y_test, None)
Esempio n. 4
0
def check_feature_expert(dataset='imdb', metric='mutual_info', top_n=10, smoothing=1e-6, C=0.1, \
                         vect=CountVectorizer(min_df=5, max_df=1.0, binary=False)):
    class_label = {0: 'negative', 1: 'positive'}
    if isinstance(dataset, str) and dataset == 'imdb':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb(
            "./aclImdb", shuffle=True, vectorizer=vect)
    elif isinstance(dataset,
                    tuple) and len(dataset) == 3 and dataset[0] == 'newsgroup':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_newsgroups(
            dataset[1], dataset[2], vectorizer=vect)

    feature_names = vect.get_feature_names()
    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)
    doc_ids = np.random.permutation(np.arange(X_pool.shape[0]))

    print '\n'
    print '=' * 50

    for doc in doc_ids:
        print_all_features(feature_names, fe, top_n, doc, X_pool, y_pool,
                           X_pool_docs)

        print '=' * 50
        ch = raw_input(
            'Display the next document? Press Enter to continue or type \'n\' to exit...  '
        )

        if ch == 'n':
            break

    return
Esempio n. 5
0
def output_features(filename='features.txt', dataset='imdb', metric='L1', smoothing=1e-6, C=0.1, \
                    vect=CountVectorizer(min_df=5, max_df=1.0, binary=False)):

    if isinstance(dataset, str) and dataset == 'imdb':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb(
            "./aclImdb", shuffle=True, vectorizer=vect)
    elif isinstance(dataset,
                    tuple) and len(dataset) == 3 and dataset[0] == 'newsgroup':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_newsgroups(
            dataset[1], dataset[2], vectorizer=vect)

    feature_names = vect.get_feature_names()
    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)

    print 'saving into \'%s\'...' % filename
    with open(filename, 'w') as f:
        f.write('-' * 50 + '\n')
        f.write('class 0 features:\n')
        f.write('-' * 50 + '\n')
        c0_feat = fe.class0_features_by_rank()
        for i in range(len(c0_feat)):
            feature = c0_feat[i]
            f.write('rank: #%d, feature: #%d, ' % (i, feature))
            f.write('feature name: ' + feature_names[feature].encode('utf8') +
                    ' ')
            f.write('L1 weight: %f' % fe.L1_weights[feature])
            f.write('\n')

        f.write('-' * 50 + '\n')
        f.write('class 1 features:\n')
        f.write('-' * 50 + '\n')
        c1_feat = fe.class1_features_by_rank()
        for i in range(len(c1_feat)):
            feature = c1_feat[i]
            f.write('rank: #%d, feature: #%d, ' % (i, feature))
            f.write('feature name: ' + feature_names[feature].encode('utf8') +
                    ' ')
            f.write('L1 weight: %f' % fe.L1_weights[feature])
            f.write('\n')

    return
Esempio n. 6
0
def output_features(filename='features.txt', dataset='imdb', metric='L1', smoothing=1e-6, C=0.1, \
                    vect=CountVectorizer(min_df=5, max_df=1.0, binary=False)):

    if isinstance(dataset, str) and dataset == 'imdb':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb("./aclImdb", shuffle=True, vectorizer=vect)
    elif isinstance(dataset, tuple) and len(dataset) == 3 and dataset[0] == 'newsgroup':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_newsgroups(dataset[1], dataset[2], vectorizer=vect)
    
    feature_names = vect.get_feature_names()
    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)

    print 'saving into \'%s\'...' % filename
    with open(filename, 'w') as f:
        f.write('-' * 50 + '\n')
        f.write('class 0 features:\n')
        f.write('-' * 50 + '\n')
        c0_feat = fe.class0_features_by_rank()
        for i in range(len(c0_feat)):
            feature = c0_feat[i]
            f.write('rank: #%d, feature: #%d, ' % (i, feature))
            f.write('feature name: ' + feature_names[feature].encode('utf8') + ' ')
            f.write('L1 weight: %f' % fe.L1_weights[feature])
            f.write('\n')

        f.write('-' * 50 + '\n')
        f.write('class 1 features:\n')
        f.write('-' * 50 + '\n')
        c1_feat = fe.class1_features_by_rank()
        for i in range(len(c1_feat)):
            feature = c1_feat[i]
            f.write('rank: #%d, feature: #%d, ' % (i, feature))
            f.write('feature name: ' + feature_names[feature].encode('utf8') + ' ')
            f.write('L1 weight: %f' % fe.L1_weights[feature])
            f.write('\n') 
    
    return
Esempio n. 7
0
 python test1.py -cat comp.os.ms-windows.misc comp.sys.ibm.pc.hardware
 python test1.py -cat rec.sport.baseball sci.crypt
 '''
 parser = argparse.ArgumentParser()
 parser.add_argument('-cat', default=['alt.atheism', 'talk.religion.misc'], nargs=2, \
                     help='2 class labels from the 20newsgroup dataset')
 parser.add_argument('-c', type=float, default=0.1, help='Penalty term for the L1 feature expert')
 parser.add_argument('-d', type=int, default=5, help='Min_df for CountVectorizer')
 parser.add_argument('-type', default='weight', choices=['weight', 'non_zero'], help='Type of metric used to' + \
                     'partition the features into the two classes')
 args = parser.parse_args()
 
 vect = CountVectorizer(min_df=args.d, max_df=1.0, binary=True, ngram_range=(1, 1))
 
 X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \
     load_newsgroups(args.cat[0], args.cat[1], shuffle=True, random_state=42, \
         remove=('headers', 'footers'), vectorizer=vect)
 
 feature_names = vect.get_feature_names()
 fe = alt_L1_feature_expert(X_pool, y_pool, args.type, smoothing=1e-6, C=args.c)
 
 print 'class 0 features (ranked):'
 print ', '.join([str((f, feature_names[f], fe.L1_weights[f])) for f in fe.class0_features_by_rank()])
 print '-' * 50
 
 print 'class 1 features (ranked):'
 print ', '.join([str((f, feature_names[f], fe.L1_weights[f])) for f in fe.class1_features_by_rank()])
 print '-' * 50
 
 doc_ids = np.random.permutation(np.arange(X_pool.shape[0]))
 top_n = 20
 
Esempio n. 8
0
                        help='Penalty term for the L1 feature expert')
    parser.add_argument('-d',
                        type=int,
                        default=5,
                        help='Min_df for CountVectorizer')
    parser.add_argument('-type', default='weight', choices=['weight', 'non_zero'], help='Type of metric used to' + \
                        'partition the features into the two classes')
    args = parser.parse_args()

    vect = CountVectorizer(min_df=args.d,
                           max_df=1.0,
                           binary=True,
                           ngram_range=(1, 1))

    X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \
        load_newsgroups(args.cat[0], args.cat[1], shuffle=True, random_state=42, \
            remove=('headers', 'footers'), vectorizer=vect)

    feature_names = vect.get_feature_names()
    fe = alt_L1_feature_expert(X_pool,
                               y_pool,
                               args.type,
                               smoothing=1e-6,
                               C=args.c)

    print 'class 0 features (ranked):'
    print ', '.join([
        str((f, feature_names[f], fe.L1_weights[f]))
        for f in fe.class0_features_by_rank()
    ])
    print '-' * 50
Esempio n. 9
0
def covering(dataset='imdb', first='positive', agreement='any', metric='mutual_info', smoothing=1e-6, C=1):
    if first == 'positive':
        offset = 1
    else:
        offset = 0
    class_label = {0:'negative', 1:'positive'}
    vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))
    
    if dataset == 'imdb':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb("./aclImdb", shuffle=True, vectorizer=vect)
    elif isinstance(dataset, tuple) and len(dataset) == 3 and dataset[0] == 'newsgroups':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \
        load_newsgroups(class1=dataset[1], class2=dataset[2], shuffle=False, random_state=42, \
            vectorizer=vect)
    
    feature_names = vect.get_feature_names()
    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)
        
    print 'class 0 features (ranked):'
    print ', '.join([str((f, feature_names[f])) for f in fe.class0_features_by_rank()])
    
    print 'class 1 features (ranked):'
    print ', '.join([str((f, feature_names[f])) for f in fe.class1_features_by_rank()])
    
    sample_pool = range(X_pool.shape[0])
    feature_list = list()
    X_csc = X_pool.tocsc()
    
    feature_num = 0

    while len(sample_pool) != 0:
        label = (feature_num + offset) % 2 # label for the document
        rank = feature_num / 2 # rank of the feature in the list
        feature_num += 1
        
        if rank < len(fe.feature_rank[label]):
            feature = fe.feature_rank[label][rank]
        else:
            print '*' * 50
            print ', '.join(['#'+str(doc) for doc in sample_pool]) + ' are uncovered'
            for doc in sample_pool:
                print '-' * 50
                print 'Document #%d:' % doc
                print '=' * 50
                print 'length = %d' % len(X_pool_docs[doc])
                print X_pool_docs[doc]
                print '=' * 50
                print X_pool[doc].indices
            break
            
        feature_name = feature_names[feature]
        docs_with_feature = X_csc.getcol(feature).indices

        docs_in_pool_with_feature = list(set(sample_pool).intersection(set(docs_with_feature)))
        if len(docs_in_pool_with_feature) == 0:
            continue
        else:
            num_docs_covered = len(docs_in_pool_with_feature)
            num_positive_docs = len(np.nonzero(y_pool[docs_in_pool_with_feature] == 1)[0])
            num_negative_docs = len(np.nonzero(y_pool[docs_in_pool_with_feature] == 0)[0])

            poolsize_before_removal = len(sample_pool)
            
            if agreement == 'agree':
                docs_with_label = np.nonzero(y_pool == label)[0]
                docs_to_remove = list(set(docs_in_pool_with_feature).intersection(set(docs_with_label)))
                sample_pool = list(set(sample_pool).difference(set(docs_to_remove)))
            else:
                sample_pool = list(set(sample_pool).difference(set(docs_in_pool_with_feature)))

            # pack the information into a dictionary for easy printing   
            result = dict()
            result['name'] = feature_name
            result['num'] = feature
            result['class'] = class_label[label]
            result['poolsize_before_removal'] = poolsize_before_removal
            result['num_docs_covered'] = num_docs_covered
            result['num_positive_docs'] = num_positive_docs
            result['num_negative_docs'] = num_negative_docs
            result['poolsize_after_removal'] = len(sample_pool)
            
            feature_list.append(result)

    return feature_list
Esempio n. 10
0
        
        index += 1
    
    return feature_cover_counts, uncovered_docs

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-dataset', default=['imdb'], nargs='*', \
                    help='Dataset to be used: [\'imdb\', \'20newsgroups\'] 20newsgroups must have 2 valid group names')
    args = parser.parse_args()
    vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))
    
    if args.dataset == ['imdb']:
        X_pool, y_pool, _, _, _, _ = load_imdb("../aclImdb", shuffle=True, vectorizer=vect)
    elif len(args.dataset) == 3 and args.dataset[0] == '20newsgroups':
        X_pool, y_pool, _, _, _, _ = load_newsgroups(args.dataset[1], args.dataset[2], shuffle=True, vectorizer=vect)
    elif args.dataset == ['SRAA']:
        X_pool = pickle.load(open('SRAA_X_train.pickle', 'rb'))
        y_pool = pickle.load(open('SRAA_y_train.pickle', 'rb'))
    elif args.dataset == ['nova']:
        X_pool, y_pool, _, _, = load_nova()
    else:
        raise ValueError('Invalid Dataset!')
    
    if args.dataset != ['SRAA'] and args.dataset != ['nova']:
        feature_names = vect.get_feature_names()
    elif args.dataset == ['SRAA']:
        feature_names = pickle.load(open('SRAA_feature_names.pickle', 'rb'))
    # Note: nova dataset has no feature_names

    clf_l1 = LogisticRegression(penalty='l1', C=0.1)
Esempio n. 11
0
    parser = argparse.ArgumentParser()
    parser.add_argument('-dataset', default=['imdb'], nargs='*', \
                    help='Dataset to be used: [\'imdb\', \'20newsgroups\'] 20newsgroups must have 2 valid group names')
    args = parser.parse_args()
    vect = CountVectorizer(min_df=5,
                           max_df=1.0,
                           binary=True,
                           ngram_range=(1, 1))

    if args.dataset == ['imdb']:
        X_pool, y_pool, _, _, _, _ = load_imdb("../aclImdb",
                                               shuffle=True,
                                               vectorizer=vect)
    elif len(args.dataset) == 3 and args.dataset[0] == '20newsgroups':
        X_pool, y_pool, _, _, _, _ = load_newsgroups(args.dataset[1],
                                                     args.dataset[2],
                                                     shuffle=True,
                                                     vectorizer=vect)
    elif args.dataset == ['SRAA']:
        X_pool = pickle.load(open('SRAA_X_train.pickle', 'rb'))
        y_pool = pickle.load(open('SRAA_y_train.pickle', 'rb'))
    elif args.dataset == ['nova']:
        X_pool, y_pool, _, _, = load_nova()
    else:
        raise ValueError('Invalid Dataset!')

    if args.dataset != ['SRAA'] and args.dataset != ['nova']:
        feature_names = vect.get_feature_names()
    elif args.dataset == ['SRAA']:
        feature_names = pickle.load(open('SRAA_feature_names.pickle', 'rb'))
    # Note: nova dataset has no feature_names
Esempio n. 12
0
def covering(dataset='imdb',
             first='positive',
             agreement='any',
             metric='mutual_info',
             smoothing=1e-6,
             C=1):
    if first == 'positive':
        offset = 1
    else:
        offset = 0
    class_label = {0: 'negative', 1: 'positive'}
    vect = CountVectorizer(min_df=5,
                           max_df=1.0,
                           binary=True,
                           ngram_range=(1, 1))

    if dataset == 'imdb':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = load_imdb(
            "./aclImdb", shuffle=True, vectorizer=vect)
    elif isinstance(
            dataset,
            tuple) and len(dataset) == 3 and dataset[0] == 'newsgroups':
        X_pool, y_pool, X_test, y_test, X_pool_docs, X_test_docs = \
        load_newsgroups(class1=dataset[1], class2=dataset[2], shuffle=False, random_state=42, \
            vectorizer=vect)

    feature_names = vect.get_feature_names()
    fe = feature_expert(X_pool, y_pool, metric, smoothing, C)

    print 'class 0 features (ranked):'
    print ', '.join(
        [str((f, feature_names[f])) for f in fe.class0_features_by_rank()])

    print 'class 1 features (ranked):'
    print ', '.join(
        [str((f, feature_names[f])) for f in fe.class1_features_by_rank()])

    sample_pool = range(X_pool.shape[0])
    feature_list = list()
    X_csc = X_pool.tocsc()

    feature_num = 0

    while len(sample_pool) != 0:
        label = (feature_num + offset) % 2  # label for the document
        rank = feature_num / 2  # rank of the feature in the list
        feature_num += 1

        if rank < len(fe.feature_rank[label]):
            feature = fe.feature_rank[label][rank]
        else:
            print '*' * 50
            print ', '.join(['#' + str(doc)
                             for doc in sample_pool]) + ' are uncovered'
            for doc in sample_pool:
                print '-' * 50
                print 'Document #%d:' % doc
                print '=' * 50
                print 'length = %d' % len(X_pool_docs[doc])
                print X_pool_docs[doc]
                print '=' * 50
                print X_pool[doc].indices
            break

        feature_name = feature_names[feature]
        docs_with_feature = X_csc.getcol(feature).indices

        docs_in_pool_with_feature = list(
            set(sample_pool).intersection(set(docs_with_feature)))
        if len(docs_in_pool_with_feature) == 0:
            continue
        else:
            num_docs_covered = len(docs_in_pool_with_feature)
            num_positive_docs = len(
                np.nonzero(y_pool[docs_in_pool_with_feature] == 1)[0])
            num_negative_docs = len(
                np.nonzero(y_pool[docs_in_pool_with_feature] == 0)[0])

            poolsize_before_removal = len(sample_pool)

            if agreement == 'agree':
                docs_with_label = np.nonzero(y_pool == label)[0]
                docs_to_remove = list(
                    set(docs_in_pool_with_feature).intersection(
                        set(docs_with_label)))
                sample_pool = list(
                    set(sample_pool).difference(set(docs_to_remove)))
            else:
                sample_pool = list(
                    set(sample_pool).difference(
                        set(docs_in_pool_with_feature)))

            # pack the information into a dictionary for easy printing
            result = dict()
            result['name'] = feature_name
            result['num'] = feature
            result['class'] = class_label[label]
            result['poolsize_before_removal'] = poolsize_before_removal
            result['num_docs_covered'] = num_docs_covered
            result['num_positive_docs'] = num_positive_docs
            result['num_negative_docs'] = num_negative_docs
            result['poolsize_after_removal'] = len(sample_pool)

            feature_list.append(result)

    return feature_list