Example #1
0
    def test_lime_text_tabular_not_equal_random_state(self):
        categories = ['alt.atheism', 'soc.religion.christian']
        newsgroups_train = fetch_20newsgroups(subset='train',
                                              categories=categories)
        newsgroups_test = fetch_20newsgroups(subset='test',
                                             categories=categories)
        class_names = ['atheism', 'christian']
        vectorizer = TfidfVectorizer(lowercase=False)
        train_vectors = vectorizer.fit_transform(newsgroups_train.data)
        test_vectors = vectorizer.transform(newsgroups_test.data)
        nb = MultinomialNB(alpha=.01)
        nb.fit(train_vectors, newsgroups_train.target)
        pred = nb.predict(test_vectors)
        f1_score(newsgroups_test.target, pred, average='weighted')
        c = make_pipeline(vectorizer, nb)

        explainer = LimeTextExplainer(
            class_names=class_names, random_state=10)
        exp_1 = explainer.explain_instance(newsgroups_test.data[83],
                                           c.predict_proba, num_features=6)

        explainer = LimeTextExplainer(
            class_names=class_names, random_state=20)
        exp_2 = explainer.explain_instance(newsgroups_test.data[83],
                                           c.predict_proba, num_features=6)

        self.assertFalse(exp_1.as_map() == exp_2.as_map())
Example #2
0
def get_data():
    newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories= ['talk.politics.guns', 'talk.politics.mideast','alt.atheism','talk.politics.misc', 'talk.religion.misc'])
    newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),categories= ['talk.politics.guns', 'talk.politics.mideast','alt.atheism','talk.politics.misc', 'talk.religion.misc'])
    # texts_train, target_train = newsgroups_train.data, newsgroups_train.target
    # texts_test, target_test = newsgroups_test.data,newsgroups_test.target
    #return texts_train, target_train, newsgroups_train.filenames, texts_test, target_test, newsgroups_test.filenames
    return newsgroups_train, newsgroups_test
def file():
    cats = ["alt.atheism", "sci.electronics"]

    newsgroups_train = fetch_20newsgroups(subset="train", categories=cats)

    newsgroups_test = fetch_20newsgroups(subset="test", categories=cats)
    vectorizer = TfidfVectorizer()  # 把所有文档都切词,统计了

    vectors_train = vectorizer.fit_transform(newsgroups_train.data)
    vectors = vectorizer.transform(newsgroups_test.data)
    print vectors.shape[1]
    # f=open('test_all.txt','wb')
    for j in range(0, vectors.shape[0]):
        item_id = list()
        tokens = vectorizer.build_tokenizer()(newsgroups_test.data[j])  # 提取分词结果
        # print tokens

        word_sort = np.argsort(-vectors[j].data)
        print "顶点" + str(j)
        for i in range(0, len(word_sort)):
            word = vectorizer.get_feature_names()[vectors[j].indices[word_sort[i]]]  # 这个是tf-idf詞
            for line in range(0, len(tokens)):
                if tokens[line].lower() == word:
                    item_id.append((line, word_sort[i]))

        pos_item = sorted(item_id, key=lambda jj: jj[0], reverse=True)  # 抽取tf-idf词

        word_word = np.zeros([len(word_sort), len(word_sort)])
        for p in range(0, len(pos_item)):
            if p < (len(pos_item) - 1):
                ki = word_sort[pos_item[p][1]]
                kj = word_sort[pos_item[p + 1][1]]
                word_word[ki, kj] = word_word[ki, kj] + 1
Example #4
0
def test_20news():
    try:
        data = datasets.fetch_20newsgroups(
            subset='all', download_if_missing=False, shuffle=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # Extract a reduced dataset
    data2cats = datasets.fetch_20newsgroups(
        subset='all', categories=data.target_names[-1:-3:-1], shuffle=False)
    # Check that the ordering of the target_names is the same
    # as the ordering in the full dataset
    assert_equal(data2cats.target_names,
                 data.target_names[-2:])
    # Assert that we have only 0 and 1 as labels
    assert_equal(np.unique(data2cats.target).tolist(), [0, 1])

    # Check that the number of filenames is consistent with data/target
    assert_equal(len(data2cats.filenames), len(data2cats.target))
    assert_equal(len(data2cats.filenames), len(data2cats.data))

    # Check that the first entry of the reduced dataset corresponds to
    # the first entry of the corresponding category in the full dataset
    entry1 = data2cats.data[0]
    category = data2cats.target_names[data2cats.target[0]]
    label = data.target_names.index(category)
    entry2 = data.data[np.where(data.target == label)[0][0]]
    assert_equal(entry1, entry2)
Example #5
0
def News():
    from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
    newsgroups_train = datasets.fetch_20newsgroups(subset='train')
    vectorizer = CountVectorizer(encoding='latin-1', max_features=30000)
    #vectorizer = HashingVectorizer(encoding='latin-1')
    x_train = vectorizer.fit_transform(newsgroups_train.data)
    x_train = numpy.asarray(x_train.todense(), dtype='float32')
    y_train = numpy.asarray(newsgroups_train.target, dtype='int32')
    newsgroups_test = datasets.fetch_20newsgroups(subset='test')
    x_test = vectorizer.transform(newsgroups_test.data)
    x_test = numpy.asarray(x_test.todense(), dtype='float32')
    y_test = numpy.asarray(newsgroups_test.target, dtype='int32')
    dnn=RegularizedNet(numpy_rng=numpy.random.RandomState(123), theano_rng=None, 
            n_ins=x_train.shape[1],
            layers_types=[ReLU, ReLU, LogisticRegression],
            layers_sizes=[1000, 1000],
            n_outs=len(set(y_train)),
            rho=0.95, 
            eps=1.E-6,
            max_norm=0.,
            debugprint=False,
            L1_reg=0.,
            L2_reg=1./x_train.shape[0])
    print len(set(y_train))
    dnn.fit(x_train, y_train, max_epochs=30, method='adadelta_nesterov', verbose=True, plot=False)
    test_error = dnn.score(x_test, y_test)
    print("score: %f" % (1. - test_error))
def train_20_news(n_jobs, n_folds):
    from sklearn.datasets import fetch_20newsgroups
    train = fetch_20newsgroups(subset='train', shuffle=False, random_state=100,
                               remove=('headers', 'footers', 'quotes'))
    test = fetch_20newsgroups(subset='test', shuffle=False, random_state=100,
                              remove=('headers', 'footers', 'quotes'))

    x_train = map(dt.clean_str, train.data)
    x_test = map(dt.clean_str, test.data)

    text_clf = Pipeline([
                         # ('clean', Cleaner()),
                         ('vect', CountVectorizer(ngram_range=(1, 2), stop_words='english')),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('clf', SGDClassifier(fit_intercept=True, random_state=0))
                         ])

    SGDClassifier_params = {
        'clf__alpha': np.arange(4e-5, 2e-3, 2e-5),
        'clf__loss': ('squared_loss', 'hinge', 'squared_hinge'),
        'clf__penalty': ('l2', 'elasticnet'),
    }

    gs_clf = GridSearchCV(text_clf, SGDClassifier_params, n_jobs=n_jobs, cv=n_folds, refit=True, verbose=3)
    gs_clf.fit(x_train, train.target)

    result_str = list()
    result_str.append('\n')
    result_str.append('best params:')
    result_str.append(str(gs_clf.best_params_))
    result_str.append('best score = %f' % gs_clf.best_score_)
    result_str = '\n'.join(result_str)
    print result_str

    print "test score = " % gs_clf.score(x_test, test.target)
Example #7
0
    def load_sklearn_data(self,name):
        if name == "digits":
            training = fetch_20newsgroups(subset='train',shuffle=True,random_state=42);
            testing = fetch_20newsgroups(subset='test',shuffle=True,random_state=100);
            validation = fetch_20newsgroups(subset='test',shuffle=True,random_state=200);
            categories = training.target_names
            data_train_size_mb = size_mb(training.data)
            data_test_size_mb = size_mb(testing.data)
            data_test_size_mb = size_mb(validation.data)
            
            print("%d documents - %0.3fMB (training set)" % (
                len(training.data), data_train_size_mb))
            print("%d documents - %0.3fMB (test set)" % (
                len(testing.data), data_test_size_mb))

            print("%d documents - %0.3fMB (test set)" % (
                len(validation.data), data_test_size_mb))
                
            print("%d categories" % len(categories))
            print()        
            
            training=[training.data,training.target_names]
            testing=[testing.data,testing.target_names]
            validation=[validation.data,validation.target_names]
            
            return [training,testing,validation];
Example #8
0
def export_20ng(remove_headers=False, remove_footers=False, remove_quotes=False, categories=None):
    output_dir = os.path.join('..', 'datasets', '20ng', 'data')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    remove = []
    if remove_headers:
        remove.append('headers')
    if remove_footers:
        remove.append('footers')
    if remove_quotes:
        remove.append('quotes')

    print categories

    ng_train = fetch_20newsgroups(subset='train', remove=remove, categories=categories)
    keys = ['train' + str(i) for i in range(len(ng_train.data))]
    print len(keys)
    train_text = dict(zip(keys, ng_train.data))
    fh.write_to_json(train_text, os.path.join(output_dir, 'train.json'))

    train_labels = pd.DataFrame(ng_train.target, columns=['target'], index=keys)
    train_labels.to_csv(os.path.join(output_dir, 'train.csv'))
    print train_labels.shape

    ng_test = fetch_20newsgroups(subset='test', remove=remove, categories=categories)
    keys = ['test' + str(i) for i in range(len(ng_test.data))]
    test_text = dict(zip(keys, ng_train.data))
    fh.write_to_json(test_text, os.path.join(output_dir, 'test.json'))

    test_labels = pd.DataFrame(ng_test.target, columns=['target'], index=keys)
    test_labels.to_csv(os.path.join(output_dir, 'test.csv'))
Example #9
0
def load_20newsgroups(category=None, shuffle=True, rnd=1):
    categories = {'religion': ['alt.atheism', 'talk.religion.misc'],
                  'graphics': ['comp.graphics', 'comp.windows.x'],
                  'hardware': ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  'baseball': ['rec.sport.baseball', 'sci.crypt']}
    cat = None
    if category is not None:
        cat = categories[category]

    data = bunch.Bunch()
    data.train = fetch_20newsgroups(subset='train', categories=cat, remove=('headers', 'footers', 'quotes'),
                                    shuffle=shuffle, random_state=rnd)

    # data.train.data = np.array([keep_header_subject(text) for text in data.train.data], dtype=object)
    data.train.data = np.array(data.train.data, dtype=object)
    data.test = fetch_20newsgroups(subset='test', categories=cat, remove=('headers', 'footers', 'quotes'),
                                   shuffle=shuffle, random_state=rnd)

    # data.test.data = np.array([keep_header_subject(text) for text in data.test.data], dtype=object)
    data.test.data = np.array(data.test.data, dtype=object)
    data = minimum_size(data)

    if shuffle:
        random_state = np.random.RandomState(rnd)
        indices = np.arange(data.train.target.shape[0])
        random_state.shuffle(indices)
        data.train.filenames = data.train.filenames[indices]
        data.train.target = data.train.target[indices]
        # Use an object array to shuffle: avoids memory copy
        data_lst = np.array(data.train.data, dtype=object)
        data_lst = data_lst[indices]
        data.train.data = data_lst

    return data
Example #10
0
def Load20NG():
    cats = ['alt.atheism', 'soc.religion.christian']
    newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
    newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)
    train, train_labels = newsgroups_train.data, newsgroups_train.target
    test, test_labels = newsgroups_test.data, newsgroups_test.target
    return train, train_labels, test, test_labels
    def loadData(self, opts):
        if opts.all_categories:
            categories = None
        else:
            categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics',
                          'sci.space']

        if opts.filtered:
            remove = ('headers', 'footers', 'quotes')
        else:
            remove = ()

        print('Loading 20 newsgroups dataset for categories:')
        print((categories if categories else 'all'))

        data_train = fetch_20newsgroups(subset='train', categories=categories,
                                        shuffle=True, random_state=42,
                                        remove=remove)

        data_test = fetch_20newsgroups(subset='test', categories=categories,
                                       shuffle=True, random_state=42,
                                       remove=remove)
        
        categories = data_train.target_names  # for case categories == None
        # print(len(data_train))
        print('data loaded')
        
        return data_train, data_test, categories
Example #12
0
def uai(params):#, **kwargs):
    print 'Params: ', params, '\n'
    #y = benchmark_functions.save_svm_on_grid(params, opt_time=ret_time, **kwargs)
    logreg = linear_model.LogisticRegression(penalty=params['penalty'],tol=float(params['tol']),C=float(params['strength']))
    if params['n_min'] > params['n_max']:
      z=params['n_min']
      params['n_min']=params['n_max']
      params['n_max']=z
    if params['stop_words']==True:
      st='english'
    else:
      st=None 
    vectorizer = TfidfVectorizer(ngram_range=(int(params['n_min']),int(params['n_max'])),binary=params['binary'],use_idf=params['idf'],smooth_idf=True,stop_words=st)
    if params['cats'] == 'all':
        cats = None
    elif params['cats'] == 'science':
        cats = ['sci.med','sci.space','sci.crypt','sci.electronics']
    elif params['cats'] == 'religion':
        cats = ['alt.atheism', 'talk.religion.misc']
    elif params['cats'] == 'graphics':
        cats = ['comp.windows.x','comp.graphics']
    #cats = ['sci.med','sci.space']
    #cats = ['comp.sys.ibm.pc.hardware','comp.sys.mac.hardware']
    print 'preprocess data'
    #newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=cats)
    #vectors = vectorizer.fit_transform(newsgroups_train.data)
    #print vectors.shape
    #newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=cats)   
    #print 'preprocess test data'
    #vectors_test = vectorizer.fit_transform(newsgroups_test.data)
    if params['rm_footers']:
        to_remove = ('headers', 'footers')
    else:
        to_remove = ('headers',)
        
    print_20n(to_remove, cats, params)
    newsgroups_all = fetch_20newsgroups(subset='all', remove=to_remove, categories=cats)#,'footers'))#,'footers','quotes'), categories=cats)   
    vectors_all = vectorizer.fit_transform(newsgroups_all.data)
    #nrow=round(7.0/10.0*vectors_all.shape[0])
    newsgroups_train = fetch_20newsgroups(subset='train',remove=to_remove, categories=cats)
    nrow=newsgroups_train.target.shape[0]
    #print nrow
    #print vectors_all.shape
    vectors=vectors_all[0:nrow,:]
    vectors_test=vectors_all[nrow:,:]
    #print vectors.shape
    #print vectors_test.shape
    print 'fit model'
    logreg.fit(vectors,newsgroups_all.target[0:nrow])
    print 'predict model'
    pred=logreg.predict(vectors_test)
    print 'evaluate'
    y=metrics.accuracy_score(newsgroups_all.target[nrow:], pred)
    print 'Result: ', y
    print('idf: ', params['idf'], 'rm_footers: ', params['rm_footers'], 'cats: ', params['cats'])
    return -y
Example #13
0
def exercise():
    groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 
        'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
    train_data = fetch_20newsgroups(subset='train', categories=groups)
    clusterizer = DocumentClusterizer()
    clusterizer.train(train_data.data)
    test_data = fetch_20newsgroups(subset='test', categories=groups)
    for i in range(10):
        sample = test_data.data[np.random.randint(len(test_data.data))]
        clusterizer.find_most_similar(sample)
def load_data():
    twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
    twenty_test = fetch_20newsgroups(subset='test',shuffle=True, random_state=42)
     
    x_train = twenty_train.data
    y_train = twenty_train.target
    x_test = twenty_test.data
    y_test = twenty_test.target
    print 'data loaded!'
    return (x_train, y_train, x_test, y_test)
Example #15
0
def get_login_pages(keywords):
    from sklearn.datasets import fetch_20newsgroups
    import gensim
    import re
    """
    newsgroups_train = fetch_20newsgroups(subset='train')
    for  news in newsgroups_train.target_names:
        print news

    alt.atheism
    comp.graphics
    comp.os.ms-windows.misc
    comp.sys.ibm.pc.hardware
    comp.sys.mac.hardware
    comp.windows.x
    misc.forsale
    rec.autos
    rec.motorcycles
    rec.sport.baseball
    rec.sport.hockey
    sci.crypt
    sci.electronics
    sci.med
    sci.space
    soc.religion.christian
    talk.politics.guns
    talk.politics.mideast
    talk.politics.misc
    talk.religion.misc
    """
    #cats = ['sci.crypt']
    #newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
    newsgroups_train = fetch_20newsgroups(subset='train')
    newsgroups_test = fetch_20newsgroups(subset='test')

    newsgroups=[]
    newsgroups.append(newsgroups_train.data)
    newsgroups.append(newsgroups_test.data)
    #newsgroups_train = fetch_20newsgroups()
    #print len(newsgroups_train.data)
    print newsgroups_train.data
    sentences=[re.findall("[a-z\-]+",s.lower()) for s in newsgroups_train.data]
    #sentences = [s.lower().split() for s in newsgroups_train.data]
    #print sentences

    model = gensim.models.Word2Vec(sentences, size=200, window=5, min_count=1, workers=4,iter=20)

    #print len(sentences)

    for key in keywords:
        print "[%s] most_similar:" % key
        results=model.most_similar(positive=[key], topn=10)
        for i in results:
            print i
Example #16
0
 def testNaiveBayesSK2(self):
     categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
     newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
     newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
     vectorizer = TfidfVectorizer()
     # Both vectors and vectors_test are SciPy CSR matrix
     vectors = vectorizer.fit_transform(newsgroups_train.data)
     vectors_test = vectorizer.transform(newsgroups_test.data)
     nb = NaiveBayes(sqlCtx)
     nb.fit(vectors, newsgroups_train.target)
     pred = nb.predict(vectors_test)
     score = metrics.f1_score(newsgroups_test.target, pred, average='weighted')
     self.failUnless(score > 0.8)
def load_dataset(category_list):
    """
    :return: Load the 20_newsgroup dataset depending on category_list.
             If [] provided return everything
    """

    if category_list == []:  # read all categories from news20 dataset
        train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
        test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
    else:            # read only computer technology & recreational activity categories
        train = fetch_20newsgroups(subset='train',  shuffle=True, random_state=42, categories=category_list)
        test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42,  categories=category_list)

    return train, test
 def test_naive_bayes1(self):
     categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
     newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
     newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
     vectorizer = TfidfVectorizer()
     # Both vectors and vectors_test are SciPy CSR matrix
     vectors = vectorizer.fit_transform(newsgroups_train.data)
     vectors_test = vectorizer.transform(newsgroups_test.data)
     nb = NaiveBayes(sparkSession)
     mllearn_predicted = nb.fit(vectors, newsgroups_train.target).predict(vectors_test)
     from sklearn.naive_bayes import MultinomialNB
     clf = MultinomialNB()
     sklearn_predicted = clf.fit(vectors, newsgroups_train.target).predict(vectors_test)
     self.failUnless(accuracy_score(sklearn_predicted, mllearn_predicted) > 0.95 )
Example #19
0
  def __init__(self):
    data_train = fetch_20newsgroups(subset='train', categories=None,
                                    shuffle=True, random_state=42)
    data_test = fetch_20newsgroups(subset='test', categories=None,
                                   shuffle=True, random_state=42)
    self.train_data = data_train.data
    self.train_target = data_train.target
    self.alltest_data = data_test.data
    self.alltest_target = data_test.target

    self.categories = data_train.target_names
    self.num_classes = 20

    DataGatherer.__init__(self)
Example #20
0
def test_20news_length_consistency():
    """Checks the length consistencies within the bunch

    This is a non-regression test for a bug present in 0.16.1.
    """
    try:
        data = datasets.fetch_20newsgroups(
            subset='all', download_if_missing=False, shuffle=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")
    # Extract the full dataset
    data = datasets.fetch_20newsgroups(subset='all')
    assert_equal(len(data['data']), len(data.data))
    assert_equal(len(data['target']), len(data.target))
    assert_equal(len(data['filenames']), len(data.filenames))
def load_20_news_data(data_path=None, max_size=None):
    newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=100,
                                    remove=('headers', 'footers', 'quotes'))
    data = pd.DataFrame({"text" : newsgroups.data, "label" : newsgroups.target})
    if max_size is not None:
        data = data[0:max_size]
    return data
Example #22
0
def _download_20newsgroup():
    """
    Download the 20 newsgroups dataset from scikitlearn.
    :return: The train, test and validation set.
    """
    from sklearn.datasets import fetch_20newsgroups
    print "downloading 20 newsgroup train data...."
    newsgroups_train = fetch_20newsgroups(
        subset='train', remove=('headers', 'footers', 'quotes'))
    print "downloading 20 newsgroup test data...."
    newsgroups_test = fetch_20newsgroups(
        subset='test', remove=('headers', 'footers', 'quotes'))
    train_set = (newsgroups_train.data, newsgroups_train.target)
    test_set = (newsgroups_test.data, newsgroups_test.target)

    return train_set,test_set
def get_train_data():
    try:
        twenty_train = pickle.load("twenty_train.p")
    except:
        twenty_train = fetch_20newsgroups(subset='train', categories=CATEGORIES, shuffle=True, random_state=42)
        pickle.dump(twenty_train, open("twenty_train.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
    return twenty_train
def main():
    newsgoups = fetch_20newsgroups(subset='train', categories=['sci.crypt', 'talk.politics.guns'])

    vectorizer = CountVectorizer()
    vector = vectorizer.fit_transform(newsgoups.data, newsgoups.target)
    vocab = np.array(vectorizer.get_feature_names())
    print "number of positive examples:", np.sum(newsgoups.target)

    t0 = time.time()
    ig_scores, _ = ig(vector, newsgoups.target)
    print "Information Gain top 50  scored terms:"
    print vocab[np.argsort(ig_scores)][-50:]
    print "time: %.4f secs" % (time.time()-t0)

    t0 = time.time()
    bns_scores, _ = bns(vector, newsgoups.target)
    print "Bi-Normal Separation top 50  scored terms:"
    print vocab[np.argsort(bns_scores)][-50:]
    print "time: %.4f secs" % (time.time()-t0)

    t0 = time.time()
    chi2_scores, _ = chi2(vector, newsgoups.target)
    print "Chi Squared top 50  scored terms:"
    print vocab[np.argsort(chi2_scores)][-50:]
    print "time: %.4f secs" % (time.time()-t0)
def fetch_and_save(dirpath, vocpath=None, min_df=6, tokenizer=None):
    """
    Fetches the 20 newsgroups corpus, vectorized the documents, stores them in
    a database of lists and saves it to file.
    """
    # Loading data
    newsgroups_dataset = fetch_20newsgroups(subset="all", remove=("headers", "footers", "quotes"), random_state=123)

    # uses a predefined vocabulary list if available
    if vocpath:
        vocabulary = load_vocabulary(vocpath)
        newsgroups_counter = CountVectorizer(
            stop_words="english", tokenizer=tokenizer, vocabulary=vocabulary, min_df=min_df
        )
    else:
        newsgroups_counter = CountVectorizer(stop_words="english", tokenizer=tokenizer, min_df=min_df)

    # generates csr matrix with the vectors of term frequencies
    newsgroups_mat = newsgroups_counter.fit_transform(newsgroups_dataset.data)

    # converts csr matrix to a database of lists
    num_of_docs, vocab_size = newsgroups_mat.shape
    newsgroups_list = [[] for i in xrange(num_of_docs)]
    newsgroups_coo = newsgroups_mat.tocoo()
    for i, j, v in itertools.izip(newsgroups_coo.row, newsgroups_coo.col, newsgroups_coo.data):
        newsgroups_list[i].append([j, v])

    # saves corpus, vocabulary and indices
    save_corpus_to_file(dirpath + "/20newsgroups.corpus", newsgroups_list)
    save_vocabulary_to_file(dirpath + "/20newsgroups.vocab", newsgroups_list, newsgroups_counter)
    save_idx_to_file(dirpath + "/20newsgroups.idx", newsgroups_dataset)
def load_20news():
    newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
    newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

    vectorizer = TfidfVectorizer()

    X_train = vectorizer.fit_transform(newsgroups_train.data)
    X_test = vectorizer.transform(newsgroups_test.data)

    y_train = newsgroups_train.target
    y_test = newsgroups_test.target
    
    X_train, X_val = X_train[:-1000], X_train[-1000:]
    y_train, y_val = y_train[:-1000], y_train[-1000:]

    return X_train, y_train, X_val, y_val, X_test, y_test
Example #27
0
File: c.py Project: chengxwcq/ee219
def retrieve_data():
    graphics_train = fetch_20newsgroups(subset = 'train', shuffle = True, random_state = 42)
    categories = graphics_train.target_names
    # cluster the data from one class
    all_data = graphics_train.data
    filenames = graphics_train.filenames
    return all_data, filenames, categories
Example #28
0
def category_docs_frequency_count(category):

    category_train = fetch_20newsgroups(subset='train', categories=category, shuffle=True, random_state=42)
    frequency = []
    frequency = collections.Counter(category_train.target) #count frequency of category ids
    docs_count = sum(frequency.values()) #sum up frequencies of docs of a category
    return docs_count
Example #29
0
def load_newsgroup_data(V, cats, sort_data=True):
    from sklearn.datasets import fetch_20newsgroups
    print("Downloading newsgroups data...")
    print('cats = %s' % cats)
    newsgroups = fetch_20newsgroups(
        subset="train", categories=cats, remove=('headers', 'footers', 'quotes'))
    return get_sparse_repr(newsgroups.data, V, sort_data)
Example #30
0
stop_words = text.ENGLISH_STOP_WORDS
class LemmaTokenizer(object):
	def __init__(self):
	    self.wnl = WordNetLemmatizer()
	def __call__(self, doc):
		new_doc = re.sub('[,.-:/()?{}*$#&]', ' ', doc) # Remove symbols
		new_doc = ''.join([ch for ch in new_doc if ch not in string.punctuation])  # remove all punctuation
		new_doc = "".join(ch for ch in new_doc if ord(ch) < 128)  # remove all n5n-ascii characters
		new_doc = new_doc.lower() # convert to lowercase
		return [self.wnl.lemmatize(t) for t in word_tokenize(new_doc)] # stemmed


# Load the eight category
categories_8 = ['comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware', 'rec.autos','rec.motorcycles','rec.sport.baseball','rec.sport.hockey']

eight_train = fetch_20newsgroups(subset='train', categories=categories_8, shuffle=True, random_state=42)
eight_test = fetch_20newsgroups(subset='test', categories=categories_8, shuffle=True, random_state=42)

# Tokenize each document into words
# Gets rid of stop words, and stemmed version of word
# Ignores words appearing in less then 5 (or 2 if min_df = 2) documents 
vectorizer = CountVectorizer(min_df=5, stop_words= stop_words, tokenizer=LemmaTokenizer() )
X_train_counts = vectorizer.fit_transform(eight_train.data)
X_test_counts = vectorizer.transform(eight_test.data)

# TFIDF
# We set smooth_idf = false so we use the equation idf(d, t) = log [ n / df(d, t) ] + 1
tfidf_transformer = TfidfTransformer(smooth_idf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
'''
Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing)
Chapter 2: Exploring the 20 Newsgroups Dataset with Text Analysis Techniques
Author: Yuxi (Hayden) Liu
'''

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

categories_3 = ['talk.religion.misc', 'comp.graphics', 'sci.space']

groups_3 = fetch_20newsgroups(categories=categories_3)


def is_letter_only(word):
    for char in word:
        if not char.isalpha():
            return False
    return True


from nltk.corpus import names
all_names = set(names.words())

count_vector_sw = CountVectorizer(stop_words="english", max_features=500)

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

data_cleaned = []
Example #32
0
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

emails = fetch_20newsgroups(
    categories=['rec.sport.baseball', 'rec.sport.hockey'])

#print(emails.target_names)
#print(emails.data[5])
#print(emails.target[5])
train_emails = fetch_20newsgroups(
    categories=['rec.sport.baseball', 'rec.sport.hockey'],
    subset='train',
    shuffle=True,
    random_state=108)

test_emails = fetch_20newsgroups(
    categories=['rec.sport.baseball', 'rec.sport.hockey'],
    subset='test',
    shuffle=True,
    random_state=108)

counter = CountVectorizer()
counter.fit(test_emails.data + train_emails.data)
train_counts = counter.transform(train_emails.data)
test_counts = counter.transform(test_emails.data)

classifier = MultinomialNB()
classifier.fit(train_counts, train_emails.target)

print(classifier.score(test_counts, test_emails.target))
import scipy as sp
from sklearn import (datasets, svm, metrics)
from nlp02_onehot_word import build_vocab
from nlp02_bow_hand    import build_idf, doc2bow_hit, doc2bow_count, doc2bow_tfidf

# Load the 20 newsgroup dataset
remove = ('headers', 'footers', 'quotes')
train = datasets.fetch_20newsgroups(subset='train', remove=remove)
test  = datasets.fetch_20newsgroups(subset='test',  remove=remove)

# Build a vocaburary and its document frequency
vocab = build_vocab(train.data)
idf = build_idf(train.data, vocab)

# Vectorize training and test data
dataset_vectors = [
    {'name' : 'Hit',
     # Stack document vectors vertically for the whole dataset
     'train': sp.sparse.vstack([doc2bow_hit(doc, vocab) for doc in train.data]),
     'test' : sp.sparse.vstack([doc2bow_hit(doc, vocab) for doc in test.data])},
    {'name' : 'Count',
     'train': sp.sparse.vstack([doc2bow_count(doc, vocab) for doc in train.data]),
     'test' : sp.sparse.vstack([doc2bow_count(doc, vocab) for doc in test.data])},
    {'name' : 'TF-IDF',
     'train': sp.sparse.vstack([doc2bow_tfidf(doc, vocab, idf) for doc in train.data]),
     'test' : sp.sparse.vstack([doc2bow_tfidf(doc, vocab, idf) for doc in test.data])},
]

# Test with the SVM classifier
print('### Classification test (accuracy)')
for vector in dataset_vectors:
from sklearn.datasets import fetch_20newsgroups
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords

#problem1
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

tfidf_Vect = TfidfVectorizer()
X_train_tfidf = tfidf_Vect.fit_transform(twenty_train.data)
clf = MultinomialNB()
classifier = SVC(kernel='linear', random_state=0)

clf.fit(X_train_tfidf, twenty_train.target)
classifier.fit(X_train_tfidf, twenty_train.target)

twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
X_test_tfidf = tfidf_Vect.transform(twenty_test.data)

predicted =clf.predict(X_test_tfidf)
predicted1 = classifier.predict(X_test_tfidf)

score = metrics.accuracy_score(twenty_test.target, predicted)
score1 = metrics.accuracy_score(twenty_test.target, predicted1)
print("accuracy score with multinomialNB",score)
print("accuracy score after applyingSVM",score1)
Example #35
0
def main3():
    newsgroups = fetch_20newsgroups(subset='all')
    count_vec = CountVectorizer(analyzer='word', stop_words='english')
    vec = count_vec.fit_transform(newsgroups.data)
    lab = newsgroups.target
    newvec = SelectKBest(chi2, k=features_num).fit_transform(vec,
                                                             lab).todense()
    print(numpy.shape(newvec))

    #print(newvec)

    def add_layer(
        inputs,
        in_size,
        out_size,
        activation_function=None,
    ):
        weights = {
            'h1': tf.Variable(tf.random_normal([features_num, 100])),
            'h2': tf.Variable(tf.random_normal([100, 100])),
            'out': tf.Variable(tf.random_normal([100, 20]))
        }
        biases = {
            'b1': tf.Variable(tf.zeros([1, 100]) + 0.1, ),
            'b2': tf.Variable(tf.zeros([1, 100]) + 0.1, ),
            'out': tf.Variable(tf.zeros([1, 20]) + 0.1, )
        }
        layer_1_multiplication = tf.matmul(inputs, weights['h1'])
        layer_1_addition = tf.add(layer_1_multiplication, biases['b1'])
        layer_1 = tf.nn.relu(layer_1_addition)

        layer_2_multiplication = tf.matmul(layer_1, weights['h2'])
        layer_2_addition = tf.add(layer_2_multiplication, biases['b2'])
        layer_2 = tf.nn.relu(layer_2_addition)

        out_layer_multiplication = tf.matmul(layer_2, weights['out'])
        out_layer_addition = out_layer_multiplication + biases['out']
        return out_layer_addition

    def compute_accuracy(v_xs, v_ys):
        y_pre = sess.run(prediction, feed_dict={xs: v_xs})
        v_ys = numpy.array(v_ys)
        correct_prediction = tf.equal(tf.argmax(y_pre, 1), tf.argmax(v_ys, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        result = sess.run(accuracy, feed_dict={xs: v_xs, ys: v_ys})
        return result

    xs = tf.placeholder(tf.float32, [None, features_num])
    ys = tf.placeholder(tf.float32, [None, 20])

    prediction = add_layer(xs,
                           features_num,
                           20,
                           activation_function=tf.nn.softmax)

    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=ys))
    train_step = tf.train.GradientDescentOptimizer(0.01).minimize(
        cross_entropy)

    sess = tf.Session()
    init = tf.global_variables_initializer()
    sess.run(init)

    batchsize = 100

    def get_batch(vec, lab, i):
        batches = []
        results = []
        texts = vec[i * batchsize:i * batchsize + batchsize]
        categories = lab[i * batchsize:i * batchsize + batchsize]
        for text in texts:
            features = numpy.zeros((features_num), dtype=float)
            for i in range(features_num):
                features[i] = text[0, i]
            batches.append(features)
        for category in categories:
            y = numpy.zeros((20), dtype=int)
            y[category] = 1
            results.append(y)
        return batches, results

    def getall(vec, lab):
        batches = []
        results = []
        texts = vec[0:8000]
        categories = lab[0:8000]
        for text in texts:
            features = numpy.zeros((features_num), dtype=float)
            for i in range(features_num):
                features[i] = text[0, i]
            batches.append(features)
        for category in categories:
            y = numpy.zeros((20), dtype=int)
            y[category] = 1
            results.append(y)
        return batches, results

    for i in range(1000):
        print(i)
        batch_xs, batch_ys = get_batch(newvec, lab, i)
        if (len(batch_xs) == 0):
            break
        sess.run(train_step, feed_dict={xs: batch_xs, ys: batch_ys})

    all_xs, all_ys = getall(newvec, lab)
    print(compute_accuracy(all_xs, all_ys))
Example #36
0
#first extract the 20 news_group dataset to /scikit_learn_data
from sklearn.datasets import fetch_20newsgroups
#all categories
#newsgroup_train = fetch_20newsgroups(subset='train')
#part categories
categories = ['comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x'];
newsgroup_train = fetch_20newsgroups(subset = 'train',categories = categories);

def calculate_result(actual,pred):
    m_precision = metrics.precision_score(actual,pred,average='macro');
    m_recall = metrics.recall_score(actual,pred,average='macro');
    print 'predict info:'
    print 'precision:{0:.3f}'.format(m_precision)
    print 'recall:{0:0.3f}'.format(m_recall);
    print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual,pred,average='macro'));
    

#print category names
from pprint import pprint
pprint(list(newsgroup_train.target_names))



#newsgroup_train.data is the original documents, but we need to extract the 
#TF-IDF vectors inorder to model the text data
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
#vectorizer = TfidfVectorizer(sublinear_tf = True,
# TextBlob is wrapper over NLTK and provide easy-to-use built-in functions
#and methods

from nltk.corpus import names
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
# download the dataset
from sklearn.datasets import fetch_20newsgroups
# see the unique
import numpy as np

from sklearn.decomposition import NMF

# download the dataset
groups = fetch_20newsgroups()
ps = PorterStemmer()
lm = WordNetLemmatizer()

# lets check few names
print(names.words()[:20])

# difference between Stemming and lemmatization is that lemmatization is a
#cautious version of stemming
# examples
ps.stem("machines")
ps.stem('learning')
# lemmatization algo based on wordnet corpus built-in
lm.lemmatize('machines')
lm.lemmatize('learning')  # lm works on nouns not on verb
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.decomposition import PCA
from time import time
import matplotlib.pyplot as plt
from sklearn.utils.extmath import density
from sklearn import metrics

# #############################################################################
# Load data set

print("Loading 20 newsgroups dataset:")
data_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
data_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
print('data loaded')

target_names = data_train.target_names

print("%d documents (training set)" % len(data_train.data))
print("%d documents (test set)" % len(data_test.data))
print("%d categories" % len(data_train.target_names))
print()

# #############################################################################
# split into train set and test set
y_train, y_test = data_train.target, data_test.target

# #############################################################################
Example #39
0
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
    ]

if opts.filtered:
    remove = ('headers', 'footers', 'quotes')
else:
    remove = ()

print("Loading 20 newsgroups dataset for categories:")
print(categories if categories else "all")

data_train = fetch_20newsgroups(subset='train',
                                categories=categories,
                                shuffle=True,
                                random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test',
                               categories=categories,
                               shuffle=True,
                               random_state=42,
                               remove=remove)
print('data loaded')

target_names = data_train.target_names


def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6
Example #40
0
# coding: utf-8

# In[2]:

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import numpy as np
get_ipython().run_line_magic('matplotlib', 'inline')
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups()
data.target_names

# In[3]:

categories = [
    'alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
    'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x',
    'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
    'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space',
    'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast',
    'talk.politics.misc', 'talk.religion.misc'
]

# In[4]:

## Training the data on these categories

train = fetch_20newsgroups(subset="train", categories=categories)

## testing the data for these categories
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

if __name__ == '__main__':
    # Fetch the dataset
    data = fetch_20newsgroups(subset="all")
    texts = np.array(data.data)
    labels = np.array(data.target)
    df = pd.DataFrame(data={'texts': texts, 'labels': labels})
    df.to_csv('20newsgroups.csv', index=False)
    # Read split indices
    with open('splits/test', 'r') as f:
        test_idx = np.array(list(map(int, f.read().splitlines())))
    with open('splits/validation', 'r') as f:
        validation_idx = np.array(list(map(int, f.read().splitlines())))

    assert not set(test_idx).intersection(set(validation_idx))

    test_texts, test_labels = texts[test_idx], labels[test_idx]
    val_texts, val_labels = texts[validation_idx], labels[validation_idx]

    concat_idx = np.append(test_idx, validation_idx)
    texts, labels = np.delete(texts, concat_idx), np.delete(labels, concat_idx)

    df_test = pd.DataFrame(data={'texts': test_texts, 'labels': test_labels})
    df_val = pd.DataFrame(data={'texts': val_texts, 'labels': val_labels})

    # the test set contains is also used for training because it's unsupervised learning
    df_test.to_csv('20newsgroups_train.csv', index=False)
    df_val.to_csv('20newsgroups_val.csv', index=False)
Example #42
0
from sklearn.datasets import fetch_20newsgroups

news = fetch_20newsgroups(subset='all')
X, y = news.data, news.target
from bs4 import BeautifulSoup

import nltk, re


def news_to_sentences(news):
    news_text = BeautifulSoup(news).get_text()

    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(news_text)

    sentences = []

    for sent in raw_sentences:
        sentences.append(
            re.sub('[^a-zA-Z]', ' ',
                   sent.lower().strip()).split())

    return sentences


sentences = []

for x in X:
    sentences += news_to_sentences(x)

from gensim.models import word2vec
Example #43
0
    loss = torch.mul(x, weight)
    return loss.mean()


def train_model(data: GloveDataset):
    optimizer = torch.optim.Adam(data.all_params, weight_decay=1e-8)
    optimizer.zero_grad()
    for epoch in tqdm(range(NUM_EPOCH)):
        logging.info("Start epoch %i", epoch)
        num_batches = int(len(data) / BATCH_SIZE)
        avg_loss = 0.0
        n_batch = int(len(data) / BATCH_SIZE)
        for batch in tqdm(gen_batchs(data), total=n_batch, mininterval=1):
            optimizer.zero_grad()
            loss = get_loss(*batch)
            avg_loss += loss.data.item() / num_batches
            loss.backward()
            optimizer.step()
        logging.info("Average loss for epoch %i: %.5f", epoch + 1, avg_loss)


if __name__ == "__main__":
    logging.info("Fetching data")
    newsgroup = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
    logging.info("Build dataset")
    glove_data = GloveDataset(newsgroup.data, right_window=RIGHT_WINDOW)
    logging.info("#Words: %s", glove_data.indexer.n_words)
    logging.info("#Ngrams: %s", len(glove_data))
    logging.info("Start training")
    train_model(glove_data)
# coding: utf-8

# # Part 1 - Clustering of Text Data

# ## Question 1: Build TF-IDF Matrix

# In[1]:


from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

categories = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 
              'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']
dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

vectorizer = CountVectorizer(min_df=3, stop_words='english')
X_counts = vectorizer.fit_transform(dataset.data)
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

labels = [0 if label<4 else 1 for label in dataset.target]

print('X_tfidf Shape:', X_tfidf.shape)


# ## Question 2: Contingency Table of K-means Clustering

# In[2]:
Example #45
0
from sklearn import datasets
from sklearn.cross_validation import KFold
from sklearn.feature_extraction import text
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
'''
Для начала вам потребуется загрузить данные.
В этом задании мы воспользуемся одним из датасетов, доступных в scikit-learn'е — 20 newsgroups.
 Для этого нужно воспользоваться модулем datasets:
1 Загрузите объекты из новостного датасета 20 newsgroups, относящиеся к категориям "космос" и "атеизм"
  (инструкция приведена выше). Обратите внимание, что загрузка данных может занять несколько минут
 '''
print("Loading dataset...")
t0 = time()
newsgroups = datasets.fetch_20newsgroups(
    subset='all',
    categories=['alt.atheism', 'sci.space'],
    download_if_missing=True)
data_samples = newsgroups.data
print("done in %0.3fs." % (time() - t0))
'''
После выполнения этого кода массив с текстами будет находиться
в поле newsgroups.data, номер класса — в поле newsgroups.target.

Одна из сложностей работы с текстовыми данными состоит в том, что для них нужно построить числовое представление.
Одним из способов нахождения такого представления является вычисление TF-IDF.
В Scikit-Learn это реализовано в классе sklearn.feature_extraction.text.TfidfVectorizer.
Преобразование обучающей выборки нужно делать с помощью функции fit_transform, тестовой — с помощью transform.

Реализация SVM-классификатора находится в классе sklearn.svm.SVC.
Веса каждого признака у обученного классификатора хранятся в поле coef_.
Чтобы понять, какому слову соответствует i-й признак,
Example #46
0
print "This run will use min_df=" + str(this_df)

print "numpy version: " + np.__version__
print "sklearn version: " + skl.__version__
print "matplotlib version: " + mpl.__version__
print "nltk version: " + nltk.__version__

categories = [
    'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles',
    'rec.sport.baseball', 'rec.sport.hockey'
]

trainset = fetch_20newsgroups(subset='train',
                              categories=categories,
                              shuffle=True,
                              random_state=42)
testset = fetch_20newsgroups(subset='test',
                             categories=categories,
                             shuffle=True,
                             random_state=42)

#plot the histogram for part a
#plt.hist(trainset.target, bins=range(min(trainset.target), (max(trainset.target) + 2)))
#plt.xlabel('Article target number')
#plt.ylabel('Number of articles target number')
#plt.title('Histogram of Article distribution, training set')
#plt.show()
#
#plt.hist(testset.target, bins=range(min(testset.target), (max(testset.target) + 2)))
#plt.xlabel('Article target number')
Example #47
0

#=================================Obtain data==================================
comp_tech_subclasses = ['comp.graphics', 
                        'comp.os.ms-windows.misc', 
                        'comp.sys.ibm.pc.hardware', 
                        'comp.sys.mac.hardware']
                        
rec_act_subclasses = ['rec.autos', 
                      'rec.motorcycles', 
                      'rec.sport.baseball', 
                      'rec.sport.hockey']
  
dataset = fetch_20newsgroups(subset='all',
                             categories=comp_tech_subclasses+rec_act_subclasses,
                             shuffle=True,
                             random_state=42,
                             remove=('headers', 'footers', 'quotes'))


labels = [1]*len(dataset.data)
for i in range(len(dataset.data)):
    if dataset.target[i] > 3:
        labels[i] = 0
#==============================================================================





#===================Remove Punctuation & Stem & Stop Words=====================
Example #48
0
from sklearn.datasets import fetch_20newsgroups

Training_data = fetch_20newsgroups(subset='train', shuffle=True)
Training_data.target_names

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier

svm_classification = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf-svm',
     SGDClassifier(loss='hinge',
                   penalty='l2',
                   alpha=1e-3,
                   n_iter=5,
                   random_state=42)),
])
svm_classification = svm_classification.fit(Training_data.data,
                                            Training_data.target)

import numpy as np

Testing_data = fetch_20newsgroups(subset='test', shuffle=True)
svm_prediction = svm_classification.predict(Testing_data.data)
print("Accuracy of Support Vector Machine in percentage :",
      np.mean(svm_prediction == Testing_data.target) * 100)
Example #49
0
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join(
            [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True,
                             random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                   min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups



newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target




text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=100)),
                     ])


text_clf.fit(X_train, y_train)




predicted = text_clf.predict(X_test)
Example #51
0
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import GridSearchCV, KFold
# TF-IDF - Это показатель, который равен произведению двух чисел:
# TF (term frequency) и IDF (inverse document frequency).
# Первая равна отношению числа вхождений слова в документ к общей длине документа.
# Вторая величина зависит от того, в скольки документах выборки встречается это слово.
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# массив с текстами будет находиться в поле newsgroups.data,
# номер класса — в поле newsgroups.target.
newsgroups = datasets.fetch_20newsgroups(
    subset='all', categories=['alt.atheism', 'sci.space'])

tfid_newsgroups = TfidfVectorizer()
data_params = tfid_newsgroups.fit_transform(newsgroups.data)

# Чтобы понять, какому слову соответствует i-й признак,
# можно воспользоваться методом get_feature_names() у TfidfVectorizer:
feature_mapping = tfid_newsgroups.get_feature_names()

grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = svm.SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(data_params, newsgroups.target)

clf = svm.SVC(kernel='linear', C=gs.best_estimator_.C, random_state=241)
clf.fit(data_params, newsgroups.target)
    'comp.sys.mac.hardware', 'comp.windows.x', 'rec.autos', 'rec.motorcycles',
    'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics',
    'sci.med', 'sci.space', 'misc.forsale', 'talk.politics.misc',
    'talk.politics.guns', 'talk.politics.mideast', 'talk.religion.misc',
    'alt.atheism', 'soc.religion.christian'
]

# Print information
print("Loading 20 newsgroups dataset for categories...")
print(categories)
print()

# Load dataset and split two groups
dataset = fetch_20newsgroups(subset='all',
                             categories=categories,
                             shuffle=True,
                             random_state=42,
                             remove=('headers', 'footers', 'quotes'))
size = dataset.target.shape[0]
for i in range(0, size):
    if (dataset.target[i] <= 4):
        dataset.target[i] = 0
    if (5 <= dataset.target[i] and dataset.target[i] <= 8):
        dataset.target[i] = 1
    if (9 <= dataset.target[i] and dataset.target[i] <= 12):
        dataset.target[i] = 2
    if (dataset.target[i] == 13):
        dataset.target[i] = 3
    if (14 <= dataset.target[i] and dataset.target[i] <= 16):
        dataset.target[i] = 4
    if (17 <= dataset.target[i] and dataset.target[i] <= 19):
Example #53
0
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import matplotlib.pyplot as plt

categories = [
    'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles',
    'rec.sport.baseball', 'rec.sport.hockey'
]
train_data = fetch_20newsgroups(subset='train',
                                categories=categories,
                                shuffle=True,
                                random_state=42)

length = []
data = []
index = []

for m in range(8):
    temp_index = []
    temp_index.append(list(np.where(train_data.target == m))[0])
    index.append(temp_index)
    temp_data = []
    for n in index[m][0]:
        temp_data.append(train_data.data[n])
    data.append(temp_data)
    length.append(len(temp_data))

plt.figure()
plt_index = range(8)
width = 1
Example #54
0
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
from sklearn.datasets import fetch_20newsgroups
import re
import matplotlib.pyplot as plt

# download example data ( may take a while)
train = fetch_20newsgroups()


def clean(text):
    """Remove posting header, split by sentences and words, keep only letters"""
    lines = re.split('[?!.:]\s',
                     re.sub('^.*Lines: \d+', '', re.sub('\n', ' ', text)))
    return [re.sub('[^a-zA-Z]', ' ', line).lower().split() for line in lines]


sentences = [line for text in train.data for line in clean(text)]

model = Word2Vec(sentences,
                 workers=4,
                 size=100,
                 min_count=50,
                 window=10,
                 sample=1e-3)

print(model.most_similar('memory'))

X = model[model.wv.vocab]

tsne = TSNE(n_components=2)
Example #55
0
def getData(categories, subset, shuffle, random_state):
    return fetch_20newsgroups(subset=subset,
                              categories=categories,
                              shuffle=shuffle,
                              random_state=random_state)
Example #56
0
                ],

                # weight components in ColumnTransformer
                transformer_weights={
                    'subject': 0.8,
                    'body_bow': 0.5,
                    'body_stats': 1.0,
                })),

        # Use a SVC classifier on the combined features
        ('svc', LinearSVC()),
    ],
    verbose=True)

# limit the list of categories to make running this example faster.
categories = ['alt.atheism', 'talk.religion.misc']
train = fetch_20newsgroups(
    random_state=1,
    subset='train',
    categories=categories,
)
test = fetch_20newsgroups(
    random_state=1,
    subset='test',
    categories=categories,
)

pipeline.fit(train.data, train.target)
y = pipeline.predict(test.data)
print(classification_report(y, test.target))
            pred = self.w @ vector
            # print(pred)

            pred = self.sigmoid(pred)
            labels.append(np.argmax(pred))
        return labels

    def sigmoid(self, x):
        return 1.0 / (1.0 + np.exp(-x))

    def loss(self, label, pred):
        return np.mean(np.square(label - pred))


newsgroups_train = fetch_20newsgroups(subset='train',
                                      shuffle=True,
                                      random_state=42,
                                      remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test',
                                     shuffle=True,
                                     random_state=42,
                                     remove=('headers', 'footers', 'quotes'))

vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
vectorizer.fit(newsgroups_train.data)
train_vector = vectorizer.transform(newsgroups_train.data)
test_vector = vectorizer.transform(newsgroups_test.data)

decomp = decomposition.TruncatedSVD(n_components=300)
train_pca = decomp.fit_transform(train_vector)
test_pca = decomp.transform(test_vector)
from sklearn.datasets import fetch_20newsgroups
from gensim.models import Word2Vec
from nltk import sent_tokenize, word_tokenize

newsgroups_data = fetch_20newsgroups(subset='all')
sentences = []
for doc in newsgroups_data.data:
    for sent in sent_tokenize(doc):
        word_list = word_tokenize(sent)
        sentences.append(word_list)

# reference https://radimrehurek.com/gensim/models/word2vec.html
print('Start training!')
model = Word2Vec(sentences, sg=1, hs=0, size=100, min_count=5, max_vocab_size=50000)  # skip-gram with negative sampling
model.save('20news-vectors-negative100.model')
model.wv.save_word2vec_format('20news-vectors-negative100.bin', binary=True)
print('Done training!')
    return int(newsgroups_counts[:, count_vectorizer.vocabulary_["phone"]].
               toarray().sum())


# ## Questão 7
#
# Aplique `TfidfVectorizer` ao _data set_ `newsgroups` e descubra o TF-IDF da palavra _phone_. Responda como um único escalar arredondado para três casas decimais.

# In[28]:

tfidf_vectorizer = TfidfVectorizer()

categories = ['sci.electronics', 'comp.graphics', 'rec.motorcycles']

newsgroup = fetch_20newsgroups(subset="train",
                               categories=categories,
                               shuffle=True,
                               random_state=42)

tfidf_vectorizer.fit(newsgroup.data)

newsgroups_tfidf_vectorized = tfidf_vectorizer.transform(newsgroup.data)

# In[29]:


def q7():

    idf_value = newsgroups_tfidf_vectorized[:, tfidf_vectorizer.
                                            vocabulary_["phone"]].toarray(
                                            ).sum()
Example #60
-1
def test_20news_vectorized():
    try:
        datasets.fetch_20newsgroups(subset='all',
                                    download_if_missing=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # test subset = train
    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314, 130107))
    assert_equal(bunch.target.shape[0], 11314)
    assert_equal(bunch.data.dtype, np.float64)

    # test subset = test
    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (7532, 130107))
    assert_equal(bunch.target.shape[0], 7532)
    assert_equal(bunch.data.dtype, np.float64)

    # test return_X_y option
    fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test')
    check_return_X_y(bunch, fetch_func)

    # test subset = all
    bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314 + 7532, 130107))
    assert_equal(bunch.target.shape[0], 11314 + 7532)
    assert_equal(bunch.data.dtype, np.float64)