Beispiel #1
0
def getData():
	train_data= load_files('training')    
	test_data=load_files("test")
	count_Vec=TfidfVectorizer(min_df=1,decode_error="replace")
	doc_train=count_Vec.fit_transform(train_data.data)
	doc_test=count_Vec.transform(test_data.data)# ! there is transform not fit_transform
	return doc_train.toarray(),train_data.target,doc_test.toarray()
Beispiel #2
0
def export_classifier():
    #note that this data is not in the git repo
    train_small = load_files('./training_data/')
    test_small = load_files('./test_data/')

    # Turn the text documents into vectors of word frequencies
    vectorizer = CountVectorizer(min_df=5, ngram_range=(1, 2),
                                 stop_words='english',
                                 strip_accents='ascii')
    X_train = vectorizer.fit_transform(train_small.data)
    y_train = train_small.target

    # Fit a classifier on the training set
    classifier = LogisticRegression(penalty='l2', tol=0.0001, C=1.0,
                                    fit_intercept=True, intercept_scaling=1,
                                    ).fit(X_train, y_train)
    print("Training score: {0:.1f}%".format(
        classifier.score(X_train, y_train) * 100))

    # Evaluate the classifier on the testing set
    X_test = vectorizer.transform(test_small.data)
    y_test = test_small.target
    print("Testing score: {0:.1f}%".format(
        classifier.score(X_test, y_test) * 100))
    export_pickle('LRclassifier.txt', classifier)
    export_pickle('LRvectorizer.txt', vectorizer)
Beispiel #3
0
Datei: lr.py Projekt: chen33/nlp
def getData():
	train_data= load_files('dataset/train')    
	test_data=load_files("dataset/test")
	count_Vec=TfidfVectorizer(min_df=1,decode_error="replace")
	doc_train=count_Vec.fit_transform(train_data.data)
	doc_test=count_Vec.transform(test_data.data)
	return doc_train.toarray(),train_data.target,doc_test.toarray(),test_data.target
def createDataSet(train_path,test_path,category,k):
	"""
	create vectorized text feature
    '0' refer to 'atheism'
    '1' refer to 'sports'

	"""
	train_set = datasets.load_files(train_path,categories=category, 
	load_content=True, shuffle=True, encoding='utf-8', decode_error='ignore', random_state=0)

	count_vect = CountVectorizer(encoding = 'utf-8',lowercase = True,
	 decode_error = 'ignore',  analyzer = 'word', ngram_range = (2,4),min_df = 1)
	
	tfidf_vecter = TfidfVectorizer( max_df = 0.8, stop_words = 'english')

	test_set = datasets.load_files(test_path,categories=category, 
	load_content=True, shuffle=True, encoding='utf-8',  decode_error='ignore', random_state=0)

	

	X_train_tfidf = tfidf_vecter.fit_transform(train_set.data)
	X_train_counts = count_vect.fit_transform(train_set.data)

	X_test_tfidf = tfidf_vecter.transform(test_set.data)
	X_test_counts = count_vect.transform(test_set.data)


	 
	for i in range(X_train_counts.shape[0]):
		if train_set.target[i] == k:
			train_set.target[i] = 1
		else:
			train_set.target[i] = -1

	for i in range(X_test_counts.shape[0]):
		if test_set.target[i] == k:
			test_set.target[i] = 1
		else:
			test_set.target[i] = -1

	
	
	#X_train_normalize = preprocessing.normalize(X_train_counts, norm = 'l2')
	



	#print train_set.target_names
	#print train_set.target
	#print size 
	#print len(train_set.target)


	#print X_train_tfidf.shape
	#print X_train_counts
	#print X_train_normalize


	return X_train_counts, train_set.target, X_train_counts.shape,X_test_counts, test_set.target, X_test_counts.shape
def load(dataset, categories):
    if dataset == 'full':
        train = load_files('aclImdb/aggregate/', categories=categories)
        return train

    elif dataset == 'split':    
        train = load_files('aclImdb/train/', categories=categories)
        test = load_files('aclImdb/test/', categories=categories)
        return (train, test)
Beispiel #6
0
def vector_for_input_binary(train_file_path="/mnt/hgfs/temp/machine learning/train",
                            test_file_path="/mnt/hgfs/temp/machine learning/test", categories=None):
    train_data = load.load_files(train_file_path, categories=categories, encoding='utf-8', decode_error='ignore')
    test_data = load.load_files(test_file_path, categories=categories, encoding='utf-8', decode_error='ignore')

    vectorized = feature_extraction.CountVectorizer(min_df=1, binary=True)
    train_input = vectorized.fit_transform(train_data['data'])
    test_input = vectorized.transform(test_data['data'])

    return train_input, train_data['target'], test_input, test_data['target']
def test_grid_search_cv_on_newsgroup():
    ## load news group data
    categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
    ]
    twenty_train_small = load_files('./data/20news-bydate-train/',
        categories=categories, charset='latin-1')
    twenty_test_small = load_files('./data/20news-bydate-test/',
        categories=categories, charset='latin-1')
    ## model pipeline using tfidf and passive aggresive
    pipeline = Pipeline((
        ('vec', TfidfVectorizer(min_df=1, max_df=0.8, use_idf=True)),
        ('clf', PassiveAggressiveClassifier(C=1)),
    ))
    param_grid = {
        'vec__min_df': [1, 2],
        'vec__max_df': [0.8, 1.0],
        'vec__ngram_range': [(1, 1), (1, 2)],
        'vec__use_idf': [True, False]
    }
    X, y = twenty_train_small.data, twenty_train_small.target
    ## cross validation on n_iter = 5
    grid_searcher = meta_search.GridSearch()
    # persist only once
    grid_searcher.persist_cv_splits('text_classification', X, y, './tmp/')
    grid_searcher.search(pipeline, param_grid)
    import time
    while not grid_searcher.isready():
        print time.sleep(2)
        print 'progress:', grid_searcher.progress()
        print 'best result:', grid_searcher.best_params_so_far()
        if grid_searcher.best_params_so_far():
            pass#grid_searcher.abort()
    print len(grid_searcher.partial_result())
    ## run again with naive bayesian
    ## no need to persist_cv_splits
    pipeline = Pipeline((
        ('vec', TfidfVectorizer(min_df=1, max_df=0.8, use_idf=True)),
        ('clf', MultinomialNB()),
    ))
    grid_searcher10 = meta_search.GridSearch(datafiles = grid_searcher.datafiles)
    grid_searcher10.search(pipeline, param_grid)
    while not grid_searcher10.isready():
        print time.sleep(2)
        print 'progress:', grid_searcher10.progress()
        print 'best result:', grid_searcher10.best_params_so_far()
        if grid_searcher10.best_params_so_far():
            pass#grid_searcher10.abort()
    print len(grid_searcher10.partial_result())    
Beispiel #8
0
def main():
    #buildTrainSet()
    #buildTestSet()
    train = load_files('model/train', encoding='utf-8')
    test = load_files('model/test', encoding='utf-8')
    print train.cc
#    for l in train.target_names:
#        print l
#    for l in train.target:
#        print l
    vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english')
    X_train = vectorizer.fit(train)
    X_test = vectorizer.fit_transform(test)
    print X_train.get_feature_names()
Beispiel #9
0
def vector_for_input(train_file_path=path1,
                     test_file_path=path2, categories=None):
    train_data = load.load_files(train_file_path, categories=categories, encoding='utf-8', decode_error='ignore')
    test_data = load.load_files(test_file_path, categories=categories, encoding='utf-8', decode_error='ignore')

    # vectorized_normalized = feature_extraction.TfidfVectorizer(min_df=1)
    # train_input_normalized = vectorized_normalized.fit_transform(train_data['data'])
    # test_input_normalized = vectorized_normalized.transform(test_data['data'])

    vectorized = feature_extraction.CountVectorizer(min_df=1)
    train_input = vectorized.fit_transform(train_data['data'])
    test_input = vectorized.transform(test_data['data'])

    return train_input, train_data['target'], test_input, test_data['target']
def test_docs(dir):
	# Load documents
	docs = datasets.load_files(container_path="../../sklearn_data/"+dir)
	X, y = docs.data, docs.target

	baseline = 1/float(len(list(np.unique(y))))

	# Select Features via Bag of Words approach without stop words
	#X = CountVectorizer(charset_error='ignore', stop_words='english', strip_accents='unicode', ).fit_transform(X)
	X = TfidfVectorizer(charset_error='ignore', stop_words='english', analyzer='char', ngram_range=(2,4), strip_accents='unicode', sublinear_tf=True, max_df=0.5).fit_transform(X)
	n_samples, n_features = X.shape


	# sklearn's grid search
	parameters = { 'alpha': np.logspace(-100,0,10)}

	bv = Bootstrap(n_samples, n_iter=10, test_size=0.3, random_state=42)
	mnb_gv = GridSearchCV(MultinomialNB(), parameters, cv=bv,)
	#scores = cross_val_score(mnb_gv, X, y, cv=bv)
	mnb_gv.fit(X, y)
	mnb_gv_best_params = mnb_gv.best_params_.values()[0]
	print mnb_gv.best_score_
	print mnb_gv_best_params

	# CV with Bootstrap
	mnb = MultinomialNB(alpha=mnb_gv_best_params)
	boot_scores = cross_val_score(mnb, X, y, cv=bv)
	print mean_sem(boot_scores)

	improvement = (mnb_gv.best_score_ - baseline) / baseline

	rand_baseline.append(baseline)
	test_results.append([mnb_gv.best_score_])
	com_results.append(improvement)
	sem_results.append(sem(boot_scores))
Beispiel #11
0
def train(param_search=False):
    data = load_files(download())
    y = [data.target_names[t] for t in data.target]

    # The random state on the LR estimator is fixed to the most arbitrary value
    # that I could come up with. It is biased toward the middle number keys on
    # my keyboard.
    clf = make_pipeline(TfidfVectorizer(min_df=2, dtype=float,
                                        sublinear_tf=True,
                                        ngram_range=(1, 2),
                                        strip_accents='unicode'),
                        LogisticRegression(random_state=623, C=5000))

    if param_search:
        params = {'tfidf__ngram_range': [(1, 1), (1, 2)],
                  'lr__C': [1000, 5000, 10000]}

        print("Starting parameter search for review sentiment classification")
        # We ignore the original folds in the data, preferring a simple 5-fold
        # CV instead; this is intended to get a working model, not results for
        # publication.
        gs = GridSearchCV(clf, params, cv=5, refit=True, n_jobs=-1, verbose=2)
        gs.fit(data.data, y)

        print("Parameters found:")
        pprint(gs.best_params_)
        print("Cross-validation accuracy: %.3f" % gs.best_score_)

        return gs.best_estimator_

    else:
        print("Training logistic regression for movie review polarity")
        return clf.fit(data.data, y)
def load_data():
    # Descargamos los datos, los descomprimimos en la carpeta ./data/txt_sentoken
    # "http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz")
    dataset = load_files('./data/txt_sentoken', shuffle=False)
    print("n_samples: %d" % len(dataset.data))

    return dataset
def test_load_files_w_categories_desc_and_encoding():
    category = os.path.abspath(TEST_CATEGORY_DIR1).split("/").pop()
    res = load_files(LOAD_FILES_ROOT, description="test", categories=category, encoding="utf-8")
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 1)
    assert_equal(res.DESCR, "test")
    assert_equal(res.data, [u("Hello World!\n")])
def load_docs(path):
    dataset = load_files(args.train_path)
    docs = []
    for raw_data in dataset.data:
        docs.append(json.loads(raw_data))
    dataset.data = docs
    return dataset
Beispiel #15
0
def importData(datadirectory):
	#categories = ['n','u', 'y']
	categories = ['n', 'y']

	data = load_files(datadirectory,categories=categories, shuffle=True, random_state=42, encoding='latin-1') 
	X_train, X_test, y_train, y_test = cross_validation.train_test_split(data.data, data.target, test_size = 0.4, random_state=0)
	print X_train 
	# count_vect = CountVectorizer()
	# X_train_vec = count_vect.fit_transform(X_train)
	# X_test_vec = count_vect.fit_transform(X_test)
	# clf = svm.SVC(kernel='linear', C=1).fit(X_train_vec, y_train)
	# clf.score(X_test_vec, y_test) 

	text_clf = Pipeline([('vect', TfidfVectorizer()), ('clf', MultinomialNB())])
	#print text_clf.named_steps['clf']
	print str(sum(cross_val_score(text_clf, data.data,data.target ))/3.0) + ' Tfidf NB'
	#array([ 0.62376238,  0.57      ,  0.6122449 ])
	text_clf = Pipeline([('vect', CountVectorizer()),('clf', MultinomialNB()),]) 
	print str(sum(cross_val_score(text_clf, data.data,data.target ))/3.0) + ' CountVec NB'                                         #array([ 0.56435644,  0.5       ,  0.57142857])
	clf = Pipeline([('vect', CountVectorizer()), ('svm', LinearSVC())])                        
	print str(sum(cross_val_score(clf, data.data,data.target ))/3.0) + ' CountVec SVM'
	#array([ 0.55445545,  0.48      ,  0.54081633])
	clf = Pipeline([('vect', TfidfVectorizer()), ('svm', LinearSVC())])                    
	print str(sum(cross_val_score(clf, data.data,data.target ))/3.0) + ' Tfidf SVM'
	#array([ 0.62376238,  0.57      ,  0.6122449 ])
	clf_sgdc = Pipeline([('vect', CountVectorizer()),('clf', linear_model.SGDClassifier()),])
	print str(sum(cross_val_score(clf_sgdc, data.data,data.target ))/3.0) + ' SGDC' 
Beispiel #16
0
def load_SRAA(AVI_HOME='./SRAA/partition1/data', percent=1./3, rnd=2342, \
              vect=CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))):
    data = load_files(AVI_HOME, encoding="latin1", load_content=True, random_state=rnd)
    data.data = [remove_header_subject(text) for text in data.data]

    indices = ShuffleSplit(len(data.data), n_iter=1, test_size=percent, indices=True, random_state=rnd)
    for train_ind, test_ind in indices:
        data = bunch.Bunch(train=bunch.Bunch(data=[data.data[i] for i in train_ind], target=data.target[train_ind]),
                              test=bunch.Bunch(data=[data.data[i] for i in test_ind], target=data.target[test_ind]))

    X_tr = vect.fit_transform(data.train.data)
    y_tr = data.train.target

    X_te = vect.transform(data.test.data)
    y_te = data.test.target
    
    # cache the files
    pickle.dump(X_tr, open('SRAA_X_train.pickle', 'wb'))
    pickle.dump(y_tr, open('SRAA_y_train.pickle', 'wb'))
    pickle.dump(X_te, open('SRAA_X_test.pickle', 'wb'))
    pickle.dump(y_te, open('SRAA_y_test.pickle', 'wb'))
    pickle.dump(data.train.data, open('SRAA_X_train_corpus.pickle', 'wb'))
    pickle.dump(data.test.data, open('SRAA_X_test_corpus.pickle', 'wb'))
    pickle.dump(vect.get_feature_names(), open('SRAA_feature_names.pickle', 'wb'))
    
    return (X_tr, y_tr, X_te, y_te, data.train.data, data.test.data)
def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
    print 'Loading dataset, 80% for training, 20% for testing...'
    movie_reviews = load_files(dataset_dir_name)  
    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)
    
    print 'Feature selection...'
    print 'fs method:' + fs_method, 'fs num:' + str(fs_num)
    vectorizer = CountVectorizer(binary = True)   
    word_tokenizer = vectorizer.build_tokenizer()
    doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in doc_str_list_train]
    term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num]
    
    print 'Building VSM model...'
    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
    vectorizer.fixed_vocabulary = True
    vectorizer.vocabulary_ = term_dict
    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    doc_test_vec= vectorizer.transform(doc_str_list_test)
    
    clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train)  #调用MultinomialNB分类
    doc_test_predicted = clf.predict(doc_test_vec)
    
    acc = np.mean(doc_test_predicted == doc_class_list_test)  
    print 'Accuracy: ', acc
    
    return acc
Beispiel #18
0
def test_default_load_files(test_category_dir_1, test_category_dir_2,
                            load_files_root):
    res = load_files(load_files_root)
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 2)
    assert_equal(res.DESCR, None)
    assert_equal(res.data, [b("Hello World!\n")])
Beispiel #19
0
def testdata_stats():
    test_dataset = datasets.load_files(project_root+"/testdata",
                                     encoding='utf-8',
                                  decode_error='ignore')

    # save_thing_to_file(test_dataset, "test_dataset.txt")

    bayes = get_thing_from_file("bayes.txt")
    bayes.fit(test_dataset.data, test_dataset.target)
    predicted_nb = bayes.predict(test_dataset.data)

    print "*****BAYESIAN STATS****"
    print "average accuracy = " + \
            str(numpy.mean(predicted_nb == test_dataset.target))

    print(metrics.classification_report(test_dataset.target, predicted_nb,
    target_names=test_dataset.target_names))
    print "*****BAYESIAN CONFUSION MATRIX*****"
    print metrics.confusion_matrix(test_dataset.target, predicted_nb)

    svm = get_thing_from_file("svm.txt")
    svm.fit(test_dataset.data, test_dataset.target)
    predicted_svm = svm.predict(test_dataset.data)

    print "*****SVM STATS*****"
    print "average accuracy = " + \
            str(numpy.mean(predicted_svm == test_dataset.target))
    print(metrics.classification_report(test_dataset.target, predicted_svm,
    target_names=test_dataset.target_names))
    print "*****SVM CONFUSION MATRIX*****"
    print metrics.confusion_matrix(test_dataset.target, predicted_svm)
Beispiel #20
0
def test_load_files_wo_load_content(
        test_category_dir_1, test_category_dir_2, load_files_root):
    res = load_files(load_files_root, load_content=False)
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 2)
    assert_equal(res.DESCR, None)
    assert_equal(res.get('data'), None)
Beispiel #21
0
def runClassifiers (dataDir):
    
    data = load_files(dataDir)

    nbClassifier = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('classifier', MultinomialNB())])
    
    parameters = {'vect__ngram_range': [(1,1),(2,2),(3,3),(1,2),(1,3)],
                  'vect__binary': [True, False],
                  'tfidf__use_idf': [True, False],
                  'classifier__alpha': [1e-2, 1e-3]}
    
    gs = GridSearchCV(nbClassifier, parameters, n_jobs=-1, verbose=1)
    gs.fit(data.data, data.target)
    best_parameters = gs.best_estimator_.get_params()
    
    print("Best score: %0.3f" % gs.best_score_)
    for params, mean_score, scores in gs.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    print("Done")
    
    pass
Beispiel #22
0
    def __init__(self, file_path):
        self.training_documents = load_files(container_path='./20news-bydate/20news-bydate-train',
                                       categories=CATEGORIES,
                                       decode_error='ignore',
                                       shuffle=True,
                                       encoding='utf-8',
                                       random_state=42)

        self.test_documents = load_files(container_path='./20news-bydate/20news-bydate-test',
                                       categories=CATEGORIES,
                                       decode_error='ignore',
                                       shuffle=True,
                                       encoding='utf-8',
                                       random_state=42)

        self.file_path = file_path
Beispiel #23
0
def text_sentiment(docs_new):
   docs_new=[docs_new]
   twenty_train= load_files('./Sentiment')  #the complete data is in this directory; like comp.graphics etc
   count_vect = CountVectorizer()
   X_train_counts = count_vect.fit_transform(twenty_train.data)
   tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
   X_train_tf = tf_transformer.transform(X_train_counts)
   tfidf_transformer = TfidfTransformer()
   X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

   # Fit a classifier on the training set
   #clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
   #f = open('my_classifier.pickle', 'wb')
   #pickle.dump(clf, f)
   #f = open('my_classifier.pickle',)
   #clf = pickle.load(f)
   #f.close()
   # save the classifier
   #with open('my_sentiment.pkl', 'wb') as fid:
      #cPickle.dump(clf, fid)    

   # load it again
   with open('my_sentiment.pkl', 'rb') as fid:
      clf = cPickle.load(fid)
   X_new_counts = count_vect.transform(docs_new)
   X_new_tfidf = tfidf_transformer.transform(X_new_counts)

   predicted = clf.predict(X_new_tfidf)
   return twenty_train.target_names[predicted]
Beispiel #24
0
def get_inbin_depth(which_set):
    if which_set not in ('train', 'test'):
        raise ValueError

    data_dir = os.path.join(here, '../data/inbin_depth_{0}'.format(which_set))
    data = load_files(data_dir, load_content=False, shuffle=False)
    return data
    def train(self):
        """Loading and Training classifier"""
        # Load dataset
        categories = ['neg', 'pos']
        self.train_set = load_files('resources/sentimentDataset/train/', categories=categories, encoding='latin-1')
        self.test_set = load_files('resources/sentimentDataset/test/', categories=categories, encoding='latin-1')

        #Tokenizing text with scikit-learn
        count_vect = CountVectorizer()
        X_train_counts = count_vect.fit_transform(self.train_set.data)

        # occurrences to frequencies
        tfidf_transformer = TfidfTransformer()
        X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

        # Pipline
        self.text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB()),])
        self.text_clf.fit(self.train_set.data, self.train_set.target)
Beispiel #26
0
def test_load_files_w_categories_desc_and_encoding(
        test_category_dir_1, test_category_dir_2, load_files_root):
    category = os.path.abspath(test_category_dir_1).split('/').pop()
    res = load_files(load_files_root, description="test",
                     categories=category, encoding="utf-8")
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 1)
    assert_equal(res.DESCR, "test")
    assert_equal(res.data, [u("Hello World!\n")])
def load_data(data_path, data_categories):
    return load_files(container_path=data_path,
                      description=None,
                      categories=data_categories,
                      load_content=True,
                      shuffle=True,
                      encoding='latin-1',
                      decode_error='strict',
                      random_state=randint(0, 999999))
Beispiel #28
0
def test_default_load_files():
    try:
        setup_load_files()
        res = load_files(LOAD_FILES_ROOT)
        assert_equal(len(res.filenames), 1)
        assert_equal(len(res.target_names), 2)
        assert_equal(res.DESCR, None)
        assert_equal(res.data, [b("Hello World!\n")])
    finally:
        teardown_load_files()
Beispiel #29
0
def test_load_files_wo_load_content():
    try:
        setup_load_files()
        res = load_files(LOAD_FILES_ROOT, load_content=False)
        assert_equal(len(res.filenames), 1)
        assert_equal(len(res.target_names), 2)
        assert_equal(res.DESCR, None)
        assert_equal(res.get('data'), None)
    finally:
        teardown_load_files()
Beispiel #30
0
def build_histogram(path="./data", name="hist"):

    # here we create a Bunch object ['target_names', 'data', 'target', 'DESCR', 'filenames']
    raw_bunch = datasets.load_files(path, description=None, categories=None, load_content=True,
                                    shuffle=True, encoding='utf-8', decode_error='replace')
    quantities = {author: 0 for author in list(raw_bunch['target_names'])}
    for i in list(raw_bunch['target']):
        quantities[list(raw_bunch['target_names'])[i]]+=1
    plt.figure(figsize=(17, 7), dpi=80, facecolor='w', edgecolor='k')
    plt.bar(range(len(quantities)), quantities.values(), align='center')
    plt.xticks(range(len(quantities)), quantities.keys())
    savefig(name + '.png')
Beispiel #31
0
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, confusion_matrix

from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

from keras.models import Sequential, Model
from keras.layers import Dense
from keras.regularizers import l1

from keras import backend as K

import numpy as np

documents = load_files('../TEXTDATA/', shuffle=False)

# Split remainder into training and testing
X_train, X_test, y_train, y_test = train_test_split(documents.data,
                                                    documents.target,
                                                    test_size=0.15)

#import code
#code.interact(local=locals())

count_vect = CountVectorizer()
X_train_count = count_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)
Beispiel #32
0
def test_default_empty_load_files():
    res = load_files(LOAD_FILES_ROOT)
    assert_equal(len(res.filenames), 0)
    assert_equal(len(res.target_names), 0)
    assert_equal(res.DESCR, None)
Beispiel #33
0
def test_load_files_wo_load_content():
    res = load_files(LOAD_FILES_ROOT, load_content=False)
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 2)
    assert_equal(res.DESCR, None)
    assert_equal(res.get('data'), None)
from nltk.corpus import stopwords
import nltk
import re
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import roc_auc_score

import seaborn as sns

Stop_words = stopwords.words('english')

dataset = load_files('Web Page Classification/')

#datasetimizi oluşturuyoruz
#X:dataset
#Y:Classlar

X, y = dataset.data, dataset.target

from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()

corpus = []  #Documentları corpus kaydediyoruz
#fakat doha öncesinde preprocessing(boşlukların atılması,noktalama işretleri,a an )

for i in range(0, len(X)):
Beispiel #35
0
# movie reviews

import numpy as np

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

resumedir = 'data/resumes_and_reviews'
resumes = load_files(resumedir, shuffle=True)

# Split remainder into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    resumes.data, resumes.target, test_size=0.50
)

count_vect = CountVectorizer()
X_train_count = count_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)
resume_matrix = X_train_tfidf[y_train == 0]
top_words = []

svd = TruncatedSVD(n_components=20)
Beispiel #36
0
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from nltk import sent_tokenize
from nltk import pos_tag
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from sklearn.linear_model import LogisticRegression


ps = PorterStemmer()
bookdir = r'Book'
# loading all files as training data.
book_train = load_files(bookdir, shuffle=True)
#print(book_train.data)
# target names ("classes") are automatically generated from subfolder names
#print(book_train.target_names)
#print(book_train.filenames)

#nltk.download('sentiwordnet')

stopwd  = set(sw.words('english'))

lemmatizer = WordNetLemmatizer()

def lemmatize_texte( token, tag, normalize):
    tag = {
        'N': wn.NOUN,
        'V': wn.VERB,
Beispiel #37
0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from sklearn.datasets import load_files
# for reproducibility
random_state = 0

print(
    "[+] NLP Clustering by https://sanjayasubedi.com.np/nlp/nlp-with-python-document-clustering/"
)

DATA_DIR = "./bbc/"
print("[+] Load files")
data = load_files(DATA_DIR,
                  encoding="utf-8",
                  decode_error="replace",
                  random_state=random_state)
df = pd.DataFrame(list(zip(data['data'], data['target'])),
                  columns=['text', 'label'])
df.head()

print("[+] Calculate tf-id")
# Learn vocabolary
vec = TfidfVectorizer(stop_words="english")
vec.fit(df.text.values)

print("[+] Define cluster")
# Crea modello
cls = MiniBatchKMeans(n_clusters=5, random_state=random_state)
features = vec.transform(df.text.values)
cls.fit(features)
Beispiel #38
0
def load_dataset(path):
    data = load_files(path)
    dog_files = np.array(data['filenames'])
    dog_targets = np_utils.to_categorical(np.array(data['target']), 120)
    return dog_files, dog_targets
from sklearn.neighbors import KNeighborsClassifier

stemmer = EnglishStemmer()


def stemming_tokenizer(text):
    stemmed_text = [
        stemmer.stem(word) for word in word_tokenize(text, language='english')
    ]
    return stemmed_text


data_folder_training_set = "./Training"
data_folder_test_set = "./Test"

training_dataset = load_files(data_folder_training_set)
test_dataset = load_files(data_folder_test_set)

print(training_dataset.target_names)

# Load Training-Set
X_train, X_test_DUMMY_to_ignore, Y_train, Y_test_DUMMY_to_ignore = train_test_split(
    training_dataset.data, training_dataset.target, test_size=0.0)
target_names = training_dataset.target_names

# Load Test-Set
X_train_DUMMY_to_ignore, X_test, Y_train_DUMMY_to_ignore, Y_test = train_test_split(
    test_dataset.data, test_dataset.target, train_size=0.0)
target_names = training_dataset.target_names
print(Y_train.shape)
print(Y_test.shape)
Beispiel #40
0
# Author: Olivier Grisel <*****@*****.**>
# License: Simplified BSD
# Adapted by: Francesco Mosconi

import numpy as np

from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# The training data folder must be passed as first argument
try:
    dataset = load_files('./wikidata/short_paragraphs')
except OSError as ex:
    print(ex)
    print(
        "Couldn't import the data, try running `python fetch_data.py` first ")
    exit(-1)

# TASK: Split the dataset in training and test set
# (use 20% of the data for test):
x_train, x_test, y_train, y_test = train_test_split(dataset.data,
                                                    dataset.target,
                                                    test_size=0.20,
                                                    random_state=0)
# TASK: Build a an vectorizer that splits strings into sequence of 1 to 3
# characters instead of word tokens using the class TfidfVectorizer
#coding:utf-8
"""
Kmeans算法聚类文本示例
"""

import matplotlib.pyplot as plt
import numpy as np

# 加载文本数据
from time import time
from sklearn.datasets import load_files
print("loading documents ...")
t = time()
docs = load_files('data/cluster_data')
print("summary: {0} documents in {1} categories.".format(
    len(docs.data), len(docs.target_names)))
print("done in {0} seconds".format(time() - t))

# 文本向量化表示
from sklearn.feature_extraction.text import TfidfVectorizer
max_features = 20000
print("vectorizing documents ...")
t = time()
vectorizer = TfidfVectorizer(max_df=0.4,
                             min_df=2,
                             max_features=max_features,
                             encoding='latin-1')
X = vectorizer.fit_transform((d for d in docs.data))
print("n_samples: %d, n_features: %d" % X.shape)
print("number of non-zero features in sample [{0}]: {1}".format(
    docs.filenames[0], X[0].getnnz()))
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn import metrics

from keras.models import Sequential, Model
from keras.layers import Dense, Input
from keras.optimizers import Adam

resumes = load_files('data/actingAndManagerResumes/', shuffle=True)

# Split remainder into training and testing
X_train, X_test, y_train, y_test = train_test_split(resumes.data,
                                                    resumes.target,
                                                    test_size=0.20)

count_vect = CountVectorizer()
X_train_count = count_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)

pca = TruncatedSVD(n_components=20)
pca.fit(X_train_tfidf)
Beispiel #43
0
#!/usr/bin/env python
# coding: utf-8

# # Project ML / DSA : Sentiment Analysis dengan Hybrid ANN-NB Classifier

# ### Read Feature (Text) and Target (Rating)

# In[1]:

from sklearn.datasets import load_files
import numpy as np

reviews = load_files("dataset", encoding="ISO-8859-1")
texts, rating = reviews.data, reviews.target

# ### Preprocessing

# In[2]:

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
def load_dataset(path):
    data = load_files(path)
    medical_files = np.array(data['filenames'])
    medical_targets = np_utils.to_categorical(np.array(data['target']), 3)
    return medical_files, medical_targets
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.datasets import load_files

import pickle
from nltk.corpus import stopwords

# In[2]:

dataset = load_files(r"C:\Users\ozgur\Desktop\inputset2", encoding="utf-8")
X = dataset.data
y = dataset.target
print(y)

# In[3]:

documents = []
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
Beispiel #46
0
    return c_clf


# Out-of-core Training
train_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'train')
train_pos = os.path.join(train_path, 'pos')
train_neg = os.path.join(train_path, 'neg')

fnames = [os.path.join(train_pos, f) for f in os.listdir(train_pos)] +\
         [os.path.join(train_neg, f) for f in os.listdir(train_neg)]
y_train = np.zeros((len(fnames), ), dtype=int)
y_train[:12500] = 1
np.bincount(y_train)

sgd = SGDClassifier(loss='log', random_state=1)

sgd = batch_train(clf=sgd,
                  fnames=fnames,
                  labels=y_train)


# Testing
test_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'test')
test = load_files(container_path=(test_path),
                  categories=['pos', 'neg'])
docs_test, y_test = test['data'][12500:], test['target'][12500:]

vec = HashingVectorizer(encoding='latin-1')
print('accuracy:', sgd.score(vec.transform(docs_test), y_test))
Beispiel #47
0
import shutil
from sklearn import preprocessing

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer

NUM_QUESTIONS = 3
PLOT_RESULTS = False
ACTIVE = True
DATA_FOLDER = "/home/af/Downloads/movie_review_kfold/review_polarity/activelearning"
TRAIN_FOLDER = os.path.join(DATA_FOLDER, "train")
TEST_FOLDER = os.path.join(DATA_FOLDER, "test")
UNLABELED_FOLDER = os.path.join(DATA_FOLDER, "unlabeled")
ENCODING = 'latin1'
while True:
    data_train = load_files(TRAIN_FOLDER, encoding=ENCODING)
    data_test = load_files(TEST_FOLDER, encoding=ENCODING)
    data_unlabeled = load_files(UNLABELED_FOLDER, encoding=ENCODING)

    categories = data_train.target_names

    def size_mb(docs):
        return sum(len(s.encode('utf-8')) for s in docs) / 1e6

    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)
    data_unlabeled_size_mb = size_mb(data_unlabeled.data)

    print("%d documents - %0.3fMB (training set)" %
          (len(data_train.data), data_train_size_mb))
    print("%d documents - %0.3fMB (test set)" %
Beispiel #48
0
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.externals import joblib

# The training data folder must be passed as first argument
languages_data_folder = '/home/janrn/Development/machinelearning/articles'
dataset = load_files(languages_data_folder)

# Split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(dataset.data,
                                                          dataset.target,
                                                          test_size=0.5)

# TASK: Build a an vectorizer
clf = Pipeline([('vect', TfidfVectorizer(max_df=0.9, min_df=2)),
                ('clf', LinearSVC())])

# fit the pipeline on training data
clf.fit(docs_train, y_train)

# fit pipeline on all the data (no test)
#clf.fit(dataset.data, dataset.target)

# get category names
# print dataset.target_names
Beispiel #49
0
def load_dataset(path):
    data = load_files(path)
    print(data)
    part_files = np.array(sorted(glob("data/*")))
    return part_files
def load_dataset(path):
    data = load_files(path)
    mushroom_files = np.array(data['filenames'])
    mushroom_targets = np_utils.to_categorical(np.array(data['target']), 10)
    return mushroom_files, mushroom_targets
Beispiel #51
0
import copy
import codecs as cs
from sklearn.datasets import load_files
from sklearn import svm
from sklearn import metrics
from random import shuffle
from ficlearn.feature_extraction.text import BnsVectorizer

if __name__ == '__main__':
    print("-----------------------------------------------")
    print("Load corpus and vectorize with BNSVectorizer")
    print("-----------------------------------------------")
    corpus = "corpus6"
    label_names = ['relevant', 'spam']

    notices = load_files(corpus, categories=label_names, load_content=False)
    data = [
        cs.open(filename, 'r', 'UTF-8').read()
        for filename in notices.filenames
    ]
    n_samples = len(data)
    Y = notices.target

    start = int(n_samples / 10)
    step = start
    recalls = []
    precisions = []
    sizes = []
    N_SAMPLES = copy.deepcopy(n_samples)
    for i in range(2, 10, 1):
        sliceIndex = int((i * 0.1 + 0.1) * N_SAMPLES)
# Text Classifiation using NLP

# Importing the libraries
import numpy as np
import re
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')

# Importing the dataset
reviews = load_files('class/')
X, y = reviews.data, reviews.target

# Pickling the dataset
with open('X.pickle', 'wb') as f:
    pickle.dump(X, f)

with open('y.pickle', 'wb') as f:
    pickle.dump(y, f)

# Unpickling dataset
X_in = open('X.pickle', 'rb')
y_in = open('y.pickle', 'rb')
X = pickle.load(X_in)
y = pickle.load(y_in)

# Creating the corpus
corpus = []
for i in range(0, 2000):
Beispiel #53
0
from sklearn.datasets import load_digits
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn import datasets

datasets.load_digits()

cpath = 'D:\\kaggle\\basicshapes\\shapes\\' + 'circles'

datasets.load_files(cpath)
a = [1, 2, 3, 4, 5]
print(len(a))
print(a[1])
a[2] = 8
print(a.append(88), a)

a = [1, 2, 3, 4, 5]
b = [1, 2, 3, 4, 5, 6]

print(a == b)
print(5 in b)
Beispiel #54
0
    # categories = [
    #     'alt.atheism',
    #     'talk.religion.misc',
    #     'comp.graphics',
    #     'sci.space',
    # ]
    # load dataset
    #print("Loading 20 newsgroups dataset for categories:")
    #print(categories if categories else "all")

    # data_train = fetch_20newsgroups(subset='train', categories=categories,
    #                                 shuffle=True, random_state=42)
    #data_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

    #data_test = fetch_20newsgroups(subset='test',shuffle=True, random_state=42)
    data_train = load_files('train')
    data_test = load_files('test')
    print('data loaded')

    # order of labels in `target_names` can be different from `categories`
    target_names = data_train.target_names

    #data_train_size_mb = size_mb(data_train.data)
    #data_test_size_mb = size_mb(data_test.data)

    # print("%d documents - %0.3fMB (training set)" % (
    #     len(data_train.data), data_train_size_mb))
    # print("%d documents - %0.3fMB (test set)" % (
    #     len(data_test.data), data_test_size_mb))
    #print("%d categories" % len(categories))
    #print()
Beispiel #55
0
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics

language_data_folder = r'C:\Users\Imre\PycharmProjects\dirteszt\Webcrawl Excercise\crawl\data'
dataset = load_files(language_data_folder, encoding='utf-8', shuffle=True)

train_data, test_data, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.3)

vect = TfidfVectorizer(max_df=0.95, min_df=3)

my_clf = Pipeline([('vector', vect),
                   ('clf', LinearSVC(C=1000))])

my_clf.fit(train_data, y_train)

y_predicted = my_clf.predict(test_data)

# print(help(metrics.classification_report))
# print(metrics.classification_report(y_test, y_predicted, labels=range(13),
#                                     target_names=dataset.target_names))

test_sentences = ['Az iráni látogatáson lévő White tavalyi letartóztatásáról egy emigráns szervezet adott először hírt\
, ami egy kiszabadult fogolytól azt tudta meg, hogy 2018 októberében, egy Meshed városában lévő börtönben találkozott\
 vele.']

predicted = my_clf.predict(test_sentences)
Beispiel #56
0
def load_dataset(path):
    data = load_files(path)
    files = np.array(data['filenames'])
    targets = np.array(data['target'])
    target_labels = np.array(data['target_names'])
    return files, targets, target_labels
Beispiel #57
0
def load_dataset(path):
    data = load_files(path)
    fire_files = np.array(data['filenames'])
    fire_targets = np_utils.to_categorical(np.array(data['target']),
                                           num_classes)
    return fire_files, fire_targets
Beispiel #58
0
def test_default_load_files():
    res = load_files(LOAD_FILES_ROOT)
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 2)
    assert_equal(res.DESCR, None)
    assert_equal(res.data, [b("Hello World!\n")])
Beispiel #59
0
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics


if __name__ == "__main__":
    # NOTE: we put the following in a 'if __name__ == "__main__"' protected
    # block to be able to use a multi-core grid search that also works under
    # Windows, see: http://docs.python.org/library/multiprocessing.html#windows
    # The multiprocessing module is used as the backend of joblib.Parallel
    # that is used when n_jobs != 1 in GridSearchCV

    # the training data folder must be passed as first argument
    movie_reviews_data_folder = sys.argv[1]
    dataset = load_files(movie_reviews_data_folder, shuffle=False)
    print("n_samples: %d" % len(dataset.data))

    # split the dataset in training and test set:
    docs_train, docs_test, y_train, y_test = train_test_split(
        dataset.data, dataset.target, test_size=0.25, random_state=None)

    # TASK: Build a vectorizer / classifier pipeline that filters out tokens
    # that are too rare or too frequent
    pipeline = Pipeline([
        ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),
        ('clf', LinearSVC(C=1000)),
    ])

    # TASK: Build a grid search to find out whether unigrams or bigrams are
    # more useful.
    '''Extract TF-IDF features from corpus'''
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        stop_words='english',  # remove stop words
        min_df=1  # minimum document frequency
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus


data_directory = 'tweets'
tweets_sent_data = datasets.load_files(data_directory, shuffle=True)
print('{} files loaded.'.format(len(tweets_sent_data.data)))
print('They contain the following classes: {}.'.format(
    tweets_sent_data.target_names))

tweets_tfidf = extract_features(tweets_sent_data.data)

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    tweets_tfidf, tweets_sent_data.target, test_size=0.30, random_state=42)


model = linear_model.LogisticRegression()
model.fit(X_train, y_train)
print('Model performance: {}'.format(model.score(X_test, y_test)))

y_pred = model.predict(X_test)